simprim, vat refactor, batchrange

3 years ago · 5566b80e52
5 changed files with 245 additions and 286 deletions
--- a/src/kyupy/init.py
+++ b/src/kyupy/init.py
@ -76,6 +76,11 @@ def hr_time(seconds):
    return s
 def batchrange(nitems, maxsize):
    for offset in range(0, nitems, maxsize):
        yield offset, min(nitems-offset, maxsize)
 class Timer:
    def __init__(self, s=0): self.s = s
    def __enter__(self): self.start_time = time.perf_counter(); return self
--- a/src/kyupy/logic_sim.py
+++ b/src/kyupy/logic_sim.py
@ -11,10 +11,8 @@ import math
 import numpy as np
 from . import numba, logic, hr_bytes, sim
 from .sim import SimOps, SimPrim
-
+class LogicSim(sim.SimOps):
 class LogicSim(SimOps):
    """A bit-parallel naïve combinational simulator for 2-, 4-, or 8-valued logic.
    :param circuit: The circuit to simulate.
@ -36,17 +34,17 @@ class LogicSim(SimOps):
        self.s = np.zeros((2, self.s_len, 3, nbytes), dtype=np.uint8)
        self.s[:,:,1,:] = 255  # unassigned
-        self.pi_s_locs = np.flatnonzero(self.vat[self.ppi_offset+np.arange(len(self.circuit.io_nodes)), 0] >= 0)
+        self.pi_s_locs = np.flatnonzero(self.c_locs[self.ppi_offset+np.arange(len(self.circuit.io_nodes))] >= 0)
-        self.po_s_locs = np.flatnonzero(self.vat[self.ppo_offset+np.arange(len(self.circuit.io_nodes)), 0] >= 0)
+        self.po_s_locs = np.flatnonzero(self.c_locs[self.ppo_offset+np.arange(len(self.circuit.io_nodes))] >= 0)
        self.ppio_s_locs = np.arange(len(self.circuit.io_nodes), len(self.s_nodes))
        self.pippi_s_locs = np.concatenate([self.pi_s_locs, self.ppio_s_locs])
        self.poppo_s_locs = np.concatenate([self.po_s_locs, self.ppio_s_locs])
-        self.pi_c_locs = self.vat[self.ppi_offset+self.pi_s_locs, 0]
+        self.pi_c_locs = self.c_locs[self.ppi_offset+self.pi_s_locs]
-        self.po_c_locs = self.vat[self.ppo_offset+self.po_s_locs, 0]
+        self.po_c_locs = self.c_locs[self.ppo_offset+self.po_s_locs]
-        self.ppi_c_locs = self.vat[self.ppi_offset+self.ppio_s_locs, 0]
+        self.ppi_c_locs = self.c_locs[self.ppi_offset+self.ppio_s_locs]
-        self.ppo_c_locs = self.vat[self.ppo_offset+self.ppio_s_locs, 0]
+        self.ppo_c_locs = self.c_locs[self.ppo_offset+self.ppio_s_locs]
        self.pippi_c_locs = np.concatenate([self.pi_c_locs, self.ppi_c_locs])
        self.poppo_c_locs = np.concatenate([self.po_c_locs, self.ppo_c_locs])
@ -103,34 +101,34 @@ class LogicSim(SimOps):
        nbytes = (sims - 1) // 8 + 1
        if self.m == 2:
            if inject_cb is None:
-                _prop_cpu(self.ops, self.vat, self.c[...,:nbytes])
+                _prop_cpu(self.ops, self.c_locs, self.c[...,:nbytes])
            else:
                for op, o0, i0, i1, i2, i3 in self.ops:
-                    o0, i0, i1, i2, i3 = [self.vat[x,0] for x in (o0, i0, i1, i2, i3)]
+                    o0, i0, i1, i2, i3 = [self.c_locs[x] for x in (o0, i0, i1, i2, i3)]
-                    if op == SimPrim.BUF1: self.c[o0]=self.c[i0]
+                    if op == sim.BUF1: self.c[o0]=self.c[i0]
-                    elif op == SimPrim.INV1: self.c[o0] = ~self.c[i0]
+                    elif op == sim.INV1: self.c[o0] = ~self.c[i0]
-                    elif op == SimPrim.AND2: self.c[o0] = self.c[i0] & self.c[i1]
+                    elif op == sim.AND2: self.c[o0] = self.c[i0] & self.c[i1]
-                    elif op == SimPrim.NAND2: self.c[o0] = ~(self.c[i0] & self.c[i1])
+                    elif op == sim.NAND2: self.c[o0] = ~(self.c[i0] & self.c[i1])
-                    elif op == SimPrim.OR2: self.c[o0] = self.c[i0] | self.c[i1]
+                    elif op == sim.OR2: self.c[o0] = self.c[i0] | self.c[i1]
-                    elif op == SimPrim.NOR2: self.c[o0] = ~(self.c[i0] | self.c[i1])
+                    elif op == sim.NOR2: self.c[o0] = ~(self.c[i0] | self.c[i1])
-                    elif op == SimPrim.XOR2: self.c[o0] = self.c[i0] ^ self.c[i1]
+                    elif op == sim.XOR2: self.c[o0] = self.c[i0] ^ self.c[i1]
-                    elif op == SimPrim.XNOR2: self.c[o0] = ~(self.c[i0] ^ self.c[i1])
+                    elif op == sim.XNOR2: self.c[o0] = ~(self.c[i0] ^ self.c[i1])
-                    else: print(f'unknown SimPrim {op}')
+                    else: print(f'unknown sim {op}')
                    inject_cb(o0, self.s[o0])
        elif self.m == 4:
            pass
        else:
            for op, o0, i0, i1, i2, i3 in self.ops:
-                o0, i0, i1, i2, i3 = [self.vat[x,0] for x in (o0, i0, i1, i2, i3)]
+                o0, i0, i1, i2, i3 = [self.c_locs[x] for x in (o0, i0, i1, i2, i3)]
-                if op == SimPrim.BUF1: self.c[o0]=self.c[i0]
+                if op == sim.BUF1: self.c[o0]=self.c[i0]
-                elif op == SimPrim.INV1: logic.bp_not(self.c[o0], self.c[i0])
+                elif op == sim.INV1: logic.bp_not(self.c[o0], self.c[i0])
-                elif op == SimPrim.AND2: logic.bp_and(self.c[o0], self.c[i0], self.c[i1])
+                elif op == sim.AND2: logic.bp_and(self.c[o0], self.c[i0], self.c[i1])
-                elif op == SimPrim.NAND2: logic.bp_and(self.c[o0], self.c[i0], self.c[i1]); logic.bp_not(self.c[o0], self.c[o0])
+                elif op == sim.NAND2: logic.bp_and(self.c[o0], self.c[i0], self.c[i1]); logic.bp_not(self.c[o0], self.c[o0])
-                elif op == SimPrim.OR2: logic.bp_or(self.c[o0], self.c[i0], self.c[i1])
+                elif op == sim.OR2: logic.bp_or(self.c[o0], self.c[i0], self.c[i1])
-                elif op == SimPrim.NOR2: logic.bp_or(self.c[o0], self.c[i0], self.c[i1]); logic.bp_not(self.c[o0], self.c[o0])
+                elif op == sim.NOR2: logic.bp_or(self.c[o0], self.c[i0], self.c[i1]); logic.bp_not(self.c[o0], self.c[o0])
-                elif op == SimPrim.XOR2: logic.bp_xor(self.c[o0], self.c[i0], self.c[i1])
+                elif op == sim.XOR2: logic.bp_xor(self.c[o0], self.c[i0], self.c[i1])
-                elif op == SimPrim.XNOR2: logic.bp_xor(self.c[o0], self.c[i0], self.c[i1]); logic.bp_not(self.c[o0], self.c[o0])
+                elif op == sim.XNOR2: logic.bp_xor(self.c[o0], self.c[i0], self.c[i1]); logic.bp_not(self.c[o0], self.c[o0])
-                else: print(f'unknown SimPrim {op}')
+                else: print(f'unknown sim {op}')
                if inject_cb is not None: inject_cb(o0, self.s[o0])
    def s_ppo_to_ppi(self):
@ -159,9 +157,9 @@ class LogicSim(SimOps):
@numba.njit
-def _prop_cpu(ops, vat, c):
+def _prop_cpu(ops, c_locs, c):
    for op, o0, i0, i1, i2, i3 in ops:
-        o0, i0, i1, i2, i3 = [vat[x,0] for x in (o0, i0, i1, i2, i3)]
+        o0, i0, i1, i2, i3 = [c_locs[x] for x in (o0, i0, i1, i2, i3)]
        if op == sim.BUF1: c[o0]=c[i0]
        elif op == sim.INV1: c[o0] = ~c[i0]
        elif op == sim.AND2: c[o0] = c[i0] & c[i1]
@ -170,4 +168,4 @@ def _prop_cpu(ops, vat, c):
        elif op == sim.NOR2: c[o0] = ~(c[i0] | c[i1])
        elif op == sim.XOR2: c[o0] = c[i0] ^ c[i1]
        elif op == sim.XNOR2: c[o0] = ~(c[i0] ^ c[i1])
-        else: print(f'unknown SimPrim {op}')
+        else: print(f'unknown sim {op}')
--- a/src/kyupy/sim.py
+++ b/src/kyupy/sim.py
@ -4,117 +4,70 @@ from bisect import bisect, insort_left
 import numpy as np
-BUF1 = 0b1010_1010_1010_1010
+BUF1 = np.uint16(0b1010_1010_1010_1010)
-INV1 = 0b0101_0101_0101_0101
+INV1 = ~BUF1
-
+
-NAND4 = 0b0111_1111_1111_1111
+AND2 = np.uint16(0b1000_1000_1000_1000)
-NAND3 = 0b0111_1111_0111_1111
+AND3 = np.uint16(0b1000_0000_1000_0000)
-NAND2 = 0b0111_0111_0111_0111
+AND4 = np.uint16(0b1000_0000_0000_0000)
-
+
-NOR4 = 0b0000_0000_0000_0001
+NAND2, NAND3, NAND4 = ~AND2, ~AND3, ~AND4
-NOR3 = 0b0000_0001_0000_0001
+
-NOR2 = 0b0001_0001_0001_0001
+OR2 = np.uint16(0b1110_1110_1110_1110)
-
+OR3 = np.uint16(0b1111_1110_1111_1110)
-AND4 = 0b1000_0000_0000_0000
+OR4 = np.uint16(0b1111_1111_1111_1110)
-AND3 = 0b1000_0000_1000_0000
+
-AND2 = 0b1000_1000_1000_1000
+NOR2, NOR3, NOR4 = ~OR2, ~OR3, ~OR4
-
+
-OR4 = 0b1111_1111_1111_1110
+XOR2 = np.uint16(0b0110_0110_0110_0110)
-OR3 = 0b1111_1110_1111_1110
+XOR3 = np.uint16(0b1001_0110_1001_0110)
-OR2 = 0b1110_1110_1110_1110
+XOR4 = np.uint16(0b0110_1001_1001_0110)
-
+
-XOR4 = 0b0110_1001_1001_0110
+XNOR2, XNOR3, XNOR4 = ~XOR2, ~XOR3, ~XOR4
-XOR3 = 0b1001_0110_1001_0110
+
-XOR2 = 0b0110_0110_0110_0110
+AO21 = np.uint16(0b1110_1010_1110_1010)
-
+AO22 = np.uint16(0b1111_1000_1000_1000)
-XNOR4 = 0b1001_0110_0110_1001
+OA21 = np.uint16(0b1010_1000_1010_1000)
-XNOR3 = 0b0110_1001_0110_1001
+OA22 = np.uint16(0b1110_1110_1110_0000)
-XNOR2 = 0b1001_1001_1001_1001
+
-
+AOI21, AOI22, OAI21, OAI22 = ~AO21, ~AO22, ~OA21, ~OA22
-AO22 = 0b1111_1000_1000_1000
+
-AOI22 = 0b0000_0111_0111_0111
+MUX21 = np.uint16(0b1110_0100_1110_0100)
-AO21 = 0b1110_1010_1110_1010
+
-AOI21 = 0b0001_0101_0001_0101
+names = dict([(v, k) for k, v in globals().items() if isinstance(v, np.uint16)])
-OA22 = 0b1110_1110_1110_0000
+
-OAI22 = 0b0001_0001_0001_1111
+kind_prefixes = {
-OA21 = 0b1010_1000_1010_1000
+    'nand': (NAND4, NAND3, NAND2),
-OAI21 = 0b0101_0111_0101_0111
+    'nor': (NOR4, NOR3, NOR2),
-MUX21 = 0b1110_0100_1110_0100
+    'and': (AND4, AND3, AND2),
-
+    'or': (OR4, OR3, OR2),
-class SimPrim:
+    'xor': (XOR4, XOR3, XOR2),
-    BUF1 = 0b1010_1010_1010_1010
+    'xnor': (XNOR4, XNOR3, XNOR2),
-    INV1 = 0b0101_0101_0101_0101
+
-
+    'not': (INV1, INV1, INV1),
-    NAND4 = 0b0111_1111_1111_1111
+    'inv': (INV1, INV1, INV1),
-    NAND3 = 0b0111_1111_0111_1111
+    'ibuf': (INV1, INV1, INV1),
-    NAND2 = 0b0111_0111_0111_0111
+    '__const1__': (INV1, INV1, INV1),
-
+    'tieh': (INV1, INV1, INV1),
-    NOR4 = 0b0000_0000_0000_0001
+
-    NOR3 = 0b0000_0001_0000_0001
+    'buf': (BUF1, BUF1, BUF1),
-    NOR2 = 0b0001_0001_0001_0001
+    'nbuf': (BUF1, BUF1, BUF1),
-
+    'delln': (BUF1, BUF1, BUF1),
-    AND4 = 0b1000_0000_0000_0000
+    '__const0__': (BUF1, BUF1, BUF1),
-    AND3 = 0b1000_0000_1000_0000
+    'tiel': (BUF1, BUF1, BUF1),
-    AND2 = 0b1000_1000_1000_1000
+
-
+    'ao22': (AO22, AO22, AO22),
-    OR4 = 0b1111_1111_1111_1110
+    'aoi22': (AOI22, AOI22, AOI22),
-    OR3 = 0b1111_1110_1111_1110
+    'ao21': (AO21, AO21, AO21),
-    OR2 = 0b1110_1110_1110_1110
+    'aoi21': (AOI21, AOI21, AOI21),
-
+
-    XOR4 = 0b0110_1001_1001_0110
+    'oa22': (OA22, OA22, OA22),
-    XOR3 = 0b1001_0110_1001_0110
+    'oai22': (OAI22, OAI22, OAI22),
-    XOR2 = 0b0110_0110_0110_0110
+    'oa21': (OA21, OA21, OA21),
-
+    'oai21': (OAI21, OAI21, OAI21),
-    XNOR4 = 0b1001_0110_0110_1001
+
-    XNOR3 = 0b0110_1001_0110_1001
+    'mux21': (MUX21, MUX21, MUX21),
-    XNOR2 = 0b1001_1001_1001_1001
+}
    AO22 = 0b1111_1000_1000_1000
    AOI22 = 0b0000_0111_0111_0111
    AO21 = 0b1110_1010_1110_1010
    AOI21 = 0b0001_0101_0001_0101
    OA22 = 0b1110_1110_1110_0000
    OAI22 = 0b0001_0001_0001_1111
    OA21 = 0b1010_1000_1010_1000
    OAI21 = 0b0101_0111_0101_0111
    MUX21 = 0b1110_0100_1110_0100
    kind_prefixes = {
        'nand': (NAND4, NAND3, NAND2),
        'nor': (NOR4, NOR3, NOR2),
        'and': (AND4, AND3, AND2),
        'or': (OR4, OR3, OR2),
        'xor': (XOR4, XOR3, XOR2),
        'xnor': (XNOR4, XNOR3, XNOR2),
        'not': (INV1, INV1, INV1),
        'inv': (INV1, INV1, INV1),
        'ibuf': (INV1, INV1, INV1),
        '__const1__': (INV1, INV1, INV1),
        'tieh': (INV1, INV1, INV1),
        'buf': (BUF1, BUF1, BUF1),
        'nbuf': (BUF1, BUF1, BUF1),
        'delln': (BUF1, BUF1, BUF1),
        '__const0__': (BUF1, BUF1, BUF1),
        'tiel': (BUF1, BUF1, BUF1),
        'ao22': (AO22, AO22, AO22),
        'aoi22': (AOI22, AOI22, AOI22),
        'ao21': (AO21, AO21, AO21),
        'aoi21': (AOI21, AOI21, AOI21),
        'oa22': (OA22, OA22, OA22),
        'oai22': (OAI22, OAI22, OAI22),
        'oa21': (OA21, OA21, OA21),
        'oai21': (OAI21, OAI21, OAI21),
        'mux21': (MUX21, MUX21, MUX21),
    }
    @classmethod
    def names(cls):
        return dict([(v, k) for k, v in cls.__dict__.items() if isinstance(v, int)])
 class Heap:
    def __init__(self):
@ -184,7 +137,7 @@ class Heap:
 class SimOps:
    """A static scheduler that translates a Circuit into a topologically sorted list of basic logic operations (self.ops) and
-    a value allocation table (self.vat) for use in simulators.
+    a memory mapping (self.c_locs, self.c_caps) for use in simulators.
    :param circuit: The circuit to create a schedule for.
    :param strip_forks: If enabled, the scheduler will not include fork nodes to safe simulation time.
@ -203,12 +156,12 @@ class SimOps:
        if isinstance(c_caps, int):
            c_caps = [c_caps] * len(circuit.lines)
-        # indices for state allocation table (sat)
+        # special locations and offsets in c_locs/c_caps
        self.zero_idx = len(circuit.lines)
        self.tmp_idx = self.zero_idx + 1
        self.ppi_offset = self.tmp_idx + 1
        self.ppo_offset = self.ppi_offset + len(self.s_nodes)
-        self.vat_len = self.ppo_offset + len(self.s_nodes)
+        self.c_locs_len = self.ppo_offset + len(self.s_nodes)
        # translate circuit structure into self.ops
        ops = []
@ -217,14 +170,14 @@ class SimOps:
            if n in interface_dict:
                inp_idx = self.ppi_offset + interface_dict[n]
                if len(n.outs) > 0 and n.outs[0] is not None:  # first output of a PI/PPI
-                    ops.append((SimPrim.BUF1, n.outs[0].index, inp_idx, self.zero_idx, self.zero_idx, self.zero_idx)) 
+                    ops.append((BUF1, n.outs[0].index, inp_idx, self.zero_idx, self.zero_idx, self.zero_idx)) 
                if 'dff' in n.kind.lower():  # second output of DFF is inverted
                    if len(n.outs) > 1 and n.outs[1] is not None:
-                        ops.append((SimPrim.INV1, n.outs[1].index, inp_idx, self.zero_idx, self.zero_idx, self.zero_idx))
+                        ops.append((INV1, n.outs[1].index, inp_idx, self.zero_idx, self.zero_idx, self.zero_idx))
                else:  # if not DFF, no output is inverted.
                    for o_line in n.outs[1:]:
                        if o_line is not None:
-                            ops.append((SimPrim.BUF1, o_line.index, inp_idx, self.zero_idx, self.zero_idx, self.zero_idx))
+                            ops.append((BUF1, o_line.index, inp_idx, self.zero_idx, self.zero_idx, self.zero_idx))
                continue
            # regular node, not PI/PPI or PO/PPO
            o0_idx = n.outs[0].index if len(n.outs) > 0 and n.outs[0] is not None else self.tmp_idx
@ -237,10 +190,10 @@ class SimOps:
                if not strip_forks:
                    for o_line in n.outs:
                        if o_line is not None:
-                            ops.append((SimPrim.BUF1, o_line.index, i0_idx, i1_idx, i2_idx, i3_idx))
+                            ops.append((BUF1, o_line.index, i0_idx, i1_idx, i2_idx, i3_idx))
                continue
            sp = None
-            for prefix, prims in SimPrim.kind_prefixes.items():
+            for prefix, prims in kind_prefixes.items():
                if kind.startswith(prefix):
                    sp = prims[0]
                    if i3_idx == self.zero_idx:
@ -256,7 +209,7 @@ class SimOps:
        self.ops = np.asarray(ops, dtype='int32')
        # create a map from fanout lines to stem lines for fork stripping
-        stems = np.zeros(self.vat_len, dtype='int32') - 1  # default to -1: 'no fanout line'
+        stems = np.zeros(self.c_locs_len, dtype='int32') - 1  # default to -1: 'no fanout line'
        if strip_forks:
            for f in circuit.forks.values():
                prev_line = f.ins[0]
@ -267,8 +220,8 @@ class SimOps:
                    stems[ol] = stem_idx
        # calculate level (distance from PI/PPI) and reference count for each line
-        levels = np.zeros(self.vat_len, dtype='int32')
+        levels = np.zeros(self.c_locs_len, dtype='int32')
-        ref_count = np.zeros(self.vat_len, dtype='int32')
+        ref_count = np.zeros(self.c_locs_len, dtype='int32')
        level_starts = [0]
        current_level = 1
        for i, op in enumerate(self.ops):
@ -289,21 +242,21 @@ class SimOps:
        self.level_stops = np.asarray(level_starts[1:] + [len(self.ops)], dtype='int32')
        # state allocation table. maps line and interface indices to self.state memory locations
-        self.vat = np.zeros((self.vat_len, 3), dtype='int')
+        self.c_locs = np.full((self.c_locs_len,), -1, dtype=np.int32)
-        self.vat[:, 0] = -1
+        self.c_caps = np.zeros((self.c_locs_len,), dtype=np.int32)
        h = Heap()
        # allocate and keep memory for special fields
-        self.vat[self.zero_idx] = h.alloc(1), 1, 0
+        self.c_locs[self.zero_idx], self.c_caps[self.zero_idx] = h.alloc(1), 1
-        self.vat[self.tmp_idx] = h.alloc(1), 1, 0
+        self.c_locs[self.tmp_idx], self.c_caps[self.tmp_idx] = h.alloc(1), 1
        ref_count[self.zero_idx] += 1
        ref_count[self.tmp_idx] += 1
        # allocate and keep memory for PI/PPI, keep memory for PO/PPO (allocated later)
        for i, n in enumerate(self.s_nodes):
            if len(n.outs) > 0:
-                self.vat[self.ppi_offset + i] = h.alloc(1), 1, 0
+                self.c_locs[self.ppi_offset + i], self.c_caps[self.ppi_offset + i] = h.alloc(1), 1
                ref_count[self.ppi_offset + i] += 1
            if len(n.ins) > 0:
                i0_idx = stems[n.ins[0]] if stems[n.ins[0]] >= 0 else n.ins[0]
@ -322,13 +275,13 @@ class SimOps:
                ref_count[i1_idx] -= 1
                ref_count[i2_idx] -= 1
                ref_count[i3_idx] -= 1
-                if ref_count[i0_idx] <= 0: free_list.append(self.vat[i0_idx, 0])
+                if ref_count[i0_idx] <= 0: free_list.append(self.c_locs[i0_idx])
-                if ref_count[i1_idx] <= 0: free_list.append(self.vat[i1_idx, 0])
+                if ref_count[i1_idx] <= 0: free_list.append(self.c_locs[i1_idx])
-                if ref_count[i2_idx] <= 0: free_list.append(self.vat[i2_idx, 0])
+                if ref_count[i2_idx] <= 0: free_list.append(self.c_locs[i2_idx])
-                if ref_count[i3_idx] <= 0: free_list.append(self.vat[i3_idx, 0])
+                if ref_count[i3_idx] <= 0: free_list.append(self.c_locs[i3_idx])
                o_idx = op[1]
                cap = c_caps[o_idx]
-                self.vat[o_idx] = h.alloc(cap), cap, 0
+                self.c_locs[o_idx], self.c_caps[o_idx] = h.alloc(cap), cap
            if not keep_signals:
                for loc in free_list:
                    h.free(loc)
@ -336,16 +289,15 @@ class SimOps:
        # copy memory location and capacity from stems to fanout lines
        for lidx, stem in enumerate(stems):
            if stem >= 0:  # if at a fanout line
-                self.vat[lidx] = self.vat[stem]
+                self.c_locs[lidx], self.c_caps[lidx] = self.c_locs[stem], self.c_caps[stem]
        # copy memory location to PO/PPO area
        for i, n in enumerate(self.s_nodes):
            if len(n.ins) > 0:
-                self.vat[self.ppo_offset + i] = self.vat[n.ins[0]]
+                self.c_locs[self.ppo_offset + i], self.c_caps[self.ppo_offset + i] = self.c_locs[n.ins[0]], self.c_caps[n.ins[0]]
        self.c_len = h.max_size
        from collections import defaultdict
        self.prim_counts = defaultdict(int)
-        names_dict = SimPrim.names()
+        for op, _, _, _, _, _ in self.ops: self.prim_counts[names[op]] += 1
        for op, _, _, _, _, _ in self.ops: self.prim_counts[names_dict[op]] += 1
--- a/src/kyupy/wave_sim.py
+++ b/src/kyupy/wave_sim.py
@ -16,8 +16,7 @@ import math
 import numpy as np
-from . import numba, cuda, hr_bytes
+from . import numba, cuda, hr_bytes, sim
 from .sim import SimOps
 TMAX = np.float32(2 ** 127)
@ -29,7 +28,7 @@ TMIN = np.float32(-2 ** 127)
 """A large negative 32-bit floating point value used at the beginning of waveforms that start with logic-1."""
-class WaveSim(SimOps):
+class WaveSim(sim.SimOps):
    """A waveform-based combinational logic timing simulator running on CPU.
    :param circuit: The circuit to simulate.
@ -54,30 +53,31 @@ class WaveSim(SimOps):
        self.sims = sims
        self.c_len *= 4
-        self.vat[...,0:2] *= 4
+        self.c_locs[...] *= 4
        self.c_caps[...] *= 4
-        self.timing = np.zeros((self.vat_len, 2, 2))
+        self.timing = np.zeros((self.c_locs_len, 2, 2))
        self.timing[:len(timing)] = timing
        self.c = np.zeros((self.c_len, sims), dtype=np.float32) + TMAX
-        self.s = np.zeros((len(self.s_nodes), sims, 11), dtype=np.float32)
+        self.s = np.zeros((11, self.s_len, sims), dtype=np.float32)
        """Information about the logic values and transitions around the sequential elements (flip-flops) and ports.
        The first 3 values are read by ``s_to_c()``.
        The remaining values are written by ``c_to_s()``.
        The elements are as follows:
-        * ``s[..., 0]`` (P)PI initial value
+        * ``s[0]`` (P)PI initial value
-        * ``s[..., 1]`` (P)PI transition time
+        * ``s[1]`` (P)PI transition time
-        * ``s[..., 2]`` (P)PI final value
+        * ``s[2]`` (P)PI final value
-        * ``s[..., 3]`` (P)PO initial value
+        * ``s[3]`` (P)PO initial value
-        * ``s[..., 4]`` (P)PO earliest arrival time (EAT): The time at which the output transitioned from its initial value.
+        * ``s[4]`` (P)PO earliest arrival time (EAT): The time at which the output transitioned from its initial value.
-        * ``s[..., 5]`` (P)PO latest stabilization time (LST): The time at which the output settled to its final value.
+        * ``s[5]`` (P)PO latest stabilization time (LST): The time at which the output settled to its final value.
-        * ``s[..., 6]`` (P)PO final value
+        * ``s[6]`` (P)PO final value
-        * ``s[..., 7]`` (P)PO capture value: probability of capturing a 1 at a given capture time
+        * ``s[7]`` (P)PO capture value: probability of capturing a 1 at a given capture time
-        * ``s[..., 8]`` (P)PO sampled capture value: decided by random sampling according to a given seed.
+        * ``s[8]`` (P)PO sampled capture value: decided by random sampling according to a given seed.
-        * ``s[..., 9]`` (P)PO sampled capture slack: (capture time - LST) - decided by random sampling according to a given seed.
+        * ``s[9]`` (P)PO sampled capture slack: (capture time - LST) - decided by random sampling according to a given seed.
-        * ``s[..., 10]`` Overflow indicator: If non-zero, some signals in the input cone of this output had more
+        * ``s[10]`` Overflow indicator: If non-zero, some signals in the input cone of this output had more
          transitions than specified in ``c_caps``. Some transitions have been discarded, the
          final values in the waveforms are still valid.
        """
@ -85,19 +85,19 @@ class WaveSim(SimOps):
        self.params = np.zeros((sims, 4), dtype=np.float32)
        self.params[...,0] = 1.0
-        self.nbytes = sum([a.nbytes for a in (self.c, self.s, self.vat, self.ops, self.params)])
+        self.nbytes = sum([a.nbytes for a in (self.c, self.s, self.c_locs, self.c_caps, self.ops, self.params)])
-        self.pi_s_locs = np.flatnonzero(self.vat[self.ppi_offset+np.arange(len(self.circuit.io_nodes)), 0] >= 0)
+        self.pi_s_locs = np.flatnonzero(self.c_locs[self.ppi_offset+np.arange(len(self.circuit.io_nodes))] >= 0)
-        self.po_s_locs = np.flatnonzero(self.vat[self.ppo_offset+np.arange(len(self.circuit.io_nodes)), 0] >= 0)
+        self.po_s_locs = np.flatnonzero(self.c_locs[self.ppo_offset+np.arange(len(self.circuit.io_nodes))] >= 0)
        self.ppio_s_locs = np.arange(len(self.circuit.io_nodes), len(self.s_nodes))
        self.pippi_s_locs = np.concatenate([self.pi_s_locs, self.ppio_s_locs])
        self.poppo_s_locs = np.concatenate([self.po_s_locs, self.ppio_s_locs])
-        self.pi_c_locs = self.vat[self.ppi_offset+self.pi_s_locs, 0]
+        self.pi_c_locs = self.c_locs[self.ppi_offset+self.pi_s_locs]
-        self.po_c_locs = self.vat[self.ppo_offset+self.po_s_locs, 0]
+        self.po_c_locs = self.c_locs[self.ppo_offset+self.po_s_locs]
-        self.ppi_c_locs = self.vat[self.ppi_offset+self.ppio_s_locs, 0]
+        self.ppi_c_locs = self.c_locs[self.ppi_offset+self.ppio_s_locs]
-        self.ppo_c_locs = self.vat[self.ppo_offset+self.ppio_s_locs, 0]
+        self.ppo_c_locs = self.c_locs[self.ppo_offset+self.ppio_s_locs]
        self.pippi_c_locs = np.concatenate([self.pi_c_locs, self.ppi_c_locs])
        self.poppo_c_locs = np.concatenate([self.po_c_locs, self.ppo_c_locs])
@ -112,7 +112,7 @@ class WaveSim(SimOps):
        Based on the data in ``self.s``, waveforms are generated on the input lines of the circuit.
        It modifies ``self.c``.
        """
-        sins = np.moveaxis(self.s[self.pippi_s_locs], -1, 0)
+        sins = self.s[:, self.pippi_s_locs]
        cond = (sins[2] != 0) + 2*(sins[0] != 0)  # choices order: 0 R F 1
        self.c[self.pippi_c_locs] = np.choose(cond, [TMAX, sins[1], TMIN, TMIN])
        self.c[self.pippi_c_locs+1] = np.choose(cond, [TMAX, TMAX, sins[1], TMAX])
@ -127,7 +127,7 @@ class WaveSim(SimOps):
        """
        sims = min(sims or self.sims, self.sims)
        for op_start, op_stop in zip(self.level_starts, self.level_stops):
-            level_eval_cpu(self.ops, op_start, op_stop, self.c, self.vat, 0, sims,
+            level_eval_cpu(self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, 0, sims,
                                         self.timing, self.params, sd, seed)
    def c_to_s(self, time=TMAX, sd=0.0, seed=1):
@ -140,9 +140,9 @@ class WaveSim(SimOps):
        :param sd: A standard deviation for uncertainty in the actual capture time.
        :param seed: The random seed for a capture with uncertainty.
        """
-        for s_loc, (c_loc, c_len, _) in zip(self.poppo_s_locs, self.vat[self.ppo_offset+self.poppo_s_locs]):
+        for s_loc, c_loc, c_len in zip(self.poppo_s_locs, self.c_locs[self.ppo_offset+self.poppo_s_locs], self.c_caps[self.ppo_offset+self.poppo_s_locs]):
            for vector in range(self.sims):
-                self.s[s_loc, vector, 3:] = wave_capture_cpu(self.c, c_loc, c_len, vector, time=time, sd=sd, seed=seed)
+                self.s[3:, s_loc, vector] = wave_capture_cpu(self.c, c_loc, c_len, vector, time=time, sd=sd, seed=seed)
    def s_ppo_to_ppi(self, time=0.0):
        """Re-assigns the last sampled capture to the appropriate pseudo-primary inputs (PPI). 
@ -151,9 +151,9 @@ class WaveSim(SimOps):
        :param time: The transition time at the inputs (usually 0.0).
        """
-        self.s[self.ppio_s_locs, :, 0] = self.s[self.ppio_s_locs, :, 2]
+        self.s[0, self.ppio_s_locs] = self.s[2, self.ppio_s_locs]
-        self.s[self.ppio_s_locs, :, 1] = time
+        self.s[1, self.ppio_s_locs] = time
-        self.s[self.ppio_s_locs, :, 2] = self.s[self.ppio_s_locs, :, 8]
+        self.s[2, self.ppio_s_locs] = self.s[8, self.ppio_s_locs]
@numba.njit
@ -173,7 +173,7 @@ def rand_gauss_cpu(seed, sd):
@numba.njit
-def wave_eval_cpu(op, cbuf, vat, st_idx, line_times, param, sd=0.0, seed=0):
+def wave_eval_cpu(op, cbuf, c_locs, c_caps, st_idx, line_times, param, sd=0.0, seed=0):
    lut, z_idx, a_idx, b_idx, c_idx, d_idx = op
    # >>> same code as wave_eval_cpu (except rand_gauss_*pu()-calls) >>>
@ -181,11 +181,12 @@ def wave_eval_cpu(op, cbuf, vat, st_idx, line_times, param, sd=0.0, seed=0):
    _seed = (seed << 4) + (z_idx << 20) + (st_idx << 1)
-    a_mem = vat[a_idx, 0]
+    a_mem = c_locs[a_idx]
-    b_mem = vat[b_idx, 0]
+    b_mem = c_locs[b_idx]
-    c_mem = vat[c_idx, 0]
+    c_mem = c_locs[c_idx]
-    d_mem = vat[d_idx, 0]
+    d_mem = c_locs[d_idx]
-    z_mem, z_cap, _ = vat[z_idx]
+    z_mem = c_locs[z_idx]
    z_cap = c_caps[z_idx]
    a_cur = int(0)
    b_cur = int(0)
@ -280,12 +281,12 @@ def wave_eval_cpu(op, cbuf, vat, st_idx, line_times, param, sd=0.0, seed=0):
@numba.njit
-def level_eval_cpu(ops, op_start, op_stop, c, vat, st_start, st_stop, line_times, params, sd, seed):
+def level_eval_cpu(ops, op_start, op_stop, c, c_locs, c_caps, st_start, st_stop, line_times, params, sd, seed):
    overflows = 0
    for op_idx in range(op_start, op_stop):
        op = ops[op_idx]
        for st_idx in range(st_start, st_stop):
-            wave_eval_cpu(op, c, vat, st_idx, line_times, params[st_idx], sd, seed)
+            wave_eval_cpu(op, c, c_locs, c_caps, st_idx, line_times, params[st_idx], sd, seed)
@numba.njit
@ -347,7 +348,8 @@ class WaveSimCuda(WaveSim):
        self.c = cuda.to_device(self.c)
        self.s = cuda.to_device(self.s)
        self.ops = cuda.to_device(self.ops)
-        self.vat = cuda.to_device(self.vat)
+        self.c_locs = cuda.to_device(self.c_locs)
        self.c_caps = cuda.to_device(self.c_caps)
        self.timing = cuda.to_device(self.timing)
        self.params = cuda.to_device(self.params)
@ -355,7 +357,7 @@ class WaveSimCuda(WaveSim):
    def s_to_c(self):
        grid_dim = self._grid_dim(self.sims, self.s_len)
-        wave_assign_gpu[grid_dim, self._block_dim](self.c, self.s, self.vat, self.ppi_offset)
+        wave_assign_gpu[grid_dim, self._block_dim](self.c, self.s, self.c_locs, self.ppi_offset)
    def _grid_dim(self, x, y):
        gx = math.ceil(x / self._block_dim[0])
@ -366,29 +368,29 @@ class WaveSimCuda(WaveSim):
        sims = min(sims or self.sims, self.sims)
        for op_start, op_stop in zip(self.level_starts, self.level_stops):
            grid_dim = self._grid_dim(sims, op_stop - op_start)
-            wave_eval_gpu[grid_dim, self._block_dim](self.ops, op_start, op_stop, self.c, self.vat, int(0),
+            wave_eval_gpu[grid_dim, self._block_dim](self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, int(0),
                sims, self.timing, self.params, sd, seed)
        cuda.synchronize()
    def c_to_s(self, time=TMAX, sd=0.0, seed=1):
        grid_dim = self._grid_dim(self.sims, self.s_len)
-        wave_capture_gpu[grid_dim, self._block_dim](self.c, self.s, self.vat, self.ppo_offset,
+        wave_capture_gpu[grid_dim, self._block_dim](self.c, self.s, self.c_locs, self.c_caps, self.ppo_offset,
            time, sd * math.sqrt(2), seed)
    def s_ppo_to_ppi(self, time=0.0):
        grid_dim = self._grid_dim(self.sims, self.s_len)
-        ppo_to_ppi_gpu[grid_dim, self._block_dim](self.s, self.vat, time, self.ppi_offset, self.ppo_offset)
+        ppo_to_ppi_gpu[grid_dim, self._block_dim](self.s, self.c_locs, time, self.ppi_offset, self.ppo_offset)
@cuda.jit()
-def wave_assign_gpu(c, s, vat, ppi_offset):
+def wave_assign_gpu(c, s, c_locs, ppi_offset):
    x, y = cuda.grid(2)
-    if y >= len(s): return
+    if y >= s.shape[1]: return
-    c_loc, c_len, _ = vat[ppi_offset + y]
+    c_loc = c_locs[ppi_offset + y]
    if c_loc < 0: return
    if x >= c.shape[-1]: return
-    value = int(s[y, x, 2] >= 0.5) | (2*int(s[y, x, 0] >= 0.5))
+    value = int(s[2, y, x] >= 0.5) | (2*int(s[0, y, x] >= 0.5))
-    ttime = s[y, x, 1]
+    ttime = s[1, y, x]
    if value == 0:
        c[c_loc, x] = TMAX
        c[c_loc+1, x] = TMAX
@ -421,7 +423,7 @@ def rand_gauss_gpu(seed, sd):
@cuda.jit()
-def wave_eval_gpu(ops, op_start, op_stop, cbuf, vat, st_start, st_stop, line_times, param, sd, seed):
+def wave_eval_gpu(ops, op_start, op_stop, cbuf, c_locs, c_caps, st_start, st_stop, line_times, param, sd, seed):
    x, y = cuda.grid(2)
    st_idx = st_start + x
    op_idx = op_start + y
@ -442,11 +444,12 @@ def wave_eval_gpu(ops, op_start, op_stop, cbuf, vat, st_start, st_stop, line_tim
    _seed = (seed << 4) + (z_idx << 20) + (st_idx << 1)
-    a_mem = vat[a_idx, 0]
+    a_mem = c_locs[a_idx]
-    b_mem = vat[b_idx, 0]
+    b_mem = c_locs[b_idx]
-    c_mem = vat[c_idx, 0]
+    c_mem = c_locs[c_idx]
-    d_mem = vat[d_idx, 0]
+    d_mem = c_locs[d_idx]
-    z_mem, z_cap, _ = vat[z_idx]
+    z_mem = c_locs[z_idx]
    z_cap = c_caps[z_idx]
    a_cur = int(0)
    b_cur = int(0)
@ -541,10 +544,11 @@ def wave_eval_gpu(ops, op_start, op_stop, cbuf, vat, st_start, st_stop, line_tim
@cuda.jit()
-def wave_capture_gpu(c, s, vat, ppo_offset, time, s_sqrt2, seed):
+def wave_capture_gpu(c, s, c_locs, c_caps, ppo_offset, time, s_sqrt2, seed):
    x, y = cuda.grid(2)
-    if ppo_offset + y >= len(vat): return
+    if ppo_offset + y >= len(c_locs): return
-    line, tdim, _ = vat[ppo_offset + y]
+    line = c_locs[ppo_offset + y]
    tdim = c_caps[ppo_offset + y] 
    if line < 0: return
    if x >= c.shape[-1]: return
    vector = x
@ -588,25 +592,25 @@ def wave_capture_gpu(c, s, vat, ppo_offset, time, s_sqrt2, seed):
    else:
        acc = val
-    s[y, vector, 3] = (c[line, vector] <= TMIN)
+    s[3, y, vector] = (c[line, vector] <= TMIN)
-    s[y, vector, 4] = eat
+    s[4, y, vector] = eat
-    s[y, vector, 5] = lst
+    s[5, y, vector] = lst
-    s[y, vector, 6] = final
+    s[6, y, vector] = final
-    s[y, vector, 7] = acc
+    s[7, y, vector] = acc
-    s[y, vector, 8] = val
+    s[8, y, vector] = val
-    s[y, vector, 9] = 0  # TODO
+    s[9, y, vector] = 0  # TODO
-    s[y, vector, 10] = ovl
+    s[10, y, vector] = ovl
@cuda.jit()
-def ppo_to_ppi_gpu(s, vat, time, ppi_offset, ppo_offset):
+def ppo_to_ppi_gpu(s, c_locs, time, ppi_offset, ppo_offset):
    x, y = cuda.grid(2)
    if y >= s.shape[0]: return
    if x >= s.shape[1]: return
-    if vat[ppi_offset + y, 0] < 0: return
+    if c_locs[ppi_offset + y] < 0: return
-    if vat[ppo_offset + y, 0] < 0: return
+    if c_locs[ppo_offset + y] < 0: return
-    s[y, x, 0] = s[y, x, 2]
+    s[0, y, x] = s[2, y, x]
-    s[y, x, 1] = time
+    s[1, y, x] = time
-    s[y, x, 2] = s[y, x, 8]
+    s[2, y, x] = s[8, y, x]
--- a/tests/test_wave_sim.py
+++ b/tests/test_wave_sim.py
@ -2,17 +2,17 @@ import numpy as np
 from kyupy.wave_sim import WaveSim, WaveSimCuda, wave_eval_cpu, TMIN, TMAX
 from kyupy.logic_sim import LogicSim
-from kyupy import verilog, sdf, logic, bench
+from kyupy import logic, bench, sim
 from kyupy.logic import mvarray
 from kyupy.sim import SimPrim
 def test_nand_delays():
-    op = (SimPrim.NAND4, 4, 0, 1, 2, 3)
+    op = (sim.NAND4, 4, 0, 1, 2, 3)
    #op = (0b0111, 4, 0, 1)
    c = np.full((5*16, 1), TMAX)  # 5 waveforms of capacity 16
-    vat = np.zeros((5, 3), dtype='int')
+    c_locs = np.zeros((5,), dtype='int')
-    for i in range(5): vat[i] = i*16, 16, 0  # 1:1 mapping
+    c_caps = np.zeros((5,), dtype='int')
    for i in range(5): c_locs[i], c_caps[i] = i*16, 16  # 1:1 mapping
    # SDF specifies IOPATH delays with respect to output polarity
    # SDF pulse rejection value is determined by IOPATH causing last transition and polarity of last transition
@ -32,7 +32,7 @@ def test_nand_delays():
    def wave_assert(inputs, output):
        for i, a in zip(inputs, c.reshape(-1,16)): a[:len(i)] = i
-        wave_eval_cpu(op, c, vat, 0, line_times, sdata)
+        wave_eval_cpu(op, c, c_locs, c_caps, 0, line_times, sdata)
        for i, v in enumerate(output): np.testing.assert_allclose(c.reshape(-1,16)[4,i], v)
    wave_assert([[TMAX,TMAX],[TMAX,TMAX],[TMIN,TMAX],[TMIN,TMAX]], [TMIN,TMAX]) # NAND(0,0,1,1) => 1
@ -53,29 +53,29 @@ def test_tiny_circuit():
    lt = np.zeros((len(c.lines), 2, 2))
    lt[:,0,:] = 1.0  # unit delay for all lines
    wsim = WaveSim(c, lt)
-    assert len(wsim.s) == 5
+    assert wsim.s.shape[1] == 5
    # values for x
-    wsim.s[0,0,:3] = 0, 0.1, 0
+    wsim.s[:3,0,0] = 0, 0.1, 0
-    wsim.s[0,1,:3] = 0, 0.2, 1
+    wsim.s[:3,0,1] = 0, 0.2, 1
-    wsim.s[0,2,:3] = 1, 0.3, 0
+    wsim.s[:3,0,2] = 1, 0.3, 0
-    wsim.s[0,3,:3] = 1, 0.4, 1
+    wsim.s[:3,0,3] = 1, 0.4, 1
    # values for y
-    wsim.s[1,0,:3] = 1, 0.5, 0
+    wsim.s[:3,1,0] = 1, 0.5, 0
-    wsim.s[1,1,:3] = 1, 0.6, 0
+    wsim.s[:3,1,1] = 1, 0.6, 0
-    wsim.s[1,2,:3] = 1, 0.7, 0
+    wsim.s[:3,1,2] = 1, 0.7, 0
-    wsim.s[1,3,:3] = 0, 0.8, 1
+    wsim.s[:3,1,3] = 0, 0.8, 1
    wsim.s_to_c()
-    x_c_loc = wsim.vat[wsim.ppi_offset+0, 0] # check x waveforms
+    x_c_loc = wsim.c_locs[wsim.ppi_offset+0] # check x waveforms
    np.testing.assert_allclose(wsim.c[x_c_loc:x_c_loc+3, 0], [TMAX, TMAX, TMAX])
    np.testing.assert_allclose(wsim.c[x_c_loc:x_c_loc+3, 1], [0.2, TMAX, TMAX])
    np.testing.assert_allclose(wsim.c[x_c_loc:x_c_loc+3, 2], [TMIN, 0.3, TMAX])
    np.testing.assert_allclose(wsim.c[x_c_loc:x_c_loc+3, 3], [TMIN, TMAX, TMAX])
-    y_c_loc = wsim.vat[wsim.ppi_offset+1, 0] # check y waveforms
+    y_c_loc = wsim.c_locs[wsim.ppi_offset+1] # check y waveforms
    np.testing.assert_allclose(wsim.c[y_c_loc:y_c_loc+3, 0], [TMIN, 0.5, TMAX])
    np.testing.assert_allclose(wsim.c[y_c_loc:y_c_loc+3, 1], [TMIN, 0.6, TMAX])
    np.testing.assert_allclose(wsim.c[y_c_loc:y_c_loc+3, 2], [TMIN, 0.7, TMAX])
@ -83,19 +83,19 @@ def test_tiny_circuit():
    wsim.c_prop()
-    a_c_loc = wsim.vat[wsim.ppo_offset+2, 0] # check a waveforms
+    a_c_loc = wsim.c_locs[wsim.ppo_offset+2] # check a waveforms
    np.testing.assert_allclose(wsim.c[a_c_loc:a_c_loc+3, 0], [TMAX, TMAX, TMAX])
    np.testing.assert_allclose(wsim.c[a_c_loc:a_c_loc+3, 1], [1.2, 1.6, TMAX])
    np.testing.assert_allclose(wsim.c[a_c_loc:a_c_loc+3, 2], [TMIN, 1.3, TMAX])
    np.testing.assert_allclose(wsim.c[a_c_loc:a_c_loc+3, 3], [1.8, TMAX, TMAX])
-    o_c_loc = wsim.vat[wsim.ppo_offset+3, 0] # check o waveforms
+    o_c_loc = wsim.c_locs[wsim.ppo_offset+3] # check o waveforms
    np.testing.assert_allclose(wsim.c[o_c_loc:o_c_loc+3, 0], [TMIN, 1.5, TMAX])
    np.testing.assert_allclose(wsim.c[o_c_loc:o_c_loc+3, 1], [TMIN, TMAX, TMAX])
    np.testing.assert_allclose(wsim.c[o_c_loc:o_c_loc+3, 2], [TMIN, 1.7, TMAX])
    np.testing.assert_allclose(wsim.c[o_c_loc:o_c_loc+3, 3], [TMIN, TMAX, TMAX])
-    n_c_loc = wsim.vat[wsim.ppo_offset+4, 0] # check n waveforms
+    n_c_loc = wsim.c_locs[wsim.ppo_offset+4] # check n waveforms
    np.testing.assert_allclose(wsim.c[n_c_loc:n_c_loc+3, 0], [TMIN, TMAX, TMAX])
    np.testing.assert_allclose(wsim.c[n_c_loc:n_c_loc+3, 1], [TMIN, 1.2, TMAX])
    np.testing.assert_allclose(wsim.c[n_c_loc:n_c_loc+3, 2], [1.3, TMAX, TMAX])
@ -104,22 +104,22 @@ def test_tiny_circuit():
    wsim.c_to_s()
    # check a captures
-    np.testing.assert_allclose(wsim.s[2, 0, 3:7], [0, TMAX, TMIN, 0])
+    np.testing.assert_allclose(wsim.s[3:7, 2, 0], [0, TMAX, TMIN, 0])
-    np.testing.assert_allclose(wsim.s[2, 1, 3:7], [0, 1.2, 1.6, 0])
+    np.testing.assert_allclose(wsim.s[3:7, 2, 1], [0, 1.2, 1.6, 0])
-    np.testing.assert_allclose(wsim.s[2, 2, 3:7], [1, 1.3, 1.3, 0])
+    np.testing.assert_allclose(wsim.s[3:7, 2, 2], [1, 1.3, 1.3, 0])
-    np.testing.assert_allclose(wsim.s[2, 3, 3:7], [0, 1.8, 1.8, 1])
+    np.testing.assert_allclose(wsim.s[3:7, 2, 3], [0, 1.8, 1.8, 1])
    # check o captures                
-    np.testing.assert_allclose(wsim.s[3, 0, 3:7], [1, 1.5, 1.5, 0])
+    np.testing.assert_allclose(wsim.s[3:7, 3, 0], [1, 1.5, 1.5, 0])
-    np.testing.assert_allclose(wsim.s[3, 1, 3:7], [1, TMAX, TMIN, 1])
+    np.testing.assert_allclose(wsim.s[3:7, 3, 1], [1, TMAX, TMIN, 1])
-    np.testing.assert_allclose(wsim.s[3, 2, 3:7], [1, 1.7, 1.7, 0])
+    np.testing.assert_allclose(wsim.s[3:7, 3, 2], [1, 1.7, 1.7, 0])
-    np.testing.assert_allclose(wsim.s[3, 3, 3:7], [1, TMAX, TMIN, 1])
+    np.testing.assert_allclose(wsim.s[3:7, 3, 3], [1, TMAX, TMIN, 1])
    # check o captures                
-    np.testing.assert_allclose(wsim.s[4, 0, 3:7], [1, TMAX, TMIN, 1])
+    np.testing.assert_allclose(wsim.s[3:7, 4, 0], [1, TMAX, TMIN, 1])
-    np.testing.assert_allclose(wsim.s[4, 1, 3:7], [1, 1.2, 1.2, 0])
+    np.testing.assert_allclose(wsim.s[3:7, 4, 1], [1, 1.2, 1.2, 0])
-    np.testing.assert_allclose(wsim.s[4, 2, 3:7], [0, 1.3, 1.3, 1])
+    np.testing.assert_allclose(wsim.s[3:7, 4, 2], [0, 1.3, 1.3, 1])
-    np.testing.assert_allclose(wsim.s[4, 3, 3:7], [0, TMAX, TMIN, 0])
+    np.testing.assert_allclose(wsim.s[3:7, 4, 3], [0, TMAX, TMIN, 0])
 def compare_to_logic_sim(wsim: WaveSim):
@ -127,17 +127,17 @@ def compare_to_logic_sim(wsim: WaveSim):
    rng = np.random.default_rng(10)
    tests = rng.choice(choices, (wsim.s_len, wsim.sims))
-    wsim.s[:, :, 0] = (tests & 2) >> 1
+    wsim.s[0] = (tests & 2) >> 1
-    wsim.s[:, :, 3] = (tests & 2) >> 1
+    wsim.s[3] = (tests & 2) >> 1
-    wsim.s[:, :, 1] = 0.0
+    wsim.s[1] = 0.0
-    wsim.s[:, :, 2] = tests & 1
+    wsim.s[2] = tests & 1
-    wsim.s[:, :, 6] = tests & 1
+    wsim.s[6] = tests & 1
    wsim.s_to_c()
    wsim.c_prop()
    wsim.c_to_s()
-    resp = np.array(wsim.s[:, :, 6], dtype=np.uint8) | (np.array(wsim.s[:, :, 3], dtype=np.uint8)<<1)
+    resp = np.array(wsim.s[6], dtype=np.uint8) | (np.array(wsim.s[3], dtype=np.uint8)<<1)
    resp |= ((resp ^ (resp >> 1)) & 1) << 2  # transitions
    resp[wsim.pi_s_locs] = logic.UNASSIGNED