diff --git a/src/kyupy/__init__.py b/src/kyupy/__init__.py
index 897e832..9be6485 100644
--- a/src/kyupy/__init__.py
+++ b/src/kyupy/__init__.py
@@ -76,6 +76,11 @@ def hr_time(seconds):
     return s
 
 
+def batchrange(nitems, maxsize):
+    for offset in range(0, nitems, maxsize):
+        yield offset, min(nitems-offset, maxsize)
+
+        
 class Timer:
     def __init__(self, s=0): self.s = s
     def __enter__(self): self.start_time = time.perf_counter(); return self
diff --git a/src/kyupy/logic_sim.py b/src/kyupy/logic_sim.py
index c1f9d45..422cc91 100644
--- a/src/kyupy/logic_sim.py
+++ b/src/kyupy/logic_sim.py
@@ -11,10 +11,8 @@ import math
 import numpy as np
 
 from . import numba, logic, hr_bytes, sim
-from .sim import SimOps, SimPrim
 
-
-class LogicSim(SimOps):
+class LogicSim(sim.SimOps):
     """A bit-parallel naïve combinational simulator for 2-, 4-, or 8-valued logic.
 
     :param circuit: The circuit to simulate.
@@ -36,17 +34,17 @@ class LogicSim(SimOps):
         self.s = np.zeros((2, self.s_len, 3, nbytes), dtype=np.uint8)
         self.s[:,:,1,:] = 255  # unassigned
 
-        self.pi_s_locs = np.flatnonzero(self.vat[self.ppi_offset+np.arange(len(self.circuit.io_nodes)), 0] >= 0)
-        self.po_s_locs = np.flatnonzero(self.vat[self.ppo_offset+np.arange(len(self.circuit.io_nodes)), 0] >= 0)
+        self.pi_s_locs = np.flatnonzero(self.c_locs[self.ppi_offset+np.arange(len(self.circuit.io_nodes))] >= 0)
+        self.po_s_locs = np.flatnonzero(self.c_locs[self.ppo_offset+np.arange(len(self.circuit.io_nodes))] >= 0)
         self.ppio_s_locs = np.arange(len(self.circuit.io_nodes), len(self.s_nodes))
 
         self.pippi_s_locs = np.concatenate([self.pi_s_locs, self.ppio_s_locs])
         self.poppo_s_locs = np.concatenate([self.po_s_locs, self.ppio_s_locs])
 
-        self.pi_c_locs = self.vat[self.ppi_offset+self.pi_s_locs, 0]
-        self.po_c_locs = self.vat[self.ppo_offset+self.po_s_locs, 0]
-        self.ppi_c_locs = self.vat[self.ppi_offset+self.ppio_s_locs, 0]
-        self.ppo_c_locs = self.vat[self.ppo_offset+self.ppio_s_locs, 0]
+        self.pi_c_locs = self.c_locs[self.ppi_offset+self.pi_s_locs]
+        self.po_c_locs = self.c_locs[self.ppo_offset+self.po_s_locs]
+        self.ppi_c_locs = self.c_locs[self.ppi_offset+self.ppio_s_locs]
+        self.ppo_c_locs = self.c_locs[self.ppo_offset+self.ppio_s_locs]
 
         self.pippi_c_locs = np.concatenate([self.pi_c_locs, self.ppi_c_locs])
         self.poppo_c_locs = np.concatenate([self.po_c_locs, self.ppo_c_locs])
@@ -103,34 +101,34 @@ class LogicSim(SimOps):
         nbytes = (sims - 1) // 8 + 1
         if self.m == 2:
             if inject_cb is None:
-                _prop_cpu(self.ops, self.vat, self.c[...,:nbytes])
+                _prop_cpu(self.ops, self.c_locs, self.c[...,:nbytes])
             else:
                 for op, o0, i0, i1, i2, i3 in self.ops:
-                    o0, i0, i1, i2, i3 = [self.vat[x,0] for x in (o0, i0, i1, i2, i3)]
-                    if op == SimPrim.BUF1: self.c[o0]=self.c[i0]
-                    elif op == SimPrim.INV1: self.c[o0] = ~self.c[i0]
-                    elif op == SimPrim.AND2: self.c[o0] = self.c[i0] & self.c[i1]
-                    elif op == SimPrim.NAND2: self.c[o0] = ~(self.c[i0] & self.c[i1])
-                    elif op == SimPrim.OR2: self.c[o0] = self.c[i0] | self.c[i1]
-                    elif op == SimPrim.NOR2: self.c[o0] = ~(self.c[i0] | self.c[i1])
-                    elif op == SimPrim.XOR2: self.c[o0] = self.c[i0] ^ self.c[i1]
-                    elif op == SimPrim.XNOR2: self.c[o0] = ~(self.c[i0] ^ self.c[i1])
-                    else: print(f'unknown SimPrim {op}')
+                    o0, i0, i1, i2, i3 = [self.c_locs[x] for x in (o0, i0, i1, i2, i3)]
+                    if op == sim.BUF1: self.c[o0]=self.c[i0]
+                    elif op == sim.INV1: self.c[o0] = ~self.c[i0]
+                    elif op == sim.AND2: self.c[o0] = self.c[i0] & self.c[i1]
+                    elif op == sim.NAND2: self.c[o0] = ~(self.c[i0] & self.c[i1])
+                    elif op == sim.OR2: self.c[o0] = self.c[i0] | self.c[i1]
+                    elif op == sim.NOR2: self.c[o0] = ~(self.c[i0] | self.c[i1])
+                    elif op == sim.XOR2: self.c[o0] = self.c[i0] ^ self.c[i1]
+                    elif op == sim.XNOR2: self.c[o0] = ~(self.c[i0] ^ self.c[i1])
+                    else: print(f'unknown sim {op}')
                     inject_cb(o0, self.s[o0])
         elif self.m == 4:
             pass
         else:
             for op, o0, i0, i1, i2, i3 in self.ops:
-                o0, i0, i1, i2, i3 = [self.vat[x,0] for x in (o0, i0, i1, i2, i3)]
-                if op == SimPrim.BUF1: self.c[o0]=self.c[i0]
-                elif op == SimPrim.INV1: logic.bp_not(self.c[o0], self.c[i0])
-                elif op == SimPrim.AND2: logic.bp_and(self.c[o0], self.c[i0], self.c[i1])
-                elif op == SimPrim.NAND2: logic.bp_and(self.c[o0], self.c[i0], self.c[i1]); logic.bp_not(self.c[o0], self.c[o0])
-                elif op == SimPrim.OR2: logic.bp_or(self.c[o0], self.c[i0], self.c[i1])
-                elif op == SimPrim.NOR2: logic.bp_or(self.c[o0], self.c[i0], self.c[i1]); logic.bp_not(self.c[o0], self.c[o0])
-                elif op == SimPrim.XOR2: logic.bp_xor(self.c[o0], self.c[i0], self.c[i1])
-                elif op == SimPrim.XNOR2: logic.bp_xor(self.c[o0], self.c[i0], self.c[i1]); logic.bp_not(self.c[o0], self.c[o0])
-                else: print(f'unknown SimPrim {op}')
+                o0, i0, i1, i2, i3 = [self.c_locs[x] for x in (o0, i0, i1, i2, i3)]
+                if op == sim.BUF1: self.c[o0]=self.c[i0]
+                elif op == sim.INV1: logic.bp_not(self.c[o0], self.c[i0])
+                elif op == sim.AND2: logic.bp_and(self.c[o0], self.c[i0], self.c[i1])
+                elif op == sim.NAND2: logic.bp_and(self.c[o0], self.c[i0], self.c[i1]); logic.bp_not(self.c[o0], self.c[o0])
+                elif op == sim.OR2: logic.bp_or(self.c[o0], self.c[i0], self.c[i1])
+                elif op == sim.NOR2: logic.bp_or(self.c[o0], self.c[i0], self.c[i1]); logic.bp_not(self.c[o0], self.c[o0])
+                elif op == sim.XOR2: logic.bp_xor(self.c[o0], self.c[i0], self.c[i1])
+                elif op == sim.XNOR2: logic.bp_xor(self.c[o0], self.c[i0], self.c[i1]); logic.bp_not(self.c[o0], self.c[o0])
+                else: print(f'unknown sim {op}')
                 if inject_cb is not None: inject_cb(o0, self.s[o0])
 
     def s_ppo_to_ppi(self):
@@ -159,9 +157,9 @@ class LogicSim(SimOps):
 
 
 @numba.njit
-def _prop_cpu(ops, vat, c):
+def _prop_cpu(ops, c_locs, c):
     for op, o0, i0, i1, i2, i3 in ops:
-        o0, i0, i1, i2, i3 = [vat[x,0] for x in (o0, i0, i1, i2, i3)]
+        o0, i0, i1, i2, i3 = [c_locs[x] for x in (o0, i0, i1, i2, i3)]
         if op == sim.BUF1: c[o0]=c[i0]
         elif op == sim.INV1: c[o0] = ~c[i0]
         elif op == sim.AND2: c[o0] = c[i0] & c[i1]
@@ -170,4 +168,4 @@ def _prop_cpu(ops, vat, c):
         elif op == sim.NOR2: c[o0] = ~(c[i0] | c[i1])
         elif op == sim.XOR2: c[o0] = c[i0] ^ c[i1]
         elif op == sim.XNOR2: c[o0] = ~(c[i0] ^ c[i1])
-        else: print(f'unknown SimPrim {op}')
+        else: print(f'unknown sim {op}')
diff --git a/src/kyupy/sim.py b/src/kyupy/sim.py
index f0a9116..29da8fc 100644
--- a/src/kyupy/sim.py
+++ b/src/kyupy/sim.py
@@ -4,117 +4,70 @@ from bisect import bisect, insort_left
 
 import numpy as np
 
-BUF1 = 0b1010_1010_1010_1010
-INV1 = 0b0101_0101_0101_0101
-
-NAND4 = 0b0111_1111_1111_1111
-NAND3 = 0b0111_1111_0111_1111
-NAND2 = 0b0111_0111_0111_0111
-
-NOR4 = 0b0000_0000_0000_0001
-NOR3 = 0b0000_0001_0000_0001
-NOR2 = 0b0001_0001_0001_0001
-
-AND4 = 0b1000_0000_0000_0000
-AND3 = 0b1000_0000_1000_0000
-AND2 = 0b1000_1000_1000_1000
-
-OR4 = 0b1111_1111_1111_1110
-OR3 = 0b1111_1110_1111_1110
-OR2 = 0b1110_1110_1110_1110
-
-XOR4 = 0b0110_1001_1001_0110
-XOR3 = 0b1001_0110_1001_0110
-XOR2 = 0b0110_0110_0110_0110
-
-XNOR4 = 0b1001_0110_0110_1001
-XNOR3 = 0b0110_1001_0110_1001
-XNOR2 = 0b1001_1001_1001_1001
-
-AO22 = 0b1111_1000_1000_1000
-AOI22 = 0b0000_0111_0111_0111
-AO21 = 0b1110_1010_1110_1010
-AOI21 = 0b0001_0101_0001_0101
-OA22 = 0b1110_1110_1110_0000
-OAI22 = 0b0001_0001_0001_1111
-OA21 = 0b1010_1000_1010_1000
-OAI21 = 0b0101_0111_0101_0111
-MUX21 = 0b1110_0100_1110_0100
-
-class SimPrim:
-    BUF1 = 0b1010_1010_1010_1010
-    INV1 = 0b0101_0101_0101_0101
-
-    NAND4 = 0b0111_1111_1111_1111
-    NAND3 = 0b0111_1111_0111_1111
-    NAND2 = 0b0111_0111_0111_0111
-
-    NOR4 = 0b0000_0000_0000_0001
-    NOR3 = 0b0000_0001_0000_0001
-    NOR2 = 0b0001_0001_0001_0001
-
-    AND4 = 0b1000_0000_0000_0000
-    AND3 = 0b1000_0000_1000_0000
-    AND2 = 0b1000_1000_1000_1000
-
-    OR4 = 0b1111_1111_1111_1110
-    OR3 = 0b1111_1110_1111_1110
-    OR2 = 0b1110_1110_1110_1110
-
-    XOR4 = 0b0110_1001_1001_0110
-    XOR3 = 0b1001_0110_1001_0110
-    XOR2 = 0b0110_0110_0110_0110
-
-    XNOR4 = 0b1001_0110_0110_1001
-    XNOR3 = 0b0110_1001_0110_1001
-    XNOR2 = 0b1001_1001_1001_1001
-
-    AO22 = 0b1111_1000_1000_1000
-    AOI22 = 0b0000_0111_0111_0111
-    AO21 = 0b1110_1010_1110_1010
-    AOI21 = 0b0001_0101_0001_0101
-    OA22 = 0b1110_1110_1110_0000
-    OAI22 = 0b0001_0001_0001_1111
-    OA21 = 0b1010_1000_1010_1000
-    OAI21 = 0b0101_0111_0101_0111
-    MUX21 = 0b1110_0100_1110_0100
-
-    kind_prefixes = {
-        'nand': (NAND4, NAND3, NAND2),
-        'nor': (NOR4, NOR3, NOR2),
-        'and': (AND4, AND3, AND2),
-        'or': (OR4, OR3, OR2),
-        'xor': (XOR4, XOR3, XOR2),
-        'xnor': (XNOR4, XNOR3, XNOR2),
-
-        'not': (INV1, INV1, INV1),
-        'inv': (INV1, INV1, INV1),
-        'ibuf': (INV1, INV1, INV1),
-        '__const1__': (INV1, INV1, INV1),
-        'tieh': (INV1, INV1, INV1),
-
-        'buf': (BUF1, BUF1, BUF1),
-        'nbuf': (BUF1, BUF1, BUF1),
-        'delln': (BUF1, BUF1, BUF1),
-        '__const0__': (BUF1, BUF1, BUF1),
-        'tiel': (BUF1, BUF1, BUF1),
-
-        'ao22': (AO22, AO22, AO22),
-        'aoi22': (AOI22, AOI22, AOI22),
-        'ao21': (AO21, AO21, AO21),
-        'aoi21': (AOI21, AOI21, AOI21),
-
-        'oa22': (OA22, OA22, OA22),
-        'oai22': (OAI22, OAI22, OAI22),
-        'oa21': (OA21, OA21, OA21),
-        'oai21': (OAI21, OAI21, OAI21),
-
-        'mux21': (MUX21, MUX21, MUX21),
-    }
-
-    @classmethod
-    def names(cls):
-        return dict([(v, k) for k, v in cls.__dict__.items() if isinstance(v, int)])
+BUF1 = np.uint16(0b1010_1010_1010_1010)
+INV1 = ~BUF1
+
+AND2 = np.uint16(0b1000_1000_1000_1000)
+AND3 = np.uint16(0b1000_0000_1000_0000)
+AND4 = np.uint16(0b1000_0000_0000_0000)
+
+NAND2, NAND3, NAND4 = ~AND2, ~AND3, ~AND4
+
+OR2 = np.uint16(0b1110_1110_1110_1110)
+OR3 = np.uint16(0b1111_1110_1111_1110)
+OR4 = np.uint16(0b1111_1111_1111_1110)
+
+NOR2, NOR3, NOR4 = ~OR2, ~OR3, ~OR4
+
+XOR2 = np.uint16(0b0110_0110_0110_0110)
+XOR3 = np.uint16(0b1001_0110_1001_0110)
+XOR4 = np.uint16(0b0110_1001_1001_0110)
+
+XNOR2, XNOR3, XNOR4 = ~XOR2, ~XOR3, ~XOR4
+
+AO21 = np.uint16(0b1110_1010_1110_1010)
+AO22 = np.uint16(0b1111_1000_1000_1000)
+OA21 = np.uint16(0b1010_1000_1010_1000)
+OA22 = np.uint16(0b1110_1110_1110_0000)
+
+AOI21, AOI22, OAI21, OAI22 = ~AO21, ~AO22, ~OA21, ~OA22
+
+MUX21 = np.uint16(0b1110_0100_1110_0100)
+
+names = dict([(v, k) for k, v in globals().items() if isinstance(v, np.uint16)])
+
+kind_prefixes = {
+    'nand': (NAND4, NAND3, NAND2),
+    'nor': (NOR4, NOR3, NOR2),
+    'and': (AND4, AND3, AND2),
+    'or': (OR4, OR3, OR2),
+    'xor': (XOR4, XOR3, XOR2),
+    'xnor': (XNOR4, XNOR3, XNOR2),
+
+    'not': (INV1, INV1, INV1),
+    'inv': (INV1, INV1, INV1),
+    'ibuf': (INV1, INV1, INV1),
+    '__const1__': (INV1, INV1, INV1),
+    'tieh': (INV1, INV1, INV1),
+
+    'buf': (BUF1, BUF1, BUF1),
+    'nbuf': (BUF1, BUF1, BUF1),
+    'delln': (BUF1, BUF1, BUF1),
+    '__const0__': (BUF1, BUF1, BUF1),
+    'tiel': (BUF1, BUF1, BUF1),
+
+    'ao22': (AO22, AO22, AO22),
+    'aoi22': (AOI22, AOI22, AOI22),
+    'ao21': (AO21, AO21, AO21),
+    'aoi21': (AOI21, AOI21, AOI21),
+
+    'oa22': (OA22, OA22, OA22),
+    'oai22': (OAI22, OAI22, OAI22),
+    'oa21': (OA21, OA21, OA21),
+    'oai21': (OAI21, OAI21, OAI21),
+
+    'mux21': (MUX21, MUX21, MUX21),
+}
 
 class Heap:
     def __init__(self):
@@ -184,7 +137,7 @@ class Heap:
 
 class SimOps:
     """A static scheduler that translates a Circuit into a topologically sorted list of basic logic operations (self.ops) and
-    a value allocation table (self.vat) for use in simulators.
+    a memory mapping (self.c_locs, self.c_caps) for use in simulators.
 
     :param circuit: The circuit to create a schedule for.
     :param strip_forks: If enabled, the scheduler will not include fork nodes to safe simulation time.
@@ -203,12 +156,12 @@ class SimOps:
         if isinstance(c_caps, int):
             c_caps = [c_caps] * len(circuit.lines)
 
-        # indices for state allocation table (sat)
+        # special locations and offsets in c_locs/c_caps
         self.zero_idx = len(circuit.lines)
         self.tmp_idx = self.zero_idx + 1
         self.ppi_offset = self.tmp_idx + 1
         self.ppo_offset = self.ppi_offset + len(self.s_nodes)
-        self.vat_len = self.ppo_offset + len(self.s_nodes)
+        self.c_locs_len = self.ppo_offset + len(self.s_nodes)
 
         # translate circuit structure into self.ops
         ops = []
@@ -217,14 +170,14 @@ class SimOps:
             if n in interface_dict:
                 inp_idx = self.ppi_offset + interface_dict[n]
                 if len(n.outs) > 0 and n.outs[0] is not None:  # first output of a PI/PPI
-                    ops.append((SimPrim.BUF1, n.outs[0].index, inp_idx, self.zero_idx, self.zero_idx, self.zero_idx)) 
+                    ops.append((BUF1, n.outs[0].index, inp_idx, self.zero_idx, self.zero_idx, self.zero_idx)) 
                 if 'dff' in n.kind.lower():  # second output of DFF is inverted
                     if len(n.outs) > 1 and n.outs[1] is not None:
-                        ops.append((SimPrim.INV1, n.outs[1].index, inp_idx, self.zero_idx, self.zero_idx, self.zero_idx))
+                        ops.append((INV1, n.outs[1].index, inp_idx, self.zero_idx, self.zero_idx, self.zero_idx))
                 else:  # if not DFF, no output is inverted.
                     for o_line in n.outs[1:]:
                         if o_line is not None:
-                            ops.append((SimPrim.BUF1, o_line.index, inp_idx, self.zero_idx, self.zero_idx, self.zero_idx))
+                            ops.append((BUF1, o_line.index, inp_idx, self.zero_idx, self.zero_idx, self.zero_idx))
                 continue
             # regular node, not PI/PPI or PO/PPO
             o0_idx = n.outs[0].index if len(n.outs) > 0 and n.outs[0] is not None else self.tmp_idx
@@ -237,10 +190,10 @@ class SimOps:
                 if not strip_forks:
                     for o_line in n.outs:
                         if o_line is not None:
-                            ops.append((SimPrim.BUF1, o_line.index, i0_idx, i1_idx, i2_idx, i3_idx))
+                            ops.append((BUF1, o_line.index, i0_idx, i1_idx, i2_idx, i3_idx))
                 continue
             sp = None
-            for prefix, prims in SimPrim.kind_prefixes.items():
+            for prefix, prims in kind_prefixes.items():
                 if kind.startswith(prefix):
                     sp = prims[0]
                     if i3_idx == self.zero_idx:
@@ -256,7 +209,7 @@ class SimOps:
         self.ops = np.asarray(ops, dtype='int32')
 
         # create a map from fanout lines to stem lines for fork stripping
-        stems = np.zeros(self.vat_len, dtype='int32') - 1  # default to -1: 'no fanout line'
+        stems = np.zeros(self.c_locs_len, dtype='int32') - 1  # default to -1: 'no fanout line'
         if strip_forks:
             for f in circuit.forks.values():
                 prev_line = f.ins[0]
@@ -267,8 +220,8 @@ class SimOps:
                     stems[ol] = stem_idx
 
         # calculate level (distance from PI/PPI) and reference count for each line
-        levels = np.zeros(self.vat_len, dtype='int32')
-        ref_count = np.zeros(self.vat_len, dtype='int32')
+        levels = np.zeros(self.c_locs_len, dtype='int32')
+        ref_count = np.zeros(self.c_locs_len, dtype='int32')
         level_starts = [0]
         current_level = 1
         for i, op in enumerate(self.ops):
@@ -289,21 +242,21 @@ class SimOps:
         self.level_stops = np.asarray(level_starts[1:] + [len(self.ops)], dtype='int32')
 
         # state allocation table. maps line and interface indices to self.state memory locations
-        self.vat = np.zeros((self.vat_len, 3), dtype='int')
-        self.vat[:, 0] = -1
-
+        self.c_locs = np.full((self.c_locs_len,), -1, dtype=np.int32)
+        self.c_caps = np.zeros((self.c_locs_len,), dtype=np.int32)
+        
         h = Heap()
 
         # allocate and keep memory for special fields
-        self.vat[self.zero_idx] = h.alloc(1), 1, 0
-        self.vat[self.tmp_idx] = h.alloc(1), 1, 0
+        self.c_locs[self.zero_idx], self.c_caps[self.zero_idx] = h.alloc(1), 1
+        self.c_locs[self.tmp_idx], self.c_caps[self.tmp_idx] = h.alloc(1), 1
         ref_count[self.zero_idx] += 1
         ref_count[self.tmp_idx] += 1
 
         # allocate and keep memory for PI/PPI, keep memory for PO/PPO (allocated later)
         for i, n in enumerate(self.s_nodes):
             if len(n.outs) > 0:
-                self.vat[self.ppi_offset + i] = h.alloc(1), 1, 0
+                self.c_locs[self.ppi_offset + i], self.c_caps[self.ppi_offset + i] = h.alloc(1), 1
                 ref_count[self.ppi_offset + i] += 1
             if len(n.ins) > 0:
                 i0_idx = stems[n.ins[0]] if stems[n.ins[0]] >= 0 else n.ins[0]
@@ -322,13 +275,13 @@ class SimOps:
                 ref_count[i1_idx] -= 1
                 ref_count[i2_idx] -= 1
                 ref_count[i3_idx] -= 1
-                if ref_count[i0_idx] <= 0: free_list.append(self.vat[i0_idx, 0])
-                if ref_count[i1_idx] <= 0: free_list.append(self.vat[i1_idx, 0])
-                if ref_count[i2_idx] <= 0: free_list.append(self.vat[i2_idx, 0])
-                if ref_count[i3_idx] <= 0: free_list.append(self.vat[i3_idx, 0])
+                if ref_count[i0_idx] <= 0: free_list.append(self.c_locs[i0_idx])
+                if ref_count[i1_idx] <= 0: free_list.append(self.c_locs[i1_idx])
+                if ref_count[i2_idx] <= 0: free_list.append(self.c_locs[i2_idx])
+                if ref_count[i3_idx] <= 0: free_list.append(self.c_locs[i3_idx])
                 o_idx = op[1]
                 cap = c_caps[o_idx]
-                self.vat[o_idx] = h.alloc(cap), cap, 0
+                self.c_locs[o_idx], self.c_caps[o_idx] = h.alloc(cap), cap
             if not keep_signals:
                 for loc in free_list:
                     h.free(loc)
@@ -336,16 +289,15 @@ class SimOps:
         # copy memory location and capacity from stems to fanout lines
         for lidx, stem in enumerate(stems):
             if stem >= 0:  # if at a fanout line
-                self.vat[lidx] = self.vat[stem]
+                self.c_locs[lidx], self.c_caps[lidx] = self.c_locs[stem], self.c_caps[stem]
 
         # copy memory location to PO/PPO area
         for i, n in enumerate(self.s_nodes):
             if len(n.ins) > 0:
-                self.vat[self.ppo_offset + i] = self.vat[n.ins[0]]
+                self.c_locs[self.ppo_offset + i], self.c_caps[self.ppo_offset + i] = self.c_locs[n.ins[0]], self.c_caps[n.ins[0]]
 
         self.c_len = h.max_size
 
         from collections import defaultdict
         self.prim_counts = defaultdict(int)
-        names_dict = SimPrim.names()
-        for op, _, _, _, _, _ in self.ops: self.prim_counts[names_dict[op]] += 1
+        for op, _, _, _, _, _ in self.ops: self.prim_counts[names[op]] += 1
diff --git a/src/kyupy/wave_sim.py b/src/kyupy/wave_sim.py
index af1f02b..61054fc 100644
--- a/src/kyupy/wave_sim.py
+++ b/src/kyupy/wave_sim.py
@@ -16,8 +16,7 @@ import math
 
 import numpy as np
 
-from . import numba, cuda, hr_bytes
-from .sim import SimOps
+from . import numba, cuda, hr_bytes, sim
 
 
 TMAX = np.float32(2 ** 127)
@@ -29,7 +28,7 @@ TMIN = np.float32(-2 ** 127)
 """A large negative 32-bit floating point value used at the beginning of waveforms that start with logic-1."""
 
 
-class WaveSim(SimOps):
+class WaveSim(sim.SimOps):
     """A waveform-based combinational logic timing simulator running on CPU.
 
     :param circuit: The circuit to simulate.
@@ -54,30 +53,31 @@ class WaveSim(SimOps):
         self.sims = sims
 
         self.c_len *= 4
-        self.vat[...,0:2] *= 4
+        self.c_locs[...] *= 4
+        self.c_caps[...] *= 4
 
-        self.timing = np.zeros((self.vat_len, 2, 2))
+        self.timing = np.zeros((self.c_locs_len, 2, 2))
         self.timing[:len(timing)] = timing
 
         self.c = np.zeros((self.c_len, sims), dtype=np.float32) + TMAX
-        self.s = np.zeros((len(self.s_nodes), sims, 11), dtype=np.float32)
+        self.s = np.zeros((11, self.s_len, sims), dtype=np.float32)
         """Information about the logic values and transitions around the sequential elements (flip-flops) and ports.
 
         The first 3 values are read by ``s_to_c()``.
         The remaining values are written by ``c_to_s()``.
 
         The elements are as follows:
-        * ``s[..., 0]`` (P)PI initial value
-        * ``s[..., 1]`` (P)PI transition time
-        * ``s[..., 2]`` (P)PI final value
-        * ``s[..., 3]`` (P)PO initial value
-        * ``s[..., 4]`` (P)PO earliest arrival time (EAT): The time at which the output transitioned from its initial value.
-        * ``s[..., 5]`` (P)PO latest stabilization time (LST): The time at which the output settled to its final value.
-        * ``s[..., 6]`` (P)PO final value
-        * ``s[..., 7]`` (P)PO capture value: probability of capturing a 1 at a given capture time
-        * ``s[..., 8]`` (P)PO sampled capture value: decided by random sampling according to a given seed.
-        * ``s[..., 9]`` (P)PO sampled capture slack: (capture time - LST) - decided by random sampling according to a given seed.
-        * ``s[..., 10]`` Overflow indicator: If non-zero, some signals in the input cone of this output had more
+        * ``s[0]`` (P)PI initial value
+        * ``s[1]`` (P)PI transition time
+        * ``s[2]`` (P)PI final value
+        * ``s[3]`` (P)PO initial value
+        * ``s[4]`` (P)PO earliest arrival time (EAT): The time at which the output transitioned from its initial value.
+        * ``s[5]`` (P)PO latest stabilization time (LST): The time at which the output settled to its final value.
+        * ``s[6]`` (P)PO final value
+        * ``s[7]`` (P)PO capture value: probability of capturing a 1 at a given capture time
+        * ``s[8]`` (P)PO sampled capture value: decided by random sampling according to a given seed.
+        * ``s[9]`` (P)PO sampled capture slack: (capture time - LST) - decided by random sampling according to a given seed.
+        * ``s[10]`` Overflow indicator: If non-zero, some signals in the input cone of this output had more
           transitions than specified in ``c_caps``. Some transitions have been discarded, the
           final values in the waveforms are still valid.
         """
@@ -85,19 +85,19 @@ class WaveSim(SimOps):
         self.params = np.zeros((sims, 4), dtype=np.float32)
         self.params[...,0] = 1.0
 
-        self.nbytes = sum([a.nbytes for a in (self.c, self.s, self.vat, self.ops, self.params)])
+        self.nbytes = sum([a.nbytes for a in (self.c, self.s, self.c_locs, self.c_caps, self.ops, self.params)])
 
-        self.pi_s_locs = np.flatnonzero(self.vat[self.ppi_offset+np.arange(len(self.circuit.io_nodes)), 0] >= 0)
-        self.po_s_locs = np.flatnonzero(self.vat[self.ppo_offset+np.arange(len(self.circuit.io_nodes)), 0] >= 0)
+        self.pi_s_locs = np.flatnonzero(self.c_locs[self.ppi_offset+np.arange(len(self.circuit.io_nodes))] >= 0)
+        self.po_s_locs = np.flatnonzero(self.c_locs[self.ppo_offset+np.arange(len(self.circuit.io_nodes))] >= 0)
         self.ppio_s_locs = np.arange(len(self.circuit.io_nodes), len(self.s_nodes))
 
         self.pippi_s_locs = np.concatenate([self.pi_s_locs, self.ppio_s_locs])
         self.poppo_s_locs = np.concatenate([self.po_s_locs, self.ppio_s_locs])
 
-        self.pi_c_locs = self.vat[self.ppi_offset+self.pi_s_locs, 0]
-        self.po_c_locs = self.vat[self.ppo_offset+self.po_s_locs, 0]
-        self.ppi_c_locs = self.vat[self.ppi_offset+self.ppio_s_locs, 0]
-        self.ppo_c_locs = self.vat[self.ppo_offset+self.ppio_s_locs, 0]
+        self.pi_c_locs = self.c_locs[self.ppi_offset+self.pi_s_locs]
+        self.po_c_locs = self.c_locs[self.ppo_offset+self.po_s_locs]
+        self.ppi_c_locs = self.c_locs[self.ppi_offset+self.ppio_s_locs]
+        self.ppo_c_locs = self.c_locs[self.ppo_offset+self.ppio_s_locs]
 
         self.pippi_c_locs = np.concatenate([self.pi_c_locs, self.ppi_c_locs])
         self.poppo_c_locs = np.concatenate([self.po_c_locs, self.ppo_c_locs])
@@ -112,7 +112,7 @@ class WaveSim(SimOps):
         Based on the data in ``self.s``, waveforms are generated on the input lines of the circuit.
         It modifies ``self.c``.
         """
-        sins = np.moveaxis(self.s[self.pippi_s_locs], -1, 0)
+        sins = self.s[:, self.pippi_s_locs]
         cond = (sins[2] != 0) + 2*(sins[0] != 0)  # choices order: 0 R F 1
         self.c[self.pippi_c_locs] = np.choose(cond, [TMAX, sins[1], TMIN, TMIN])
         self.c[self.pippi_c_locs+1] = np.choose(cond, [TMAX, TMAX, sins[1], TMAX])
@@ -127,7 +127,7 @@ class WaveSim(SimOps):
         """
         sims = min(sims or self.sims, self.sims)
         for op_start, op_stop in zip(self.level_starts, self.level_stops):
-            level_eval_cpu(self.ops, op_start, op_stop, self.c, self.vat, 0, sims,
+            level_eval_cpu(self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, 0, sims,
                                          self.timing, self.params, sd, seed)
 
     def c_to_s(self, time=TMAX, sd=0.0, seed=1):
@@ -140,9 +140,9 @@ class WaveSim(SimOps):
         :param sd: A standard deviation for uncertainty in the actual capture time.
         :param seed: The random seed for a capture with uncertainty.
         """
-        for s_loc, (c_loc, c_len, _) in zip(self.poppo_s_locs, self.vat[self.ppo_offset+self.poppo_s_locs]):
+        for s_loc, c_loc, c_len in zip(self.poppo_s_locs, self.c_locs[self.ppo_offset+self.poppo_s_locs], self.c_caps[self.ppo_offset+self.poppo_s_locs]):
             for vector in range(self.sims):
-                self.s[s_loc, vector, 3:] = wave_capture_cpu(self.c, c_loc, c_len, vector, time=time, sd=sd, seed=seed)
+                self.s[3:, s_loc, vector] = wave_capture_cpu(self.c, c_loc, c_len, vector, time=time, sd=sd, seed=seed)
 
     def s_ppo_to_ppi(self, time=0.0):
         """Re-assigns the last sampled capture to the appropriate pseudo-primary inputs (PPI). 
@@ -151,9 +151,9 @@ class WaveSim(SimOps):
 
         :param time: The transition time at the inputs (usually 0.0).
         """
-        self.s[self.ppio_s_locs, :, 0] = self.s[self.ppio_s_locs, :, 2]
-        self.s[self.ppio_s_locs, :, 1] = time
-        self.s[self.ppio_s_locs, :, 2] = self.s[self.ppio_s_locs, :, 8]
+        self.s[0, self.ppio_s_locs] = self.s[2, self.ppio_s_locs]
+        self.s[1, self.ppio_s_locs] = time
+        self.s[2, self.ppio_s_locs] = self.s[8, self.ppio_s_locs]
 
 
 @numba.njit
@@ -173,7 +173,7 @@ def rand_gauss_cpu(seed, sd):
 
 
 @numba.njit
-def wave_eval_cpu(op, cbuf, vat, st_idx, line_times, param, sd=0.0, seed=0):
+def wave_eval_cpu(op, cbuf, c_locs, c_caps, st_idx, line_times, param, sd=0.0, seed=0):
     lut, z_idx, a_idx, b_idx, c_idx, d_idx = op
 
     # >>> same code as wave_eval_cpu (except rand_gauss_*pu()-calls) >>>
@@ -181,11 +181,12 @@ def wave_eval_cpu(op, cbuf, vat, st_idx, line_times, param, sd=0.0, seed=0):
 
     _seed = (seed << 4) + (z_idx << 20) + (st_idx << 1)
 
-    a_mem = vat[a_idx, 0]
-    b_mem = vat[b_idx, 0]
-    c_mem = vat[c_idx, 0]
-    d_mem = vat[d_idx, 0]
-    z_mem, z_cap, _ = vat[z_idx]
+    a_mem = c_locs[a_idx]
+    b_mem = c_locs[b_idx]
+    c_mem = c_locs[c_idx]
+    d_mem = c_locs[d_idx]
+    z_mem = c_locs[z_idx]
+    z_cap = c_caps[z_idx]
 
     a_cur = int(0)
     b_cur = int(0)
@@ -280,12 +281,12 @@ def wave_eval_cpu(op, cbuf, vat, st_idx, line_times, param, sd=0.0, seed=0):
 
 
 @numba.njit
-def level_eval_cpu(ops, op_start, op_stop, c, vat, st_start, st_stop, line_times, params, sd, seed):
+def level_eval_cpu(ops, op_start, op_stop, c, c_locs, c_caps, st_start, st_stop, line_times, params, sd, seed):
     overflows = 0
     for op_idx in range(op_start, op_stop):
         op = ops[op_idx]
         for st_idx in range(st_start, st_stop):
-            wave_eval_cpu(op, c, vat, st_idx, line_times, params[st_idx], sd, seed)
+            wave_eval_cpu(op, c, c_locs, c_caps, st_idx, line_times, params[st_idx], sd, seed)
 
 
 @numba.njit
@@ -347,7 +348,8 @@ class WaveSimCuda(WaveSim):
         self.c = cuda.to_device(self.c)
         self.s = cuda.to_device(self.s)
         self.ops = cuda.to_device(self.ops)
-        self.vat = cuda.to_device(self.vat)
+        self.c_locs = cuda.to_device(self.c_locs)
+        self.c_caps = cuda.to_device(self.c_caps)
         self.timing = cuda.to_device(self.timing)
         self.params = cuda.to_device(self.params)
 
@@ -355,7 +357,7 @@ class WaveSimCuda(WaveSim):
 
     def s_to_c(self):
         grid_dim = self._grid_dim(self.sims, self.s_len)
-        wave_assign_gpu[grid_dim, self._block_dim](self.c, self.s, self.vat, self.ppi_offset)
+        wave_assign_gpu[grid_dim, self._block_dim](self.c, self.s, self.c_locs, self.ppi_offset)
 
     def _grid_dim(self, x, y):
         gx = math.ceil(x / self._block_dim[0])
@@ -366,29 +368,29 @@ class WaveSimCuda(WaveSim):
         sims = min(sims or self.sims, self.sims)
         for op_start, op_stop in zip(self.level_starts, self.level_stops):
             grid_dim = self._grid_dim(sims, op_stop - op_start)
-            wave_eval_gpu[grid_dim, self._block_dim](self.ops, op_start, op_stop, self.c, self.vat, int(0),
+            wave_eval_gpu[grid_dim, self._block_dim](self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, int(0),
                 sims, self.timing, self.params, sd, seed)
         cuda.synchronize()
 
     def c_to_s(self, time=TMAX, sd=0.0, seed=1):
         grid_dim = self._grid_dim(self.sims, self.s_len)
-        wave_capture_gpu[grid_dim, self._block_dim](self.c, self.s, self.vat, self.ppo_offset,
+        wave_capture_gpu[grid_dim, self._block_dim](self.c, self.s, self.c_locs, self.c_caps, self.ppo_offset,
             time, sd * math.sqrt(2), seed)
 
     def s_ppo_to_ppi(self, time=0.0):
         grid_dim = self._grid_dim(self.sims, self.s_len)
-        ppo_to_ppi_gpu[grid_dim, self._block_dim](self.s, self.vat, time, self.ppi_offset, self.ppo_offset)
+        ppo_to_ppi_gpu[grid_dim, self._block_dim](self.s, self.c_locs, time, self.ppi_offset, self.ppo_offset)
 
 
 @cuda.jit()
-def wave_assign_gpu(c, s, vat, ppi_offset):
+def wave_assign_gpu(c, s, c_locs, ppi_offset):
     x, y = cuda.grid(2)
-    if y >= len(s): return
-    c_loc, c_len, _ = vat[ppi_offset + y]
+    if y >= s.shape[1]: return
+    c_loc = c_locs[ppi_offset + y]
     if c_loc < 0: return
     if x >= c.shape[-1]: return
-    value = int(s[y, x, 2] >= 0.5) | (2*int(s[y, x, 0] >= 0.5))
-    ttime = s[y, x, 1]
+    value = int(s[2, y, x] >= 0.5) | (2*int(s[0, y, x] >= 0.5))
+    ttime = s[1, y, x]
     if value == 0:
         c[c_loc, x] = TMAX
         c[c_loc+1, x] = TMAX
@@ -421,7 +423,7 @@ def rand_gauss_gpu(seed, sd):
 
 
 @cuda.jit()
-def wave_eval_gpu(ops, op_start, op_stop, cbuf, vat, st_start, st_stop, line_times, param, sd, seed):
+def wave_eval_gpu(ops, op_start, op_stop, cbuf, c_locs, c_caps, st_start, st_stop, line_times, param, sd, seed):
     x, y = cuda.grid(2)
     st_idx = st_start + x
     op_idx = op_start + y
@@ -442,11 +444,12 @@ def wave_eval_gpu(ops, op_start, op_stop, cbuf, vat, st_start, st_stop, line_tim
 
     _seed = (seed << 4) + (z_idx << 20) + (st_idx << 1)
 
-    a_mem = vat[a_idx, 0]
-    b_mem = vat[b_idx, 0]
-    c_mem = vat[c_idx, 0]
-    d_mem = vat[d_idx, 0]
-    z_mem, z_cap, _ = vat[z_idx]
+    a_mem = c_locs[a_idx]
+    b_mem = c_locs[b_idx]
+    c_mem = c_locs[c_idx]
+    d_mem = c_locs[d_idx]
+    z_mem = c_locs[z_idx]
+    z_cap = c_caps[z_idx]
 
     a_cur = int(0)
     b_cur = int(0)
@@ -541,10 +544,11 @@ def wave_eval_gpu(ops, op_start, op_stop, cbuf, vat, st_start, st_stop, line_tim
 
 
 @cuda.jit()
-def wave_capture_gpu(c, s, vat, ppo_offset, time, s_sqrt2, seed):
+def wave_capture_gpu(c, s, c_locs, c_caps, ppo_offset, time, s_sqrt2, seed):
     x, y = cuda.grid(2)
-    if ppo_offset + y >= len(vat): return
-    line, tdim, _ = vat[ppo_offset + y]
+    if ppo_offset + y >= len(c_locs): return
+    line = c_locs[ppo_offset + y]
+    tdim = c_caps[ppo_offset + y] 
     if line < 0: return
     if x >= c.shape[-1]: return
     vector = x
@@ -588,25 +592,25 @@ def wave_capture_gpu(c, s, vat, ppo_offset, time, s_sqrt2, seed):
     else:
         acc = val
 
-    s[y, vector, 3] = (c[line, vector] <= TMIN)
-    s[y, vector, 4] = eat
-    s[y, vector, 5] = lst
-    s[y, vector, 6] = final
-    s[y, vector, 7] = acc
-    s[y, vector, 8] = val
-    s[y, vector, 9] = 0  # TODO
-    s[y, vector, 10] = ovl
+    s[3, y, vector] = (c[line, vector] <= TMIN)
+    s[4, y, vector] = eat
+    s[5, y, vector] = lst
+    s[6, y, vector] = final
+    s[7, y, vector] = acc
+    s[8, y, vector] = val
+    s[9, y, vector] = 0  # TODO
+    s[10, y, vector] = ovl
 
 
 @cuda.jit()
-def ppo_to_ppi_gpu(s, vat, time, ppi_offset, ppo_offset):
+def ppo_to_ppi_gpu(s, c_locs, time, ppi_offset, ppo_offset):
     x, y = cuda.grid(2)
     if y >= s.shape[0]: return
     if x >= s.shape[1]: return
 
-    if vat[ppi_offset + y, 0] < 0: return
-    if vat[ppo_offset + y, 0] < 0: return
+    if c_locs[ppi_offset + y] < 0: return
+    if c_locs[ppo_offset + y] < 0: return
 
-    s[y, x, 0] = s[y, x, 2]
-    s[y, x, 1] = time
-    s[y, x, 2] = s[y, x, 8]
+    s[0, y, x] = s[2, y, x]
+    s[1, y, x] = time
+    s[2, y, x] = s[8, y, x]
diff --git a/tests/test_wave_sim.py b/tests/test_wave_sim.py
index b8fc18c..6a4bad3 100644
--- a/tests/test_wave_sim.py
+++ b/tests/test_wave_sim.py
@@ -2,17 +2,17 @@ import numpy as np
 
 from kyupy.wave_sim import WaveSim, WaveSimCuda, wave_eval_cpu, TMIN, TMAX
 from kyupy.logic_sim import LogicSim
-from kyupy import verilog, sdf, logic, bench
+from kyupy import logic, bench, sim
 from kyupy.logic import mvarray
-from kyupy.sim import SimPrim
-
 
 def test_nand_delays():
-    op = (SimPrim.NAND4, 4, 0, 1, 2, 3)
+    op = (sim.NAND4, 4, 0, 1, 2, 3)
     #op = (0b0111, 4, 0, 1)
     c = np.full((5*16, 1), TMAX)  # 5 waveforms of capacity 16
-    vat = np.zeros((5, 3), dtype='int')
-    for i in range(5): vat[i] = i*16, 16, 0  # 1:1 mapping
+    c_locs = np.zeros((5,), dtype='int')
+    c_caps = np.zeros((5,), dtype='int')
+    
+    for i in range(5): c_locs[i], c_caps[i] = i*16, 16  # 1:1 mapping
 
     # SDF specifies IOPATH delays with respect to output polarity
     # SDF pulse rejection value is determined by IOPATH causing last transition and polarity of last transition
@@ -32,7 +32,7 @@ def test_nand_delays():
 
     def wave_assert(inputs, output):
         for i, a in zip(inputs, c.reshape(-1,16)): a[:len(i)] = i
-        wave_eval_cpu(op, c, vat, 0, line_times, sdata)
+        wave_eval_cpu(op, c, c_locs, c_caps, 0, line_times, sdata)
         for i, v in enumerate(output): np.testing.assert_allclose(c.reshape(-1,16)[4,i], v)
 
     wave_assert([[TMAX,TMAX],[TMAX,TMAX],[TMIN,TMAX],[TMIN,TMAX]], [TMIN,TMAX]) # NAND(0,0,1,1) => 1
@@ -53,29 +53,29 @@ def test_tiny_circuit():
     lt = np.zeros((len(c.lines), 2, 2))
     lt[:,0,:] = 1.0  # unit delay for all lines
     wsim = WaveSim(c, lt)
-    assert len(wsim.s) == 5
+    assert wsim.s.shape[1] == 5
 
     # values for x
-    wsim.s[0,0,:3] = 0, 0.1, 0
-    wsim.s[0,1,:3] = 0, 0.2, 1
-    wsim.s[0,2,:3] = 1, 0.3, 0
-    wsim.s[0,3,:3] = 1, 0.4, 1
-
+    wsim.s[:3,0,0] = 0, 0.1, 0
+    wsim.s[:3,0,1] = 0, 0.2, 1
+    wsim.s[:3,0,2] = 1, 0.3, 0
+    wsim.s[:3,0,3] = 1, 0.4, 1
+             
     # values for y
-    wsim.s[1,0,:3] = 1, 0.5, 0
-    wsim.s[1,1,:3] = 1, 0.6, 0
-    wsim.s[1,2,:3] = 1, 0.7, 0
-    wsim.s[1,3,:3] = 0, 0.8, 1
+    wsim.s[:3,1,0] = 1, 0.5, 0
+    wsim.s[:3,1,1] = 1, 0.6, 0
+    wsim.s[:3,1,2] = 1, 0.7, 0
+    wsim.s[:3,1,3] = 0, 0.8, 1
 
     wsim.s_to_c()
 
-    x_c_loc = wsim.vat[wsim.ppi_offset+0, 0] # check x waveforms
+    x_c_loc = wsim.c_locs[wsim.ppi_offset+0] # check x waveforms
     np.testing.assert_allclose(wsim.c[x_c_loc:x_c_loc+3, 0], [TMAX, TMAX, TMAX])
     np.testing.assert_allclose(wsim.c[x_c_loc:x_c_loc+3, 1], [0.2, TMAX, TMAX])
     np.testing.assert_allclose(wsim.c[x_c_loc:x_c_loc+3, 2], [TMIN, 0.3, TMAX])
     np.testing.assert_allclose(wsim.c[x_c_loc:x_c_loc+3, 3], [TMIN, TMAX, TMAX])
 
-    y_c_loc = wsim.vat[wsim.ppi_offset+1, 0] # check y waveforms
+    y_c_loc = wsim.c_locs[wsim.ppi_offset+1] # check y waveforms
     np.testing.assert_allclose(wsim.c[y_c_loc:y_c_loc+3, 0], [TMIN, 0.5, TMAX])
     np.testing.assert_allclose(wsim.c[y_c_loc:y_c_loc+3, 1], [TMIN, 0.6, TMAX])
     np.testing.assert_allclose(wsim.c[y_c_loc:y_c_loc+3, 2], [TMIN, 0.7, TMAX])
@@ -83,19 +83,19 @@ def test_tiny_circuit():
 
     wsim.c_prop()
 
-    a_c_loc = wsim.vat[wsim.ppo_offset+2, 0] # check a waveforms
+    a_c_loc = wsim.c_locs[wsim.ppo_offset+2] # check a waveforms
     np.testing.assert_allclose(wsim.c[a_c_loc:a_c_loc+3, 0], [TMAX, TMAX, TMAX])
     np.testing.assert_allclose(wsim.c[a_c_loc:a_c_loc+3, 1], [1.2, 1.6, TMAX])
     np.testing.assert_allclose(wsim.c[a_c_loc:a_c_loc+3, 2], [TMIN, 1.3, TMAX])
     np.testing.assert_allclose(wsim.c[a_c_loc:a_c_loc+3, 3], [1.8, TMAX, TMAX])
 
-    o_c_loc = wsim.vat[wsim.ppo_offset+3, 0] # check o waveforms
+    o_c_loc = wsim.c_locs[wsim.ppo_offset+3] # check o waveforms
     np.testing.assert_allclose(wsim.c[o_c_loc:o_c_loc+3, 0], [TMIN, 1.5, TMAX])
     np.testing.assert_allclose(wsim.c[o_c_loc:o_c_loc+3, 1], [TMIN, TMAX, TMAX])
     np.testing.assert_allclose(wsim.c[o_c_loc:o_c_loc+3, 2], [TMIN, 1.7, TMAX])
     np.testing.assert_allclose(wsim.c[o_c_loc:o_c_loc+3, 3], [TMIN, TMAX, TMAX])
 
-    n_c_loc = wsim.vat[wsim.ppo_offset+4, 0] # check n waveforms
+    n_c_loc = wsim.c_locs[wsim.ppo_offset+4] # check n waveforms
     np.testing.assert_allclose(wsim.c[n_c_loc:n_c_loc+3, 0], [TMIN, TMAX, TMAX])
     np.testing.assert_allclose(wsim.c[n_c_loc:n_c_loc+3, 1], [TMIN, 1.2, TMAX])
     np.testing.assert_allclose(wsim.c[n_c_loc:n_c_loc+3, 2], [1.3, TMAX, TMAX])
@@ -104,22 +104,22 @@ def test_tiny_circuit():
     wsim.c_to_s()
 
     # check a captures
-    np.testing.assert_allclose(wsim.s[2, 0, 3:7], [0, TMAX, TMIN, 0])
-    np.testing.assert_allclose(wsim.s[2, 1, 3:7], [0, 1.2, 1.6, 0])
-    np.testing.assert_allclose(wsim.s[2, 2, 3:7], [1, 1.3, 1.3, 0])
-    np.testing.assert_allclose(wsim.s[2, 3, 3:7], [0, 1.8, 1.8, 1])
-
-    # check o captures
-    np.testing.assert_allclose(wsim.s[3, 0, 3:7], [1, 1.5, 1.5, 0])
-    np.testing.assert_allclose(wsim.s[3, 1, 3:7], [1, TMAX, TMIN, 1])
-    np.testing.assert_allclose(wsim.s[3, 2, 3:7], [1, 1.7, 1.7, 0])
-    np.testing.assert_allclose(wsim.s[3, 3, 3:7], [1, TMAX, TMIN, 1])
-
-    # check o captures
-    np.testing.assert_allclose(wsim.s[4, 0, 3:7], [1, TMAX, TMIN, 1])
-    np.testing.assert_allclose(wsim.s[4, 1, 3:7], [1, 1.2, 1.2, 0])
-    np.testing.assert_allclose(wsim.s[4, 2, 3:7], [0, 1.3, 1.3, 1])
-    np.testing.assert_allclose(wsim.s[4, 3, 3:7], [0, TMAX, TMIN, 0])
+    np.testing.assert_allclose(wsim.s[3:7, 2, 0], [0, TMAX, TMIN, 0])
+    np.testing.assert_allclose(wsim.s[3:7, 2, 1], [0, 1.2, 1.6, 0])
+    np.testing.assert_allclose(wsim.s[3:7, 2, 2], [1, 1.3, 1.3, 0])
+    np.testing.assert_allclose(wsim.s[3:7, 2, 3], [0, 1.8, 1.8, 1])
+                                      
+    # check o captures                
+    np.testing.assert_allclose(wsim.s[3:7, 3, 0], [1, 1.5, 1.5, 0])
+    np.testing.assert_allclose(wsim.s[3:7, 3, 1], [1, TMAX, TMIN, 1])
+    np.testing.assert_allclose(wsim.s[3:7, 3, 2], [1, 1.7, 1.7, 0])
+    np.testing.assert_allclose(wsim.s[3:7, 3, 3], [1, TMAX, TMIN, 1])
+                                      
+    # check o captures                
+    np.testing.assert_allclose(wsim.s[3:7, 4, 0], [1, TMAX, TMIN, 1])
+    np.testing.assert_allclose(wsim.s[3:7, 4, 1], [1, 1.2, 1.2, 0])
+    np.testing.assert_allclose(wsim.s[3:7, 4, 2], [0, 1.3, 1.3, 1])
+    np.testing.assert_allclose(wsim.s[3:7, 4, 3], [0, TMAX, TMIN, 0])
 
 
 def compare_to_logic_sim(wsim: WaveSim):
@@ -127,17 +127,17 @@ def compare_to_logic_sim(wsim: WaveSim):
     rng = np.random.default_rng(10)
     tests = rng.choice(choices, (wsim.s_len, wsim.sims))
 
-    wsim.s[:, :, 0] = (tests & 2) >> 1
-    wsim.s[:, :, 3] = (tests & 2) >> 1
-    wsim.s[:, :, 1] = 0.0
-    wsim.s[:, :, 2] = tests & 1
-    wsim.s[:, :, 6] = tests & 1
+    wsim.s[0] = (tests & 2) >> 1
+    wsim.s[3] = (tests & 2) >> 1
+    wsim.s[1] = 0.0
+    wsim.s[2] = tests & 1
+    wsim.s[6] = tests & 1
 
     wsim.s_to_c()
     wsim.c_prop()
     wsim.c_to_s()
 
-    resp = np.array(wsim.s[:, :, 6], dtype=np.uint8) | (np.array(wsim.s[:, :, 3], dtype=np.uint8)<<1)
+    resp = np.array(wsim.s[6], dtype=np.uint8) | (np.array(wsim.s[3], dtype=np.uint8)<<1)
     resp |= ((resp ^ (resp >> 1)) & 1) << 2  # transitions
     resp[wsim.pi_s_locs] = logic.UNASSIGNED