From f04f1b0012e5680791b06c1f7ba47bc00befc456 Mon Sep 17 00:00:00 2001
From: Stefan Holst <holst@cse.kyutech.ac.jp>
Date: Mon, 20 Mar 2023 10:31:55 +0900
Subject: [PATCH] cleanup

---
 src/kyupy/logic_sim.py |  30 +----
 src/kyupy/sim.py       |  29 ++++-
 src/kyupy/wave_sim.py  | 288 +----------------------------------------
 3 files changed, 28 insertions(+), 319 deletions(-)
diff --git a/src/kyupy/logic_sim.py b/src/kyupy/logic_sim.py
index 8eace75..b6d20a4 100644
--- a/src/kyupy/logic_sim.py
+++ b/src/kyupy/logic_sim.py
@@ -34,21 +34,6 @@ class LogicSim(sim.SimOps):
         self.s = np.zeros((2, self.s_len, 3, nbytes), dtype=np.uint8)
         self.s[:,:,1,:] = 255  # unassigned
 
-        self.pi_s_locs = np.flatnonzero(self.c_locs[self.ppi_offset+np.arange(len(self.circuit.io_nodes))] >= 0)
-        self.po_s_locs = np.flatnonzero(self.c_locs[self.ppo_offset+np.arange(len(self.circuit.io_nodes))] >= 0)
-        self.ppio_s_locs = np.arange(len(self.circuit.io_nodes), len(self.s_nodes))
-
-        self.pippi_s_locs = np.concatenate([self.pi_s_locs, self.ppio_s_locs])
-        self.poppo_s_locs = np.concatenate([self.po_s_locs, self.ppio_s_locs])
-
-        self.pi_c_locs = self.c_locs[self.ppi_offset+self.pi_s_locs]
-        self.po_c_locs = self.c_locs[self.ppo_offset+self.po_s_locs]
-        self.ppi_c_locs = self.c_locs[self.ppi_offset+self.ppio_s_locs]
-        self.ppo_c_locs = self.c_locs[self.ppo_offset+self.ppio_s_locs]
-
-        self.pippi_c_locs = np.concatenate([self.pi_c_locs, self.ppi_c_locs])
-        self.poppo_c_locs = np.concatenate([self.po_c_locs, self.ppo_c_locs])
-
     def __repr__(self):
         return f'<LogicSim {self.circuit.name} sims={self.sims} m={self.m} state_mem={hr_bytes(self.c.nbytes)}>'
 
@@ -81,15 +66,6 @@ class LogicSim(sim.SimOps):
         """Propagate the input values towards the outputs (Perform all logic operations in topological order).
 
         If the circuit is sequential (it contains flip-flops), one call simulates one clock cycle.
-        Multiple clock cycles are simulated by a assign-propagate-capture loop:
-
-        .. code-block:: python
-
-           # initial state in state_bp
-           for cycle in range(10):  # simulate 10 clock cycles
-               sim.assign(state_bp)
-               sim.propagate()
-               sim.capture(state_bp)
 
         :param inject_cb: A callback function for manipulating intermediate signal values.
             This function is called with a line and its new logic values (in bit-parallel format) after
@@ -113,7 +89,7 @@ class LogicSim(sim.SimOps):
                     elif op == sim.NOR2: self.c[o0] = ~(self.c[i0] | self.c[i1])
                     elif op == sim.XOR2: self.c[o0] = self.c[i0] ^ self.c[i1]
                     elif op == sim.XNOR2: self.c[o0] = ~(self.c[i0] ^ self.c[i1])
-                    else: print(f'unknown sim {op}')
+                    else: print(f'unknown op {op}')
                     inject_cb(o0, self.s[o0])
         elif self.m == 4:
             pass
@@ -128,7 +104,7 @@ class LogicSim(sim.SimOps):
                 elif op == sim.NOR2: logic.bp_or(self.c[o0], self.c[i0], self.c[i1]); logic.bp_not(self.c[o0], self.c[o0])
                 elif op == sim.XOR2: logic.bp_xor(self.c[o0], self.c[i0], self.c[i1])
                 elif op == sim.XNOR2: logic.bp_xor(self.c[o0], self.c[i0], self.c[i1]); logic.bp_not(self.c[o0], self.c[o0])
-                else: print(f'unknown sim {op}')
+                else: print(f'unknown op {op}')
                 if inject_cb is not None: inject_cb(o0, self.s[o0])
 
     def s_ppo_to_ppi(self):
@@ -169,4 +145,4 @@ def _prop_cpu(ops, c_locs, c):
         elif op == sim.NOR2: c[o0] = ~(c[i0] | c[i1])
         elif op == sim.XOR2: c[o0] = c[i0] ^ c[i1]
         elif op == sim.XNOR2: c[o0] = ~(c[i0] ^ c[i1])
-        else: print(f'unknown sim {op}')
+        else: print(f'unknown op {op}')
diff --git a/src/kyupy/sim.py b/src/kyupy/sim.py
index 29da8fc..6bc5961 100644
--- a/src/kyupy/sim.py
+++ b/src/kyupy/sim.py
@@ -145,7 +145,7 @@ class SimOps:
     :param keep_signals: If disabled, memory of intermediate signal waveforms will be re-used. This greatly reduces
         memory footprint, but intermediate signal waveforms become unaccessible after a propagation.
     """
-    def __init__(self, circuit, c_caps=1, c_reuse=False, strip_forks=False):
+    def __init__(self, circuit, c_caps=1, c_caps_min=1, c_reuse=False, strip_forks=False):
         self.circuit = circuit
         dffs = [n for n in circuit.nodes if 'dff' in n.kind.lower()]
         latches = [n for n in circuit.nodes if 'latch' in n.kind.lower()]
@@ -205,7 +205,7 @@ class SimOps:
                 print('unknown gate type', kind)
             else:
                 ops.append((sp, o0_idx, i0_idx, i1_idx, i2_idx, i3_idx))
-            
+
         self.ops = np.asarray(ops, dtype='int32')
 
         # create a map from fanout lines to stem lines for fork stripping
@@ -244,19 +244,19 @@ class SimOps:
         # state allocation table. maps line and interface indices to self.state memory locations
         self.c_locs = np.full((self.c_locs_len,), -1, dtype=np.int32)
         self.c_caps = np.zeros((self.c_locs_len,), dtype=np.int32)
-        
+
         h = Heap()
 
         # allocate and keep memory for special fields
-        self.c_locs[self.zero_idx], self.c_caps[self.zero_idx] = h.alloc(1), 1
-        self.c_locs[self.tmp_idx], self.c_caps[self.tmp_idx] = h.alloc(1), 1
+        self.c_locs[self.zero_idx], self.c_caps[self.zero_idx] = h.alloc(c_caps_min), c_caps_min
+        self.c_locs[self.tmp_idx], self.c_caps[self.tmp_idx] = h.alloc(c_caps_min), c_caps_min
         ref_count[self.zero_idx] += 1
         ref_count[self.tmp_idx] += 1
 
         # allocate and keep memory for PI/PPI, keep memory for PO/PPO (allocated later)
         for i, n in enumerate(self.s_nodes):
             if len(n.outs) > 0:
-                self.c_locs[self.ppi_offset + i], self.c_caps[self.ppi_offset + i] = h.alloc(1), 1
+                self.c_locs[self.ppi_offset + i], self.c_caps[self.ppi_offset + i] = h.alloc(c_caps_min), c_caps_min
                 ref_count[self.ppi_offset + i] += 1
             if len(n.ins) > 0:
                 i0_idx = stems[n.ins[0]] if stems[n.ins[0]] >= 0 else n.ins[0]
@@ -280,7 +280,7 @@ class SimOps:
                 if ref_count[i2_idx] <= 0: free_list.append(self.c_locs[i2_idx])
                 if ref_count[i3_idx] <= 0: free_list.append(self.c_locs[i3_idx])
                 o_idx = op[1]
-                cap = c_caps[o_idx]
+                cap = max(c_caps_min, c_caps[o_idx])
                 self.c_locs[o_idx], self.c_caps[o_idx] = h.alloc(cap), cap
             if not keep_signals:
                 for loc in free_list:
@@ -301,3 +301,18 @@ class SimOps:
         from collections import defaultdict
         self.prim_counts = defaultdict(int)
         for op, _, _, _, _, _ in self.ops: self.prim_counts[names[op]] += 1
+
+        self.pi_s_locs = np.flatnonzero(self.c_locs[self.ppi_offset+np.arange(len(self.circuit.io_nodes))] >= 0)
+        self.po_s_locs = np.flatnonzero(self.c_locs[self.ppo_offset+np.arange(len(self.circuit.io_nodes))] >= 0)
+        self.ppio_s_locs = np.arange(len(self.circuit.io_nodes), len(self.s_nodes))
+
+        self.pippi_s_locs = np.concatenate([self.pi_s_locs, self.ppio_s_locs])
+        self.poppo_s_locs = np.concatenate([self.po_s_locs, self.ppio_s_locs])
+
+        self.pi_c_locs = self.c_locs[self.ppi_offset+self.pi_s_locs]
+        self.po_c_locs = self.c_locs[self.ppo_offset+self.po_s_locs]
+        self.ppi_c_locs = self.c_locs[self.ppi_offset+self.ppio_s_locs]
+        self.ppo_c_locs = self.c_locs[self.ppo_offset+self.ppio_s_locs]
+
+        self.pippi_c_locs = np.concatenate([self.pi_c_locs, self.ppi_c_locs])
+        self.poppo_c_locs = np.concatenate([self.po_c_locs, self.ppo_c_locs])
diff --git a/src/kyupy/wave_sim.py b/src/kyupy/wave_sim.py
index a6feffd..647872f 100644
--- a/src/kyupy/wave_sim.py
+++ b/src/kyupy/wave_sim.py
@@ -48,14 +48,8 @@ class WaveSim(sim.SimOps):
         memory footprint, but intermediate signal waveforms become unaccessible after a propagation.
     """
     def __init__(self, circuit, delays, sims=8, c_caps=16, c_reuse=False, strip_forks=False):
-        assert c_caps > 0 and c_caps % 4 == 0
-        super().__init__(circuit, c_caps=c_caps//4, c_reuse=c_reuse, strip_forks=strip_forks)
+        super().__init__(circuit, c_caps=c_caps, c_caps_min=4, c_reuse=c_reuse, strip_forks=strip_forks)
         self.sims = sims
-
-        self.c_len *= 4
-        self.c_locs[...] *= 4
-        self.c_caps[...] *= 4
-
         self.delays = np.zeros((len(delays), self.c_locs_len, 2, 2), dtype=delays.dtype)
         self.delays[:, :delays.shape[1]] = delays
 
@@ -87,21 +81,6 @@ class WaveSim(sim.SimOps):
 
         self.nbytes = sum([a.nbytes for a in (self.c, self.s, self.c_locs, self.c_caps, self.ops, self.params)])
 
-        self.pi_s_locs = np.flatnonzero(self.c_locs[self.ppi_offset+np.arange(len(self.circuit.io_nodes))] >= 0)
-        self.po_s_locs = np.flatnonzero(self.c_locs[self.ppo_offset+np.arange(len(self.circuit.io_nodes))] >= 0)
-        self.ppio_s_locs = np.arange(len(self.circuit.io_nodes), len(self.s_nodes))
-
-        self.pippi_s_locs = np.concatenate([self.pi_s_locs, self.ppio_s_locs])
-        self.poppo_s_locs = np.concatenate([self.po_s_locs, self.ppio_s_locs])
-
-        self.pi_c_locs = self.c_locs[self.ppi_offset+self.pi_s_locs]
-        self.po_c_locs = self.c_locs[self.ppo_offset+self.po_s_locs]
-        self.ppi_c_locs = self.c_locs[self.ppi_offset+self.ppio_s_locs]
-        self.ppo_c_locs = self.c_locs[self.ppo_offset+self.ppio_s_locs]
-
-        self.pippi_c_locs = np.concatenate([self.pi_c_locs, self.ppi_c_locs])
-        self.poppo_c_locs = np.concatenate([self.po_c_locs, self.ppo_c_locs])
-
     def __repr__(self):
         return f'<{type(self).__name__} {self.circuit.name} sims={self.sims} ops={len(self.ops)} ' + \
                f'levels={len(self.level_starts)} mem={hr_bytes(self.nbytes)}>'
@@ -156,131 +135,6 @@ class WaveSim(sim.SimOps):
         self.s[2, self.ppio_s_locs] = self.s[8, self.ppio_s_locs]
 
 
-@numba.njit
-def rand_gauss_cpu(seed, sd):
-    clamp = 0.5
-    if sd <= 0.0:
-        return 1.0
-    while True:
-        x = -6.0
-        for _ in range(12):
-            seed = int(0xDEECE66D) * seed + 0xB
-            x += float((seed >> 8) & 0xffffff) / float(1 << 24)
-        x *= sd
-        if abs(x) <= clamp:
-            break
-    return x + 1.0
-
-
-@numba.njit
-def wave_eval_cpu_old(op, cbuf, c_locs, c_caps, st_idx, line_times, param, sd=0.0, seed=0):
-    lut, z_idx, a_idx, b_idx, c_idx, d_idx = op
-
-    # >>> same code as wave_eval_cpu (except rand_gauss_*pu()-calls) >>>
-    overflows = int(0)
-
-    _seed = (seed << 4) + (z_idx << 20) + (st_idx << 1)
-
-    a_mem = c_locs[a_idx]
-    b_mem = c_locs[b_idx]
-    c_mem = c_locs[c_idx]
-    d_mem = c_locs[d_idx]
-    z_mem = c_locs[z_idx]
-    z_cap = c_caps[z_idx]
-
-    a_cur = int(0)
-    b_cur = int(0)
-    c_cur = int(0)
-    d_cur = int(0)
-    z_cur = lut & 1
-    if z_cur == 1:
-        cbuf[z_mem, st_idx] = TMIN
-
-    a = cbuf[a_mem, st_idx] + line_times[a_idx, 0, z_cur] * rand_gauss_cpu(_seed ^ a_mem ^ z_cur, sd) * param[0]
-    if int(param[1]) == a_idx: a += param[2+z_cur]
-    b = cbuf[b_mem, st_idx] + line_times[b_idx, 0, z_cur] * rand_gauss_cpu(_seed ^ b_mem ^ z_cur, sd) * param[0]
-    if int(param[1]) == b_idx: b += param[2+z_cur]
-    c = cbuf[c_mem, st_idx] + line_times[c_idx, 0, z_cur] * rand_gauss_cpu(_seed ^ c_mem ^ z_cur, sd) * param[0]
-    if int(param[1]) == c_idx: c += param[2+z_cur]
-    d = cbuf[d_mem, st_idx] + line_times[d_idx, 0, z_cur] * rand_gauss_cpu(_seed ^ d_mem ^ z_cur, sd) * param[0]
-    if int(param[1]) == d_idx: d += param[2+z_cur]
-
-    previous_t = TMIN
-
-    current_t = min(a, b, c, d)
-    inputs = int(0)
-
-    while current_t < TMAX:
-        z_val = z_cur & 1
-        if a == current_t:
-            a_cur += 1
-            a = cbuf[a_mem + a_cur, st_idx]
-            a += line_times[a_idx, 0, z_val ^ 1] * rand_gauss_cpu(_seed ^ a_mem ^ z_val ^ 1, sd) * param[0]
-            thresh = line_times[a_idx, 1, z_val] * rand_gauss_cpu(_seed ^ a_mem ^ z_val, sd) * param[0]
-            if int(param[1]) == a_idx:
-                a += param[2+(z_val^1)]
-                thresh += param[2+z_val]
-            inputs ^= 1
-            next_t = a
-
-        elif b == current_t:
-            b_cur += 1
-            b = cbuf[b_mem + b_cur, st_idx]
-            b += line_times[b_idx, 0, z_val ^ 1] * rand_gauss_cpu(_seed ^ b_mem ^ z_val ^ 1, sd) * param[0]
-            thresh = line_times[b_idx, 1, z_val] * rand_gauss_cpu(_seed ^ b_mem ^ z_val, sd) * param[0]
-            if int(param[1]) == b_idx:
-                b += param[2+(z_val^1)]
-                thresh += param[2+z_val]
-            inputs ^= 2
-            next_t = b
-
-        elif c == current_t:
-            c_cur += 1
-            c = cbuf[c_mem + c_cur, st_idx]
-            c += line_times[c_idx, 0, z_val ^ 1] * rand_gauss_cpu(_seed ^ c_mem ^ z_val ^ 1, sd) * param[0]
-            thresh = line_times[c_idx, 1, z_val] * rand_gauss_cpu(_seed ^ c_mem ^ z_val, sd) * param[0]
-            if int(param[1]) == c_idx:
-                c += param[2+(z_val^1)]
-                thresh += param[2+z_val]
-            inputs ^= 4
-            next_t = c
-
-        else:
-            d_cur += 1
-            d = cbuf[d_mem + d_cur, st_idx]
-            d += line_times[d_idx, 0, z_val ^ 1] * rand_gauss_cpu(_seed ^ d_mem ^ z_val ^ 1, sd) * param[0]
-            thresh = line_times[d_idx, 1, z_val] * rand_gauss_cpu(_seed ^ d_mem ^ z_val, sd) * param[0]
-            if int(param[1]) == d_idx:
-                d += param[2+(z_val^1)]
-                thresh += param[2+z_val]
-            inputs ^= 8
-            next_t = d
-
-        if (z_cur & 1) != ((lut >> inputs) & 1):
-            # we generate a toggle in z_mem, if:
-            #   ( it is the first toggle in z_mem OR
-            #   following toggle is earlier OR
-            #   pulse is wide enough ) AND enough space in z_mem.
-            if z_cur == 0 or next_t < current_t or (current_t - previous_t) > thresh:
-                if z_cur < (z_cap - 1):
-                    cbuf[z_mem + z_cur, st_idx] = current_t
-                    previous_t = current_t
-                    z_cur += 1
-                else:
-                    overflows += 1
-                    previous_t = cbuf[z_mem + z_cur - 1, st_idx]
-                    z_cur -= 1
-            else:
-                z_cur -= 1
-                previous_t = cbuf[z_mem + z_cur - 1, st_idx] if z_cur > 0 else TMIN
-
-        current_t = min(a, b, c, d)
-
-    # generate or propagate overflow flag
-    cbuf[z_mem + z_cur, st_idx] = TMAX_OVL if overflows > 0 else max(a, b, c, d)
-
-
-
 @numba.njit
 def wave_eval_cpu(op, cbuf, c_locs, c_caps, st_idx, delays, param, sd=0.0, seed=0):
     lut, z_idx, a_idx, b_idx, c_idx, d_idx = op
@@ -379,6 +233,7 @@ def wave_eval_cpu(op, cbuf, c_locs, c_caps, st_idx, delays, param, sd=0.0, seed=
     # generate or propagate overflow flag
     cbuf[z_mem + z_cur, st_idx] = TMAX_OVL if overflows > 0 else max(a, b, c, d)
 
+
 @numba.njit
 def level_eval_cpu(ops, op_start, op_stop, c, c_locs, c_caps, st_start, st_stop, delays, params, sd, seed):
     overflows = 0
@@ -505,143 +360,6 @@ def wave_assign_gpu(c, s, c_locs, ppi_offset):
     c[c_loc+2, x] = TMAX
 
 
-@cuda.jit(device=True)
-def rand_gauss_gpu(seed, sd):
-    clamp = 0.5
-    if sd <= 0.0:
-        return 1.0
-    while True:
-        x = -6.0
-        for _ in range(12):
-            seed = int(0xDEECE66D) * seed + 0xB
-            x += float((seed >> 8) & 0xffffff) / float(1 << 24)
-        x *= sd
-        if abs(x) <= clamp:
-            break
-    return x + 1.0
-
-
-@cuda.jit()
-def wave_eval_gpu_old(ops, op_start, op_stop, cbuf, c_locs, c_caps, st_start, st_stop, line_times, param, sd, seed):
-    x, y = cuda.grid(2)
-    st_idx = st_start + x
-    op_idx = op_start + y
-    if st_idx >= st_stop: return
-    if op_idx >= op_stop: return
-
-    lut = ops[op_idx, 0]
-    z_idx = ops[op_idx, 1]
-    a_idx = ops[op_idx, 2]
-    b_idx = ops[op_idx, 3]
-    c_idx = ops[op_idx, 4]
-    d_idx = ops[op_idx, 5]
-
-    param = param[st_idx]
-
-    # >>> same code as wave_eval_cpu (except rand_gauss_*pu()-calls) >>>
-    overflows = int(0)
-
-    _seed = (seed << 4) + (z_idx << 20) + (st_idx << 1)
-
-    a_mem = c_locs[a_idx]
-    b_mem = c_locs[b_idx]
-    c_mem = c_locs[c_idx]
-    d_mem = c_locs[d_idx]
-    z_mem = c_locs[z_idx]
-    z_cap = c_caps[z_idx]
-
-    a_cur = int(0)
-    b_cur = int(0)
-    c_cur = int(0)
-    d_cur = int(0)
-    z_cur = lut & 1
-    if z_cur == 1:
-        cbuf[z_mem, st_idx] = TMIN
-
-    a = cbuf[a_mem, st_idx] + line_times[a_idx, 0, z_cur] * rand_gauss_gpu(_seed ^ a_mem ^ z_cur, sd) * param[0]
-    if int(param[1]) == a_idx: a += param[2+z_cur]
-    b = cbuf[b_mem, st_idx] + line_times[b_idx, 0, z_cur] * rand_gauss_gpu(_seed ^ b_mem ^ z_cur, sd) * param[0]
-    if int(param[1]) == b_idx: b += param[2+z_cur]
-    c = cbuf[c_mem, st_idx] + line_times[c_idx, 0, z_cur] * rand_gauss_gpu(_seed ^ c_mem ^ z_cur, sd) * param[0]
-    if int(param[1]) == c_idx: c += param[2+z_cur]
-    d = cbuf[d_mem, st_idx] + line_times[d_idx, 0, z_cur] * rand_gauss_gpu(_seed ^ d_mem ^ z_cur, sd) * param[0]
-    if int(param[1]) == d_idx: d += param[2+z_cur]
-
-    previous_t = TMIN
-
-    current_t = min(a, b, c, d)
-    inputs = int(0)
-
-    while current_t < TMAX:
-        z_val = z_cur & 1
-        if a == current_t:
-            a_cur += 1
-            a = cbuf[a_mem + a_cur, st_idx]
-            a += line_times[a_idx, 0, z_val ^ 1] * rand_gauss_gpu(_seed ^ a_mem ^ z_val ^ 1, sd) * param[0]
-            thresh = line_times[a_idx, 1, z_val] * rand_gauss_gpu(_seed ^ a_mem ^ z_val, sd) * param[0]
-            if int(param[1]) == a_idx:
-                a += param[2+(z_val^1)]
-                thresh += param[2+z_val]
-            inputs ^= 1
-            next_t = a
-
-        elif b == current_t:
-            b_cur += 1
-            b = cbuf[b_mem + b_cur, st_idx]
-            b += line_times[b_idx, 0, z_val ^ 1] * rand_gauss_gpu(_seed ^ b_mem ^ z_val ^ 1, sd) * param[0]
-            thresh = line_times[b_idx, 1, z_val] * rand_gauss_gpu(_seed ^ b_mem ^ z_val, sd) * param[0]
-            if int(param[1]) == b_idx:
-                b += param[2+(z_val^1)]
-                thresh += param[2+z_val]
-            inputs ^= 2
-            next_t = b
-
-        elif c == current_t:
-            c_cur += 1
-            c = cbuf[c_mem + c_cur, st_idx]
-            c += line_times[c_idx, 0, z_val ^ 1] * rand_gauss_gpu(_seed ^ c_mem ^ z_val ^ 1, sd) * param[0]
-            thresh = line_times[c_idx, 1, z_val] * rand_gauss_gpu(_seed ^ c_mem ^ z_val, sd) * param[0]
-            if int(param[1]) == c_idx:
-                c += param[2+(z_val^1)]
-                thresh += param[2+z_val]
-            inputs ^= 4
-            next_t = c
-
-        else:
-            d_cur += 1
-            d = cbuf[d_mem + d_cur, st_idx]
-            d += line_times[d_idx, 0, z_val ^ 1] * rand_gauss_gpu(_seed ^ d_mem ^ z_val ^ 1, sd) * param[0]
-            thresh = line_times[d_idx, 1, z_val] * rand_gauss_gpu(_seed ^ d_mem ^ z_val, sd) * param[0]
-            if int(param[1]) == d_idx:
-                d += param[2+(z_val^1)]
-                thresh += param[2+z_val]
-            inputs ^= 8
-            next_t = d
-
-        if (z_cur & 1) != ((lut >> inputs) & 1):
-            # we generate a toggle in z_mem, if:
-            #   ( it is the first toggle in z_mem OR
-            #   following toggle is earlier OR
-            #   pulse is wide enough ) AND enough space in z_mem.
-            if z_cur == 0 or next_t < current_t or (current_t - previous_t) > thresh:
-                if z_cur < (z_cap - 1):
-                    cbuf[z_mem + z_cur, st_idx] = current_t
-                    previous_t = current_t
-                    z_cur += 1
-                else:
-                    overflows += 1
-                    previous_t = cbuf[z_mem + z_cur - 1, st_idx]
-                    z_cur -= 1
-            else:
-                z_cur -= 1
-                previous_t = cbuf[z_mem + z_cur - 1, st_idx] if z_cur > 0 else TMIN
-
-        current_t = min(a, b, c, d)
-
-    # generate or propagate overflow flag
-    cbuf[z_mem + z_cur, st_idx] = TMAX_OVL if overflows > 0 else max(a, b, c, d)
-
-
 @cuda.jit()
 def wave_eval_gpu(ops, op_start, op_stop, cbuf, c_locs, c_caps, st_start, st_stop, delays, param, sd, seed):
     x, y = cuda.grid(2)
@@ -759,7 +477,7 @@ def wave_capture_gpu(c, s, c_locs, c_caps, ppo_offset, time, s_sqrt2, seed):
     x, y = cuda.grid(2)
     if ppo_offset + y >= len(c_locs): return
     line = c_locs[ppo_offset + y]
-    tdim = c_caps[ppo_offset + y] 
+    tdim = c_caps[ppo_offset + y]
     if line < 0: return
     if x >= c.shape[-1]: return
     vector = x