diff --git a/src/kyupy/wave_sim4.py b/src/kyupy/wave_sim4.py
index 3a44f2a..183df03 100644
--- a/src/kyupy/wave_sim4.py
+++ b/src/kyupy/wave_sim4.py
@@ -10,7 +10,6 @@ The simulators are not event-based and are not capable of simulating sequential
 """
 
 import math
-from bisect import bisect, insort_left
 
 import numpy as np
 
@@ -76,19 +75,14 @@ class WaveSim(SimOps):
         * ``s[..., 8]`` (P)PO sampled capture value: decided by random sampling according to a given seed.
         * ``s[..., 9]`` (P)PO sampled capture slack: (capture time - LST) - decided by random sampling according to a given seed.
         * ``s[..., 10]`` Overflow indicator: If non-zero, some signals in the input cone of this output had more
-          transitions than specified in ``wavecaps``. Some transitions have been discarded, the
+          transitions than specified in ``c_caps``. Some transitions have been discarded, the
           final values in the waveforms are still valid.
         """
                      
         self.params = np.zeros((sims, 4), dtype=np.float32)
         self.params[...,0] = 1.0
 
-        m1 = np.array([2 ** x for x in range(7, -1, -1)], dtype=np.uint8)
-        m0 = ~m1
-        self.mask = np.rollaxis(np.vstack((m0, m1)), 1)
-
-        self.overflows = 0
-        self.lst_eat_valid = False
+        self.nbytes = sum([a.nbytes for a in (self.c, self.s, self.vat, self.ops, self.params)])
 
         self.pi_s_locs = np.flatnonzero(self.vat[self.ppi_offset+np.arange(len(self.circuit.io_nodes)), 0] >= 0)
         self.po_s_locs = np.flatnonzero(self.vat[self.ppo_offset+np.arange(len(self.circuit.io_nodes)), 0] >= 0)
@@ -105,20 +99,9 @@ class WaveSim(SimOps):
         self.pippi_c_locs = np.concatenate([self.pi_c_locs, self.ppi_c_locs])
         self.poppo_c_locs = np.concatenate([self.po_c_locs, self.ppo_c_locs])
 
-        self.wave_capture = numba.njit(WaveSim.wave_capture)
-
     def __repr__(self):
-        total_mem = self.c.nbytes + self.vat.nbytes + self.ops.nbytes + self.s.nbytes
-        return f'<WaveSim {self.circuit.name} sims={self.sims} ops={len(self.ops)} ' + \
-               f'levels={len(self.level_starts)} mem={hr_bytes(total_mem)}>'
-
-    def get_line_delay(self, line, polarity):
-        """Returns the current delay of the given ``line`` and ``polarity`` in the simulation model."""
-        return self.timing[line, 0, polarity]
-
-    def set_line_delay(self, line, polarity, delay):
-        """Sets a new ``delay`` for the given ``line`` and ``polarity`` in the simulation model."""
-        self.timing[line, 0, polarity] = delay
+        return f'<{type(self).__name__} {self.circuit.name} sims={self.sims} ops={len(self.ops)} ' + \
+               f'levels={len(self.level_starts)} mem={hr_bytes(self.nbytes)}>'
 
     def s_to_c(self):
         """Transfers values of sequential elements and primary inputs to the combinational portion.
@@ -141,9 +124,8 @@ class WaveSim(SimOps):
         """
         sims = min(sims or self.sims, self.sims)
         for op_start, op_stop in zip(self.level_starts, self.level_stops):
-            self.overflows += level_eval(self.ops, op_start, op_stop, self.c, self.vat, 0, sims,
+            level_eval_cpu(self.ops, op_start, op_stop, self.c, self.vat, 0, sims,
                                          self.timing, self.params, sd, seed)
-        self.lst_eat_valid = False
 
     def c_to_s(self, time=TMAX, sd=0.0, seed=1):
         """Simulates a capture operation at all sequential elements and primary outputs.
@@ -157,7 +139,7 @@ class WaveSim(SimOps):
         """
         for s_loc, (c_loc, c_len, _) in zip(self.poppo_s_locs, self.vat[self.ppo_offset+self.poppo_s_locs]):
             for vector in range(self.sims):
-                self.s[s_loc, vector, 3:] = self.wave_capture(self.c, c_loc, c_len, vector, time=time, sd=sd, seed=seed)
+                self.s[s_loc, vector, 3:] = wave_capture_cpu(self.c, c_loc, c_len, vector, time=time, sd=sd, seed=seed)
 
     def s_ppo_to_ppi(self, time=0.0):
         """Re-assigns the last sampled capture to the appropriate pseudo-primary inputs (PPI). 
@@ -170,64 +152,229 @@ class WaveSim(SimOps):
         self.s[self.ppio_s_locs, :, 1] = time
         self.s[self.ppio_s_locs, :, 2] = self.s[self.ppio_s_locs, :, 8]
 
-    @staticmethod
-    def wave_capture(c, c_loc, c_len, vector, time=TMAX, sd=0.0, seed=1):
-        s_sqrt2 = sd * math.sqrt(2)
-        m = 0.5
-        acc = 0.0
-        eat = TMAX
-        lst = TMIN
-        tog = 0
-        ovl = 0
-        val = int(0)
-        final = int(0)
-        w = c[c_loc:c_loc+c_len, vector]
-        for t in w:
-            if t >= TMAX:
-                if t == TMAX_OVL:
-                    ovl = 1
-                break
-            m = -m
-            final ^= 1
-            if t < time:
-                val ^= 1
-            if t <= TMIN: continue
-            if s_sqrt2 > 0:
-                acc += m * (1 + math.erf((t - time) / s_sqrt2))
-            eat = min(eat, t)
-            lst = max(lst, t)
-            tog += 1
-        if s_sqrt2 > 0:
-            if m < 0:
-                acc += 1
-            if acc >= 0.99:
-                val = 1
-            elif acc > 0.01:
-                seed = (seed << 4) + (vector << 20) + c_loc
-                seed = int(0xDEECE66D) * seed + 0xB
-                seed = int(0xDEECE66D) * seed + 0xB
-                rnd = float((seed >> 8) & 0xffffff) / float(1 << 24)
-                val = rnd < acc
-            else:
-                val = 0
-        else:
-            acc = val
 
-        return (w[0] <= TMIN), eat, lst, final, acc, val, 0, ovl
+@numba.njit
+def rand_gauss_cpu(seed, sd):
+    clamp = 0.5
+    if sd <= 0.0:
+        return 1.0
+    while True:
+        x = -6.0
+        for _ in range(12):
+            seed = int(0xDEECE66D) * seed + 0xB
+            x += float((seed >> 8) & 0xffffff) / float(1 << 24)
+        x *= sd
+        if abs(x) <= clamp:
+            break
+    return x + 1.0
+
+
+@numba.njit
+def wave_eval_cpu(op, cbuf, vat, st_idx, line_times, param, sd=0.0, seed=0):
+    lut, z_idx, a_idx, b_idx, c_idx, d_idx = op
+
+    # >>> same code as wave_eval_cpu (except rand_gauss_*pu()-call) >>>
+    overflows = int(0)
+
+    _seed = (seed << 4) + (z_idx << 20) + (st_idx << 1)
+
+    a_mem = vat[a_idx, 0]
+    b_mem = vat[b_idx, 0]
+    c_mem = vat[c_idx, 0]
+    d_mem = vat[d_idx, 0]
+    z_mem, z_cap, _ = vat[z_idx]
+
+    a_cur = int(0)
+    b_cur = int(0)
+    c_cur = int(0)
+    d_cur = int(0)                                          
+    z_cur = lut & 1
+    if z_cur == 1:
+        cbuf[z_mem, st_idx] = TMIN
+
+    a = cbuf[a_mem, st_idx] + line_times[a_idx, 0, z_cur] * rand_gauss_cpu(_seed ^ a_mem ^ z_cur, sd) * param[0]
+    if int(param[1]) == a_idx: a += param[2+z_cur]
+    b = cbuf[b_mem, st_idx] + line_times[b_idx, 0, z_cur] * rand_gauss_cpu(_seed ^ b_mem ^ z_cur, sd) * param[0]
+    if int(param[1]) == b_idx: b += param[2+z_cur]
+    c = cbuf[c_mem, st_idx] + line_times[c_idx, 0, z_cur] * rand_gauss_cpu(_seed ^ c_mem ^ z_cur, sd) * param[0]
+    if int(param[1]) == c_idx: c += param[2+z_cur]
+    d = cbuf[d_mem, st_idx] + line_times[d_idx, 0, z_cur] * rand_gauss_cpu(_seed ^ d_mem ^ z_cur, sd) * param[0]
+    if int(param[1]) == d_idx: d += param[2+z_cur]
+    
+    previous_t = TMIN
+
+    current_t = min(a, b, c, d)
+    inputs = int(0)
+
+    while current_t < TMAX:
+        z_val = z_cur & 1
+        if a == current_t:
+            a_cur += 1
+            a = cbuf[a_mem + a_cur, st_idx]
+            a += line_times[a_idx, 0, z_val ^ 1] * rand_gauss_cpu(_seed ^ a_mem ^ z_val ^ 1, sd) * param[0]
+            thresh = line_times[a_idx, 1, z_val] * rand_gauss_cpu(_seed ^ a_mem ^ z_val, sd) * param[0]
+            if int(param[1]) == a_idx:
+                a += param[2+(z_val^1)]
+                thresh += param[2+z_val]
+            inputs ^= 1
+            next_t = a   
+        
+        elif b == current_t:
+            b_cur += 1
+            b = cbuf[b_mem + b_cur, st_idx]
+            b += line_times[b_idx, 0, z_val ^ 1] * rand_gauss_cpu(_seed ^ b_mem ^ z_val ^ 1, sd) * param[0]
+            thresh = line_times[b_idx, 1, z_val] * rand_gauss_cpu(_seed ^ b_mem ^ z_val, sd) * param[0]
+            if int(param[1]) == b_idx:
+                b += param[2+(z_val^1)]
+                thresh += param[2+z_val]
+            inputs ^= 2
+            next_t = b
+                
+        elif c == current_t:
+            c_cur += 1
+            c = cbuf[c_mem + c_cur, st_idx]
+            c += line_times[c_idx, 0, z_val ^ 1] * rand_gauss_cpu(_seed ^ c_mem ^ z_val ^ 1, sd) * param[0]
+            thresh = line_times[c_idx, 1, z_val] * rand_gauss_cpu(_seed ^ c_mem ^ z_val, sd) * param[0]
+            if int(param[1]) == c_idx:
+                c += param[2+(z_val^1)]
+                thresh += param[2+z_val]
+            inputs ^= 4
+            next_t = c 
+                     
+        else:
+            d_cur += 1
+            d = cbuf[d_mem + d_cur, st_idx]
+            d += line_times[d_idx, 0, z_val ^ 1] * rand_gauss_cpu(_seed ^ d_mem ^ z_val ^ 1, sd) * param[0]
+            thresh = line_times[d_idx, 1, z_val] * rand_gauss_cpu(_seed ^ d_mem ^ z_val, sd) * param[0]
+            if int(param[1]) == d_idx:
+                d += param[2+(z_val^1)]
+                thresh += param[2+z_val]
+            inputs ^= 8
+            next_t = d 
+       
+        if (z_cur & 1) != ((lut >> inputs) & 1):
+            # we generate a toggle in z_mem, if:
+            #   ( it is the first toggle in z_mem OR
+            #   following toggle is earlier OR
+            #   pulse is wide enough ) AND enough space in z_mem.
+            if z_cur == 0 or next_t < current_t or (current_t - previous_t) > thresh:
+                if z_cur < (z_cap - 1):
+                    cbuf[z_mem + z_cur, st_idx] = current_t
+                    previous_t = current_t
+                    z_cur += 1
+                else:
+                    overflows += 1
+                    previous_t = cbuf[z_mem + z_cur - 1, st_idx]
+                    z_cur -= 1
+            else:
+                z_cur -= 1
+                previous_t = cbuf[z_mem + z_cur - 1, st_idx] if z_cur > 0 else TMIN
+                
+        current_t = min(a, b, c, d)
 
+    # generate overflow flag or propagate from input
+    cbuf[z_mem + z_cur, st_idx] = TMAX_OVL if overflows > 0 else max(a, b, c, d)
+    
 
 @numba.njit
-def level_eval(ops, op_start, op_stop, c, vat, st_start, st_stop, line_times, params, sd, seed):
+def level_eval_cpu(ops, op_start, op_stop, c, vat, st_start, st_stop, line_times, params, sd, seed):
     overflows = 0
     for op_idx in range(op_start, op_stop):
         op = ops[op_idx]
         for st_idx in range(st_start, st_stop):
-            overflows += wave_eval(op, c, vat, st_idx, line_times, params[st_idx], sd, seed)
-    return overflows
+            wave_eval_cpu(op, c, vat, st_idx, line_times, params[st_idx], sd, seed)
 
 
 @numba.njit
-def rand_gauss(seed, sd):
+def wave_capture_cpu(c, c_loc, c_len, vector, time=TMAX, sd=0.0, seed=1):
+    s_sqrt2 = sd * math.sqrt(2)
+    m = 0.5
+    acc = 0.0
+    eat = TMAX
+    lst = TMIN
+    tog = 0
+    ovl = 0
+    val = int(0)
+    final = int(0)
+    w = c[c_loc:c_loc+c_len, vector]
+    for t in w:
+        if t >= TMAX:
+            if t == TMAX_OVL:
+                ovl = 1
+            break
+        m = -m
+        final ^= 1
+        if t < time:
+            val ^= 1
+        if t <= TMIN: continue
+        if s_sqrt2 > 0:
+            acc += m * (1 + math.erf((t - time) / s_sqrt2))
+        eat = min(eat, t)
+        lst = max(lst, t)
+        tog += 1
+    if s_sqrt2 > 0:
+        if m < 0:
+            acc += 1
+        if acc >= 0.99:
+            val = 1
+        elif acc > 0.01:
+            seed = (seed << 4) + (vector << 20) + c_loc
+            seed = int(0xDEECE66D) * seed + 0xB
+            seed = int(0xDEECE66D) * seed + 0xB
+            rnd = float((seed >> 8) & 0xffffff) / float(1 << 24)
+            val = rnd < acc
+        else:
+            val = 0
+    else:
+        acc = val
+
+    return (w[0] <= TMIN), eat, lst, final, acc, val, 0, ovl
+
+
+class WaveSimCuda(WaveSim):
+    """A GPU-accelerated waveform-based combinational logic timing simulator.
+
+    The API is the same as for :py:class:`WaveSim`.
+    All internal memories are mirrored into GPU memory upon construction.
+    Some operations like access to single waveforms can involve large communication overheads.
+    """
+    def __init__(self, circuit, timing, sims=8, c_caps=16, c_reuse=False, strip_forks=False):
+        super().__init__(circuit, timing, sims, c_caps, c_reuse, strip_forks)
+
+        self.c = cuda.to_device(self.c)
+        self.s = cuda.to_device(self.s)
+        self.ops = cuda.to_device(self.ops)
+        self.vat = cuda.to_device(self.vat)
+        self.timing = cuda.to_device(self.timing)
+        self.params = cuda.to_device(self.params)
+        
+        self._block_dim = (32, 16)
+
+    # TODO implement on GPU
+    #def s_to_c(self):
+
+    def _grid_dim(self, x, y):
+        gx = math.ceil(x / self._block_dim[0])
+        gy = math.ceil(y / self._block_dim[1])
+        return gx, gy
+    
+    def c_prop(self, sims=None, sd=0.0, seed=1):
+        sims = min(sims or self.sims, self.sims)
+        for op_start, op_stop in zip(self.level_starts, self.level_stops):
+            grid_dim = self._grid_dim(sims, op_stop - op_start)
+            wave_eval_gpu[grid_dim, self._block_dim](self.ops, op_start, op_stop, self.c, self.vat, int(0),
+                sims, self.timing, self.params, sd, seed)
+        cuda.synchronize()
+    
+    # TODO implement on GPU
+    #def c_to_s(self):
+    
+    # TODO implement on GPU
+    #def s_ppo_to_ppi(self, time=0.0):
+    
+
+@cuda.jit(device=True)
+def rand_gauss_gpu(seed, sd):
     clamp = 0.5
     if sd <= 0.0:
         return 1.0
@@ -242,9 +389,24 @@ def rand_gauss(seed, sd):
     return x + 1.0
 
 
-@numba.njit
-def wave_eval(op, cbuf, vat, st_idx, line_times, param, sd=0.0, seed=0):
-    lut, z_idx, a_idx, b_idx, c_idx, d_idx = op
+@cuda.jit()
+def wave_eval_gpu(ops, op_start, op_stop, cbuf, vat, st_start, st_stop, line_times, param, sd, seed):
+    x, y = cuda.grid(2)
+    st_idx = st_start + x
+    op_idx = op_start + y
+    if st_idx >= st_stop: return
+    if op_idx >= op_stop: return
+
+    lut = ops[op_idx, 0]
+    z_idx = ops[op_idx, 1]
+    a_idx = ops[op_idx, 2]
+    b_idx = ops[op_idx, 3]
+    c_idx = ops[op_idx, 4]
+    d_idx = ops[op_idx, 5]
+
+    param = param[st_idx]
+    
+    # >>> same code as wave_eval_cpu (except rand_gauss_*pu()-call) >>>
     overflows = int(0)
 
     _seed = (seed << 4) + (z_idx << 20) + (st_idx << 1)
@@ -263,13 +425,13 @@ def wave_eval(op, cbuf, vat, st_idx, line_times, param, sd=0.0, seed=0):
     if z_cur == 1:
         cbuf[z_mem, st_idx] = TMIN
 
-    a = cbuf[a_mem, st_idx] + line_times[a_idx, 0, z_cur] * rand_gauss(_seed ^ a_mem ^ z_cur, sd) * param[0]
+    a = cbuf[a_mem, st_idx] + line_times[a_idx, 0, z_cur] * rand_gauss_gpu(_seed ^ a_mem ^ z_cur, sd) * param[0]
     if int(param[1]) == a_idx: a += param[2+z_cur]
-    b = cbuf[b_mem, st_idx] + line_times[b_idx, 0, z_cur] * rand_gauss(_seed ^ b_mem ^ z_cur, sd) * param[0]
+    b = cbuf[b_mem, st_idx] + line_times[b_idx, 0, z_cur] * rand_gauss_gpu(_seed ^ b_mem ^ z_cur, sd) * param[0]
     if int(param[1]) == b_idx: b += param[2+z_cur]
-    c = cbuf[c_mem, st_idx] + line_times[c_idx, 0, z_cur] * rand_gauss(_seed ^ c_mem ^ z_cur, sd) * param[0]
+    c = cbuf[c_mem, st_idx] + line_times[c_idx, 0, z_cur] * rand_gauss_gpu(_seed ^ c_mem ^ z_cur, sd) * param[0]
     if int(param[1]) == c_idx: c += param[2+z_cur]
-    d = cbuf[d_mem, st_idx] + line_times[d_idx, 0, z_cur] * rand_gauss(_seed ^ d_mem ^ z_cur, sd) * param[0]
+    d = cbuf[d_mem, st_idx] + line_times[d_idx, 0, z_cur] * rand_gauss_gpu(_seed ^ d_mem ^ z_cur, sd) * param[0]
     if int(param[1]) == d_idx: d += param[2+z_cur]
     
     previous_t = TMIN
@@ -282,8 +444,8 @@ def wave_eval(op, cbuf, vat, st_idx, line_times, param, sd=0.0, seed=0):
         if a == current_t:
             a_cur += 1
             a = cbuf[a_mem + a_cur, st_idx]
-            a += line_times[a_idx, 0, z_val ^ 1] * rand_gauss(_seed ^ a_mem ^ z_val ^ 1, sd) * param[0]
-            thresh = line_times[a_idx, 1, z_val] * rand_gauss(_seed ^ a_mem ^ z_val, sd) * param[0]
+            a += line_times[a_idx, 0, z_val ^ 1] * rand_gauss_gpu(_seed ^ a_mem ^ z_val ^ 1, sd) * param[0]
+            thresh = line_times[a_idx, 1, z_val] * rand_gauss_gpu(_seed ^ a_mem ^ z_val, sd) * param[0]
             if int(param[1]) == a_idx:
                 a += param[2+(z_val^1)]
                 thresh += param[2+z_val]
@@ -293,8 +455,8 @@ def wave_eval(op, cbuf, vat, st_idx, line_times, param, sd=0.0, seed=0):
         elif b == current_t:
             b_cur += 1
             b = cbuf[b_mem + b_cur, st_idx]
-            b += line_times[b_idx, 0, z_val ^ 1] * rand_gauss(_seed ^ b_mem ^ z_val ^ 1, sd) * param[0]
-            thresh = line_times[b_idx, 1, z_val] * rand_gauss(_seed ^ b_mem ^ z_val, sd) * param[0]
+            b += line_times[b_idx, 0, z_val ^ 1] * rand_gauss_gpu(_seed ^ b_mem ^ z_val ^ 1, sd) * param[0]
+            thresh = line_times[b_idx, 1, z_val] * rand_gauss_gpu(_seed ^ b_mem ^ z_val, sd) * param[0]
             if int(param[1]) == b_idx:
                 b += param[2+(z_val^1)]
                 thresh += param[2+z_val]
@@ -304,8 +466,8 @@ def wave_eval(op, cbuf, vat, st_idx, line_times, param, sd=0.0, seed=0):
         elif c == current_t:
             c_cur += 1
             c = cbuf[c_mem + c_cur, st_idx]
-            c += line_times[c_idx, 0, z_val ^ 1] * rand_gauss(_seed ^ c_mem ^ z_val ^ 1, sd) * param[0]
-            thresh = line_times[c_idx, 1, z_val] * rand_gauss(_seed ^ c_mem ^ z_val, sd) * param[0]
+            c += line_times[c_idx, 0, z_val ^ 1] * rand_gauss_gpu(_seed ^ c_mem ^ z_val ^ 1, sd) * param[0]
+            thresh = line_times[c_idx, 1, z_val] * rand_gauss_gpu(_seed ^ c_mem ^ z_val, sd) * param[0]
             if int(param[1]) == c_idx:
                 c += param[2+(z_val^1)]
                 thresh += param[2+z_val]
@@ -315,19 +477,13 @@ def wave_eval(op, cbuf, vat, st_idx, line_times, param, sd=0.0, seed=0):
         else:
             d_cur += 1
             d = cbuf[d_mem + d_cur, st_idx]
-            d += line_times[d_idx, 0, z_val ^ 1] * rand_gauss(_seed ^ d_mem ^ z_val ^ 1, sd) * param[0]
-            thresh = line_times[d_idx, 1, z_val] * rand_gauss(_seed ^ d_mem ^ z_val, sd) * param[0]
+            d += line_times[d_idx, 0, z_val ^ 1] * rand_gauss_gpu(_seed ^ d_mem ^ z_val ^ 1, sd) * param[0]
+            thresh = line_times[d_idx, 1, z_val] * rand_gauss_gpu(_seed ^ d_mem ^ z_val, sd) * param[0]
             if int(param[1]) == d_idx:
                 d += param[2+(z_val^1)]
                 thresh += param[2+z_val]
             inputs ^= 8
             next_t = d 
-        #print("previous_t",previous_t)
-        #print("current_t",current_t) 
-        #print(current_t - previous_t)
-        #print(thresh)
-        #print(z_cur & 1)
-        #print((lut >> inputs) & 1)
        
         if (z_cur & 1) != ((lut >> inputs) & 1):
             # we generate a toggle in z_mem, if:
@@ -335,12 +491,8 @@ def wave_eval(op, cbuf, vat, st_idx, line_times, param, sd=0.0, seed=0):
             #   following toggle is earlier OR
             #   pulse is wide enough ) AND enough space in z_mem.
             if z_cur == 0 or next_t < current_t or (current_t - previous_t) > thresh:
-                #print(current_t - previous_t)
-                #print(thresh)
-                #print(z_cap)
                 if z_cur < (z_cap - 1):
                     cbuf[z_mem + z_cur, st_idx] = current_t
-                    #print(cbuf[z_mem + z_cur, st_idx])
                     previous_t = current_t
                     z_cur += 1
                 else:
@@ -348,18 +500,10 @@ def wave_eval(op, cbuf, vat, st_idx, line_times, param, sd=0.0, seed=0):
                     previous_t = cbuf[z_mem + z_cur - 1, st_idx]
                     z_cur -= 1
             else:
-                #print(a)
                 z_cur -= 1
-                if z_cur > 0:
-                    previous_t = cbuf[z_mem + z_cur - 1, st_idx]
-                else:
-                    previous_t = TMIN
-        
+                previous_t = cbuf[z_mem + z_cur - 1, st_idx] if z_cur > 0 else TMIN
+                
         current_t = min(a, b, c, d)
-                     
-    if overflows > 0:
-        cbuf[z_mem + z_cur, st_idx] = TMAX_OVL
-    else:
-        cbuf[z_mem + z_cur, st_idx] = a if a == max(a, b, c, d) else b if b == max(a, b, c, d) else c if c == max(a, b, c, d) else d   # propagate overflow flags by storing biggest TMAX from input
 
-    return overflows
+    # generate overflow flag or propagate from input
+    cbuf[z_mem + z_cur, st_idx] = TMAX_OVL if overflows > 0 else max(a, b, c, d)
\ No newline at end of file
diff --git a/tests/conftest.py b/tests/conftest.py
index e82b2fc..5d88c6b 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,8 +1,18 @@
 import pytest
 
 
-@pytest.fixture
+@pytest.fixture(scope='session')
 def mydir():
     import os
     from pathlib import Path
     return Path(os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))))
+
+@pytest.fixture(scope='session')
+def b14_circuit(mydir):
+    from kyupy import verilog
+    return verilog.load(mydir / 'b14.v.gz', branchforks=True)
+
+@pytest.fixture(scope='session')
+def b14_timing(mydir, b14_circuit):
+    from kyupy import sdf
+    return sdf.load(mydir / 'b14.sdf.gz').annotation(b14_circuit)
diff --git a/tests/test_wave_sim4.py b/tests/test_wave_sim4.py
index 9fe3d3d..918bd06 100644
--- a/tests/test_wave_sim4.py
+++ b/tests/test_wave_sim4.py
@@ -1,6 +1,6 @@
 import numpy as np
 
-from kyupy.wave_sim4 import WaveSim, wave_eval, TMIN, TMAX
+from kyupy.wave_sim4 import WaveSim, WaveSimCuda, wave_eval_cpu, TMIN, TMAX
 from kyupy.logic_sim import LogicSim
 from kyupy import verilog, sdf, logic, bench
 from kyupy.logic import MVArray, BPArray
@@ -32,7 +32,7 @@ def test_nand_delays():
 
     def wave_assert(inputs, output):
         for i, a in zip(inputs, c.reshape(-1,16)): a[:len(i)] = i
-        wave_eval(op, c, vat, 0, line_times, sdata)
+        wave_eval_cpu(op, c, vat, 0, line_times, sdata)
         for i, v in enumerate(output): np.testing.assert_allclose(c.reshape(-1,16)[4,i], v)
 
     wave_assert([[TMAX,TMAX],[TMAX,TMAX],[TMIN,TMAX],[TMIN,TMAX]], [TMIN,TMAX]) # NAND(0,0,1,1) => 1
@@ -53,7 +53,6 @@ def test_tiny_circuit():
     lt = np.zeros((len(c.lines), 2, 2))
     lt[:,0,:] = 1.0  # unit delay for all lines
     wsim = WaveSim(c, lt)
-    print(wsim.prim_counts)
     assert len(wsim.s) == 5
     
     # values for x
@@ -157,18 +156,11 @@ def compare_to_logic_sim(wsim: WaveSim):
         assert res_str == exp_str
 
 
-def test_b14(mydir):
-    c = verilog.load(mydir / 'b14.v.gz', branchforks=True)
-    df = sdf.load(mydir / 'b14.sdf.gz')
-    lt = df.annotation(c)
-    wsim = WaveSim(c, lt, 8)
-    compare_to_logic_sim(wsim)
+def test_b14(b14_circuit, b14_timing):
+    compare_to_logic_sim(WaveSim(b14_circuit, b14_timing, 8))
 
+def test_b14_strip_forks(b14_circuit, b14_timing):
+    compare_to_logic_sim(WaveSim(b14_circuit, b14_timing, 8, strip_forks=True))
 
-def test_b14_strip_forks(mydir):
-    c = verilog.load(mydir / 'b14.v.gz', branchforks=True)
-    df = sdf.load(mydir / 'b14.sdf.gz')
-    lt = df.annotation(c)
-    wsim = WaveSim(c, lt, 8, strip_forks=True)
-    compare_to_logic_sim(wsim)
-
+def test_b14_cuda(b14_circuit, b14_timing):
+    compare_to_logic_sim(WaveSimCuda(b14_circuit, b14_timing, 8, strip_forks=True))