first gpu-code, cached test fixtures

3 years ago · 3497bfdc75
3 changed files with 271 additions and 125 deletions
--- a/src/kyupy/wave_sim4.py
+++ b/src/kyupy/wave_sim4.py
@ -10,7 +10,6 @@ The simulators are not event-based and are not capable of simulating sequential
				@@ -10,7 +10,6 @@ The simulators are not event-based and are not capable of simulating sequential
 """

 import math
-from bisect import bisect, insort_left

 import numpy as np

@ -76,19 +75,14 @@ class WaveSim(SimOps):
				@@ -76,19 +75,14 @@ class WaveSim(SimOps):
        * ``s[..., 8]`` (P)PO sampled capture value: decided by random sampling according to a given seed.
        * ``s[..., 9]`` (P)PO sampled capture slack: (capture time - LST) - decided by random sampling according to a given seed.
        * ``s[..., 10]`` Overflow indicator: If non-zero, some signals in the input cone of this output had more
-          transitions than specified in ``wavecaps``. Some transitions have been discarded, the
+          transitions than specified in ``c_caps``. Some transitions have been discarded, the
          final values in the waveforms are still valid.
        """
                     
        self.params = np.zeros((sims, 4), dtype=np.float32)
        self.params[...,0] = 1.0

-        m1 = np.array([2 ** x for x in range(7, -1, -1)], dtype=np.uint8)
-        m0 = ~m1
-        self.mask = np.rollaxis(np.vstack((m0, m1)), 1)
-
-        self.overflows = 0
-        self.lst_eat_valid = False
+        self.nbytes = sum([a.nbytes for a in (self.c, self.s, self.vat, self.ops, self.params)])

        self.pi_s_locs = np.flatnonzero(self.vat[self.ppi_offset+np.arange(len(self.circuit.io_nodes)), 0] >= 0)
        self.po_s_locs = np.flatnonzero(self.vat[self.ppo_offset+np.arange(len(self.circuit.io_nodes)), 0] >= 0)
@ -105,20 +99,9 @@ class WaveSim(SimOps):
				@@ -105,20 +99,9 @@ class WaveSim(SimOps):
        self.pippi_c_locs = np.concatenate([self.pi_c_locs, self.ppi_c_locs])
        self.poppo_c_locs = np.concatenate([self.po_c_locs, self.ppo_c_locs])

-        self.wave_capture = numba.njit(WaveSim.wave_capture)
-
    def __repr__(self):
-        total_mem = self.c.nbytes + self.vat.nbytes + self.ops.nbytes + self.s.nbytes
-        return f'<WaveSim {self.circuit.name} sims={self.sims} ops={len(self.ops)} ' + \
-               f'levels={len(self.level_starts)} mem={hr_bytes(total_mem)}>'
-
-    def get_line_delay(self, line, polarity):
-        """Returns the current delay of the given ``line`` and ``polarity`` in the simulation model."""
-        return self.timing[line, 0, polarity]
-
-    def set_line_delay(self, line, polarity, delay):
-        """Sets a new ``delay`` for the given ``line`` and ``polarity`` in the simulation model."""
-        self.timing[line, 0, polarity] = delay
+        return f'<{type(self).__name__} {self.circuit.name} sims={self.sims} ops={len(self.ops)} ' + \
+               f'levels={len(self.level_starts)} mem={hr_bytes(self.nbytes)}>'

    def s_to_c(self):
        """Transfers values of sequential elements and primary inputs to the combinational portion.
@ -141,9 +124,8 @@ class WaveSim(SimOps):
				@@ -141,9 +124,8 @@ class WaveSim(SimOps):
        """
        sims = min(sims or self.sims, self.sims)
        for op_start, op_stop in zip(self.level_starts, self.level_stops):
-            self.overflows += level_eval(self.ops, op_start, op_stop, self.c, self.vat, 0, sims,
+            level_eval_cpu(self.ops, op_start, op_stop, self.c, self.vat, 0, sims,
                                         self.timing, self.params, sd, seed)
-        self.lst_eat_valid = False

    def c_to_s(self, time=TMAX, sd=0.0, seed=1):
        """Simulates a capture operation at all sequential elements and primary outputs.
@ -157,7 +139,7 @@ class WaveSim(SimOps):
				@@ -157,7 +139,7 @@ class WaveSim(SimOps):
        """
        for s_loc, (c_loc, c_len, _) in zip(self.poppo_s_locs, self.vat[self.ppo_offset+self.poppo_s_locs]):
            for vector in range(self.sims):
-                self.s[s_loc, vector, 3:] = self.wave_capture(self.c, c_loc, c_len, vector, time=time, sd=sd, seed=seed)
+                self.s[s_loc, vector, 3:] = wave_capture_cpu(self.c, c_loc, c_len, vector, time=time, sd=sd, seed=seed)

    def s_ppo_to_ppi(self, time=0.0):
        """Re-assigns the last sampled capture to the appropriate pseudo-primary inputs (PPI). 
@ -170,8 +152,141 @@ class WaveSim(SimOps):
				@@ -170,8 +152,141 @@ class WaveSim(SimOps):
        self.s[self.ppio_s_locs, :, 1] = time
        self.s[self.ppio_s_locs, :, 2] = self.s[self.ppio_s_locs, :, 8]

-    @staticmethod
-    def wave_capture(c, c_loc, c_len, vector, time=TMAX, sd=0.0, seed=1):
+
+@numba.njit
+def rand_gauss_cpu(seed, sd):
+    clamp = 0.5
+    if sd <= 0.0:
+        return 1.0
+    while True:
+        x = -6.0
+        for _ in range(12):
+            seed = int(0xDEECE66D) * seed + 0xB
+            x += float((seed >> 8) & 0xffffff) / float(1 << 24)
+        x *= sd
+        if abs(x) <= clamp:
+            break
+    return x + 1.0
+
+
+@numba.njit
+def wave_eval_cpu(op, cbuf, vat, st_idx, line_times, param, sd=0.0, seed=0):
+    lut, z_idx, a_idx, b_idx, c_idx, d_idx = op
+
+    # >>> same code as wave_eval_cpu (except rand_gauss_*pu()-call) >>>
+    overflows = int(0)
+
+    _seed = (seed << 4) + (z_idx << 20) + (st_idx << 1)
+
+    a_mem = vat[a_idx, 0]
+    b_mem = vat[b_idx, 0]
+    c_mem = vat[c_idx, 0]
+    d_mem = vat[d_idx, 0]
+    z_mem, z_cap, _ = vat[z_idx]
+
+    a_cur = int(0)
+    b_cur = int(0)
+    c_cur = int(0)
+    d_cur = int(0)                                          
+    z_cur = lut & 1
+    if z_cur == 1:
+        cbuf[z_mem, st_idx] = TMIN
+
+    a = cbuf[a_mem, st_idx] + line_times[a_idx, 0, z_cur] * rand_gauss_cpu(_seed ^ a_mem ^ z_cur, sd) * param[0]
+    if int(param[1]) == a_idx: a += param[2+z_cur]
+    b = cbuf[b_mem, st_idx] + line_times[b_idx, 0, z_cur] * rand_gauss_cpu(_seed ^ b_mem ^ z_cur, sd) * param[0]
+    if int(param[1]) == b_idx: b += param[2+z_cur]
+    c = cbuf[c_mem, st_idx] + line_times[c_idx, 0, z_cur] * rand_gauss_cpu(_seed ^ c_mem ^ z_cur, sd) * param[0]
+    if int(param[1]) == c_idx: c += param[2+z_cur]
+    d = cbuf[d_mem, st_idx] + line_times[d_idx, 0, z_cur] * rand_gauss_cpu(_seed ^ d_mem ^ z_cur, sd) * param[0]
+    if int(param[1]) == d_idx: d += param[2+z_cur]
+    
+    previous_t = TMIN
+
+    current_t = min(a, b, c, d)
+    inputs = int(0)
+
+    while current_t < TMAX:
+        z_val = z_cur & 1
+        if a == current_t:
+            a_cur += 1
+            a = cbuf[a_mem + a_cur, st_idx]
+            a += line_times[a_idx, 0, z_val ^ 1] * rand_gauss_cpu(_seed ^ a_mem ^ z_val ^ 1, sd) * param[0]
+            thresh = line_times[a_idx, 1, z_val] * rand_gauss_cpu(_seed ^ a_mem ^ z_val, sd) * param[0]
+            if int(param[1]) == a_idx:
+                a += param[2+(z_val^1)]
+                thresh += param[2+z_val]
+            inputs ^= 1
+            next_t = a   
+        
+        elif b == current_t:
+            b_cur += 1
+            b = cbuf[b_mem + b_cur, st_idx]
+            b += line_times[b_idx, 0, z_val ^ 1] * rand_gauss_cpu(_seed ^ b_mem ^ z_val ^ 1, sd) * param[0]
+            thresh = line_times[b_idx, 1, z_val] * rand_gauss_cpu(_seed ^ b_mem ^ z_val, sd) * param[0]
+            if int(param[1]) == b_idx:
+                b += param[2+(z_val^1)]
+                thresh += param[2+z_val]
+            inputs ^= 2
+            next_t = b
+                
+        elif c == current_t:
+            c_cur += 1
+            c = cbuf[c_mem + c_cur, st_idx]
+            c += line_times[c_idx, 0, z_val ^ 1] * rand_gauss_cpu(_seed ^ c_mem ^ z_val ^ 1, sd) * param[0]
+            thresh = line_times[c_idx, 1, z_val] * rand_gauss_cpu(_seed ^ c_mem ^ z_val, sd) * param[0]
+            if int(param[1]) == c_idx:
+                c += param[2+(z_val^1)]
+                thresh += param[2+z_val]
+            inputs ^= 4
+            next_t = c 
+                     
+        else:
+            d_cur += 1
+            d = cbuf[d_mem + d_cur, st_idx]
+            d += line_times[d_idx, 0, z_val ^ 1] * rand_gauss_cpu(_seed ^ d_mem ^ z_val ^ 1, sd) * param[0]
+            thresh = line_times[d_idx, 1, z_val] * rand_gauss_cpu(_seed ^ d_mem ^ z_val, sd) * param[0]
+            if int(param[1]) == d_idx:
+                d += param[2+(z_val^1)]
+                thresh += param[2+z_val]
+            inputs ^= 8
+            next_t = d 
+       
+        if (z_cur & 1) != ((lut >> inputs) & 1):
+            # we generate a toggle in z_mem, if:
+            #   ( it is the first toggle in z_mem OR
+            #   following toggle is earlier OR
+            #   pulse is wide enough ) AND enough space in z_mem.
+            if z_cur == 0 or next_t < current_t or (current_t - previous_t) > thresh:
+                if z_cur < (z_cap - 1):
+                    cbuf[z_mem + z_cur, st_idx] = current_t
+                    previous_t = current_t
+                    z_cur += 1
+                else:
+                    overflows += 1
+                    previous_t = cbuf[z_mem + z_cur - 1, st_idx]
+                    z_cur -= 1
+            else:
+                z_cur -= 1
+                previous_t = cbuf[z_mem + z_cur - 1, st_idx] if z_cur > 0 else TMIN
+                
+        current_t = min(a, b, c, d)
+
+    # generate overflow flag or propagate from input
+    cbuf[z_mem + z_cur, st_idx] = TMAX_OVL if overflows > 0 else max(a, b, c, d)
+    
+
+@numba.njit
+def level_eval_cpu(ops, op_start, op_stop, c, vat, st_start, st_stop, line_times, params, sd, seed):
+    overflows = 0
+    for op_idx in range(op_start, op_stop):
+        op = ops[op_idx]
+        for st_idx in range(st_start, st_stop):
+            wave_eval_cpu(op, c, vat, st_idx, line_times, params[st_idx], sd, seed)
+
+
+@numba.njit
+def wave_capture_cpu(c, c_loc, c_len, vector, time=TMAX, sd=0.0, seed=1):
    s_sqrt2 = sd * math.sqrt(2)
    m = 0.5
    acc = 0.0
@ -216,18 +331,50 @@ class WaveSim(SimOps):
				@@ -216,18 +331,50 @@ class WaveSim(SimOps):
    return (w[0] <= TMIN), eat, lst, final, acc, val, 0, ovl


-@numba.njit
-def level_eval(ops, op_start, op_stop, c, vat, st_start, st_stop, line_times, params, sd, seed):
-    overflows = 0
-    for op_idx in range(op_start, op_stop):
-        op = ops[op_idx]
-        for st_idx in range(st_start, st_stop):
-            overflows += wave_eval(op, c, vat, st_idx, line_times, params[st_idx], sd, seed)
-    return overflows
+class WaveSimCuda(WaveSim):
+    """A GPU-accelerated waveform-based combinational logic timing simulator.

+    The API is the same as for :py:class:`WaveSim`.
+    All internal memories are mirrored into GPU memory upon construction.
+    Some operations like access to single waveforms can involve large communication overheads.
+    """
+    def __init__(self, circuit, timing, sims=8, c_caps=16, c_reuse=False, strip_forks=False):
+        super().__init__(circuit, timing, sims, c_caps, c_reuse, strip_forks)

-@numba.njit
-def rand_gauss(seed, sd):
+        self.c = cuda.to_device(self.c)
+        self.s = cuda.to_device(self.s)
+        self.ops = cuda.to_device(self.ops)
+        self.vat = cuda.to_device(self.vat)
+        self.timing = cuda.to_device(self.timing)
+        self.params = cuda.to_device(self.params)
+        
+        self._block_dim = (32, 16)
+
+    # TODO implement on GPU
+    #def s_to_c(self):
+
+    def _grid_dim(self, x, y):
+        gx = math.ceil(x / self._block_dim[0])
+        gy = math.ceil(y / self._block_dim[1])
+        return gx, gy
+    
+    def c_prop(self, sims=None, sd=0.0, seed=1):
+        sims = min(sims or self.sims, self.sims)
+        for op_start, op_stop in zip(self.level_starts, self.level_stops):
+            grid_dim = self._grid_dim(sims, op_stop - op_start)
+            wave_eval_gpu[grid_dim, self._block_dim](self.ops, op_start, op_stop, self.c, self.vat, int(0),
+                sims, self.timing, self.params, sd, seed)
+        cuda.synchronize()
+    
+    # TODO implement on GPU
+    #def c_to_s(self):
+    
+    # TODO implement on GPU
+    #def s_ppo_to_ppi(self, time=0.0):
+    
+
+@cuda.jit(device=True)
+def rand_gauss_gpu(seed, sd):
    clamp = 0.5
    if sd <= 0.0:
        return 1.0
@ -242,9 +389,24 @@ def rand_gauss(seed, sd):
				@@ -242,9 +389,24 @@ def rand_gauss(seed, sd):
    return x + 1.0


-@numba.njit
-def wave_eval(op, cbuf, vat, st_idx, line_times, param, sd=0.0, seed=0):
-    lut, z_idx, a_idx, b_idx, c_idx, d_idx = op
+@cuda.jit()
+def wave_eval_gpu(ops, op_start, op_stop, cbuf, vat, st_start, st_stop, line_times, param, sd, seed):
+    x, y = cuda.grid(2)
+    st_idx = st_start + x
+    op_idx = op_start + y
+    if st_idx >= st_stop: return
+    if op_idx >= op_stop: return
+
+    lut = ops[op_idx, 0]
+    z_idx = ops[op_idx, 1]
+    a_idx = ops[op_idx, 2]
+    b_idx = ops[op_idx, 3]
+    c_idx = ops[op_idx, 4]
+    d_idx = ops[op_idx, 5]
+
+    param = param[st_idx]
+    
+    # >>> same code as wave_eval_cpu (except rand_gauss_*pu()-call) >>>
    overflows = int(0)

    _seed = (seed << 4) + (z_idx << 20) + (st_idx << 1)
@ -263,13 +425,13 @@ def wave_eval(op, cbuf, vat, st_idx, line_times, param, sd=0.0, seed=0):
				@@ -263,13 +425,13 @@ def wave_eval(op, cbuf, vat, st_idx, line_times, param, sd=0.0, seed=0):
    if z_cur == 1:
        cbuf[z_mem, st_idx] = TMIN

-    a = cbuf[a_mem, st_idx] + line_times[a_idx, 0, z_cur] * rand_gauss(_seed ^ a_mem ^ z_cur, sd) * param[0]
+    a = cbuf[a_mem, st_idx] + line_times[a_idx, 0, z_cur] * rand_gauss_gpu(_seed ^ a_mem ^ z_cur, sd) * param[0]
    if int(param[1]) == a_idx: a += param[2+z_cur]
-    b = cbuf[b_mem, st_idx] + line_times[b_idx, 0, z_cur] * rand_gauss(_seed ^ b_mem ^ z_cur, sd) * param[0]
+    b = cbuf[b_mem, st_idx] + line_times[b_idx, 0, z_cur] * rand_gauss_gpu(_seed ^ b_mem ^ z_cur, sd) * param[0]
    if int(param[1]) == b_idx: b += param[2+z_cur]
-    c = cbuf[c_mem, st_idx] + line_times[c_idx, 0, z_cur] * rand_gauss(_seed ^ c_mem ^ z_cur, sd) * param[0]
+    c = cbuf[c_mem, st_idx] + line_times[c_idx, 0, z_cur] * rand_gauss_gpu(_seed ^ c_mem ^ z_cur, sd) * param[0]
    if int(param[1]) == c_idx: c += param[2+z_cur]
-    d = cbuf[d_mem, st_idx] + line_times[d_idx, 0, z_cur] * rand_gauss(_seed ^ d_mem ^ z_cur, sd) * param[0]
+    d = cbuf[d_mem, st_idx] + line_times[d_idx, 0, z_cur] * rand_gauss_gpu(_seed ^ d_mem ^ z_cur, sd) * param[0]
    if int(param[1]) == d_idx: d += param[2+z_cur]
    
    previous_t = TMIN
@ -282,8 +444,8 @@ def wave_eval(op, cbuf, vat, st_idx, line_times, param, sd=0.0, seed=0):
				@@ -282,8 +444,8 @@ def wave_eval(op, cbuf, vat, st_idx, line_times, param, sd=0.0, seed=0):
        if a == current_t:
            a_cur += 1
            a = cbuf[a_mem + a_cur, st_idx]
-            a += line_times[a_idx, 0, z_val ^ 1] * rand_gauss(_seed ^ a_mem ^ z_val ^ 1, sd) * param[0]
-            thresh = line_times[a_idx, 1, z_val] * rand_gauss(_seed ^ a_mem ^ z_val, sd) * param[0]
+            a += line_times[a_idx, 0, z_val ^ 1] * rand_gauss_gpu(_seed ^ a_mem ^ z_val ^ 1, sd) * param[0]
+            thresh = line_times[a_idx, 1, z_val] * rand_gauss_gpu(_seed ^ a_mem ^ z_val, sd) * param[0]
            if int(param[1]) == a_idx:
                a += param[2+(z_val^1)]
                thresh += param[2+z_val]
@ -293,8 +455,8 @@ def wave_eval(op, cbuf, vat, st_idx, line_times, param, sd=0.0, seed=0):
				@@ -293,8 +455,8 @@ def wave_eval(op, cbuf, vat, st_idx, line_times, param, sd=0.0, seed=0):
        elif b == current_t:
            b_cur += 1
            b = cbuf[b_mem + b_cur, st_idx]
-            b += line_times[b_idx, 0, z_val ^ 1] * rand_gauss(_seed ^ b_mem ^ z_val ^ 1, sd) * param[0]
-            thresh = line_times[b_idx, 1, z_val] * rand_gauss(_seed ^ b_mem ^ z_val, sd) * param[0]
+            b += line_times[b_idx, 0, z_val ^ 1] * rand_gauss_gpu(_seed ^ b_mem ^ z_val ^ 1, sd) * param[0]
+            thresh = line_times[b_idx, 1, z_val] * rand_gauss_gpu(_seed ^ b_mem ^ z_val, sd) * param[0]
            if int(param[1]) == b_idx:
                b += param[2+(z_val^1)]
                thresh += param[2+z_val]
@ -304,8 +466,8 @@ def wave_eval(op, cbuf, vat, st_idx, line_times, param, sd=0.0, seed=0):
				@@ -304,8 +466,8 @@ def wave_eval(op, cbuf, vat, st_idx, line_times, param, sd=0.0, seed=0):
        elif c == current_t:
            c_cur += 1
            c = cbuf[c_mem + c_cur, st_idx]
-            c += line_times[c_idx, 0, z_val ^ 1] * rand_gauss(_seed ^ c_mem ^ z_val ^ 1, sd) * param[0]
-            thresh = line_times[c_idx, 1, z_val] * rand_gauss(_seed ^ c_mem ^ z_val, sd) * param[0]
+            c += line_times[c_idx, 0, z_val ^ 1] * rand_gauss_gpu(_seed ^ c_mem ^ z_val ^ 1, sd) * param[0]
+            thresh = line_times[c_idx, 1, z_val] * rand_gauss_gpu(_seed ^ c_mem ^ z_val, sd) * param[0]
            if int(param[1]) == c_idx:
                c += param[2+(z_val^1)]
                thresh += param[2+z_val]
@ -315,19 +477,13 @@ def wave_eval(op, cbuf, vat, st_idx, line_times, param, sd=0.0, seed=0):
				@@ -315,19 +477,13 @@ def wave_eval(op, cbuf, vat, st_idx, line_times, param, sd=0.0, seed=0):
        else:
            d_cur += 1
            d = cbuf[d_mem + d_cur, st_idx]
-            d += line_times[d_idx, 0, z_val ^ 1] * rand_gauss(_seed ^ d_mem ^ z_val ^ 1, sd) * param[0]
-            thresh = line_times[d_idx, 1, z_val] * rand_gauss(_seed ^ d_mem ^ z_val, sd) * param[0]
+            d += line_times[d_idx, 0, z_val ^ 1] * rand_gauss_gpu(_seed ^ d_mem ^ z_val ^ 1, sd) * param[0]
+            thresh = line_times[d_idx, 1, z_val] * rand_gauss_gpu(_seed ^ d_mem ^ z_val, sd) * param[0]
            if int(param[1]) == d_idx:
                d += param[2+(z_val^1)]
                thresh += param[2+z_val]
            inputs ^= 8
            next_t = d 
-        #print("previous_t",previous_t)
-        #print("current_t",current_t) 
-        #print(current_t - previous_t)
-        #print(thresh)
-        #print(z_cur & 1)
-        #print((lut >> inputs) & 1)
       
        if (z_cur & 1) != ((lut >> inputs) & 1):
            # we generate a toggle in z_mem, if:
@ -335,12 +491,8 @@ def wave_eval(op, cbuf, vat, st_idx, line_times, param, sd=0.0, seed=0):
				@@ -335,12 +491,8 @@ def wave_eval(op, cbuf, vat, st_idx, line_times, param, sd=0.0, seed=0):
            #   following toggle is earlier OR
            #   pulse is wide enough ) AND enough space in z_mem.
            if z_cur == 0 or next_t < current_t or (current_t - previous_t) > thresh:
-                #print(current_t - previous_t)
-                #print(thresh)
-                #print(z_cap)
                if z_cur < (z_cap - 1):
                    cbuf[z_mem + z_cur, st_idx] = current_t
-                    #print(cbuf[z_mem + z_cur, st_idx])
                    previous_t = current_t
                    z_cur += 1
                else:
@ -348,18 +500,10 @@ def wave_eval(op, cbuf, vat, st_idx, line_times, param, sd=0.0, seed=0):
				@@ -348,18 +500,10 @@ def wave_eval(op, cbuf, vat, st_idx, line_times, param, sd=0.0, seed=0):
                    previous_t = cbuf[z_mem + z_cur - 1, st_idx]
                    z_cur -= 1
            else:
-                #print(a)
                z_cur -= 1
-                if z_cur > 0:
-                    previous_t = cbuf[z_mem + z_cur - 1, st_idx]
-                else:
-                    previous_t = TMIN
+                previous_t = cbuf[z_mem + z_cur - 1, st_idx] if z_cur > 0 else TMIN
                
        current_t = min(a, b, c, d)

-    if overflows > 0:
-        cbuf[z_mem + z_cur, st_idx] = TMAX_OVL
-    else:
-        cbuf[z_mem + z_cur, st_idx] = a if a == max(a, b, c, d) else b if b == max(a, b, c, d) else c if c == max(a, b, c, d) else d   # propagate overflow flags by storing biggest TMAX from input
-
-    return overflows
+    # generate overflow flag or propagate from input
+    cbuf[z_mem + z_cur, st_idx] = TMAX_OVL if overflows > 0 else max(a, b, c, d)
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -1,8 +1,18 @@
				@@ -1,8 +1,18 @@
 import pytest


-@pytest.fixture
+@pytest.fixture(scope='session')
 def mydir():
    import os
    from pathlib import Path
    return Path(os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))))
+
+@pytest.fixture(scope='session')
+def b14_circuit(mydir):
+    from kyupy import verilog
+    return verilog.load(mydir / 'b14.v.gz', branchforks=True)
+
+@pytest.fixture(scope='session')
+def b14_timing(mydir, b14_circuit):
+    from kyupy import sdf
+    return sdf.load(mydir / 'b14.sdf.gz').annotation(b14_circuit)
--- a/tests/test_wave_sim4.py
+++ b/tests/test_wave_sim4.py
@ -1,6 +1,6 @@
				@@ -1,6 +1,6 @@
 import numpy as np

-from kyupy.wave_sim4 import WaveSim, wave_eval, TMIN, TMAX
+from kyupy.wave_sim4 import WaveSim, WaveSimCuda, wave_eval_cpu, TMIN, TMAX
 from kyupy.logic_sim import LogicSim
 from kyupy import verilog, sdf, logic, bench
 from kyupy.logic import MVArray, BPArray
@ -32,7 +32,7 @@ def test_nand_delays():
				@@ -32,7 +32,7 @@ def test_nand_delays():

    def wave_assert(inputs, output):
        for i, a in zip(inputs, c.reshape(-1,16)): a[:len(i)] = i
-        wave_eval(op, c, vat, 0, line_times, sdata)
+        wave_eval_cpu(op, c, vat, 0, line_times, sdata)
        for i, v in enumerate(output): np.testing.assert_allclose(c.reshape(-1,16)[4,i], v)

    wave_assert([[TMAX,TMAX],[TMAX,TMAX],[TMIN,TMAX],[TMIN,TMAX]], [TMIN,TMAX]) # NAND(0,0,1,1) => 1
@ -53,7 +53,6 @@ def test_tiny_circuit():
				@@ -53,7 +53,6 @@ def test_tiny_circuit():
    lt = np.zeros((len(c.lines), 2, 2))
    lt[:,0,:] = 1.0  # unit delay for all lines
    wsim = WaveSim(c, lt)
-    print(wsim.prim_counts)
    assert len(wsim.s) == 5
    
    # values for x
@ -157,18 +156,11 @@ def compare_to_logic_sim(wsim: WaveSim):
				@@ -157,18 +156,11 @@ def compare_to_logic_sim(wsim: WaveSim):
        assert res_str == exp_str


-def test_b14(mydir):
-    c = verilog.load(mydir / 'b14.v.gz', branchforks=True)
-    df = sdf.load(mydir / 'b14.sdf.gz')
-    lt = df.annotation(c)
-    wsim = WaveSim(c, lt, 8)
-    compare_to_logic_sim(wsim)
+def test_b14(b14_circuit, b14_timing):
+    compare_to_logic_sim(WaveSim(b14_circuit, b14_timing, 8))

+def test_b14_strip_forks(b14_circuit, b14_timing):
+    compare_to_logic_sim(WaveSim(b14_circuit, b14_timing, 8, strip_forks=True))

-def test_b14_strip_forks(mydir):
-    c = verilog.load(mydir / 'b14.v.gz', branchforks=True)
-    df = sdf.load(mydir / 'b14.sdf.gz')
-    lt = df.annotation(c)
-    wsim = WaveSim(c, lt, 8, strip_forks=True)
-    compare_to_logic_sim(wsim)
-
+def test_b14_cuda(b14_circuit, b14_timing):
+    compare_to_logic_sim(WaveSimCuda(b14_circuit, b14_timing, 8, strip_forks=True))