more on-gpu code, bump python requirement

3 years ago · 7bfc02e683
3 changed files with 109 additions and 26 deletions
--- a/setup.py
+++ b/setup.py
@ -14,7 +14,7 @@ setup(
    url='https://github.com/s-holst/kyupy',
    author='Stefan Holst',
    author_email='mail@s-holst.de',
-    python_requires='>=3.6',
+    python_requires='>=3.8',
    install_requires=[
        'numpy>=1.17.0',
        'lark-parser>=0.8.0'
@ -33,9 +33,8 @@ setup(
        'Operating System :: OS Independent',
        'Programming Language :: Python :: 3',
        'Programming Language :: Python :: 3 :: Only',
        'Programming Language :: Python :: 3.6',
        'Programming Language :: Python :: 3.7',
        'Programming Language :: Python :: 3.8',
        'Programming Language :: Python :: 3.9',
        'Programming Language :: Python :: 3.10',
    ],
 )
--- a/src/kyupy/sim.py
+++ b/src/kyupy/sim.py
@ -158,6 +158,7 @@ class SimOps:
    def __init__(self, circuit, c_caps=1, c_reuse=False, strip_forks=False):
        self.circuit = circuit
        self.s_nodes = list(circuit.io_nodes) + [n for n in circuit.nodes if 'dff' in n.kind.lower()]
        self.s_len = len(self.s_nodes)
        keep_signals = not c_reuse
        if isinstance(c_caps, int):
--- a/src/kyupy/wave_sim.py
+++ b/src/kyupy/wave_sim.py
@ -353,17 +353,9 @@ class WaveSimCuda(WaveSim):
        self._block_dim = (32, 16)
    # TODO implement on GPU
    def s_to_c(self):
-        s = np.array(self.s)
+        grid_dim = self._grid_dim(self.sims, self.s_len)
-        c = np.array(self.c)
+        wave_assign_gpu[grid_dim, self._block_dim](self.c, self.s, self.vat, self.ppi_offset)
        sins = np.moveaxis(s[self.pippi_s_locs], -1, 0)
        cond = (sins[2] != 0) + 2*(sins[0] != 0)  # choices order: 0 R F 1
        c[self.pippi_c_locs] = np.choose(cond, [TMAX, sins[1], TMIN, TMIN])
        c[self.pippi_c_locs+1] = np.choose(cond, [TMAX, TMAX, sins[1], TMAX])
        c[self.pippi_c_locs+2] = TMAX
        self.s[:,:,:] = s
        self.c[:,:] = c
    def _grid_dim(self, x, y):
        gx = math.ceil(x / self._block_dim[0])
@ -378,19 +370,38 @@ class WaveSimCuda(WaveSim):
                sims, self.timing, self.params, sd, seed)
        cuda.synchronize()
    # TODO implement on GPU
    def c_to_s(self, time=TMAX, sd=0.0, seed=1):
-        s = np.array(self.s)
+        grid_dim = self._grid_dim(self.sims, self.s_len)
-        c = np.array(self.c)
+        wave_capture_gpu[grid_dim, self._block_dim](self.c, self.s, self.vat, self.ppo_offset,
-        vat = np.array(self.vat)
+            time, sd * math.sqrt(2), seed)
-        for s_loc, (c_loc, c_len, _) in zip(self.poppo_s_locs, vat[self.ppo_offset+self.poppo_s_locs]):
+    
-            for vector in range(self.sims):
+    def s_ppo_to_ppi(self, time=0.0):
-                s[s_loc, vector, 3:] = wave_capture_cpu(c, c_loc, c_len, vector, time=time, sd=sd, seed=seed)
+        grid_dim = self._grid_dim(self.sims, self.s_len)
-        self.s[:,:,:] = s
+        ppo_to_ppi_gpu[grid_dim, self._block_dim](self.s, self.vat, time, self.ppi_offset, self.ppo_offset)
        self.c[:,:] = c
-    # TODO implement on GPU
+
-    #def s_ppo_to_ppi(self, time=0.0):
+@cuda.jit()
 def wave_assign_gpu(c, s, vat, ppi_offset):
    x, y = cuda.grid(2)
    if y >= len(s): return
    c_loc, c_len, _ = vat[ppi_offset + y]
    if c_loc < 0: return
    if x >= c.shape[-1]: return
    value = int(s[y, x, 2] >= 0.5) | (2*int(s[y, x, 0] >= 0.5))
    ttime = s[y, x, 1]
    if value == 0:
        c[c_loc, x] = TMAX
        c[c_loc+1, x] = TMAX
    elif value == 1:
        c[c_loc, x] = ttime
        c[c_loc+1, x] = TMAX
    elif value == 2:
        c[c_loc, x] = TMIN
        c[c_loc+1, x] = ttime
    else:
        c[c_loc, x] = TMIN
        c[c_loc+1, x] = TMAX
    c[c_loc+2, x] = TMAX
@cuda.jit(device=True)
@ -527,3 +538,75 @@ def wave_eval_gpu(ops, op_start, op_stop, cbuf, vat, st_start, st_stop, line_tim
    # generate overflow flag or propagate from input
    cbuf[z_mem + z_cur, st_idx] = TMAX_OVL if overflows > 0 else max(a, b, c, d)
@cuda.jit()
 def wave_capture_gpu(c, s, vat, ppo_offset, time, s_sqrt2, seed):
    x, y = cuda.grid(2)
    if ppo_offset + y >= len(vat): return
    line, tdim, _ = vat[ppo_offset + y]
    if line < 0: return
    if x >= c.shape[-1]: return
    vector = x
    m = 0.5
    acc = 0.0
    eat = TMAX
    lst = TMIN
    tog = 0
    ovl = 0
    val = int(0)
    final = int(0)
    for tidx in range(tdim):
        t = c[line + tidx, vector]
        if t >= TMAX:
            if t == TMAX_OVL:
                ovl = 1
            break
        m = -m
        final ^= 1
        if t < time:
            val ^= 1
        if t <= TMIN: continue
        if s_sqrt2 > 0:
            acc += m * (1 + math.erf((t - time) / s_sqrt2))
        eat = min(eat, t)
        lst = max(lst, t)
        tog += 1
    if s_sqrt2 > 0:
        if m < 0:
            acc += 1
        if acc >= 0.99:
            val = 1
        elif acc > 0.01:
            seed = (seed << 4) + (vector << 20) + (y << 1)
            seed = int(0xDEECE66D) * seed + 0xB
            seed = int(0xDEECE66D) * seed + 0xB
            rnd = float((seed >> 8) & 0xffffff) / float(1 << 24)
            val = rnd < acc
        else:
            val = 0
    else:
        acc = val
    s[y, vector, 3] = (c[line, vector] <= TMIN)
    s[y, vector, 4] = eat
    s[y, vector, 5] = lst
    s[y, vector, 6] = final
    s[y, vector, 7] = acc
    s[y, vector, 8] = val
    s[y, vector, 9] = 0  # TODO
    s[y, vector, 10] = ovl
@cuda.jit()
 def ppo_to_ppi_gpu(s, vat, time, ppi_offset, ppo_offset):
    x, y = cuda.grid(2)
    if y >= s.shape[0]: return
    if x >= s.shape[1]: return
    if vat[ppi_offset + y, 0] < 0: return
    if vat[ppo_offset + y, 0] < 0: return
    s[y, x, 0] = s[y, x, 2]
    s[y, x, 1] = time
    s[y, x, 2] = s[y, x, 8]