From 7bfc02e68332898086376071e75fd84f321aeccf Mon Sep 17 00:00:00 2001 From: Stefan Holst Date: Sun, 26 Feb 2023 22:37:15 +0900 Subject: [PATCH] more on-gpu code, bump python requirement --- setup.py | 5 +- src/kyupy/sim.py | 1 + src/kyupy/wave_sim.py | 129 ++++++++++++++++++++++++++++++++++-------- 3 files changed, 109 insertions(+), 26 deletions(-) diff --git a/setup.py b/setup.py index 0f443ff..47ef1f7 100644 --- a/setup.py +++ b/setup.py @@ -14,7 +14,7 @@ setup( url='https://github.com/s-holst/kyupy', author='Stefan Holst', author_email='mail@s-holst.de', - python_requires='>=3.6', + python_requires='>=3.8', install_requires=[ 'numpy>=1.17.0', 'lark-parser>=0.8.0' @@ -33,9 +33,8 @@ setup( 'Operating System :: OS Independent', 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3 :: Only', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9', + 'Programming Language :: Python :: 3.10', ], ) diff --git a/src/kyupy/sim.py b/src/kyupy/sim.py index d45388e..ef1b2e5 100644 --- a/src/kyupy/sim.py +++ b/src/kyupy/sim.py @@ -158,6 +158,7 @@ class SimOps: def __init__(self, circuit, c_caps=1, c_reuse=False, strip_forks=False): self.circuit = circuit self.s_nodes = list(circuit.io_nodes) + [n for n in circuit.nodes if 'dff' in n.kind.lower()] + self.s_len = len(self.s_nodes) keep_signals = not c_reuse if isinstance(c_caps, int): diff --git a/src/kyupy/wave_sim.py b/src/kyupy/wave_sim.py index d3895d1..7605b6c 100644 --- a/src/kyupy/wave_sim.py +++ b/src/kyupy/wave_sim.py @@ -353,17 +353,9 @@ class WaveSimCuda(WaveSim): self._block_dim = (32, 16) - # TODO implement on GPU def s_to_c(self): - s = np.array(self.s) - c = np.array(self.c) - sins = np.moveaxis(s[self.pippi_s_locs], -1, 0) - cond = (sins[2] != 0) + 2*(sins[0] != 0) # choices order: 0 R F 1 - c[self.pippi_c_locs] = np.choose(cond, [TMAX, sins[1], TMIN, TMIN]) - c[self.pippi_c_locs+1] = np.choose(cond, [TMAX, TMAX, sins[1], TMAX]) - c[self.pippi_c_locs+2] = TMAX - self.s[:,:,:] = s - self.c[:,:] = c + grid_dim = self._grid_dim(self.sims, self.s_len) + wave_assign_gpu[grid_dim, self._block_dim](self.c, self.s, self.vat, self.ppi_offset) def _grid_dim(self, x, y): gx = math.ceil(x / self._block_dim[0]) @@ -378,20 +370,39 @@ class WaveSimCuda(WaveSim): sims, self.timing, self.params, sd, seed) cuda.synchronize() - # TODO implement on GPU def c_to_s(self, time=TMAX, sd=0.0, seed=1): - s = np.array(self.s) - c = np.array(self.c) - vat = np.array(self.vat) - for s_loc, (c_loc, c_len, _) in zip(self.poppo_s_locs, vat[self.ppo_offset+self.poppo_s_locs]): - for vector in range(self.sims): - s[s_loc, vector, 3:] = wave_capture_cpu(c, c_loc, c_len, vector, time=time, sd=sd, seed=seed) - self.s[:,:,:] = s - self.c[:,:] = c - - # TODO implement on GPU - #def s_ppo_to_ppi(self, time=0.0): + grid_dim = self._grid_dim(self.sims, self.s_len) + wave_capture_gpu[grid_dim, self._block_dim](self.c, self.s, self.vat, self.ppo_offset, + time, sd * math.sqrt(2), seed) + def s_ppo_to_ppi(self, time=0.0): + grid_dim = self._grid_dim(self.sims, self.s_len) + ppo_to_ppi_gpu[grid_dim, self._block_dim](self.s, self.vat, time, self.ppi_offset, self.ppo_offset) + + +@cuda.jit() +def wave_assign_gpu(c, s, vat, ppi_offset): + x, y = cuda.grid(2) + if y >= len(s): return + c_loc, c_len, _ = vat[ppi_offset + y] + if c_loc < 0: return + if x >= c.shape[-1]: return + value = int(s[y, x, 2] >= 0.5) | (2*int(s[y, x, 0] >= 0.5)) + ttime = s[y, x, 1] + if value == 0: + c[c_loc, x] = TMAX + c[c_loc+1, x] = TMAX + elif value == 1: + c[c_loc, x] = ttime + c[c_loc+1, x] = TMAX + elif value == 2: + c[c_loc, x] = TMIN + c[c_loc+1, x] = ttime + else: + c[c_loc, x] = TMIN + c[c_loc+1, x] = TMAX + c[c_loc+2, x] = TMAX + @cuda.jit(device=True) def rand_gauss_gpu(seed, sd): @@ -526,4 +537,76 @@ def wave_eval_gpu(ops, op_start, op_stop, cbuf, vat, st_start, st_stop, line_tim current_t = min(a, b, c, d) # generate overflow flag or propagate from input - cbuf[z_mem + z_cur, st_idx] = TMAX_OVL if overflows > 0 else max(a, b, c, d) \ No newline at end of file + cbuf[z_mem + z_cur, st_idx] = TMAX_OVL if overflows > 0 else max(a, b, c, d) + + +@cuda.jit() +def wave_capture_gpu(c, s, vat, ppo_offset, time, s_sqrt2, seed): + x, y = cuda.grid(2) + if ppo_offset + y >= len(vat): return + line, tdim, _ = vat[ppo_offset + y] + if line < 0: return + if x >= c.shape[-1]: return + vector = x + m = 0.5 + acc = 0.0 + eat = TMAX + lst = TMIN + tog = 0 + ovl = 0 + val = int(0) + final = int(0) + for tidx in range(tdim): + t = c[line + tidx, vector] + if t >= TMAX: + if t == TMAX_OVL: + ovl = 1 + break + m = -m + final ^= 1 + if t < time: + val ^= 1 + if t <= TMIN: continue + if s_sqrt2 > 0: + acc += m * (1 + math.erf((t - time) / s_sqrt2)) + eat = min(eat, t) + lst = max(lst, t) + tog += 1 + if s_sqrt2 > 0: + if m < 0: + acc += 1 + if acc >= 0.99: + val = 1 + elif acc > 0.01: + seed = (seed << 4) + (vector << 20) + (y << 1) + seed = int(0xDEECE66D) * seed + 0xB + seed = int(0xDEECE66D) * seed + 0xB + rnd = float((seed >> 8) & 0xffffff) / float(1 << 24) + val = rnd < acc + else: + val = 0 + else: + acc = val + + s[y, vector, 3] = (c[line, vector] <= TMIN) + s[y, vector, 4] = eat + s[y, vector, 5] = lst + s[y, vector, 6] = final + s[y, vector, 7] = acc + s[y, vector, 8] = val + s[y, vector, 9] = 0 # TODO + s[y, vector, 10] = ovl + + +@cuda.jit() +def ppo_to_ppi_gpu(s, vat, time, ppi_offset, ppo_offset): + x, y = cuda.grid(2) + if y >= s.shape[0]: return + if x >= s.shape[1]: return + + if vat[ppi_offset + y, 0] < 0: return + if vat[ppo_offset + y, 0] < 0: return + + s[y, x, 0] = s[y, x, 2] + s[y, x, 1] = time + s[y, x, 2] = s[y, x, 8]