From 1af346c97a08c9992691322fa0b3697f2cce70b7 Mon Sep 17 00:00:00 2001 From: Stefan Holst Date: Mon, 23 Nov 2020 15:11:42 +0900 Subject: [PATCH] overflow notification and wavecap statistics on GPU --- kyupy/wave_sim.py | 35 ++++++++++++++++---------- kyupy/wave_sim_cuda.py | 57 ++++++++++++++++++++++++++++++++++-------- tests/test_wave_sim.py | 8 +++--- 3 files changed, 72 insertions(+), 28 deletions(-) diff --git a/kyupy/wave_sim.py b/kyupy/wave_sim.py index 153d431..fa8d585 100644 --- a/kyupy/wave_sim.py +++ b/kyupy/wave_sim.py @@ -4,6 +4,7 @@ from . import numba TMAX = np.float32(2 ** 127) # almost np.PINF for 32-bit floating point values +TMAX_OVL = np.float32(1.1 * 2 ** 127) # almost np.PINF with overflow mark TMIN = np.float32(-2 ** 127) # almost np.NINF for 32-bit floating point values @@ -16,7 +17,7 @@ class WaveSim: self.lst_eat_valid = False - self.cdata = np.zeros((len(self.interface), sims, 6), dtype='float32') + self.cdata = np.zeros((len(self.interface), sims, 7), dtype='float32') if type(wavecaps) is int: wavecaps = [wavecaps] * len(circuit.lines) @@ -25,25 +26,25 @@ class WaveSim: # state allocation table. maps line and interface indices to self.state memory locations - self.sat = np.zeros((len(circuit.lines) + 2 + 2 * len(self.interface), 2), dtype='int') + self.sat = np.zeros((len(circuit.lines) + 2 + 2 * len(self.interface), 3), dtype='int') self.sat[:, 0] = -1 filled = 0 for lidx, cap in enumerate(wavecaps): - self.sat[lidx] = filled, cap + self.sat[lidx] = filled, cap, 0 filled += cap self.zero_idx = len(circuit.lines) - self.sat[self.zero_idx] = filled, intf_wavecap + self.sat[self.zero_idx] = filled, intf_wavecap, 0 filled += intf_wavecap self.tmp_idx = self.zero_idx + 1 - self.sat[self.tmp_idx] = filled, intf_wavecap + self.sat[self.tmp_idx] = filled, intf_wavecap, 0 filled += intf_wavecap self.ppi_offset = self.tmp_idx + 1 self.ppo_offset = self.ppi_offset + len(self.interface) for i, n in enumerate(self.interface): if len(n.outs) > 0: - self.sat[self.ppi_offset + i] = filled, intf_wavecap + self.sat[self.ppi_offset + i] = filled, intf_wavecap, 0 filled += intf_wavecap if len(n.ins) > 0: self.sat[self.ppo_offset + i] = self.sat[n.ins[0].index] @@ -161,7 +162,7 @@ class WaveSim: def wave(self, line, vector): if line < 0: return [TMAX] - mem, wcap = self.sat[line] + mem, wcap, _ = self.sat[line] if mem < 0: return [TMAX] return self.state[mem:mem + wcap, vector] @@ -186,8 +187,8 @@ class WaveSim: def reassign(self, time=0.0): for i, node in enumerate(self.interface): - ppi_loc = self.sat[self.ppi_offset + i] - ppo_loc = self.sat[self.ppo_offset + i] + ppi_loc = self.sat[self.ppi_offset + i, 0] + ppo_loc = self.sat[self.ppo_offset + i, 0] if ppi_loc < 0 or ppo_loc < 0: continue for sidx in range(self.sims): ival = self.val(self.ppi_offset + i, sidx, TMAX) > 0.5 @@ -274,10 +275,14 @@ class WaveSim: eat = TMAX lst = TMIN tog = 0 + ovl = 0 val = int(0) final = int(0) for t in self.wave(line, vector): - if t >= TMAX: break + if t >= TMAX: + if t == TMAX_OVL: + ovl = 1 + break m = -m final ^= 1 if t < time: @@ -304,7 +309,7 @@ class WaveSim: else: acc = val - return acc, val, final, (val != final), eat, lst + return acc, val, final, (val != final), eat, lst, ovl @numba.njit @@ -342,7 +347,7 @@ def wave_eval(op, state, sat, st_idx, line_times, sd=0.0, seed=0): a_mem = sat[a_idx, 0] b_mem = sat[b_idx, 0] - z_mem, z_cap = sat[z_idx] + z_mem, z_cap, _ = sat[z_idx] a_cur = int(0) b_cur = int(0) @@ -397,5 +402,9 @@ def wave_eval(op, state, sat, st_idx, line_times, sd=0.0, seed=0): previous_t = TMIN current_t = min(a, b) - state[z_mem + z_cur, st_idx] = TMAX + if overflows > 0: + state[z_mem + z_cur, st_idx] = TMAX_OVL + else: + state[z_mem + z_cur, st_idx] = a if a > b else b # propagate overflow flags by storing biggest TMAX from input + return overflows diff --git a/kyupy/wave_sim_cuda.py b/kyupy/wave_sim_cuda.py index 9e92901..1b6ee80 100644 --- a/kyupy/wave_sim_cuda.py +++ b/kyupy/wave_sim_cuda.py @@ -4,6 +4,7 @@ from .wave_sim import WaveSim from . import cuda TMAX = np.float32(2 ** 127) # almost np.PINF for 32-bit floating point values +TMAX_OVL = np.float32(1.1 * 2 ** 127) # almost np.PINF with overflow mark TMIN = np.float32(-2 ** 127) # almost np.NINF for 32-bit floating point values @@ -64,7 +65,7 @@ class WaveSimCuda(WaveSim): def wave(self, line, vector): if line < 0: return None - mem, wcap = self.sat[line] + mem, wcap, _ = self.sat[line] if mem < 0: return None return self.d_state[mem:mem + wcap, vector] @@ -86,16 +87,41 @@ class WaveSimCuda(WaveSim): reassign_kernel[grid_dim, self._block_dim](self.d_state, self.d_sat, self.ppi_offset, self.ppo_offset, self.d_cdata, time) cuda.synchronize() + + def wavecaps(self): + gx = math.ceil(len(self.circuit.lines) / 512) + wavecaps_kernel[gx, 512](self.d_state, self.d_sat, self.sims) + self.sat[...] = self.d_sat + return self.sat[..., 2] +@cuda.jit() +def wavecaps_kernel(state, sat, sims): + idx = cuda.grid(1) + if idx >= len(sat): return + + lidx, lcap, _ = sat[idx] + if lidx < 0: return + + wcap = 0 + for sidx in range(sims): + for tidx in range(lcap): + t = state[lidx + tidx, sidx] + if tidx > wcap: + wcap = tidx + if t >= TMAX: break + + sat[idx, 2] = wcap + 1 + + @cuda.jit() def reassign_kernel(state, sat, ppi_offset, ppo_offset, cdata, ppi_time): vector, y = cuda.grid(2) if vector >= state.shape[-1]: return if ppo_offset + y >= len(sat): return - ppo, ppo_cap = sat[ppo_offset + y] - ppi, ppi_cap = sat[ppi_offset + y] + ppo, ppo_cap, _ = sat[ppo_offset + y] + ppi, ppi_cap, _ = sat[ppi_offset + y] if ppo < 0: return if ppi < 0: return @@ -121,7 +147,7 @@ def reassign_kernel(state, sat, ppi_offset, ppo_offset, cdata, ppi_time): def capture_kernel(state, sat, ppo_offset, cdata, time, s_sqrt2, seed): x, y = cuda.grid(2) if ppo_offset + y >= len(sat): return - line, tdim = sat[ppo_offset + y] + line, tdim, _ = sat[ppo_offset + y] if line < 0: return if x >= state.shape[-1]: return vector = x @@ -130,11 +156,15 @@ def capture_kernel(state, sat, ppo_offset, cdata, time, s_sqrt2, seed): eat = TMAX lst = TMIN tog = 0 + ovl = 0 val = int(0) final = int(0) for tidx in range(tdim): t = state[line + tidx, vector] - if t >= TMAX: break + if t >= TMAX: + if t == TMAX_OVL: + ovl = 1 + break m = -m final ^= 1 if t < time: @@ -167,6 +197,7 @@ def capture_kernel(state, sat, ppo_offset, cdata, time, s_sqrt2, seed): cdata[y, vector, 3] = (val != final) cdata[y, vector, 4] = eat cdata[y, vector, 5] = lst + cdata[y, vector, 6] = ovl @cuda.jit() @@ -219,12 +250,13 @@ def wave_kernel(ops, op_start, op_stop, state, sat, st_start, st_stop, line_time z_idx = ops[op_idx, 1] a_idx = ops[op_idx, 2] b_idx = ops[op_idx, 3] + overflows = int(0) + + _seed = (seed << 4) + (z_idx << 20) + (st_idx << 1) - z_mem, z_cap = sat[z_idx] a_mem = sat[a_idx, 0] b_mem = sat[b_idx, 0] - - _seed = (seed << 4) + (z_idx << 20) + (st_idx << 1) + z_mem, z_cap, _ = sat[z_idx] a_cur = int(0) b_cur = int(0) @@ -268,7 +300,7 @@ def wave_kernel(ops, op_start, op_stop, state, sat, st_start, st_stop, line_time previous_t = current_t z_cur += 1 else: - # overflows += 1 + overflows += 1 previous_t = state[z_mem + z_cur - 1, st_idx] z_cur -= 1 else: @@ -278,5 +310,8 @@ def wave_kernel(ops, op_start, op_stop, state, sat, st_start, st_stop, line_time else: previous_t = TMIN current_t = min(a, b) - - state[z_mem + z_cur, st_idx] = TMAX + + if overflows > 0: + state[z_mem + z_cur, st_idx] = TMAX_OVL + else: + state[z_mem + z_cur, st_idx] = a if a > b else b # propagate overflow flags by storing biggest TMAX from input diff --git a/tests/test_wave_sim.py b/tests/test_wave_sim.py index 38560da..8723cdc 100644 --- a/tests/test_wave_sim.py +++ b/tests/test_wave_sim.py @@ -26,10 +26,10 @@ def test_wave_eval(): a = state[0:16, 0] b = state[16:32, 0] z = state[32:, 0] - sat = np.zeros((3, 2), dtype='int') - sat[0] = 0, 16 - sat[1] = 16, 16 - sat[2] = 32, 16 + sat = np.zeros((3, 3), dtype='int') + sat[0] = 0, 16, 0 + sat[1] = 16, 16, 0 + sat[2] = 32, 16, 0 wave_eval((0b0111, 2, 0, 1), state, sat, 0, line_times) assert TMIN == z[0]