|
|
@ -47,8 +47,8 @@ class WaveSim(sim.SimOps): |
|
|
|
:param c_reuse: If enabled, memory of intermediate signal waveforms will be re-used. This greatly reduces |
|
|
|
:param c_reuse: If enabled, memory of intermediate signal waveforms will be re-used. This greatly reduces |
|
|
|
memory footprint, but intermediate signal waveforms become unaccessible after a propagation. |
|
|
|
memory footprint, but intermediate signal waveforms become unaccessible after a propagation. |
|
|
|
""" |
|
|
|
""" |
|
|
|
def __init__(self, circuit, delays, sims=8, c_caps=16, c_reuse=False, strip_forks=False): |
|
|
|
def __init__(self, circuit, delays, sims=8, c_caps=16, a_ctrl=None, c_reuse=False, strip_forks=False): |
|
|
|
super().__init__(circuit, c_caps=c_caps, c_caps_min=4, c_reuse=c_reuse, strip_forks=strip_forks) |
|
|
|
super().__init__(circuit, c_caps=c_caps, c_caps_min=4, a_ctrl=a_ctrl, c_reuse=c_reuse, strip_forks=strip_forks) |
|
|
|
self.sims = sims |
|
|
|
self.sims = sims |
|
|
|
if delays.ndim == 3: delays = np.expand_dims(delays, axis=0) |
|
|
|
if delays.ndim == 3: delays = np.expand_dims(delays, axis=0) |
|
|
|
self.delays = np.zeros((len(delays), self.c_locs_len, 2, 2), dtype=delays.dtype) |
|
|
|
self.delays = np.zeros((len(delays), self.c_locs_len, 2, 2), dtype=delays.dtype) |
|
|
@ -78,6 +78,9 @@ class WaveSim(sim.SimOps): |
|
|
|
final values in the waveforms are still valid. |
|
|
|
final values in the waveforms are still valid. |
|
|
|
""" |
|
|
|
""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
self.abuf_len = self.ops[:,6].max() + 1 |
|
|
|
|
|
|
|
self.abuf = np.zeros((self.abuf_len, sims), dtype=np.int32) if self.abuf_len > 0 else np.zeros((1, 1), dtype=np.int32) |
|
|
|
|
|
|
|
|
|
|
|
self.simctl_int = np.zeros((2, sims), dtype=np.int32) |
|
|
|
self.simctl_int = np.zeros((2, sims), dtype=np.int32) |
|
|
|
"""Per-simulation delay configuration. |
|
|
|
"""Per-simulation delay configuration. |
|
|
|
|
|
|
|
|
|
|
@ -113,7 +116,7 @@ class WaveSim(sim.SimOps): |
|
|
|
""" |
|
|
|
""" |
|
|
|
sims = min(sims or self.sims, self.sims) |
|
|
|
sims = min(sims or self.sims, self.sims) |
|
|
|
for op_start, op_stop in zip(self.level_starts, self.level_stops): |
|
|
|
for op_start, op_stop in zip(self.level_starts, self.level_stops): |
|
|
|
level_eval_cpu(self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, 0, sims, self.delays, self.simctl_int, seed) |
|
|
|
level_eval_cpu(self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, self.abuf, 0, sims, self.delays, self.simctl_int, seed) |
|
|
|
|
|
|
|
|
|
|
|
def c_to_s(self, time=TMAX, sd=0.0, seed=1): |
|
|
|
def c_to_s(self, time=TMAX, sd=0.0, seed=1): |
|
|
|
"""Simulates a capture operation at all sequential elements and primary outputs. |
|
|
|
"""Simulates a capture operation at all sequential elements and primary outputs. |
|
|
@ -141,9 +144,16 @@ class WaveSim(sim.SimOps): |
|
|
|
self.s[2, self.ppio_s_locs] = self.s[8, self.ppio_s_locs] |
|
|
|
self.s[2, self.ppio_s_locs] = self.s[8, self.ppio_s_locs] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _wave_eval(lut, z_idx, a_idx, b_idx, c_idx, d_idx, cbuf, c_locs, c_caps, sim, delays, simctl_int, seed=0): |
|
|
|
def _wave_eval(op, cbuf, c_locs, c_caps, sim, delays, simctl_int, seed=0): |
|
|
|
overflows = int(0) |
|
|
|
overflows = int(0) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
lut = op[0] |
|
|
|
|
|
|
|
z_idx = op[1] |
|
|
|
|
|
|
|
a_idx = op[2] |
|
|
|
|
|
|
|
b_idx = op[3] |
|
|
|
|
|
|
|
c_idx = op[4] |
|
|
|
|
|
|
|
d_idx = op[5] |
|
|
|
|
|
|
|
|
|
|
|
if len(delays) > 1: |
|
|
|
if len(delays) > 1: |
|
|
|
if simctl_int[1] == 0: |
|
|
|
if simctl_int[1] == 0: |
|
|
|
delays = delays[seed] |
|
|
|
delays = delays[seed] |
|
|
@ -240,22 +250,26 @@ def _wave_eval(lut, z_idx, a_idx, b_idx, c_idx, d_idx, cbuf, c_locs, c_caps, sim |
|
|
|
# generate or propagate overflow flag |
|
|
|
# generate or propagate overflow flag |
|
|
|
cbuf[z_mem + z_cur, sim] = TMAX_OVL if overflows > 0 else max(a, b, c, d) |
|
|
|
cbuf[z_mem + z_cur, sim] = TMAX_OVL if overflows > 0 else max(a, b, c, d) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
nrise = max(0, (z_cur+1) // 2 - (cbuf[z_mem, sim] == TMIN)) |
|
|
|
|
|
|
|
nfall = z_cur // 2 |
|
|
|
|
|
|
|
|
|
|
|
_wave_eval_cpu = numba.njit(_wave_eval) |
|
|
|
return nrise, nfall |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@numba.njit |
|
|
|
wave_eval_cpu = numba.njit(_wave_eval) |
|
|
|
def wave_eval_cpu(op, cbuf, c_locs, c_caps, sim, delays, simctl_int, seed=0): |
|
|
|
|
|
|
|
lut, z_idx, a_idx, b_idx, c_idx, d_idx = op |
|
|
|
|
|
|
|
_wave_eval_cpu(lut, z_idx, a_idx, b_idx, c_idx, d_idx, cbuf, c_locs, c_caps, sim, delays, simctl_int, seed) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@numba.njit |
|
|
|
@numba.njit |
|
|
|
def level_eval_cpu(ops, op_start, op_stop, c, c_locs, c_caps, sim_start, sim_stop, delays, simctl_int, seed): |
|
|
|
def level_eval_cpu(ops, op_start, op_stop, c, c_locs, c_caps, abuf, sim_start, sim_stop, delays, simctl_int, seed): |
|
|
|
for op_idx in range(op_start, op_stop): |
|
|
|
for op_idx in range(op_start, op_stop): |
|
|
|
op = ops[op_idx] |
|
|
|
op = ops[op_idx] |
|
|
|
for sim in range(sim_start, sim_stop): |
|
|
|
for sim in range(sim_start, sim_stop): |
|
|
|
wave_eval_cpu(op, c, c_locs, c_caps, sim, delays, simctl_int[:, sim], seed) |
|
|
|
nrise, nfall = wave_eval_cpu(op, c, c_locs, c_caps, sim, delays, simctl_int[:, sim], seed) |
|
|
|
|
|
|
|
a_loc = op[6] |
|
|
|
|
|
|
|
a_wr = op[7] |
|
|
|
|
|
|
|
a_wf = op[8] |
|
|
|
|
|
|
|
if a_loc >= 0: |
|
|
|
|
|
|
|
abuf[a_loc, sim] += nrise*a_wr + nfall*a_wf |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@numba.njit |
|
|
|
@numba.njit |
|
|
@ -311,8 +325,8 @@ class WaveSimCuda(WaveSim): |
|
|
|
All internal memories are mirrored into GPU memory upon construction. |
|
|
|
All internal memories are mirrored into GPU memory upon construction. |
|
|
|
Some operations like access to single waveforms can involve large communication overheads. |
|
|
|
Some operations like access to single waveforms can involve large communication overheads. |
|
|
|
""" |
|
|
|
""" |
|
|
|
def __init__(self, circuit, delays, sims=8, c_caps=16, c_reuse=False, strip_forks=False): |
|
|
|
def __init__(self, circuit, delays, sims=8, c_caps=16, a_ctrl=None, c_reuse=False, strip_forks=False): |
|
|
|
super().__init__(circuit, delays, sims, c_caps, c_reuse, strip_forks) |
|
|
|
super().__init__(circuit, delays, sims, c_caps, a_ctrl=a_ctrl, c_reuse=c_reuse, strip_forks=strip_forks) |
|
|
|
|
|
|
|
|
|
|
|
self.c = cuda.to_device(self.c) |
|
|
|
self.c = cuda.to_device(self.c) |
|
|
|
self.s = cuda.to_device(self.s) |
|
|
|
self.s = cuda.to_device(self.s) |
|
|
@ -321,6 +335,7 @@ class WaveSimCuda(WaveSim): |
|
|
|
self.c_caps = cuda.to_device(self.c_caps) |
|
|
|
self.c_caps = cuda.to_device(self.c_caps) |
|
|
|
self.delays = cuda.to_device(self.delays) |
|
|
|
self.delays = cuda.to_device(self.delays) |
|
|
|
self.simctl_int = cuda.to_device(self.simctl_int) |
|
|
|
self.simctl_int = cuda.to_device(self.simctl_int) |
|
|
|
|
|
|
|
self.abuf = cuda.to_device(self.abuf) |
|
|
|
|
|
|
|
|
|
|
|
self._block_dim = (32, 16) |
|
|
|
self._block_dim = (32, 16) |
|
|
|
|
|
|
|
|
|
|
@ -333,6 +348,7 @@ class WaveSimCuda(WaveSim): |
|
|
|
state['c_caps'] = np.array(self.c_caps) |
|
|
|
state['c_caps'] = np.array(self.c_caps) |
|
|
|
state['delays'] = np.array(self.delays) |
|
|
|
state['delays'] = np.array(self.delays) |
|
|
|
state['simctl_int'] = np.array(self.simctl_int) |
|
|
|
state['simctl_int'] = np.array(self.simctl_int) |
|
|
|
|
|
|
|
state['abuf'] = np.array(self.abuf) |
|
|
|
return state |
|
|
|
return state |
|
|
|
|
|
|
|
|
|
|
|
def __setstate__(self, state): |
|
|
|
def __setstate__(self, state): |
|
|
@ -344,6 +360,7 @@ class WaveSimCuda(WaveSim): |
|
|
|
self.c_caps = cuda.to_device(self.c_caps) |
|
|
|
self.c_caps = cuda.to_device(self.c_caps) |
|
|
|
self.delays = cuda.to_device(self.delays) |
|
|
|
self.delays = cuda.to_device(self.delays) |
|
|
|
self.simctl_int = cuda.to_device(self.simctl_int) |
|
|
|
self.simctl_int = cuda.to_device(self.simctl_int) |
|
|
|
|
|
|
|
self.abuf = cuda.to_device(self.abuf) |
|
|
|
|
|
|
|
|
|
|
|
def s_to_c(self): |
|
|
|
def s_to_c(self): |
|
|
|
grid_dim = self._grid_dim(self.sims, self.s_len) |
|
|
|
grid_dim = self._grid_dim(self.sims, self.s_len) |
|
|
@ -355,7 +372,7 @@ class WaveSimCuda(WaveSim): |
|
|
|
sims = min(sims or self.sims, self.sims) |
|
|
|
sims = min(sims or self.sims, self.sims) |
|
|
|
for op_start, op_stop in zip(self.level_starts, self.level_stops): |
|
|
|
for op_start, op_stop in zip(self.level_starts, self.level_stops): |
|
|
|
grid_dim = self._grid_dim(sims, op_stop - op_start) |
|
|
|
grid_dim = self._grid_dim(sims, op_stop - op_start) |
|
|
|
wave_eval_gpu[grid_dim, self._block_dim](self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, int(0), |
|
|
|
wave_eval_gpu[grid_dim, self._block_dim](self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, self.abuf, int(0), |
|
|
|
sims, self.delays, self.simctl_int, seed) |
|
|
|
sims, self.delays, self.simctl_int, seed) |
|
|
|
cuda.synchronize() |
|
|
|
cuda.synchronize() |
|
|
|
|
|
|
|
|
|
|
@ -397,21 +414,24 @@ _wave_eval_gpu = cuda.jit(_wave_eval, device=True) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@cuda.jit() |
|
|
|
@cuda.jit() |
|
|
|
def wave_eval_gpu(ops, op_start, op_stop, cbuf, c_locs, c_caps, sim_start, sim_stop, delays, simctl_int, seed): |
|
|
|
def wave_eval_gpu(ops, op_start, op_stop, cbuf, c_locs, c_caps, abuf, sim_start, sim_stop, delays, simctl_int, seed): |
|
|
|
x, y = cuda.grid(2) |
|
|
|
x, y = cuda.grid(2) |
|
|
|
sim = sim_start + x |
|
|
|
sim = sim_start + x |
|
|
|
op_idx = op_start + y |
|
|
|
op_idx = op_start + y |
|
|
|
if sim >= sim_stop: return |
|
|
|
if sim >= sim_stop: return |
|
|
|
if op_idx >= op_stop: return |
|
|
|
if op_idx >= op_stop: return |
|
|
|
|
|
|
|
|
|
|
|
lut = ops[op_idx, 0] |
|
|
|
op = ops[op_idx] |
|
|
|
z_idx = ops[op_idx, 1] |
|
|
|
a_loc = op[6] |
|
|
|
a_idx = ops[op_idx, 2] |
|
|
|
a_wr = op[7] |
|
|
|
b_idx = ops[op_idx, 3] |
|
|
|
a_wf = op[8] |
|
|
|
c_idx = ops[op_idx, 4] |
|
|
|
|
|
|
|
d_idx = ops[op_idx, 5] |
|
|
|
nrise, nfall = _wave_eval_gpu(op, cbuf, c_locs, c_caps, sim, delays, simctl_int[:, sim], seed) |
|
|
|
|
|
|
|
|
|
|
|
_wave_eval_gpu(lut, z_idx, a_idx, b_idx, c_idx, d_idx, cbuf, c_locs, c_caps, sim, delays, simctl_int[:, sim], seed) |
|
|
|
# accumulate WSA into abuf |
|
|
|
|
|
|
|
if a_loc >= 0: |
|
|
|
|
|
|
|
#abuf[a_loc, sim] += nrise*a_wr + nfall*a_wf |
|
|
|
|
|
|
|
cuda.atomic.add(abuf, (a_loc, sim), nrise*a_wr + nfall*a_wf) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@cuda.jit() |
|
|
|
@cuda.jit() |
|
|
|