|
|
|
@ -47,7 +47,7 @@ class WaveSim(sim.SimOps):
@@ -47,7 +47,7 @@ class WaveSim(sim.SimOps):
|
|
|
|
|
:param keep_waveforms: If disabled, memory of intermediate signal waveforms will be re-used. This greatly reduces |
|
|
|
|
memory footprint, but intermediate signal waveforms become unaccessible after a propagation. |
|
|
|
|
""" |
|
|
|
|
def __init__(self, circuit, timing, sims=8, c_caps=16, c_reuse=False, strip_forks=False): |
|
|
|
|
def __init__(self, circuit, delays, sims=8, c_caps=16, c_reuse=False, strip_forks=False): |
|
|
|
|
assert c_caps > 0 and c_caps % 4 == 0 |
|
|
|
|
super().__init__(circuit, c_caps=c_caps//4, c_reuse=c_reuse, strip_forks=strip_forks) |
|
|
|
|
self.sims = sims |
|
|
|
@ -56,8 +56,8 @@ class WaveSim(sim.SimOps):
@@ -56,8 +56,8 @@ class WaveSim(sim.SimOps):
|
|
|
|
|
self.c_locs[...] *= 4 |
|
|
|
|
self.c_caps[...] *= 4 |
|
|
|
|
|
|
|
|
|
self.timing = np.zeros((self.c_locs_len, 2, 2)) |
|
|
|
|
self.timing[:len(timing)] = timing |
|
|
|
|
self.delays = np.zeros((len(delays), self.c_locs_len, 2, 2), dtype=delays.dtype) |
|
|
|
|
self.delays[:, :delays.shape[1]] = delays |
|
|
|
|
|
|
|
|
|
self.c = np.zeros((self.c_len, sims), dtype=np.float32) + TMAX |
|
|
|
|
self.s = np.zeros((11, self.s_len, sims), dtype=np.float32) |
|
|
|
@ -128,7 +128,7 @@ class WaveSim(sim.SimOps):
@@ -128,7 +128,7 @@ class WaveSim(sim.SimOps):
|
|
|
|
|
sims = min(sims or self.sims, self.sims) |
|
|
|
|
for op_start, op_stop in zip(self.level_starts, self.level_stops): |
|
|
|
|
level_eval_cpu(self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, 0, sims, |
|
|
|
|
self.timing, self.params, sd, seed) |
|
|
|
|
self.delays, self.params, sd, seed) |
|
|
|
|
|
|
|
|
|
def c_to_s(self, time=TMAX, sd=0.0, seed=1): |
|
|
|
|
"""Simulates a capture operation at all sequential elements and primary outputs. |
|
|
|
@ -173,7 +173,7 @@ def rand_gauss_cpu(seed, sd):
@@ -173,7 +173,7 @@ def rand_gauss_cpu(seed, sd):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@numba.njit |
|
|
|
|
def wave_eval_cpu(op, cbuf, c_locs, c_caps, st_idx, line_times, param, sd=0.0, seed=0): |
|
|
|
|
def wave_eval_cpu_old(op, cbuf, c_locs, c_caps, st_idx, line_times, param, sd=0.0, seed=0): |
|
|
|
|
lut, z_idx, a_idx, b_idx, c_idx, d_idx = op |
|
|
|
|
|
|
|
|
|
# >>> same code as wave_eval_cpu (except rand_gauss_*pu()-calls) >>> |
|
|
|
@ -191,7 +191,7 @@ def wave_eval_cpu(op, cbuf, c_locs, c_caps, st_idx, line_times, param, sd=0.0, s
@@ -191,7 +191,7 @@ def wave_eval_cpu(op, cbuf, c_locs, c_caps, st_idx, line_times, param, sd=0.0, s
|
|
|
|
|
a_cur = int(0) |
|
|
|
|
b_cur = int(0) |
|
|
|
|
c_cur = int(0) |
|
|
|
|
d_cur = int(0) |
|
|
|
|
d_cur = int(0) |
|
|
|
|
z_cur = lut & 1 |
|
|
|
|
if z_cur == 1: |
|
|
|
|
cbuf[z_mem, st_idx] = TMIN |
|
|
|
@ -276,17 +276,116 @@ def wave_eval_cpu(op, cbuf, c_locs, c_caps, st_idx, line_times, param, sd=0.0, s
@@ -276,17 +276,116 @@ def wave_eval_cpu(op, cbuf, c_locs, c_caps, st_idx, line_times, param, sd=0.0, s
|
|
|
|
|
|
|
|
|
|
current_t = min(a, b, c, d) |
|
|
|
|
|
|
|
|
|
# generate overflow flag or propagate from input |
|
|
|
|
# generate or propagate overflow flag |
|
|
|
|
cbuf[z_mem + z_cur, st_idx] = TMAX_OVL if overflows > 0 else max(a, b, c, d) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@numba.njit |
|
|
|
|
def level_eval_cpu(ops, op_start, op_stop, c, c_locs, c_caps, st_start, st_stop, line_times, params, sd, seed): |
|
|
|
|
def wave_eval_cpu(op, cbuf, c_locs, c_caps, st_idx, delays, param, sd=0.0, seed=0): |
|
|
|
|
lut, z_idx, a_idx, b_idx, c_idx, d_idx = op |
|
|
|
|
|
|
|
|
|
# >>> same code as wave_eval_cpu (except rand_gauss_*pu()-calls) >>> |
|
|
|
|
overflows = int(0) |
|
|
|
|
|
|
|
|
|
if len(delays) > 1: |
|
|
|
|
_rnd = (seed << 4) + (z_idx << 20) + (st_idx << 1) |
|
|
|
|
for _ in range(4): |
|
|
|
|
_rnd = int(0xDEECE66D) * _rnd + 0xB |
|
|
|
|
delays = delays[_rnd % len(delays)] |
|
|
|
|
else: |
|
|
|
|
delays = delays[0] |
|
|
|
|
|
|
|
|
|
a_mem = c_locs[a_idx] |
|
|
|
|
b_mem = c_locs[b_idx] |
|
|
|
|
c_mem = c_locs[c_idx] |
|
|
|
|
d_mem = c_locs[d_idx] |
|
|
|
|
z_mem = c_locs[z_idx] |
|
|
|
|
z_cap = c_caps[z_idx] |
|
|
|
|
|
|
|
|
|
a_cur = int(0) |
|
|
|
|
b_cur = int(0) |
|
|
|
|
c_cur = int(0) |
|
|
|
|
d_cur = int(0) |
|
|
|
|
z_cur = lut & 1 |
|
|
|
|
if z_cur == 1: |
|
|
|
|
cbuf[z_mem, st_idx] = TMIN |
|
|
|
|
|
|
|
|
|
z_val = z_cur |
|
|
|
|
|
|
|
|
|
a = cbuf[a_mem + a_cur, st_idx] + delays[a_idx, 0, z_val] |
|
|
|
|
b = cbuf[b_mem + b_cur, st_idx] + delays[b_idx, 0, z_val] |
|
|
|
|
c = cbuf[c_mem + c_cur, st_idx] + delays[c_idx, 0, z_val] |
|
|
|
|
d = cbuf[d_mem + d_cur, st_idx] + delays[d_idx, 0, z_val] |
|
|
|
|
|
|
|
|
|
previous_t = TMIN |
|
|
|
|
|
|
|
|
|
current_t = min(a, b, c, d) |
|
|
|
|
inputs = int(0) |
|
|
|
|
|
|
|
|
|
while current_t < TMAX: |
|
|
|
|
if a == current_t: |
|
|
|
|
a_cur += 1 |
|
|
|
|
inputs ^= 1 |
|
|
|
|
thresh = delays[a_idx, 0, z_val] |
|
|
|
|
a = cbuf[a_mem + a_cur, st_idx] + delays[a_idx, 0, z_val] |
|
|
|
|
next_t = cbuf[a_mem + a_cur, st_idx] + delays[a_idx, 0, z_val ^ 1] |
|
|
|
|
elif b == current_t: |
|
|
|
|
b_cur += 1 |
|
|
|
|
inputs ^= 2 |
|
|
|
|
thresh = delays[b_idx, 0, z_val] |
|
|
|
|
b = cbuf[b_mem + b_cur, st_idx] + delays[b_idx, 0, z_val] |
|
|
|
|
next_t = cbuf[b_mem + b_cur, st_idx] + delays[b_idx, 0, z_val ^ 1] |
|
|
|
|
elif c == current_t: |
|
|
|
|
c_cur += 1 |
|
|
|
|
inputs ^= 4 |
|
|
|
|
thresh = delays[c_idx, 0, z_val] |
|
|
|
|
c = cbuf[c_mem + c_cur, st_idx] + delays[c_idx, 0, z_val] |
|
|
|
|
next_t = cbuf[c_mem + c_cur, st_idx] + delays[c_idx, 0, z_val ^ 1] |
|
|
|
|
else: |
|
|
|
|
d_cur += 1 |
|
|
|
|
inputs ^= 8 |
|
|
|
|
thresh = delays[d_idx, 0, z_val] |
|
|
|
|
d = cbuf[d_mem + d_cur, st_idx] + delays[d_idx, 0, z_val] |
|
|
|
|
next_t = cbuf[d_mem + d_cur, st_idx] + delays[d_idx, 0, z_val ^ 1] |
|
|
|
|
|
|
|
|
|
if (z_cur & 1) != ((lut >> inputs) & 1): |
|
|
|
|
# we generate an edge in z_mem, if ... |
|
|
|
|
if (z_cur == 0 # it is the first edge in z_mem ... |
|
|
|
|
or next_t < current_t # -OR- the next edge on SAME input is EARLIER (need current edge to filter BOTH in next iteration) ... |
|
|
|
|
or (current_t - previous_t) > thresh # -OR- the generated hazard is wider than pulse threshold. |
|
|
|
|
): |
|
|
|
|
if z_cur < (z_cap - 1): # enough space in z_mem? |
|
|
|
|
cbuf[z_mem + z_cur, st_idx] = current_t |
|
|
|
|
previous_t = current_t |
|
|
|
|
z_cur += 1 |
|
|
|
|
else: |
|
|
|
|
overflows += 1 |
|
|
|
|
previous_t = cbuf[z_mem + z_cur - 1, st_idx] |
|
|
|
|
z_cur -= 1 |
|
|
|
|
else: |
|
|
|
|
z_cur -= 1 |
|
|
|
|
previous_t = cbuf[z_mem + z_cur - 1, st_idx] if z_cur > 0 else TMIN |
|
|
|
|
|
|
|
|
|
# output value of cell changed. update all delayed inputs. |
|
|
|
|
z_val = z_val ^ 1 |
|
|
|
|
a = cbuf[a_mem + a_cur, st_idx] + delays[a_idx, 0, z_val] |
|
|
|
|
b = cbuf[b_mem + b_cur, st_idx] + delays[b_idx, 0, z_val] |
|
|
|
|
c = cbuf[c_mem + c_cur, st_idx] + delays[c_idx, 0, z_val] |
|
|
|
|
d = cbuf[d_mem + d_cur, st_idx] + delays[d_idx, 0, z_val] |
|
|
|
|
|
|
|
|
|
current_t = min(a, b, c, d) |
|
|
|
|
|
|
|
|
|
# generate or propagate overflow flag |
|
|
|
|
cbuf[z_mem + z_cur, st_idx] = TMAX_OVL if overflows > 0 else max(a, b, c, d) |
|
|
|
|
|
|
|
|
|
@numba.njit |
|
|
|
|
def level_eval_cpu(ops, op_start, op_stop, c, c_locs, c_caps, st_start, st_stop, delays, params, sd, seed): |
|
|
|
|
overflows = 0 |
|
|
|
|
for op_idx in range(op_start, op_stop): |
|
|
|
|
op = ops[op_idx] |
|
|
|
|
for st_idx in range(st_start, st_stop): |
|
|
|
|
wave_eval_cpu(op, c, c_locs, c_caps, st_idx, line_times, params[st_idx], sd, seed) |
|
|
|
|
wave_eval_cpu(op, c, c_locs, c_caps, st_idx, delays, params[st_idx], sd, seed) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@numba.njit |
|
|
|
@ -342,15 +441,15 @@ class WaveSimCuda(WaveSim):
@@ -342,15 +441,15 @@ class WaveSimCuda(WaveSim):
|
|
|
|
|
All internal memories are mirrored into GPU memory upon construction. |
|
|
|
|
Some operations like access to single waveforms can involve large communication overheads. |
|
|
|
|
""" |
|
|
|
|
def __init__(self, circuit, timing, sims=8, c_caps=16, c_reuse=False, strip_forks=False): |
|
|
|
|
super().__init__(circuit, timing, sims, c_caps, c_reuse, strip_forks) |
|
|
|
|
def __init__(self, circuit, delays, sims=8, c_caps=16, c_reuse=False, strip_forks=False): |
|
|
|
|
super().__init__(circuit, delays, sims, c_caps, c_reuse, strip_forks) |
|
|
|
|
|
|
|
|
|
self.c = cuda.to_device(self.c) |
|
|
|
|
self.s = cuda.to_device(self.s) |
|
|
|
|
self.ops = cuda.to_device(self.ops) |
|
|
|
|
self.c_locs = cuda.to_device(self.c_locs) |
|
|
|
|
self.c_caps = cuda.to_device(self.c_caps) |
|
|
|
|
self.timing = cuda.to_device(self.timing) |
|
|
|
|
self.delays = cuda.to_device(self.delays) |
|
|
|
|
self.params = cuda.to_device(self.params) |
|
|
|
|
|
|
|
|
|
self._block_dim = (32, 16) |
|
|
|
@ -369,7 +468,7 @@ class WaveSimCuda(WaveSim):
@@ -369,7 +468,7 @@ class WaveSimCuda(WaveSim):
|
|
|
|
|
for op_start, op_stop in zip(self.level_starts, self.level_stops): |
|
|
|
|
grid_dim = self._grid_dim(sims, op_stop - op_start) |
|
|
|
|
wave_eval_gpu[grid_dim, self._block_dim](self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, int(0), |
|
|
|
|
sims, self.timing, self.params, sd, seed) |
|
|
|
|
sims, self.delays, self.params, sd, seed) |
|
|
|
|
cuda.synchronize() |
|
|
|
|
|
|
|
|
|
def c_to_s(self, time=TMAX, sd=0.0, seed=1): |
|
|
|
@ -423,7 +522,7 @@ def rand_gauss_gpu(seed, sd):
@@ -423,7 +522,7 @@ def rand_gauss_gpu(seed, sd):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@cuda.jit() |
|
|
|
|
def wave_eval_gpu(ops, op_start, op_stop, cbuf, c_locs, c_caps, st_start, st_stop, line_times, param, sd, seed): |
|
|
|
|
def wave_eval_gpu_old(ops, op_start, op_stop, cbuf, c_locs, c_caps, st_start, st_stop, line_times, param, sd, seed): |
|
|
|
|
x, y = cuda.grid(2) |
|
|
|
|
st_idx = st_start + x |
|
|
|
|
op_idx = op_start + y |
|
|
|
@ -539,7 +638,119 @@ def wave_eval_gpu(ops, op_start, op_stop, cbuf, c_locs, c_caps, st_start, st_sto
@@ -539,7 +638,119 @@ def wave_eval_gpu(ops, op_start, op_stop, cbuf, c_locs, c_caps, st_start, st_sto
|
|
|
|
|
|
|
|
|
|
current_t = min(a, b, c, d) |
|
|
|
|
|
|
|
|
|
# generate overflow flag or propagate from input |
|
|
|
|
# generate or propagate overflow flag |
|
|
|
|
cbuf[z_mem + z_cur, st_idx] = TMAX_OVL if overflows > 0 else max(a, b, c, d) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@cuda.jit() |
|
|
|
|
def wave_eval_gpu(ops, op_start, op_stop, cbuf, c_locs, c_caps, st_start, st_stop, delays, param, sd, seed): |
|
|
|
|
x, y = cuda.grid(2) |
|
|
|
|
st_idx = st_start + x |
|
|
|
|
op_idx = op_start + y |
|
|
|
|
if st_idx >= st_stop: return |
|
|
|
|
if op_idx >= op_stop: return |
|
|
|
|
|
|
|
|
|
lut = ops[op_idx, 0] |
|
|
|
|
z_idx = ops[op_idx, 1] |
|
|
|
|
a_idx = ops[op_idx, 2] |
|
|
|
|
b_idx = ops[op_idx, 3] |
|
|
|
|
c_idx = ops[op_idx, 4] |
|
|
|
|
d_idx = ops[op_idx, 5] |
|
|
|
|
|
|
|
|
|
param = param[st_idx] |
|
|
|
|
|
|
|
|
|
# >>> same code as wave_eval_cpu (except rand_gauss_*pu()-calls) >>> |
|
|
|
|
overflows = int(0) |
|
|
|
|
|
|
|
|
|
if len(delays) > 1: |
|
|
|
|
_rnd = (seed << 4) + (z_idx << 20) + (st_idx << 1) |
|
|
|
|
for _ in range(4): |
|
|
|
|
_rnd = int(0xDEECE66D) * _rnd + 0xB |
|
|
|
|
delays = delays[_rnd % len(delays)] |
|
|
|
|
else: |
|
|
|
|
delays = delays[0] |
|
|
|
|
|
|
|
|
|
a_mem = c_locs[a_idx] |
|
|
|
|
b_mem = c_locs[b_idx] |
|
|
|
|
c_mem = c_locs[c_idx] |
|
|
|
|
d_mem = c_locs[d_idx] |
|
|
|
|
z_mem = c_locs[z_idx] |
|
|
|
|
z_cap = c_caps[z_idx] |
|
|
|
|
|
|
|
|
|
a_cur = int(0) |
|
|
|
|
b_cur = int(0) |
|
|
|
|
c_cur = int(0) |
|
|
|
|
d_cur = int(0) |
|
|
|
|
z_cur = lut & 1 |
|
|
|
|
if z_cur == 1: |
|
|
|
|
cbuf[z_mem, st_idx] = TMIN |
|
|
|
|
|
|
|
|
|
z_val = z_cur |
|
|
|
|
|
|
|
|
|
a = cbuf[a_mem + a_cur, st_idx] + delays[a_idx, 0, z_val] |
|
|
|
|
b = cbuf[b_mem + b_cur, st_idx] + delays[b_idx, 0, z_val] |
|
|
|
|
c = cbuf[c_mem + c_cur, st_idx] + delays[c_idx, 0, z_val] |
|
|
|
|
d = cbuf[d_mem + d_cur, st_idx] + delays[d_idx, 0, z_val] |
|
|
|
|
|
|
|
|
|
previous_t = TMIN |
|
|
|
|
|
|
|
|
|
current_t = min(a, b, c, d) |
|
|
|
|
inputs = int(0) |
|
|
|
|
|
|
|
|
|
while current_t < TMAX: |
|
|
|
|
if a == current_t: |
|
|
|
|
a_cur += 1 |
|
|
|
|
inputs ^= 1 |
|
|
|
|
thresh = delays[a_idx, 0, z_val] |
|
|
|
|
a = cbuf[a_mem + a_cur, st_idx] + delays[a_idx, 0, z_val] |
|
|
|
|
next_t = cbuf[a_mem + a_cur, st_idx] + delays[a_idx, 0, z_val ^ 1] |
|
|
|
|
elif b == current_t: |
|
|
|
|
b_cur += 1 |
|
|
|
|
inputs ^= 2 |
|
|
|
|
thresh = delays[b_idx, 0, z_val] |
|
|
|
|
b = cbuf[b_mem + b_cur, st_idx] + delays[b_idx, 0, z_val] |
|
|
|
|
next_t = cbuf[b_mem + b_cur, st_idx] + delays[b_idx, 0, z_val ^ 1] |
|
|
|
|
elif c == current_t: |
|
|
|
|
c_cur += 1 |
|
|
|
|
inputs ^= 4 |
|
|
|
|
thresh = delays[c_idx, 0, z_val] |
|
|
|
|
c = cbuf[c_mem + c_cur, st_idx] + delays[c_idx, 0, z_val] |
|
|
|
|
next_t = cbuf[c_mem + c_cur, st_idx] + delays[c_idx, 0, z_val ^ 1] |
|
|
|
|
else: |
|
|
|
|
d_cur += 1 |
|
|
|
|
inputs ^= 8 |
|
|
|
|
thresh = delays[d_idx, 0, z_val] |
|
|
|
|
d = cbuf[d_mem + d_cur, st_idx] + delays[d_idx, 0, z_val] |
|
|
|
|
next_t = cbuf[d_mem + d_cur, st_idx] + delays[d_idx, 0, z_val ^ 1] |
|
|
|
|
|
|
|
|
|
if (z_cur & 1) != ((lut >> inputs) & 1): |
|
|
|
|
# we generate an edge in z_mem, if ... |
|
|
|
|
if (z_cur == 0 # it is the first edge in z_mem ... |
|
|
|
|
or next_t < current_t # -OR- the next edge on SAME input is EARLIER (need current edge to filter BOTH in next iteration) ... |
|
|
|
|
or (current_t - previous_t) > thresh # -OR- the generated hazard is wider than pulse threshold. |
|
|
|
|
): |
|
|
|
|
if z_cur < (z_cap - 1): # enough space in z_mem? |
|
|
|
|
cbuf[z_mem + z_cur, st_idx] = current_t |
|
|
|
|
previous_t = current_t |
|
|
|
|
z_cur += 1 |
|
|
|
|
else: |
|
|
|
|
overflows += 1 |
|
|
|
|
previous_t = cbuf[z_mem + z_cur - 1, st_idx] |
|
|
|
|
z_cur -= 1 |
|
|
|
|
else: |
|
|
|
|
z_cur -= 1 |
|
|
|
|
previous_t = cbuf[z_mem + z_cur - 1, st_idx] if z_cur > 0 else TMIN |
|
|
|
|
|
|
|
|
|
# output value of cell changed. update all delayed inputs. |
|
|
|
|
z_val = z_val ^ 1 |
|
|
|
|
a = cbuf[a_mem + a_cur, st_idx] + delays[a_idx, 0, z_val] |
|
|
|
|
b = cbuf[b_mem + b_cur, st_idx] + delays[b_idx, 0, z_val] |
|
|
|
|
c = cbuf[c_mem + c_cur, st_idx] + delays[c_idx, 0, z_val] |
|
|
|
|
d = cbuf[d_mem + d_cur, st_idx] + delays[d_idx, 0, z_val] |
|
|
|
|
|
|
|
|
|
current_t = min(a, b, c, d) |
|
|
|
|
|
|
|
|
|
# generate or propagate overflow flag |
|
|
|
|
cbuf[z_mem + z_cur, st_idx] = TMAX_OVL if overflows > 0 else max(a, b, c, d) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|