|
|
@ -48,14 +48,8 @@ class WaveSim(sim.SimOps): |
|
|
|
memory footprint, but intermediate signal waveforms become unaccessible after a propagation. |
|
|
|
memory footprint, but intermediate signal waveforms become unaccessible after a propagation. |
|
|
|
""" |
|
|
|
""" |
|
|
|
def __init__(self, circuit, delays, sims=8, c_caps=16, c_reuse=False, strip_forks=False): |
|
|
|
def __init__(self, circuit, delays, sims=8, c_caps=16, c_reuse=False, strip_forks=False): |
|
|
|
assert c_caps > 0 and c_caps % 4 == 0 |
|
|
|
super().__init__(circuit, c_caps=c_caps, c_caps_min=4, c_reuse=c_reuse, strip_forks=strip_forks) |
|
|
|
super().__init__(circuit, c_caps=c_caps//4, c_reuse=c_reuse, strip_forks=strip_forks) |
|
|
|
|
|
|
|
self.sims = sims |
|
|
|
self.sims = sims |
|
|
|
|
|
|
|
|
|
|
|
self.c_len *= 4 |
|
|
|
|
|
|
|
self.c_locs[...] *= 4 |
|
|
|
|
|
|
|
self.c_caps[...] *= 4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
self.delays = np.zeros((len(delays), self.c_locs_len, 2, 2), dtype=delays.dtype) |
|
|
|
self.delays = np.zeros((len(delays), self.c_locs_len, 2, 2), dtype=delays.dtype) |
|
|
|
self.delays[:, :delays.shape[1]] = delays |
|
|
|
self.delays[:, :delays.shape[1]] = delays |
|
|
|
|
|
|
|
|
|
|
@ -87,21 +81,6 @@ class WaveSim(sim.SimOps): |
|
|
|
|
|
|
|
|
|
|
|
self.nbytes = sum([a.nbytes for a in (self.c, self.s, self.c_locs, self.c_caps, self.ops, self.params)]) |
|
|
|
self.nbytes = sum([a.nbytes for a in (self.c, self.s, self.c_locs, self.c_caps, self.ops, self.params)]) |
|
|
|
|
|
|
|
|
|
|
|
self.pi_s_locs = np.flatnonzero(self.c_locs[self.ppi_offset+np.arange(len(self.circuit.io_nodes))] >= 0) |
|
|
|
|
|
|
|
self.po_s_locs = np.flatnonzero(self.c_locs[self.ppo_offset+np.arange(len(self.circuit.io_nodes))] >= 0) |
|
|
|
|
|
|
|
self.ppio_s_locs = np.arange(len(self.circuit.io_nodes), len(self.s_nodes)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
self.pippi_s_locs = np.concatenate([self.pi_s_locs, self.ppio_s_locs]) |
|
|
|
|
|
|
|
self.poppo_s_locs = np.concatenate([self.po_s_locs, self.ppio_s_locs]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
self.pi_c_locs = self.c_locs[self.ppi_offset+self.pi_s_locs] |
|
|
|
|
|
|
|
self.po_c_locs = self.c_locs[self.ppo_offset+self.po_s_locs] |
|
|
|
|
|
|
|
self.ppi_c_locs = self.c_locs[self.ppi_offset+self.ppio_s_locs] |
|
|
|
|
|
|
|
self.ppo_c_locs = self.c_locs[self.ppo_offset+self.ppio_s_locs] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
self.pippi_c_locs = np.concatenate([self.pi_c_locs, self.ppi_c_locs]) |
|
|
|
|
|
|
|
self.poppo_c_locs = np.concatenate([self.po_c_locs, self.ppo_c_locs]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def __repr__(self): |
|
|
|
def __repr__(self): |
|
|
|
return f'<{type(self).__name__} {self.circuit.name} sims={self.sims} ops={len(self.ops)} ' + \ |
|
|
|
return f'<{type(self).__name__} {self.circuit.name} sims={self.sims} ops={len(self.ops)} ' + \ |
|
|
|
f'levels={len(self.level_starts)} mem={hr_bytes(self.nbytes)}>' |
|
|
|
f'levels={len(self.level_starts)} mem={hr_bytes(self.nbytes)}>' |
|
|
@ -156,131 +135,6 @@ class WaveSim(sim.SimOps): |
|
|
|
self.s[2, self.ppio_s_locs] = self.s[8, self.ppio_s_locs] |
|
|
|
self.s[2, self.ppio_s_locs] = self.s[8, self.ppio_s_locs] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@numba.njit |
|
|
|
|
|
|
|
def rand_gauss_cpu(seed, sd): |
|
|
|
|
|
|
|
clamp = 0.5 |
|
|
|
|
|
|
|
if sd <= 0.0: |
|
|
|
|
|
|
|
return 1.0 |
|
|
|
|
|
|
|
while True: |
|
|
|
|
|
|
|
x = -6.0 |
|
|
|
|
|
|
|
for _ in range(12): |
|
|
|
|
|
|
|
seed = int(0xDEECE66D) * seed + 0xB |
|
|
|
|
|
|
|
x += float((seed >> 8) & 0xffffff) / float(1 << 24) |
|
|
|
|
|
|
|
x *= sd |
|
|
|
|
|
|
|
if abs(x) <= clamp: |
|
|
|
|
|
|
|
break |
|
|
|
|
|
|
|
return x + 1.0 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@numba.njit |
|
|
|
|
|
|
|
def wave_eval_cpu_old(op, cbuf, c_locs, c_caps, st_idx, line_times, param, sd=0.0, seed=0): |
|
|
|
|
|
|
|
lut, z_idx, a_idx, b_idx, c_idx, d_idx = op |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# >>> same code as wave_eval_cpu (except rand_gauss_*pu()-calls) >>> |
|
|
|
|
|
|
|
overflows = int(0) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_seed = (seed << 4) + (z_idx << 20) + (st_idx << 1) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
a_mem = c_locs[a_idx] |
|
|
|
|
|
|
|
b_mem = c_locs[b_idx] |
|
|
|
|
|
|
|
c_mem = c_locs[c_idx] |
|
|
|
|
|
|
|
d_mem = c_locs[d_idx] |
|
|
|
|
|
|
|
z_mem = c_locs[z_idx] |
|
|
|
|
|
|
|
z_cap = c_caps[z_idx] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
a_cur = int(0) |
|
|
|
|
|
|
|
b_cur = int(0) |
|
|
|
|
|
|
|
c_cur = int(0) |
|
|
|
|
|
|
|
d_cur = int(0) |
|
|
|
|
|
|
|
z_cur = lut & 1 |
|
|
|
|
|
|
|
if z_cur == 1: |
|
|
|
|
|
|
|
cbuf[z_mem, st_idx] = TMIN |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
a = cbuf[a_mem, st_idx] + line_times[a_idx, 0, z_cur] * rand_gauss_cpu(_seed ^ a_mem ^ z_cur, sd) * param[0] |
|
|
|
|
|
|
|
if int(param[1]) == a_idx: a += param[2+z_cur] |
|
|
|
|
|
|
|
b = cbuf[b_mem, st_idx] + line_times[b_idx, 0, z_cur] * rand_gauss_cpu(_seed ^ b_mem ^ z_cur, sd) * param[0] |
|
|
|
|
|
|
|
if int(param[1]) == b_idx: b += param[2+z_cur] |
|
|
|
|
|
|
|
c = cbuf[c_mem, st_idx] + line_times[c_idx, 0, z_cur] * rand_gauss_cpu(_seed ^ c_mem ^ z_cur, sd) * param[0] |
|
|
|
|
|
|
|
if int(param[1]) == c_idx: c += param[2+z_cur] |
|
|
|
|
|
|
|
d = cbuf[d_mem, st_idx] + line_times[d_idx, 0, z_cur] * rand_gauss_cpu(_seed ^ d_mem ^ z_cur, sd) * param[0] |
|
|
|
|
|
|
|
if int(param[1]) == d_idx: d += param[2+z_cur] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
previous_t = TMIN |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
current_t = min(a, b, c, d) |
|
|
|
|
|
|
|
inputs = int(0) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
while current_t < TMAX: |
|
|
|
|
|
|
|
z_val = z_cur & 1 |
|
|
|
|
|
|
|
if a == current_t: |
|
|
|
|
|
|
|
a_cur += 1 |
|
|
|
|
|
|
|
a = cbuf[a_mem + a_cur, st_idx] |
|
|
|
|
|
|
|
a += line_times[a_idx, 0, z_val ^ 1] * rand_gauss_cpu(_seed ^ a_mem ^ z_val ^ 1, sd) * param[0] |
|
|
|
|
|
|
|
thresh = line_times[a_idx, 1, z_val] * rand_gauss_cpu(_seed ^ a_mem ^ z_val, sd) * param[0] |
|
|
|
|
|
|
|
if int(param[1]) == a_idx: |
|
|
|
|
|
|
|
a += param[2+(z_val^1)] |
|
|
|
|
|
|
|
thresh += param[2+z_val] |
|
|
|
|
|
|
|
inputs ^= 1 |
|
|
|
|
|
|
|
next_t = a |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
elif b == current_t: |
|
|
|
|
|
|
|
b_cur += 1 |
|
|
|
|
|
|
|
b = cbuf[b_mem + b_cur, st_idx] |
|
|
|
|
|
|
|
b += line_times[b_idx, 0, z_val ^ 1] * rand_gauss_cpu(_seed ^ b_mem ^ z_val ^ 1, sd) * param[0] |
|
|
|
|
|
|
|
thresh = line_times[b_idx, 1, z_val] * rand_gauss_cpu(_seed ^ b_mem ^ z_val, sd) * param[0] |
|
|
|
|
|
|
|
if int(param[1]) == b_idx: |
|
|
|
|
|
|
|
b += param[2+(z_val^1)] |
|
|
|
|
|
|
|
thresh += param[2+z_val] |
|
|
|
|
|
|
|
inputs ^= 2 |
|
|
|
|
|
|
|
next_t = b |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
elif c == current_t: |
|
|
|
|
|
|
|
c_cur += 1 |
|
|
|
|
|
|
|
c = cbuf[c_mem + c_cur, st_idx] |
|
|
|
|
|
|
|
c += line_times[c_idx, 0, z_val ^ 1] * rand_gauss_cpu(_seed ^ c_mem ^ z_val ^ 1, sd) * param[0] |
|
|
|
|
|
|
|
thresh = line_times[c_idx, 1, z_val] * rand_gauss_cpu(_seed ^ c_mem ^ z_val, sd) * param[0] |
|
|
|
|
|
|
|
if int(param[1]) == c_idx: |
|
|
|
|
|
|
|
c += param[2+(z_val^1)] |
|
|
|
|
|
|
|
thresh += param[2+z_val] |
|
|
|
|
|
|
|
inputs ^= 4 |
|
|
|
|
|
|
|
next_t = c |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
else: |
|
|
|
|
|
|
|
d_cur += 1 |
|
|
|
|
|
|
|
d = cbuf[d_mem + d_cur, st_idx] |
|
|
|
|
|
|
|
d += line_times[d_idx, 0, z_val ^ 1] * rand_gauss_cpu(_seed ^ d_mem ^ z_val ^ 1, sd) * param[0] |
|
|
|
|
|
|
|
thresh = line_times[d_idx, 1, z_val] * rand_gauss_cpu(_seed ^ d_mem ^ z_val, sd) * param[0] |
|
|
|
|
|
|
|
if int(param[1]) == d_idx: |
|
|
|
|
|
|
|
d += param[2+(z_val^1)] |
|
|
|
|
|
|
|
thresh += param[2+z_val] |
|
|
|
|
|
|
|
inputs ^= 8 |
|
|
|
|
|
|
|
next_t = d |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (z_cur & 1) != ((lut >> inputs) & 1): |
|
|
|
|
|
|
|
# we generate a toggle in z_mem, if: |
|
|
|
|
|
|
|
# ( it is the first toggle in z_mem OR |
|
|
|
|
|
|
|
# following toggle is earlier OR |
|
|
|
|
|
|
|
# pulse is wide enough ) AND enough space in z_mem. |
|
|
|
|
|
|
|
if z_cur == 0 or next_t < current_t or (current_t - previous_t) > thresh: |
|
|
|
|
|
|
|
if z_cur < (z_cap - 1): |
|
|
|
|
|
|
|
cbuf[z_mem + z_cur, st_idx] = current_t |
|
|
|
|
|
|
|
previous_t = current_t |
|
|
|
|
|
|
|
z_cur += 1 |
|
|
|
|
|
|
|
else: |
|
|
|
|
|
|
|
overflows += 1 |
|
|
|
|
|
|
|
previous_t = cbuf[z_mem + z_cur - 1, st_idx] |
|
|
|
|
|
|
|
z_cur -= 1 |
|
|
|
|
|
|
|
else: |
|
|
|
|
|
|
|
z_cur -= 1 |
|
|
|
|
|
|
|
previous_t = cbuf[z_mem + z_cur - 1, st_idx] if z_cur > 0 else TMIN |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
current_t = min(a, b, c, d) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# generate or propagate overflow flag |
|
|
|
|
|
|
|
cbuf[z_mem + z_cur, st_idx] = TMAX_OVL if overflows > 0 else max(a, b, c, d) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@numba.njit |
|
|
|
@numba.njit |
|
|
|
def wave_eval_cpu(op, cbuf, c_locs, c_caps, st_idx, delays, param, sd=0.0, seed=0): |
|
|
|
def wave_eval_cpu(op, cbuf, c_locs, c_caps, st_idx, delays, param, sd=0.0, seed=0): |
|
|
|
lut, z_idx, a_idx, b_idx, c_idx, d_idx = op |
|
|
|
lut, z_idx, a_idx, b_idx, c_idx, d_idx = op |
|
|
@ -379,6 +233,7 @@ def wave_eval_cpu(op, cbuf, c_locs, c_caps, st_idx, delays, param, sd=0.0, seed= |
|
|
|
# generate or propagate overflow flag |
|
|
|
# generate or propagate overflow flag |
|
|
|
cbuf[z_mem + z_cur, st_idx] = TMAX_OVL if overflows > 0 else max(a, b, c, d) |
|
|
|
cbuf[z_mem + z_cur, st_idx] = TMAX_OVL if overflows > 0 else max(a, b, c, d) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@numba.njit |
|
|
|
@numba.njit |
|
|
|
def level_eval_cpu(ops, op_start, op_stop, c, c_locs, c_caps, st_start, st_stop, delays, params, sd, seed): |
|
|
|
def level_eval_cpu(ops, op_start, op_stop, c, c_locs, c_caps, st_start, st_stop, delays, params, sd, seed): |
|
|
|
overflows = 0 |
|
|
|
overflows = 0 |
|
|
@ -505,143 +360,6 @@ def wave_assign_gpu(c, s, c_locs, ppi_offset): |
|
|
|
c[c_loc+2, x] = TMAX |
|
|
|
c[c_loc+2, x] = TMAX |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@cuda.jit(device=True) |
|
|
|
|
|
|
|
def rand_gauss_gpu(seed, sd): |
|
|
|
|
|
|
|
clamp = 0.5 |
|
|
|
|
|
|
|
if sd <= 0.0: |
|
|
|
|
|
|
|
return 1.0 |
|
|
|
|
|
|
|
while True: |
|
|
|
|
|
|
|
x = -6.0 |
|
|
|
|
|
|
|
for _ in range(12): |
|
|
|
|
|
|
|
seed = int(0xDEECE66D) * seed + 0xB |
|
|
|
|
|
|
|
x += float((seed >> 8) & 0xffffff) / float(1 << 24) |
|
|
|
|
|
|
|
x *= sd |
|
|
|
|
|
|
|
if abs(x) <= clamp: |
|
|
|
|
|
|
|
break |
|
|
|
|
|
|
|
return x + 1.0 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@cuda.jit() |
|
|
|
|
|
|
|
def wave_eval_gpu_old(ops, op_start, op_stop, cbuf, c_locs, c_caps, st_start, st_stop, line_times, param, sd, seed): |
|
|
|
|
|
|
|
x, y = cuda.grid(2) |
|
|
|
|
|
|
|
st_idx = st_start + x |
|
|
|
|
|
|
|
op_idx = op_start + y |
|
|
|
|
|
|
|
if st_idx >= st_stop: return |
|
|
|
|
|
|
|
if op_idx >= op_stop: return |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
lut = ops[op_idx, 0] |
|
|
|
|
|
|
|
z_idx = ops[op_idx, 1] |
|
|
|
|
|
|
|
a_idx = ops[op_idx, 2] |
|
|
|
|
|
|
|
b_idx = ops[op_idx, 3] |
|
|
|
|
|
|
|
c_idx = ops[op_idx, 4] |
|
|
|
|
|
|
|
d_idx = ops[op_idx, 5] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
param = param[st_idx] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# >>> same code as wave_eval_cpu (except rand_gauss_*pu()-calls) >>> |
|
|
|
|
|
|
|
overflows = int(0) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_seed = (seed << 4) + (z_idx << 20) + (st_idx << 1) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
a_mem = c_locs[a_idx] |
|
|
|
|
|
|
|
b_mem = c_locs[b_idx] |
|
|
|
|
|
|
|
c_mem = c_locs[c_idx] |
|
|
|
|
|
|
|
d_mem = c_locs[d_idx] |
|
|
|
|
|
|
|
z_mem = c_locs[z_idx] |
|
|
|
|
|
|
|
z_cap = c_caps[z_idx] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
a_cur = int(0) |
|
|
|
|
|
|
|
b_cur = int(0) |
|
|
|
|
|
|
|
c_cur = int(0) |
|
|
|
|
|
|
|
d_cur = int(0) |
|
|
|
|
|
|
|
z_cur = lut & 1 |
|
|
|
|
|
|
|
if z_cur == 1: |
|
|
|
|
|
|
|
cbuf[z_mem, st_idx] = TMIN |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
a = cbuf[a_mem, st_idx] + line_times[a_idx, 0, z_cur] * rand_gauss_gpu(_seed ^ a_mem ^ z_cur, sd) * param[0] |
|
|
|
|
|
|
|
if int(param[1]) == a_idx: a += param[2+z_cur] |
|
|
|
|
|
|
|
b = cbuf[b_mem, st_idx] + line_times[b_idx, 0, z_cur] * rand_gauss_gpu(_seed ^ b_mem ^ z_cur, sd) * param[0] |
|
|
|
|
|
|
|
if int(param[1]) == b_idx: b += param[2+z_cur] |
|
|
|
|
|
|
|
c = cbuf[c_mem, st_idx] + line_times[c_idx, 0, z_cur] * rand_gauss_gpu(_seed ^ c_mem ^ z_cur, sd) * param[0] |
|
|
|
|
|
|
|
if int(param[1]) == c_idx: c += param[2+z_cur] |
|
|
|
|
|
|
|
d = cbuf[d_mem, st_idx] + line_times[d_idx, 0, z_cur] * rand_gauss_gpu(_seed ^ d_mem ^ z_cur, sd) * param[0] |
|
|
|
|
|
|
|
if int(param[1]) == d_idx: d += param[2+z_cur] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
previous_t = TMIN |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
current_t = min(a, b, c, d) |
|
|
|
|
|
|
|
inputs = int(0) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
while current_t < TMAX: |
|
|
|
|
|
|
|
z_val = z_cur & 1 |
|
|
|
|
|
|
|
if a == current_t: |
|
|
|
|
|
|
|
a_cur += 1 |
|
|
|
|
|
|
|
a = cbuf[a_mem + a_cur, st_idx] |
|
|
|
|
|
|
|
a += line_times[a_idx, 0, z_val ^ 1] * rand_gauss_gpu(_seed ^ a_mem ^ z_val ^ 1, sd) * param[0] |
|
|
|
|
|
|
|
thresh = line_times[a_idx, 1, z_val] * rand_gauss_gpu(_seed ^ a_mem ^ z_val, sd) * param[0] |
|
|
|
|
|
|
|
if int(param[1]) == a_idx: |
|
|
|
|
|
|
|
a += param[2+(z_val^1)] |
|
|
|
|
|
|
|
thresh += param[2+z_val] |
|
|
|
|
|
|
|
inputs ^= 1 |
|
|
|
|
|
|
|
next_t = a |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
elif b == current_t: |
|
|
|
|
|
|
|
b_cur += 1 |
|
|
|
|
|
|
|
b = cbuf[b_mem + b_cur, st_idx] |
|
|
|
|
|
|
|
b += line_times[b_idx, 0, z_val ^ 1] * rand_gauss_gpu(_seed ^ b_mem ^ z_val ^ 1, sd) * param[0] |
|
|
|
|
|
|
|
thresh = line_times[b_idx, 1, z_val] * rand_gauss_gpu(_seed ^ b_mem ^ z_val, sd) * param[0] |
|
|
|
|
|
|
|
if int(param[1]) == b_idx: |
|
|
|
|
|
|
|
b += param[2+(z_val^1)] |
|
|
|
|
|
|
|
thresh += param[2+z_val] |
|
|
|
|
|
|
|
inputs ^= 2 |
|
|
|
|
|
|
|
next_t = b |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
elif c == current_t: |
|
|
|
|
|
|
|
c_cur += 1 |
|
|
|
|
|
|
|
c = cbuf[c_mem + c_cur, st_idx] |
|
|
|
|
|
|
|
c += line_times[c_idx, 0, z_val ^ 1] * rand_gauss_gpu(_seed ^ c_mem ^ z_val ^ 1, sd) * param[0] |
|
|
|
|
|
|
|
thresh = line_times[c_idx, 1, z_val] * rand_gauss_gpu(_seed ^ c_mem ^ z_val, sd) * param[0] |
|
|
|
|
|
|
|
if int(param[1]) == c_idx: |
|
|
|
|
|
|
|
c += param[2+(z_val^1)] |
|
|
|
|
|
|
|
thresh += param[2+z_val] |
|
|
|
|
|
|
|
inputs ^= 4 |
|
|
|
|
|
|
|
next_t = c |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
else: |
|
|
|
|
|
|
|
d_cur += 1 |
|
|
|
|
|
|
|
d = cbuf[d_mem + d_cur, st_idx] |
|
|
|
|
|
|
|
d += line_times[d_idx, 0, z_val ^ 1] * rand_gauss_gpu(_seed ^ d_mem ^ z_val ^ 1, sd) * param[0] |
|
|
|
|
|
|
|
thresh = line_times[d_idx, 1, z_val] * rand_gauss_gpu(_seed ^ d_mem ^ z_val, sd) * param[0] |
|
|
|
|
|
|
|
if int(param[1]) == d_idx: |
|
|
|
|
|
|
|
d += param[2+(z_val^1)] |
|
|
|
|
|
|
|
thresh += param[2+z_val] |
|
|
|
|
|
|
|
inputs ^= 8 |
|
|
|
|
|
|
|
next_t = d |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (z_cur & 1) != ((lut >> inputs) & 1): |
|
|
|
|
|
|
|
# we generate a toggle in z_mem, if: |
|
|
|
|
|
|
|
# ( it is the first toggle in z_mem OR |
|
|
|
|
|
|
|
# following toggle is earlier OR |
|
|
|
|
|
|
|
# pulse is wide enough ) AND enough space in z_mem. |
|
|
|
|
|
|
|
if z_cur == 0 or next_t < current_t or (current_t - previous_t) > thresh: |
|
|
|
|
|
|
|
if z_cur < (z_cap - 1): |
|
|
|
|
|
|
|
cbuf[z_mem + z_cur, st_idx] = current_t |
|
|
|
|
|
|
|
previous_t = current_t |
|
|
|
|
|
|
|
z_cur += 1 |
|
|
|
|
|
|
|
else: |
|
|
|
|
|
|
|
overflows += 1 |
|
|
|
|
|
|
|
previous_t = cbuf[z_mem + z_cur - 1, st_idx] |
|
|
|
|
|
|
|
z_cur -= 1 |
|
|
|
|
|
|
|
else: |
|
|
|
|
|
|
|
z_cur -= 1 |
|
|
|
|
|
|
|
previous_t = cbuf[z_mem + z_cur - 1, st_idx] if z_cur > 0 else TMIN |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
current_t = min(a, b, c, d) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# generate or propagate overflow flag |
|
|
|
|
|
|
|
cbuf[z_mem + z_cur, st_idx] = TMAX_OVL if overflows > 0 else max(a, b, c, d) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@cuda.jit() |
|
|
|
@cuda.jit() |
|
|
|
def wave_eval_gpu(ops, op_start, op_stop, cbuf, c_locs, c_caps, st_start, st_stop, delays, param, sd, seed): |
|
|
|
def wave_eval_gpu(ops, op_start, op_stop, cbuf, c_locs, c_caps, st_start, st_stop, delays, param, sd, seed): |
|
|
|
x, y = cuda.grid(2) |
|
|
|
x, y = cuda.grid(2) |
|
|
|