Browse Source

cleanup

devel
Stefan Holst 2 years ago
parent
commit
f04f1b0012
  1. 30
      src/kyupy/logic_sim.py
  2. 25
      src/kyupy/sim.py
  3. 286
      src/kyupy/wave_sim.py

30
src/kyupy/logic_sim.py

@ -34,21 +34,6 @@ class LogicSim(sim.SimOps):
self.s = np.zeros((2, self.s_len, 3, nbytes), dtype=np.uint8) self.s = np.zeros((2, self.s_len, 3, nbytes), dtype=np.uint8)
self.s[:,:,1,:] = 255 # unassigned self.s[:,:,1,:] = 255 # unassigned
self.pi_s_locs = np.flatnonzero(self.c_locs[self.ppi_offset+np.arange(len(self.circuit.io_nodes))] >= 0)
self.po_s_locs = np.flatnonzero(self.c_locs[self.ppo_offset+np.arange(len(self.circuit.io_nodes))] >= 0)
self.ppio_s_locs = np.arange(len(self.circuit.io_nodes), len(self.s_nodes))
self.pippi_s_locs = np.concatenate([self.pi_s_locs, self.ppio_s_locs])
self.poppo_s_locs = np.concatenate([self.po_s_locs, self.ppio_s_locs])
self.pi_c_locs = self.c_locs[self.ppi_offset+self.pi_s_locs]
self.po_c_locs = self.c_locs[self.ppo_offset+self.po_s_locs]
self.ppi_c_locs = self.c_locs[self.ppi_offset+self.ppio_s_locs]
self.ppo_c_locs = self.c_locs[self.ppo_offset+self.ppio_s_locs]
self.pippi_c_locs = np.concatenate([self.pi_c_locs, self.ppi_c_locs])
self.poppo_c_locs = np.concatenate([self.po_c_locs, self.ppo_c_locs])
def __repr__(self): def __repr__(self):
return f'<LogicSim {self.circuit.name} sims={self.sims} m={self.m} state_mem={hr_bytes(self.c.nbytes)}>' return f'<LogicSim {self.circuit.name} sims={self.sims} m={self.m} state_mem={hr_bytes(self.c.nbytes)}>'
@ -81,15 +66,6 @@ class LogicSim(sim.SimOps):
"""Propagate the input values towards the outputs (Perform all logic operations in topological order). """Propagate the input values towards the outputs (Perform all logic operations in topological order).
If the circuit is sequential (it contains flip-flops), one call simulates one clock cycle. If the circuit is sequential (it contains flip-flops), one call simulates one clock cycle.
Multiple clock cycles are simulated by a assign-propagate-capture loop:
.. code-block:: python
# initial state in state_bp
for cycle in range(10): # simulate 10 clock cycles
sim.assign(state_bp)
sim.propagate()
sim.capture(state_bp)
:param inject_cb: A callback function for manipulating intermediate signal values. :param inject_cb: A callback function for manipulating intermediate signal values.
This function is called with a line and its new logic values (in bit-parallel format) after This function is called with a line and its new logic values (in bit-parallel format) after
@ -113,7 +89,7 @@ class LogicSim(sim.SimOps):
elif op == sim.NOR2: self.c[o0] = ~(self.c[i0] | self.c[i1]) elif op == sim.NOR2: self.c[o0] = ~(self.c[i0] | self.c[i1])
elif op == sim.XOR2: self.c[o0] = self.c[i0] ^ self.c[i1] elif op == sim.XOR2: self.c[o0] = self.c[i0] ^ self.c[i1]
elif op == sim.XNOR2: self.c[o0] = ~(self.c[i0] ^ self.c[i1]) elif op == sim.XNOR2: self.c[o0] = ~(self.c[i0] ^ self.c[i1])
else: print(f'unknown sim {op}') else: print(f'unknown op {op}')
inject_cb(o0, self.s[o0]) inject_cb(o0, self.s[o0])
elif self.m == 4: elif self.m == 4:
pass pass
@ -128,7 +104,7 @@ class LogicSim(sim.SimOps):
elif op == sim.NOR2: logic.bp_or(self.c[o0], self.c[i0], self.c[i1]); logic.bp_not(self.c[o0], self.c[o0]) elif op == sim.NOR2: logic.bp_or(self.c[o0], self.c[i0], self.c[i1]); logic.bp_not(self.c[o0], self.c[o0])
elif op == sim.XOR2: logic.bp_xor(self.c[o0], self.c[i0], self.c[i1]) elif op == sim.XOR2: logic.bp_xor(self.c[o0], self.c[i0], self.c[i1])
elif op == sim.XNOR2: logic.bp_xor(self.c[o0], self.c[i0], self.c[i1]); logic.bp_not(self.c[o0], self.c[o0]) elif op == sim.XNOR2: logic.bp_xor(self.c[o0], self.c[i0], self.c[i1]); logic.bp_not(self.c[o0], self.c[o0])
else: print(f'unknown sim {op}') else: print(f'unknown op {op}')
if inject_cb is not None: inject_cb(o0, self.s[o0]) if inject_cb is not None: inject_cb(o0, self.s[o0])
def s_ppo_to_ppi(self): def s_ppo_to_ppi(self):
@ -169,4 +145,4 @@ def _prop_cpu(ops, c_locs, c):
elif op == sim.NOR2: c[o0] = ~(c[i0] | c[i1]) elif op == sim.NOR2: c[o0] = ~(c[i0] | c[i1])
elif op == sim.XOR2: c[o0] = c[i0] ^ c[i1] elif op == sim.XOR2: c[o0] = c[i0] ^ c[i1]
elif op == sim.XNOR2: c[o0] = ~(c[i0] ^ c[i1]) elif op == sim.XNOR2: c[o0] = ~(c[i0] ^ c[i1])
else: print(f'unknown sim {op}') else: print(f'unknown op {op}')

25
src/kyupy/sim.py

@ -145,7 +145,7 @@ class SimOps:
:param keep_signals: If disabled, memory of intermediate signal waveforms will be re-used. This greatly reduces :param keep_signals: If disabled, memory of intermediate signal waveforms will be re-used. This greatly reduces
memory footprint, but intermediate signal waveforms become unaccessible after a propagation. memory footprint, but intermediate signal waveforms become unaccessible after a propagation.
""" """
def __init__(self, circuit, c_caps=1, c_reuse=False, strip_forks=False): def __init__(self, circuit, c_caps=1, c_caps_min=1, c_reuse=False, strip_forks=False):
self.circuit = circuit self.circuit = circuit
dffs = [n for n in circuit.nodes if 'dff' in n.kind.lower()] dffs = [n for n in circuit.nodes if 'dff' in n.kind.lower()]
latches = [n for n in circuit.nodes if 'latch' in n.kind.lower()] latches = [n for n in circuit.nodes if 'latch' in n.kind.lower()]
@ -248,15 +248,15 @@ class SimOps:
h = Heap() h = Heap()
# allocate and keep memory for special fields # allocate and keep memory for special fields
self.c_locs[self.zero_idx], self.c_caps[self.zero_idx] = h.alloc(1), 1 self.c_locs[self.zero_idx], self.c_caps[self.zero_idx] = h.alloc(c_caps_min), c_caps_min
self.c_locs[self.tmp_idx], self.c_caps[self.tmp_idx] = h.alloc(1), 1 self.c_locs[self.tmp_idx], self.c_caps[self.tmp_idx] = h.alloc(c_caps_min), c_caps_min
ref_count[self.zero_idx] += 1 ref_count[self.zero_idx] += 1
ref_count[self.tmp_idx] += 1 ref_count[self.tmp_idx] += 1
# allocate and keep memory for PI/PPI, keep memory for PO/PPO (allocated later) # allocate and keep memory for PI/PPI, keep memory for PO/PPO (allocated later)
for i, n in enumerate(self.s_nodes): for i, n in enumerate(self.s_nodes):
if len(n.outs) > 0: if len(n.outs) > 0:
self.c_locs[self.ppi_offset + i], self.c_caps[self.ppi_offset + i] = h.alloc(1), 1 self.c_locs[self.ppi_offset + i], self.c_caps[self.ppi_offset + i] = h.alloc(c_caps_min), c_caps_min
ref_count[self.ppi_offset + i] += 1 ref_count[self.ppi_offset + i] += 1
if len(n.ins) > 0: if len(n.ins) > 0:
i0_idx = stems[n.ins[0]] if stems[n.ins[0]] >= 0 else n.ins[0] i0_idx = stems[n.ins[0]] if stems[n.ins[0]] >= 0 else n.ins[0]
@ -280,7 +280,7 @@ class SimOps:
if ref_count[i2_idx] <= 0: free_list.append(self.c_locs[i2_idx]) if ref_count[i2_idx] <= 0: free_list.append(self.c_locs[i2_idx])
if ref_count[i3_idx] <= 0: free_list.append(self.c_locs[i3_idx]) if ref_count[i3_idx] <= 0: free_list.append(self.c_locs[i3_idx])
o_idx = op[1] o_idx = op[1]
cap = c_caps[o_idx] cap = max(c_caps_min, c_caps[o_idx])
self.c_locs[o_idx], self.c_caps[o_idx] = h.alloc(cap), cap self.c_locs[o_idx], self.c_caps[o_idx] = h.alloc(cap), cap
if not keep_signals: if not keep_signals:
for loc in free_list: for loc in free_list:
@ -301,3 +301,18 @@ class SimOps:
from collections import defaultdict from collections import defaultdict
self.prim_counts = defaultdict(int) self.prim_counts = defaultdict(int)
for op, _, _, _, _, _ in self.ops: self.prim_counts[names[op]] += 1 for op, _, _, _, _, _ in self.ops: self.prim_counts[names[op]] += 1
self.pi_s_locs = np.flatnonzero(self.c_locs[self.ppi_offset+np.arange(len(self.circuit.io_nodes))] >= 0)
self.po_s_locs = np.flatnonzero(self.c_locs[self.ppo_offset+np.arange(len(self.circuit.io_nodes))] >= 0)
self.ppio_s_locs = np.arange(len(self.circuit.io_nodes), len(self.s_nodes))
self.pippi_s_locs = np.concatenate([self.pi_s_locs, self.ppio_s_locs])
self.poppo_s_locs = np.concatenate([self.po_s_locs, self.ppio_s_locs])
self.pi_c_locs = self.c_locs[self.ppi_offset+self.pi_s_locs]
self.po_c_locs = self.c_locs[self.ppo_offset+self.po_s_locs]
self.ppi_c_locs = self.c_locs[self.ppi_offset+self.ppio_s_locs]
self.ppo_c_locs = self.c_locs[self.ppo_offset+self.ppio_s_locs]
self.pippi_c_locs = np.concatenate([self.pi_c_locs, self.ppi_c_locs])
self.poppo_c_locs = np.concatenate([self.po_c_locs, self.ppo_c_locs])

286
src/kyupy/wave_sim.py

@ -48,14 +48,8 @@ class WaveSim(sim.SimOps):
memory footprint, but intermediate signal waveforms become unaccessible after a propagation. memory footprint, but intermediate signal waveforms become unaccessible after a propagation.
""" """
def __init__(self, circuit, delays, sims=8, c_caps=16, c_reuse=False, strip_forks=False): def __init__(self, circuit, delays, sims=8, c_caps=16, c_reuse=False, strip_forks=False):
assert c_caps > 0 and c_caps % 4 == 0 super().__init__(circuit, c_caps=c_caps, c_caps_min=4, c_reuse=c_reuse, strip_forks=strip_forks)
super().__init__(circuit, c_caps=c_caps//4, c_reuse=c_reuse, strip_forks=strip_forks)
self.sims = sims self.sims = sims
self.c_len *= 4
self.c_locs[...] *= 4
self.c_caps[...] *= 4
self.delays = np.zeros((len(delays), self.c_locs_len, 2, 2), dtype=delays.dtype) self.delays = np.zeros((len(delays), self.c_locs_len, 2, 2), dtype=delays.dtype)
self.delays[:, :delays.shape[1]] = delays self.delays[:, :delays.shape[1]] = delays
@ -87,21 +81,6 @@ class WaveSim(sim.SimOps):
self.nbytes = sum([a.nbytes for a in (self.c, self.s, self.c_locs, self.c_caps, self.ops, self.params)]) self.nbytes = sum([a.nbytes for a in (self.c, self.s, self.c_locs, self.c_caps, self.ops, self.params)])
self.pi_s_locs = np.flatnonzero(self.c_locs[self.ppi_offset+np.arange(len(self.circuit.io_nodes))] >= 0)
self.po_s_locs = np.flatnonzero(self.c_locs[self.ppo_offset+np.arange(len(self.circuit.io_nodes))] >= 0)
self.ppio_s_locs = np.arange(len(self.circuit.io_nodes), len(self.s_nodes))
self.pippi_s_locs = np.concatenate([self.pi_s_locs, self.ppio_s_locs])
self.poppo_s_locs = np.concatenate([self.po_s_locs, self.ppio_s_locs])
self.pi_c_locs = self.c_locs[self.ppi_offset+self.pi_s_locs]
self.po_c_locs = self.c_locs[self.ppo_offset+self.po_s_locs]
self.ppi_c_locs = self.c_locs[self.ppi_offset+self.ppio_s_locs]
self.ppo_c_locs = self.c_locs[self.ppo_offset+self.ppio_s_locs]
self.pippi_c_locs = np.concatenate([self.pi_c_locs, self.ppi_c_locs])
self.poppo_c_locs = np.concatenate([self.po_c_locs, self.ppo_c_locs])
def __repr__(self): def __repr__(self):
return f'<{type(self).__name__} {self.circuit.name} sims={self.sims} ops={len(self.ops)} ' + \ return f'<{type(self).__name__} {self.circuit.name} sims={self.sims} ops={len(self.ops)} ' + \
f'levels={len(self.level_starts)} mem={hr_bytes(self.nbytes)}>' f'levels={len(self.level_starts)} mem={hr_bytes(self.nbytes)}>'
@ -156,131 +135,6 @@ class WaveSim(sim.SimOps):
self.s[2, self.ppio_s_locs] = self.s[8, self.ppio_s_locs] self.s[2, self.ppio_s_locs] = self.s[8, self.ppio_s_locs]
@numba.njit
def rand_gauss_cpu(seed, sd):
clamp = 0.5
if sd <= 0.0:
return 1.0
while True:
x = -6.0
for _ in range(12):
seed = int(0xDEECE66D) * seed + 0xB
x += float((seed >> 8) & 0xffffff) / float(1 << 24)
x *= sd
if abs(x) <= clamp:
break
return x + 1.0
@numba.njit
def wave_eval_cpu_old(op, cbuf, c_locs, c_caps, st_idx, line_times, param, sd=0.0, seed=0):
lut, z_idx, a_idx, b_idx, c_idx, d_idx = op
# >>> same code as wave_eval_cpu (except rand_gauss_*pu()-calls) >>>
overflows = int(0)
_seed = (seed << 4) + (z_idx << 20) + (st_idx << 1)
a_mem = c_locs[a_idx]
b_mem = c_locs[b_idx]
c_mem = c_locs[c_idx]
d_mem = c_locs[d_idx]
z_mem = c_locs[z_idx]
z_cap = c_caps[z_idx]
a_cur = int(0)
b_cur = int(0)
c_cur = int(0)
d_cur = int(0)
z_cur = lut & 1
if z_cur == 1:
cbuf[z_mem, st_idx] = TMIN
a = cbuf[a_mem, st_idx] + line_times[a_idx, 0, z_cur] * rand_gauss_cpu(_seed ^ a_mem ^ z_cur, sd) * param[0]
if int(param[1]) == a_idx: a += param[2+z_cur]
b = cbuf[b_mem, st_idx] + line_times[b_idx, 0, z_cur] * rand_gauss_cpu(_seed ^ b_mem ^ z_cur, sd) * param[0]
if int(param[1]) == b_idx: b += param[2+z_cur]
c = cbuf[c_mem, st_idx] + line_times[c_idx, 0, z_cur] * rand_gauss_cpu(_seed ^ c_mem ^ z_cur, sd) * param[0]
if int(param[1]) == c_idx: c += param[2+z_cur]
d = cbuf[d_mem, st_idx] + line_times[d_idx, 0, z_cur] * rand_gauss_cpu(_seed ^ d_mem ^ z_cur, sd) * param[0]
if int(param[1]) == d_idx: d += param[2+z_cur]
previous_t = TMIN
current_t = min(a, b, c, d)
inputs = int(0)
while current_t < TMAX:
z_val = z_cur & 1
if a == current_t:
a_cur += 1
a = cbuf[a_mem + a_cur, st_idx]
a += line_times[a_idx, 0, z_val ^ 1] * rand_gauss_cpu(_seed ^ a_mem ^ z_val ^ 1, sd) * param[0]
thresh = line_times[a_idx, 1, z_val] * rand_gauss_cpu(_seed ^ a_mem ^ z_val, sd) * param[0]
if int(param[1]) == a_idx:
a += param[2+(z_val^1)]
thresh += param[2+z_val]
inputs ^= 1
next_t = a
elif b == current_t:
b_cur += 1
b = cbuf[b_mem + b_cur, st_idx]
b += line_times[b_idx, 0, z_val ^ 1] * rand_gauss_cpu(_seed ^ b_mem ^ z_val ^ 1, sd) * param[0]
thresh = line_times[b_idx, 1, z_val] * rand_gauss_cpu(_seed ^ b_mem ^ z_val, sd) * param[0]
if int(param[1]) == b_idx:
b += param[2+(z_val^1)]
thresh += param[2+z_val]
inputs ^= 2
next_t = b
elif c == current_t:
c_cur += 1
c = cbuf[c_mem + c_cur, st_idx]
c += line_times[c_idx, 0, z_val ^ 1] * rand_gauss_cpu(_seed ^ c_mem ^ z_val ^ 1, sd) * param[0]
thresh = line_times[c_idx, 1, z_val] * rand_gauss_cpu(_seed ^ c_mem ^ z_val, sd) * param[0]
if int(param[1]) == c_idx:
c += param[2+(z_val^1)]
thresh += param[2+z_val]
inputs ^= 4
next_t = c
else:
d_cur += 1
d = cbuf[d_mem + d_cur, st_idx]
d += line_times[d_idx, 0, z_val ^ 1] * rand_gauss_cpu(_seed ^ d_mem ^ z_val ^ 1, sd) * param[0]
thresh = line_times[d_idx, 1, z_val] * rand_gauss_cpu(_seed ^ d_mem ^ z_val, sd) * param[0]
if int(param[1]) == d_idx:
d += param[2+(z_val^1)]
thresh += param[2+z_val]
inputs ^= 8
next_t = d
if (z_cur & 1) != ((lut >> inputs) & 1):
# we generate a toggle in z_mem, if:
# ( it is the first toggle in z_mem OR
# following toggle is earlier OR
# pulse is wide enough ) AND enough space in z_mem.
if z_cur == 0 or next_t < current_t or (current_t - previous_t) > thresh:
if z_cur < (z_cap - 1):
cbuf[z_mem + z_cur, st_idx] = current_t
previous_t = current_t
z_cur += 1
else:
overflows += 1
previous_t = cbuf[z_mem + z_cur - 1, st_idx]
z_cur -= 1
else:
z_cur -= 1
previous_t = cbuf[z_mem + z_cur - 1, st_idx] if z_cur > 0 else TMIN
current_t = min(a, b, c, d)
# generate or propagate overflow flag
cbuf[z_mem + z_cur, st_idx] = TMAX_OVL if overflows > 0 else max(a, b, c, d)
@numba.njit @numba.njit
def wave_eval_cpu(op, cbuf, c_locs, c_caps, st_idx, delays, param, sd=0.0, seed=0): def wave_eval_cpu(op, cbuf, c_locs, c_caps, st_idx, delays, param, sd=0.0, seed=0):
lut, z_idx, a_idx, b_idx, c_idx, d_idx = op lut, z_idx, a_idx, b_idx, c_idx, d_idx = op
@ -379,6 +233,7 @@ def wave_eval_cpu(op, cbuf, c_locs, c_caps, st_idx, delays, param, sd=0.0, seed=
# generate or propagate overflow flag # generate or propagate overflow flag
cbuf[z_mem + z_cur, st_idx] = TMAX_OVL if overflows > 0 else max(a, b, c, d) cbuf[z_mem + z_cur, st_idx] = TMAX_OVL if overflows > 0 else max(a, b, c, d)
@numba.njit @numba.njit
def level_eval_cpu(ops, op_start, op_stop, c, c_locs, c_caps, st_start, st_stop, delays, params, sd, seed): def level_eval_cpu(ops, op_start, op_stop, c, c_locs, c_caps, st_start, st_stop, delays, params, sd, seed):
overflows = 0 overflows = 0
@ -505,143 +360,6 @@ def wave_assign_gpu(c, s, c_locs, ppi_offset):
c[c_loc+2, x] = TMAX c[c_loc+2, x] = TMAX
@cuda.jit(device=True)
def rand_gauss_gpu(seed, sd):
clamp = 0.5
if sd <= 0.0:
return 1.0
while True:
x = -6.0
for _ in range(12):
seed = int(0xDEECE66D) * seed + 0xB
x += float((seed >> 8) & 0xffffff) / float(1 << 24)
x *= sd
if abs(x) <= clamp:
break
return x + 1.0
@cuda.jit()
def wave_eval_gpu_old(ops, op_start, op_stop, cbuf, c_locs, c_caps, st_start, st_stop, line_times, param, sd, seed):
x, y = cuda.grid(2)
st_idx = st_start + x
op_idx = op_start + y
if st_idx >= st_stop: return
if op_idx >= op_stop: return
lut = ops[op_idx, 0]
z_idx = ops[op_idx, 1]
a_idx = ops[op_idx, 2]
b_idx = ops[op_idx, 3]
c_idx = ops[op_idx, 4]
d_idx = ops[op_idx, 5]
param = param[st_idx]
# >>> same code as wave_eval_cpu (except rand_gauss_*pu()-calls) >>>
overflows = int(0)
_seed = (seed << 4) + (z_idx << 20) + (st_idx << 1)
a_mem = c_locs[a_idx]
b_mem = c_locs[b_idx]
c_mem = c_locs[c_idx]
d_mem = c_locs[d_idx]
z_mem = c_locs[z_idx]
z_cap = c_caps[z_idx]
a_cur = int(0)
b_cur = int(0)
c_cur = int(0)
d_cur = int(0)
z_cur = lut & 1
if z_cur == 1:
cbuf[z_mem, st_idx] = TMIN
a = cbuf[a_mem, st_idx] + line_times[a_idx, 0, z_cur] * rand_gauss_gpu(_seed ^ a_mem ^ z_cur, sd) * param[0]
if int(param[1]) == a_idx: a += param[2+z_cur]
b = cbuf[b_mem, st_idx] + line_times[b_idx, 0, z_cur] * rand_gauss_gpu(_seed ^ b_mem ^ z_cur, sd) * param[0]
if int(param[1]) == b_idx: b += param[2+z_cur]
c = cbuf[c_mem, st_idx] + line_times[c_idx, 0, z_cur] * rand_gauss_gpu(_seed ^ c_mem ^ z_cur, sd) * param[0]
if int(param[1]) == c_idx: c += param[2+z_cur]
d = cbuf[d_mem, st_idx] + line_times[d_idx, 0, z_cur] * rand_gauss_gpu(_seed ^ d_mem ^ z_cur, sd) * param[0]
if int(param[1]) == d_idx: d += param[2+z_cur]
previous_t = TMIN
current_t = min(a, b, c, d)
inputs = int(0)
while current_t < TMAX:
z_val = z_cur & 1
if a == current_t:
a_cur += 1
a = cbuf[a_mem + a_cur, st_idx]
a += line_times[a_idx, 0, z_val ^ 1] * rand_gauss_gpu(_seed ^ a_mem ^ z_val ^ 1, sd) * param[0]
thresh = line_times[a_idx, 1, z_val] * rand_gauss_gpu(_seed ^ a_mem ^ z_val, sd) * param[0]
if int(param[1]) == a_idx:
a += param[2+(z_val^1)]
thresh += param[2+z_val]
inputs ^= 1
next_t = a
elif b == current_t:
b_cur += 1
b = cbuf[b_mem + b_cur, st_idx]
b += line_times[b_idx, 0, z_val ^ 1] * rand_gauss_gpu(_seed ^ b_mem ^ z_val ^ 1, sd) * param[0]
thresh = line_times[b_idx, 1, z_val] * rand_gauss_gpu(_seed ^ b_mem ^ z_val, sd) * param[0]
if int(param[1]) == b_idx:
b += param[2+(z_val^1)]
thresh += param[2+z_val]
inputs ^= 2
next_t = b
elif c == current_t:
c_cur += 1
c = cbuf[c_mem + c_cur, st_idx]
c += line_times[c_idx, 0, z_val ^ 1] * rand_gauss_gpu(_seed ^ c_mem ^ z_val ^ 1, sd) * param[0]
thresh = line_times[c_idx, 1, z_val] * rand_gauss_gpu(_seed ^ c_mem ^ z_val, sd) * param[0]
if int(param[1]) == c_idx:
c += param[2+(z_val^1)]
thresh += param[2+z_val]
inputs ^= 4
next_t = c
else:
d_cur += 1
d = cbuf[d_mem + d_cur, st_idx]
d += line_times[d_idx, 0, z_val ^ 1] * rand_gauss_gpu(_seed ^ d_mem ^ z_val ^ 1, sd) * param[0]
thresh = line_times[d_idx, 1, z_val] * rand_gauss_gpu(_seed ^ d_mem ^ z_val, sd) * param[0]
if int(param[1]) == d_idx:
d += param[2+(z_val^1)]
thresh += param[2+z_val]
inputs ^= 8
next_t = d
if (z_cur & 1) != ((lut >> inputs) & 1):
# we generate a toggle in z_mem, if:
# ( it is the first toggle in z_mem OR
# following toggle is earlier OR
# pulse is wide enough ) AND enough space in z_mem.
if z_cur == 0 or next_t < current_t or (current_t - previous_t) > thresh:
if z_cur < (z_cap - 1):
cbuf[z_mem + z_cur, st_idx] = current_t
previous_t = current_t
z_cur += 1
else:
overflows += 1
previous_t = cbuf[z_mem + z_cur - 1, st_idx]
z_cur -= 1
else:
z_cur -= 1
previous_t = cbuf[z_mem + z_cur - 1, st_idx] if z_cur > 0 else TMIN
current_t = min(a, b, c, d)
# generate or propagate overflow flag
cbuf[z_mem + z_cur, st_idx] = TMAX_OVL if overflows > 0 else max(a, b, c, d)
@cuda.jit() @cuda.jit()
def wave_eval_gpu(ops, op_start, op_stop, cbuf, c_locs, c_caps, st_start, st_stop, delays, param, sd, seed): def wave_eval_gpu(ops, op_start, op_stop, cbuf, c_locs, c_caps, st_start, st_stop, delays, param, sd, seed):
x, y = cuda.grid(2) x, y = cuda.grid(2)

Loading…
Cancel
Save