|
|
@ -16,8 +16,7 @@ import math |
|
|
|
|
|
|
|
|
|
|
|
import numpy as np |
|
|
|
import numpy as np |
|
|
|
|
|
|
|
|
|
|
|
from . import numba, cuda, hr_bytes |
|
|
|
from . import numba, cuda, hr_bytes, sim |
|
|
|
from .sim import SimOps |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
TMAX = np.float32(2 ** 127) |
|
|
|
TMAX = np.float32(2 ** 127) |
|
|
@ -29,7 +28,7 @@ TMIN = np.float32(-2 ** 127) |
|
|
|
"""A large negative 32-bit floating point value used at the beginning of waveforms that start with logic-1.""" |
|
|
|
"""A large negative 32-bit floating point value used at the beginning of waveforms that start with logic-1.""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class WaveSim(SimOps): |
|
|
|
class WaveSim(sim.SimOps): |
|
|
|
"""A waveform-based combinational logic timing simulator running on CPU. |
|
|
|
"""A waveform-based combinational logic timing simulator running on CPU. |
|
|
|
|
|
|
|
|
|
|
|
:param circuit: The circuit to simulate. |
|
|
|
:param circuit: The circuit to simulate. |
|
|
@ -54,30 +53,31 @@ class WaveSim(SimOps): |
|
|
|
self.sims = sims |
|
|
|
self.sims = sims |
|
|
|
|
|
|
|
|
|
|
|
self.c_len *= 4 |
|
|
|
self.c_len *= 4 |
|
|
|
self.vat[...,0:2] *= 4 |
|
|
|
self.c_locs[...] *= 4 |
|
|
|
|
|
|
|
self.c_caps[...] *= 4 |
|
|
|
|
|
|
|
|
|
|
|
self.timing = np.zeros((self.vat_len, 2, 2)) |
|
|
|
self.timing = np.zeros((self.c_locs_len, 2, 2)) |
|
|
|
self.timing[:len(timing)] = timing |
|
|
|
self.timing[:len(timing)] = timing |
|
|
|
|
|
|
|
|
|
|
|
self.c = np.zeros((self.c_len, sims), dtype=np.float32) + TMAX |
|
|
|
self.c = np.zeros((self.c_len, sims), dtype=np.float32) + TMAX |
|
|
|
self.s = np.zeros((len(self.s_nodes), sims, 11), dtype=np.float32) |
|
|
|
self.s = np.zeros((11, self.s_len, sims), dtype=np.float32) |
|
|
|
"""Information about the logic values and transitions around the sequential elements (flip-flops) and ports. |
|
|
|
"""Information about the logic values and transitions around the sequential elements (flip-flops) and ports. |
|
|
|
|
|
|
|
|
|
|
|
The first 3 values are read by ``s_to_c()``. |
|
|
|
The first 3 values are read by ``s_to_c()``. |
|
|
|
The remaining values are written by ``c_to_s()``. |
|
|
|
The remaining values are written by ``c_to_s()``. |
|
|
|
|
|
|
|
|
|
|
|
The elements are as follows: |
|
|
|
The elements are as follows: |
|
|
|
* ``s[..., 0]`` (P)PI initial value |
|
|
|
* ``s[0]`` (P)PI initial value |
|
|
|
* ``s[..., 1]`` (P)PI transition time |
|
|
|
* ``s[1]`` (P)PI transition time |
|
|
|
* ``s[..., 2]`` (P)PI final value |
|
|
|
* ``s[2]`` (P)PI final value |
|
|
|
* ``s[..., 3]`` (P)PO initial value |
|
|
|
* ``s[3]`` (P)PO initial value |
|
|
|
* ``s[..., 4]`` (P)PO earliest arrival time (EAT): The time at which the output transitioned from its initial value. |
|
|
|
* ``s[4]`` (P)PO earliest arrival time (EAT): The time at which the output transitioned from its initial value. |
|
|
|
* ``s[..., 5]`` (P)PO latest stabilization time (LST): The time at which the output settled to its final value. |
|
|
|
* ``s[5]`` (P)PO latest stabilization time (LST): The time at which the output settled to its final value. |
|
|
|
* ``s[..., 6]`` (P)PO final value |
|
|
|
* ``s[6]`` (P)PO final value |
|
|
|
* ``s[..., 7]`` (P)PO capture value: probability of capturing a 1 at a given capture time |
|
|
|
* ``s[7]`` (P)PO capture value: probability of capturing a 1 at a given capture time |
|
|
|
* ``s[..., 8]`` (P)PO sampled capture value: decided by random sampling according to a given seed. |
|
|
|
* ``s[8]`` (P)PO sampled capture value: decided by random sampling according to a given seed. |
|
|
|
* ``s[..., 9]`` (P)PO sampled capture slack: (capture time - LST) - decided by random sampling according to a given seed. |
|
|
|
* ``s[9]`` (P)PO sampled capture slack: (capture time - LST) - decided by random sampling according to a given seed. |
|
|
|
* ``s[..., 10]`` Overflow indicator: If non-zero, some signals in the input cone of this output had more |
|
|
|
* ``s[10]`` Overflow indicator: If non-zero, some signals in the input cone of this output had more |
|
|
|
transitions than specified in ``c_caps``. Some transitions have been discarded, the |
|
|
|
transitions than specified in ``c_caps``. Some transitions have been discarded, the |
|
|
|
final values in the waveforms are still valid. |
|
|
|
final values in the waveforms are still valid. |
|
|
|
""" |
|
|
|
""" |
|
|
@ -85,19 +85,19 @@ class WaveSim(SimOps): |
|
|
|
self.params = np.zeros((sims, 4), dtype=np.float32) |
|
|
|
self.params = np.zeros((sims, 4), dtype=np.float32) |
|
|
|
self.params[...,0] = 1.0 |
|
|
|
self.params[...,0] = 1.0 |
|
|
|
|
|
|
|
|
|
|
|
self.nbytes = sum([a.nbytes for a in (self.c, self.s, self.vat, self.ops, self.params)]) |
|
|
|
self.nbytes = sum([a.nbytes for a in (self.c, self.s, self.c_locs, self.c_caps, self.ops, self.params)]) |
|
|
|
|
|
|
|
|
|
|
|
self.pi_s_locs = np.flatnonzero(self.vat[self.ppi_offset+np.arange(len(self.circuit.io_nodes)), 0] >= 0) |
|
|
|
self.pi_s_locs = np.flatnonzero(self.c_locs[self.ppi_offset+np.arange(len(self.circuit.io_nodes))] >= 0) |
|
|
|
self.po_s_locs = np.flatnonzero(self.vat[self.ppo_offset+np.arange(len(self.circuit.io_nodes)), 0] >= 0) |
|
|
|
self.po_s_locs = np.flatnonzero(self.c_locs[self.ppo_offset+np.arange(len(self.circuit.io_nodes))] >= 0) |
|
|
|
self.ppio_s_locs = np.arange(len(self.circuit.io_nodes), len(self.s_nodes)) |
|
|
|
self.ppio_s_locs = np.arange(len(self.circuit.io_nodes), len(self.s_nodes)) |
|
|
|
|
|
|
|
|
|
|
|
self.pippi_s_locs = np.concatenate([self.pi_s_locs, self.ppio_s_locs]) |
|
|
|
self.pippi_s_locs = np.concatenate([self.pi_s_locs, self.ppio_s_locs]) |
|
|
|
self.poppo_s_locs = np.concatenate([self.po_s_locs, self.ppio_s_locs]) |
|
|
|
self.poppo_s_locs = np.concatenate([self.po_s_locs, self.ppio_s_locs]) |
|
|
|
|
|
|
|
|
|
|
|
self.pi_c_locs = self.vat[self.ppi_offset+self.pi_s_locs, 0] |
|
|
|
self.pi_c_locs = self.c_locs[self.ppi_offset+self.pi_s_locs] |
|
|
|
self.po_c_locs = self.vat[self.ppo_offset+self.po_s_locs, 0] |
|
|
|
self.po_c_locs = self.c_locs[self.ppo_offset+self.po_s_locs] |
|
|
|
self.ppi_c_locs = self.vat[self.ppi_offset+self.ppio_s_locs, 0] |
|
|
|
self.ppi_c_locs = self.c_locs[self.ppi_offset+self.ppio_s_locs] |
|
|
|
self.ppo_c_locs = self.vat[self.ppo_offset+self.ppio_s_locs, 0] |
|
|
|
self.ppo_c_locs = self.c_locs[self.ppo_offset+self.ppio_s_locs] |
|
|
|
|
|
|
|
|
|
|
|
self.pippi_c_locs = np.concatenate([self.pi_c_locs, self.ppi_c_locs]) |
|
|
|
self.pippi_c_locs = np.concatenate([self.pi_c_locs, self.ppi_c_locs]) |
|
|
|
self.poppo_c_locs = np.concatenate([self.po_c_locs, self.ppo_c_locs]) |
|
|
|
self.poppo_c_locs = np.concatenate([self.po_c_locs, self.ppo_c_locs]) |
|
|
@ -112,7 +112,7 @@ class WaveSim(SimOps): |
|
|
|
Based on the data in ``self.s``, waveforms are generated on the input lines of the circuit. |
|
|
|
Based on the data in ``self.s``, waveforms are generated on the input lines of the circuit. |
|
|
|
It modifies ``self.c``. |
|
|
|
It modifies ``self.c``. |
|
|
|
""" |
|
|
|
""" |
|
|
|
sins = np.moveaxis(self.s[self.pippi_s_locs], -1, 0) |
|
|
|
sins = self.s[:, self.pippi_s_locs] |
|
|
|
cond = (sins[2] != 0) + 2*(sins[0] != 0) # choices order: 0 R F 1 |
|
|
|
cond = (sins[2] != 0) + 2*(sins[0] != 0) # choices order: 0 R F 1 |
|
|
|
self.c[self.pippi_c_locs] = np.choose(cond, [TMAX, sins[1], TMIN, TMIN]) |
|
|
|
self.c[self.pippi_c_locs] = np.choose(cond, [TMAX, sins[1], TMIN, TMIN]) |
|
|
|
self.c[self.pippi_c_locs+1] = np.choose(cond, [TMAX, TMAX, sins[1], TMAX]) |
|
|
|
self.c[self.pippi_c_locs+1] = np.choose(cond, [TMAX, TMAX, sins[1], TMAX]) |
|
|
@ -127,7 +127,7 @@ class WaveSim(SimOps): |
|
|
|
""" |
|
|
|
""" |
|
|
|
sims = min(sims or self.sims, self.sims) |
|
|
|
sims = min(sims or self.sims, self.sims) |
|
|
|
for op_start, op_stop in zip(self.level_starts, self.level_stops): |
|
|
|
for op_start, op_stop in zip(self.level_starts, self.level_stops): |
|
|
|
level_eval_cpu(self.ops, op_start, op_stop, self.c, self.vat, 0, sims, |
|
|
|
level_eval_cpu(self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, 0, sims, |
|
|
|
self.timing, self.params, sd, seed) |
|
|
|
self.timing, self.params, sd, seed) |
|
|
|
|
|
|
|
|
|
|
|
def c_to_s(self, time=TMAX, sd=0.0, seed=1): |
|
|
|
def c_to_s(self, time=TMAX, sd=0.0, seed=1): |
|
|
@ -140,9 +140,9 @@ class WaveSim(SimOps): |
|
|
|
:param sd: A standard deviation for uncertainty in the actual capture time. |
|
|
|
:param sd: A standard deviation for uncertainty in the actual capture time. |
|
|
|
:param seed: The random seed for a capture with uncertainty. |
|
|
|
:param seed: The random seed for a capture with uncertainty. |
|
|
|
""" |
|
|
|
""" |
|
|
|
for s_loc, (c_loc, c_len, _) in zip(self.poppo_s_locs, self.vat[self.ppo_offset+self.poppo_s_locs]): |
|
|
|
for s_loc, c_loc, c_len in zip(self.poppo_s_locs, self.c_locs[self.ppo_offset+self.poppo_s_locs], self.c_caps[self.ppo_offset+self.poppo_s_locs]): |
|
|
|
for vector in range(self.sims): |
|
|
|
for vector in range(self.sims): |
|
|
|
self.s[s_loc, vector, 3:] = wave_capture_cpu(self.c, c_loc, c_len, vector, time=time, sd=sd, seed=seed) |
|
|
|
self.s[3:, s_loc, vector] = wave_capture_cpu(self.c, c_loc, c_len, vector, time=time, sd=sd, seed=seed) |
|
|
|
|
|
|
|
|
|
|
|
def s_ppo_to_ppi(self, time=0.0): |
|
|
|
def s_ppo_to_ppi(self, time=0.0): |
|
|
|
"""Re-assigns the last sampled capture to the appropriate pseudo-primary inputs (PPI). |
|
|
|
"""Re-assigns the last sampled capture to the appropriate pseudo-primary inputs (PPI). |
|
|
@ -151,9 +151,9 @@ class WaveSim(SimOps): |
|
|
|
|
|
|
|
|
|
|
|
:param time: The transition time at the inputs (usually 0.0). |
|
|
|
:param time: The transition time at the inputs (usually 0.0). |
|
|
|
""" |
|
|
|
""" |
|
|
|
self.s[self.ppio_s_locs, :, 0] = self.s[self.ppio_s_locs, :, 2] |
|
|
|
self.s[0, self.ppio_s_locs] = self.s[2, self.ppio_s_locs] |
|
|
|
self.s[self.ppio_s_locs, :, 1] = time |
|
|
|
self.s[1, self.ppio_s_locs] = time |
|
|
|
self.s[self.ppio_s_locs, :, 2] = self.s[self.ppio_s_locs, :, 8] |
|
|
|
self.s[2, self.ppio_s_locs] = self.s[8, self.ppio_s_locs] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@numba.njit |
|
|
|
@numba.njit |
|
|
@ -173,7 +173,7 @@ def rand_gauss_cpu(seed, sd): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@numba.njit |
|
|
|
@numba.njit |
|
|
|
def wave_eval_cpu(op, cbuf, vat, st_idx, line_times, param, sd=0.0, seed=0): |
|
|
|
def wave_eval_cpu(op, cbuf, c_locs, c_caps, st_idx, line_times, param, sd=0.0, seed=0): |
|
|
|
lut, z_idx, a_idx, b_idx, c_idx, d_idx = op |
|
|
|
lut, z_idx, a_idx, b_idx, c_idx, d_idx = op |
|
|
|
|
|
|
|
|
|
|
|
# >>> same code as wave_eval_cpu (except rand_gauss_*pu()-calls) >>> |
|
|
|
# >>> same code as wave_eval_cpu (except rand_gauss_*pu()-calls) >>> |
|
|
@ -181,11 +181,12 @@ def wave_eval_cpu(op, cbuf, vat, st_idx, line_times, param, sd=0.0, seed=0): |
|
|
|
|
|
|
|
|
|
|
|
_seed = (seed << 4) + (z_idx << 20) + (st_idx << 1) |
|
|
|
_seed = (seed << 4) + (z_idx << 20) + (st_idx << 1) |
|
|
|
|
|
|
|
|
|
|
|
a_mem = vat[a_idx, 0] |
|
|
|
a_mem = c_locs[a_idx] |
|
|
|
b_mem = vat[b_idx, 0] |
|
|
|
b_mem = c_locs[b_idx] |
|
|
|
c_mem = vat[c_idx, 0] |
|
|
|
c_mem = c_locs[c_idx] |
|
|
|
d_mem = vat[d_idx, 0] |
|
|
|
d_mem = c_locs[d_idx] |
|
|
|
z_mem, z_cap, _ = vat[z_idx] |
|
|
|
z_mem = c_locs[z_idx] |
|
|
|
|
|
|
|
z_cap = c_caps[z_idx] |
|
|
|
|
|
|
|
|
|
|
|
a_cur = int(0) |
|
|
|
a_cur = int(0) |
|
|
|
b_cur = int(0) |
|
|
|
b_cur = int(0) |
|
|
@ -280,12 +281,12 @@ def wave_eval_cpu(op, cbuf, vat, st_idx, line_times, param, sd=0.0, seed=0): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@numba.njit |
|
|
|
@numba.njit |
|
|
|
def level_eval_cpu(ops, op_start, op_stop, c, vat, st_start, st_stop, line_times, params, sd, seed): |
|
|
|
def level_eval_cpu(ops, op_start, op_stop, c, c_locs, c_caps, st_start, st_stop, line_times, params, sd, seed): |
|
|
|
overflows = 0 |
|
|
|
overflows = 0 |
|
|
|
for op_idx in range(op_start, op_stop): |
|
|
|
for op_idx in range(op_start, op_stop): |
|
|
|
op = ops[op_idx] |
|
|
|
op = ops[op_idx] |
|
|
|
for st_idx in range(st_start, st_stop): |
|
|
|
for st_idx in range(st_start, st_stop): |
|
|
|
wave_eval_cpu(op, c, vat, st_idx, line_times, params[st_idx], sd, seed) |
|
|
|
wave_eval_cpu(op, c, c_locs, c_caps, st_idx, line_times, params[st_idx], sd, seed) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@numba.njit |
|
|
|
@numba.njit |
|
|
@ -347,7 +348,8 @@ class WaveSimCuda(WaveSim): |
|
|
|
self.c = cuda.to_device(self.c) |
|
|
|
self.c = cuda.to_device(self.c) |
|
|
|
self.s = cuda.to_device(self.s) |
|
|
|
self.s = cuda.to_device(self.s) |
|
|
|
self.ops = cuda.to_device(self.ops) |
|
|
|
self.ops = cuda.to_device(self.ops) |
|
|
|
self.vat = cuda.to_device(self.vat) |
|
|
|
self.c_locs = cuda.to_device(self.c_locs) |
|
|
|
|
|
|
|
self.c_caps = cuda.to_device(self.c_caps) |
|
|
|
self.timing = cuda.to_device(self.timing) |
|
|
|
self.timing = cuda.to_device(self.timing) |
|
|
|
self.params = cuda.to_device(self.params) |
|
|
|
self.params = cuda.to_device(self.params) |
|
|
|
|
|
|
|
|
|
|
@ -355,7 +357,7 @@ class WaveSimCuda(WaveSim): |
|
|
|
|
|
|
|
|
|
|
|
def s_to_c(self): |
|
|
|
def s_to_c(self): |
|
|
|
grid_dim = self._grid_dim(self.sims, self.s_len) |
|
|
|
grid_dim = self._grid_dim(self.sims, self.s_len) |
|
|
|
wave_assign_gpu[grid_dim, self._block_dim](self.c, self.s, self.vat, self.ppi_offset) |
|
|
|
wave_assign_gpu[grid_dim, self._block_dim](self.c, self.s, self.c_locs, self.ppi_offset) |
|
|
|
|
|
|
|
|
|
|
|
def _grid_dim(self, x, y): |
|
|
|
def _grid_dim(self, x, y): |
|
|
|
gx = math.ceil(x / self._block_dim[0]) |
|
|
|
gx = math.ceil(x / self._block_dim[0]) |
|
|
@ -366,29 +368,29 @@ class WaveSimCuda(WaveSim): |
|
|
|
sims = min(sims or self.sims, self.sims) |
|
|
|
sims = min(sims or self.sims, self.sims) |
|
|
|
for op_start, op_stop in zip(self.level_starts, self.level_stops): |
|
|
|
for op_start, op_stop in zip(self.level_starts, self.level_stops): |
|
|
|
grid_dim = self._grid_dim(sims, op_stop - op_start) |
|
|
|
grid_dim = self._grid_dim(sims, op_stop - op_start) |
|
|
|
wave_eval_gpu[grid_dim, self._block_dim](self.ops, op_start, op_stop, self.c, self.vat, int(0), |
|
|
|
wave_eval_gpu[grid_dim, self._block_dim](self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, int(0), |
|
|
|
sims, self.timing, self.params, sd, seed) |
|
|
|
sims, self.timing, self.params, sd, seed) |
|
|
|
cuda.synchronize() |
|
|
|
cuda.synchronize() |
|
|
|
|
|
|
|
|
|
|
|
def c_to_s(self, time=TMAX, sd=0.0, seed=1): |
|
|
|
def c_to_s(self, time=TMAX, sd=0.0, seed=1): |
|
|
|
grid_dim = self._grid_dim(self.sims, self.s_len) |
|
|
|
grid_dim = self._grid_dim(self.sims, self.s_len) |
|
|
|
wave_capture_gpu[grid_dim, self._block_dim](self.c, self.s, self.vat, self.ppo_offset, |
|
|
|
wave_capture_gpu[grid_dim, self._block_dim](self.c, self.s, self.c_locs, self.c_caps, self.ppo_offset, |
|
|
|
time, sd * math.sqrt(2), seed) |
|
|
|
time, sd * math.sqrt(2), seed) |
|
|
|
|
|
|
|
|
|
|
|
def s_ppo_to_ppi(self, time=0.0): |
|
|
|
def s_ppo_to_ppi(self, time=0.0): |
|
|
|
grid_dim = self._grid_dim(self.sims, self.s_len) |
|
|
|
grid_dim = self._grid_dim(self.sims, self.s_len) |
|
|
|
ppo_to_ppi_gpu[grid_dim, self._block_dim](self.s, self.vat, time, self.ppi_offset, self.ppo_offset) |
|
|
|
ppo_to_ppi_gpu[grid_dim, self._block_dim](self.s, self.c_locs, time, self.ppi_offset, self.ppo_offset) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@cuda.jit() |
|
|
|
@cuda.jit() |
|
|
|
def wave_assign_gpu(c, s, vat, ppi_offset): |
|
|
|
def wave_assign_gpu(c, s, c_locs, ppi_offset): |
|
|
|
x, y = cuda.grid(2) |
|
|
|
x, y = cuda.grid(2) |
|
|
|
if y >= len(s): return |
|
|
|
if y >= s.shape[1]: return |
|
|
|
c_loc, c_len, _ = vat[ppi_offset + y] |
|
|
|
c_loc = c_locs[ppi_offset + y] |
|
|
|
if c_loc < 0: return |
|
|
|
if c_loc < 0: return |
|
|
|
if x >= c.shape[-1]: return |
|
|
|
if x >= c.shape[-1]: return |
|
|
|
value = int(s[y, x, 2] >= 0.5) | (2*int(s[y, x, 0] >= 0.5)) |
|
|
|
value = int(s[2, y, x] >= 0.5) | (2*int(s[0, y, x] >= 0.5)) |
|
|
|
ttime = s[y, x, 1] |
|
|
|
ttime = s[1, y, x] |
|
|
|
if value == 0: |
|
|
|
if value == 0: |
|
|
|
c[c_loc, x] = TMAX |
|
|
|
c[c_loc, x] = TMAX |
|
|
|
c[c_loc+1, x] = TMAX |
|
|
|
c[c_loc+1, x] = TMAX |
|
|
@ -421,7 +423,7 @@ def rand_gauss_gpu(seed, sd): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@cuda.jit() |
|
|
|
@cuda.jit() |
|
|
|
def wave_eval_gpu(ops, op_start, op_stop, cbuf, vat, st_start, st_stop, line_times, param, sd, seed): |
|
|
|
def wave_eval_gpu(ops, op_start, op_stop, cbuf, c_locs, c_caps, st_start, st_stop, line_times, param, sd, seed): |
|
|
|
x, y = cuda.grid(2) |
|
|
|
x, y = cuda.grid(2) |
|
|
|
st_idx = st_start + x |
|
|
|
st_idx = st_start + x |
|
|
|
op_idx = op_start + y |
|
|
|
op_idx = op_start + y |
|
|
@ -442,11 +444,12 @@ def wave_eval_gpu(ops, op_start, op_stop, cbuf, vat, st_start, st_stop, line_tim |
|
|
|
|
|
|
|
|
|
|
|
_seed = (seed << 4) + (z_idx << 20) + (st_idx << 1) |
|
|
|
_seed = (seed << 4) + (z_idx << 20) + (st_idx << 1) |
|
|
|
|
|
|
|
|
|
|
|
a_mem = vat[a_idx, 0] |
|
|
|
a_mem = c_locs[a_idx] |
|
|
|
b_mem = vat[b_idx, 0] |
|
|
|
b_mem = c_locs[b_idx] |
|
|
|
c_mem = vat[c_idx, 0] |
|
|
|
c_mem = c_locs[c_idx] |
|
|
|
d_mem = vat[d_idx, 0] |
|
|
|
d_mem = c_locs[d_idx] |
|
|
|
z_mem, z_cap, _ = vat[z_idx] |
|
|
|
z_mem = c_locs[z_idx] |
|
|
|
|
|
|
|
z_cap = c_caps[z_idx] |
|
|
|
|
|
|
|
|
|
|
|
a_cur = int(0) |
|
|
|
a_cur = int(0) |
|
|
|
b_cur = int(0) |
|
|
|
b_cur = int(0) |
|
|
@ -541,10 +544,11 @@ def wave_eval_gpu(ops, op_start, op_stop, cbuf, vat, st_start, st_stop, line_tim |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@cuda.jit() |
|
|
|
@cuda.jit() |
|
|
|
def wave_capture_gpu(c, s, vat, ppo_offset, time, s_sqrt2, seed): |
|
|
|
def wave_capture_gpu(c, s, c_locs, c_caps, ppo_offset, time, s_sqrt2, seed): |
|
|
|
x, y = cuda.grid(2) |
|
|
|
x, y = cuda.grid(2) |
|
|
|
if ppo_offset + y >= len(vat): return |
|
|
|
if ppo_offset + y >= len(c_locs): return |
|
|
|
line, tdim, _ = vat[ppo_offset + y] |
|
|
|
line = c_locs[ppo_offset + y] |
|
|
|
|
|
|
|
tdim = c_caps[ppo_offset + y] |
|
|
|
if line < 0: return |
|
|
|
if line < 0: return |
|
|
|
if x >= c.shape[-1]: return |
|
|
|
if x >= c.shape[-1]: return |
|
|
|
vector = x |
|
|
|
vector = x |
|
|
@ -588,25 +592,25 @@ def wave_capture_gpu(c, s, vat, ppo_offset, time, s_sqrt2, seed): |
|
|
|
else: |
|
|
|
else: |
|
|
|
acc = val |
|
|
|
acc = val |
|
|
|
|
|
|
|
|
|
|
|
s[y, vector, 3] = (c[line, vector] <= TMIN) |
|
|
|
s[3, y, vector] = (c[line, vector] <= TMIN) |
|
|
|
s[y, vector, 4] = eat |
|
|
|
s[4, y, vector] = eat |
|
|
|
s[y, vector, 5] = lst |
|
|
|
s[5, y, vector] = lst |
|
|
|
s[y, vector, 6] = final |
|
|
|
s[6, y, vector] = final |
|
|
|
s[y, vector, 7] = acc |
|
|
|
s[7, y, vector] = acc |
|
|
|
s[y, vector, 8] = val |
|
|
|
s[8, y, vector] = val |
|
|
|
s[y, vector, 9] = 0 # TODO |
|
|
|
s[9, y, vector] = 0 # TODO |
|
|
|
s[y, vector, 10] = ovl |
|
|
|
s[10, y, vector] = ovl |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@cuda.jit() |
|
|
|
@cuda.jit() |
|
|
|
def ppo_to_ppi_gpu(s, vat, time, ppi_offset, ppo_offset): |
|
|
|
def ppo_to_ppi_gpu(s, c_locs, time, ppi_offset, ppo_offset): |
|
|
|
x, y = cuda.grid(2) |
|
|
|
x, y = cuda.grid(2) |
|
|
|
if y >= s.shape[0]: return |
|
|
|
if y >= s.shape[0]: return |
|
|
|
if x >= s.shape[1]: return |
|
|
|
if x >= s.shape[1]: return |
|
|
|
|
|
|
|
|
|
|
|
if vat[ppi_offset + y, 0] < 0: return |
|
|
|
if c_locs[ppi_offset + y] < 0: return |
|
|
|
if vat[ppo_offset + y, 0] < 0: return |
|
|
|
if c_locs[ppo_offset + y] < 0: return |
|
|
|
|
|
|
|
|
|
|
|
s[y, x, 0] = s[y, x, 2] |
|
|
|
s[0, y, x] = s[2, y, x] |
|
|
|
s[y, x, 1] = time |
|
|
|
s[1, y, x] = time |
|
|
|
s[y, x, 2] = s[y, x, 8] |
|
|
|
s[2, y, x] = s[8, y, x] |
|
|
|