|
|
|
@ -353,17 +353,9 @@ class WaveSimCuda(WaveSim):
@@ -353,17 +353,9 @@ class WaveSimCuda(WaveSim):
|
|
|
|
|
|
|
|
|
|
self._block_dim = (32, 16) |
|
|
|
|
|
|
|
|
|
# TODO implement on GPU |
|
|
|
|
def s_to_c(self): |
|
|
|
|
s = np.array(self.s) |
|
|
|
|
c = np.array(self.c) |
|
|
|
|
sins = np.moveaxis(s[self.pippi_s_locs], -1, 0) |
|
|
|
|
cond = (sins[2] != 0) + 2*(sins[0] != 0) # choices order: 0 R F 1 |
|
|
|
|
c[self.pippi_c_locs] = np.choose(cond, [TMAX, sins[1], TMIN, TMIN]) |
|
|
|
|
c[self.pippi_c_locs+1] = np.choose(cond, [TMAX, TMAX, sins[1], TMAX]) |
|
|
|
|
c[self.pippi_c_locs+2] = TMAX |
|
|
|
|
self.s[:,:,:] = s |
|
|
|
|
self.c[:,:] = c |
|
|
|
|
grid_dim = self._grid_dim(self.sims, self.s_len) |
|
|
|
|
wave_assign_gpu[grid_dim, self._block_dim](self.c, self.s, self.vat, self.ppi_offset) |
|
|
|
|
|
|
|
|
|
def _grid_dim(self, x, y): |
|
|
|
|
gx = math.ceil(x / self._block_dim[0]) |
|
|
|
@ -378,19 +370,38 @@ class WaveSimCuda(WaveSim):
@@ -378,19 +370,38 @@ class WaveSimCuda(WaveSim):
|
|
|
|
|
sims, self.timing, self.params, sd, seed) |
|
|
|
|
cuda.synchronize() |
|
|
|
|
|
|
|
|
|
# TODO implement on GPU |
|
|
|
|
def c_to_s(self, time=TMAX, sd=0.0, seed=1): |
|
|
|
|
s = np.array(self.s) |
|
|
|
|
c = np.array(self.c) |
|
|
|
|
vat = np.array(self.vat) |
|
|
|
|
for s_loc, (c_loc, c_len, _) in zip(self.poppo_s_locs, vat[self.ppo_offset+self.poppo_s_locs]): |
|
|
|
|
for vector in range(self.sims): |
|
|
|
|
s[s_loc, vector, 3:] = wave_capture_cpu(c, c_loc, c_len, vector, time=time, sd=sd, seed=seed) |
|
|
|
|
self.s[:,:,:] = s |
|
|
|
|
self.c[:,:] = c |
|
|
|
|
grid_dim = self._grid_dim(self.sims, self.s_len) |
|
|
|
|
wave_capture_gpu[grid_dim, self._block_dim](self.c, self.s, self.vat, self.ppo_offset, |
|
|
|
|
time, sd * math.sqrt(2), seed) |
|
|
|
|
|
|
|
|
|
def s_ppo_to_ppi(self, time=0.0): |
|
|
|
|
grid_dim = self._grid_dim(self.sims, self.s_len) |
|
|
|
|
ppo_to_ppi_gpu[grid_dim, self._block_dim](self.s, self.vat, time, self.ppi_offset, self.ppo_offset) |
|
|
|
|
|
|
|
|
|
# TODO implement on GPU |
|
|
|
|
#def s_ppo_to_ppi(self, time=0.0): |
|
|
|
|
|
|
|
|
|
@cuda.jit() |
|
|
|
|
def wave_assign_gpu(c, s, vat, ppi_offset): |
|
|
|
|
x, y = cuda.grid(2) |
|
|
|
|
if y >= len(s): return |
|
|
|
|
c_loc, c_len, _ = vat[ppi_offset + y] |
|
|
|
|
if c_loc < 0: return |
|
|
|
|
if x >= c.shape[-1]: return |
|
|
|
|
value = int(s[y, x, 2] >= 0.5) | (2*int(s[y, x, 0] >= 0.5)) |
|
|
|
|
ttime = s[y, x, 1] |
|
|
|
|
if value == 0: |
|
|
|
|
c[c_loc, x] = TMAX |
|
|
|
|
c[c_loc+1, x] = TMAX |
|
|
|
|
elif value == 1: |
|
|
|
|
c[c_loc, x] = ttime |
|
|
|
|
c[c_loc+1, x] = TMAX |
|
|
|
|
elif value == 2: |
|
|
|
|
c[c_loc, x] = TMIN |
|
|
|
|
c[c_loc+1, x] = ttime |
|
|
|
|
else: |
|
|
|
|
c[c_loc, x] = TMIN |
|
|
|
|
c[c_loc+1, x] = TMAX |
|
|
|
|
c[c_loc+2, x] = TMAX |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@cuda.jit(device=True) |
|
|
|
@ -527,3 +538,75 @@ def wave_eval_gpu(ops, op_start, op_stop, cbuf, vat, st_start, st_stop, line_tim
@@ -527,3 +538,75 @@ def wave_eval_gpu(ops, op_start, op_stop, cbuf, vat, st_start, st_stop, line_tim
|
|
|
|
|
|
|
|
|
|
# generate overflow flag or propagate from input |
|
|
|
|
cbuf[z_mem + z_cur, st_idx] = TMAX_OVL if overflows > 0 else max(a, b, c, d) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@cuda.jit() |
|
|
|
|
def wave_capture_gpu(c, s, vat, ppo_offset, time, s_sqrt2, seed): |
|
|
|
|
x, y = cuda.grid(2) |
|
|
|
|
if ppo_offset + y >= len(vat): return |
|
|
|
|
line, tdim, _ = vat[ppo_offset + y] |
|
|
|
|
if line < 0: return |
|
|
|
|
if x >= c.shape[-1]: return |
|
|
|
|
vector = x |
|
|
|
|
m = 0.5 |
|
|
|
|
acc = 0.0 |
|
|
|
|
eat = TMAX |
|
|
|
|
lst = TMIN |
|
|
|
|
tog = 0 |
|
|
|
|
ovl = 0 |
|
|
|
|
val = int(0) |
|
|
|
|
final = int(0) |
|
|
|
|
for tidx in range(tdim): |
|
|
|
|
t = c[line + tidx, vector] |
|
|
|
|
if t >= TMAX: |
|
|
|
|
if t == TMAX_OVL: |
|
|
|
|
ovl = 1 |
|
|
|
|
break |
|
|
|
|
m = -m |
|
|
|
|
final ^= 1 |
|
|
|
|
if t < time: |
|
|
|
|
val ^= 1 |
|
|
|
|
if t <= TMIN: continue |
|
|
|
|
if s_sqrt2 > 0: |
|
|
|
|
acc += m * (1 + math.erf((t - time) / s_sqrt2)) |
|
|
|
|
eat = min(eat, t) |
|
|
|
|
lst = max(lst, t) |
|
|
|
|
tog += 1 |
|
|
|
|
if s_sqrt2 > 0: |
|
|
|
|
if m < 0: |
|
|
|
|
acc += 1 |
|
|
|
|
if acc >= 0.99: |
|
|
|
|
val = 1 |
|
|
|
|
elif acc > 0.01: |
|
|
|
|
seed = (seed << 4) + (vector << 20) + (y << 1) |
|
|
|
|
seed = int(0xDEECE66D) * seed + 0xB |
|
|
|
|
seed = int(0xDEECE66D) * seed + 0xB |
|
|
|
|
rnd = float((seed >> 8) & 0xffffff) / float(1 << 24) |
|
|
|
|
val = rnd < acc |
|
|
|
|
else: |
|
|
|
|
val = 0 |
|
|
|
|
else: |
|
|
|
|
acc = val |
|
|
|
|
|
|
|
|
|
s[y, vector, 3] = (c[line, vector] <= TMIN) |
|
|
|
|
s[y, vector, 4] = eat |
|
|
|
|
s[y, vector, 5] = lst |
|
|
|
|
s[y, vector, 6] = final |
|
|
|
|
s[y, vector, 7] = acc |
|
|
|
|
s[y, vector, 8] = val |
|
|
|
|
s[y, vector, 9] = 0 # TODO |
|
|
|
|
s[y, vector, 10] = ovl |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@cuda.jit() |
|
|
|
|
def ppo_to_ppi_gpu(s, vat, time, ppi_offset, ppo_offset): |
|
|
|
|
x, y = cuda.grid(2) |
|
|
|
|
if y >= s.shape[0]: return |
|
|
|
|
if x >= s.shape[1]: return |
|
|
|
|
|
|
|
|
|
if vat[ppi_offset + y, 0] < 0: return |
|
|
|
|
if vat[ppo_offset + y, 0] < 0: return |
|
|
|
|
|
|
|
|
|
s[y, x, 0] = s[y, x, 2] |
|
|
|
|
s[y, x, 1] = time |
|
|
|
|
s[y, x, 2] = s[y, x, 8] |
|
|
|
|