|  |  | @ -353,17 +353,9 @@ class WaveSimCuda(WaveSim): | 
			
		
	
		
		
			
				
					
					|  |  |  |          |  |  |  |          | 
			
		
	
		
		
			
				
					
					|  |  |  |         self._block_dim = (32, 16) |  |  |  |         self._block_dim = (32, 16) | 
			
		
	
		
		
			
				
					
					|  |  |  | 
 |  |  |  | 
 | 
			
		
	
		
		
			
				
					
					|  |  |  |     # TODO implement on GPU |  |  |  |  | 
			
		
	
		
		
			
				
					
					|  |  |  |     def s_to_c(self): |  |  |  |     def s_to_c(self): | 
			
		
	
		
		
			
				
					
					|  |  |  |         s = np.array(self.s) |  |  |  |         grid_dim = self._grid_dim(self.sims, self.s_len) | 
			
				
				
			
		
	
		
		
			
				
					
					|  |  |  |         c = np.array(self.c) |  |  |  |         wave_assign_gpu[grid_dim, self._block_dim](self.c, self.s, self.vat, self.ppi_offset) | 
			
				
				
			
		
	
		
		
			
				
					
					|  |  |  |         sins = np.moveaxis(s[self.pippi_s_locs], -1, 0) |  |  |  |  | 
			
		
	
		
		
			
				
					
					|  |  |  |         cond = (sins[2] != 0) + 2*(sins[0] != 0)  # choices order: 0 R F 1 |  |  |  |  | 
			
		
	
		
		
			
				
					
					|  |  |  |         c[self.pippi_c_locs] = np.choose(cond, [TMAX, sins[1], TMIN, TMIN]) |  |  |  |  | 
			
		
	
		
		
			
				
					
					|  |  |  |         c[self.pippi_c_locs+1] = np.choose(cond, [TMAX, TMAX, sins[1], TMAX]) |  |  |  |  | 
			
		
	
		
		
			
				
					
					|  |  |  |         c[self.pippi_c_locs+2] = TMAX |  |  |  |  | 
			
		
	
		
		
			
				
					
					|  |  |  |         self.s[:,:,:] = s |  |  |  |  | 
			
		
	
		
		
			
				
					
					|  |  |  |         self.c[:,:] = c |  |  |  |  | 
			
		
	
		
		
	
		
		
	
		
		
			
				
					
					|  |  |  | 
 |  |  |  | 
 | 
			
		
	
		
		
			
				
					
					|  |  |  |     def _grid_dim(self, x, y): |  |  |  |     def _grid_dim(self, x, y): | 
			
		
	
		
		
			
				
					
					|  |  |  |         gx = math.ceil(x / self._block_dim[0]) |  |  |  |         gx = math.ceil(x / self._block_dim[0]) | 
			
		
	
	
		
		
			
				
					|  |  | @ -378,19 +370,38 @@ class WaveSimCuda(WaveSim): | 
			
		
	
		
		
			
				
					
					|  |  |  |                 sims, self.timing, self.params, sd, seed) |  |  |  |                 sims, self.timing, self.params, sd, seed) | 
			
		
	
		
		
			
				
					
					|  |  |  |         cuda.synchronize() |  |  |  |         cuda.synchronize() | 
			
		
	
		
		
			
				
					
					|  |  |  |      |  |  |  |      | 
			
		
	
		
		
			
				
					
					|  |  |  |     # TODO implement on GPU |  |  |  |  | 
			
		
	
		
		
			
				
					
					|  |  |  |     def c_to_s(self, time=TMAX, sd=0.0, seed=1): |  |  |  |     def c_to_s(self, time=TMAX, sd=0.0, seed=1): | 
			
		
	
		
		
			
				
					
					|  |  |  |         s = np.array(self.s) |  |  |  |         grid_dim = self._grid_dim(self.sims, self.s_len) | 
			
				
				
			
		
	
		
		
			
				
					
					|  |  |  |         c = np.array(self.c) |  |  |  |         wave_capture_gpu[grid_dim, self._block_dim](self.c, self.s, self.vat, self.ppo_offset, | 
			
				
				
			
		
	
		
		
			
				
					
					|  |  |  |         vat = np.array(self.vat) |  |  |  |             time, sd * math.sqrt(2), seed) | 
			
				
				
			
		
	
		
		
			
				
					
					|  |  |  |         for s_loc, (c_loc, c_len, _) in zip(self.poppo_s_locs, vat[self.ppo_offset+self.poppo_s_locs]): |  |  |  |      | 
			
				
				
			
		
	
		
		
			
				
					
					|  |  |  |             for vector in range(self.sims): |  |  |  |     def s_ppo_to_ppi(self, time=0.0): | 
			
				
				
			
		
	
		
		
			
				
					
					|  |  |  |                 s[s_loc, vector, 3:] = wave_capture_cpu(c, c_loc, c_len, vector, time=time, sd=sd, seed=seed) |  |  |  |         grid_dim = self._grid_dim(self.sims, self.s_len) | 
			
				
				
			
		
	
		
		
			
				
					
					|  |  |  |         self.s[:,:,:] = s |  |  |  |         ppo_to_ppi_gpu[grid_dim, self._block_dim](self.s, self.vat, time, self.ppi_offset, self.ppo_offset) | 
			
				
				
			
		
	
		
		
			
				
					
					|  |  |  |         self.c[:,:] = c |  |  |  |  | 
			
		
	
		
		
	
		
		
	
		
		
	
		
		
	
		
		
	
		
		
	
		
		
	
		
		
			
				
					
					|  |  |  | 
 |  |  |  | 
 | 
			
		
	
		
		
			
				
					
					|  |  |  |     # TODO implement on GPU |  |  |  | 
 | 
			
				
				
			
		
	
		
		
			
				
					
					|  |  |  |     #def s_ppo_to_ppi(self, time=0.0): |  |  |  | @cuda.jit() | 
			
				
				
			
		
	
		
		
	
		
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | def wave_assign_gpu(c, s, vat, ppi_offset): | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     x, y = cuda.grid(2) | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     if y >= len(s): return | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     c_loc, c_len, _ = vat[ppi_offset + y] | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     if c_loc < 0: return | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     if x >= c.shape[-1]: return | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     value = int(s[y, x, 2] >= 0.5) | (2*int(s[y, x, 0] >= 0.5)) | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     ttime = s[y, x, 1] | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     if value == 0: | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         c[c_loc, x] = TMAX | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         c[c_loc+1, x] = TMAX | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     elif value == 1: | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         c[c_loc, x] = ttime | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         c[c_loc+1, x] = TMAX | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     elif value == 2: | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         c[c_loc, x] = TMIN | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         c[c_loc+1, x] = ttime | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     else: | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         c[c_loc, x] = TMIN | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         c[c_loc+1, x] = TMAX | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     c[c_loc+2, x] = TMAX | 
			
		
	
		
		
			
				
					
					|  |  |  | 
 |  |  |  | 
 | 
			
		
	
		
		
			
				
					
					|  |  |  | 
 |  |  |  | 
 | 
			
		
	
		
		
			
				
					
					|  |  |  | @cuda.jit(device=True) |  |  |  | @cuda.jit(device=True) | 
			
		
	
	
		
		
			
				
					|  |  | @ -527,3 +538,75 @@ def wave_eval_gpu(ops, op_start, op_stop, cbuf, vat, st_start, st_stop, line_tim | 
			
		
	
		
		
			
				
					
					|  |  |  | 
 |  |  |  | 
 | 
			
		
	
		
		
			
				
					
					|  |  |  |     # generate overflow flag or propagate from input |  |  |  |     # generate overflow flag or propagate from input | 
			
		
	
		
		
			
				
					
					|  |  |  |     cbuf[z_mem + z_cur, st_idx] = TMAX_OVL if overflows > 0 else max(a, b, c, d) |  |  |  |     cbuf[z_mem + z_cur, st_idx] = TMAX_OVL if overflows > 0 else max(a, b, c, d) | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | 
 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | 
 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | @cuda.jit() | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | def wave_capture_gpu(c, s, vat, ppo_offset, time, s_sqrt2, seed): | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     x, y = cuda.grid(2) | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     if ppo_offset + y >= len(vat): return | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     line, tdim, _ = vat[ppo_offset + y] | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     if line < 0: return | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     if x >= c.shape[-1]: return | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     vector = x | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     m = 0.5 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     acc = 0.0 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     eat = TMAX | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     lst = TMIN | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     tog = 0 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     ovl = 0 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     val = int(0) | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     final = int(0) | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     for tidx in range(tdim): | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         t = c[line + tidx, vector] | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         if t >= TMAX: | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |             if t == TMAX_OVL: | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |                 ovl = 1 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |             break | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         m = -m | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         final ^= 1 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         if t < time: | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |             val ^= 1 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         if t <= TMIN: continue | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         if s_sqrt2 > 0: | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |             acc += m * (1 + math.erf((t - time) / s_sqrt2)) | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         eat = min(eat, t) | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         lst = max(lst, t) | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         tog += 1 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     if s_sqrt2 > 0: | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         if m < 0: | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |             acc += 1 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         if acc >= 0.99: | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |             val = 1 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         elif acc > 0.01: | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |             seed = (seed << 4) + (vector << 20) + (y << 1) | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |             seed = int(0xDEECE66D) * seed + 0xB | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |             seed = int(0xDEECE66D) * seed + 0xB | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |             rnd = float((seed >> 8) & 0xffffff) / float(1 << 24) | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |             val = rnd < acc | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         else: | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |             val = 0 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     else: | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |         acc = val | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | 
 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     s[y, vector, 3] = (c[line, vector] <= TMIN) | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     s[y, vector, 4] = eat | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     s[y, vector, 5] = lst | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     s[y, vector, 6] = final | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     s[y, vector, 7] = acc | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     s[y, vector, 8] = val | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     s[y, vector, 9] = 0  # TODO | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     s[y, vector, 10] = ovl | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | 
 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | 
 | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | @cuda.jit() | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  | def ppo_to_ppi_gpu(s, vat, time, ppi_offset, ppo_offset): | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     x, y = cuda.grid(2) | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     if y >= s.shape[0]: return | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     if x >= s.shape[1]: return | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |      | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     if vat[ppi_offset + y, 0] < 0: return | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     if vat[ppo_offset + y, 0] < 0: return | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |      | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     s[y, x, 0] = s[y, x, 2] | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     s[y, x, 1] = time | 
			
		
	
		
		
			
				
					
					|  |  |  |  |  |  |  |     s[y, x, 2] = s[y, x, 8] | 
			
		
	
	
		
		
			
				
					|  |  | 
 |