|  |  |  | @ -99,13 +99,26 @@ class WaveSim(sim.SimOps):@@ -99,13 +99,26 @@ class WaveSim(sim.SimOps): | 
			
		
	
		
			
				
					|  |  |  |  |         self.simctl_int[0] = range(sims)  # unique seed for each sim by default, zero this to pick same delays for all sims. | 
			
		
	
		
			
				
					|  |  |  |  |         self.simctl_int[1] = 2  # random picking by default. | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  |         self.e = np.zeros((self.c_locs_len, sims), dtype=np.uint8)  # aux data for each line and sim | 
			
		
	
		
			
				
					|  |  |  |  |         # flat array for line use information | 
			
		
	
		
			
				
					|  |  |  |  |         line_use = defaultdict(list) | 
			
		
	
		
			
				
					|  |  |  |  |         for lidx in range(len(self.circuit.lines)): | 
			
		
	
		
			
				
					|  |  |  |  |             if self.line_use_start[lidx] < 0: continue | 
			
		
	
		
			
				
					|  |  |  |  |             if self.line_use_stop[lidx] < 0: | 
			
		
	
		
			
				
					|  |  |  |  |                 log.warn(f'line {lidx} never read?') | 
			
		
	
		
			
				
					|  |  |  |  |             for i in range(self.line_use_start[lidx], self.line_use_stop[lidx]): | 
			
		
	
		
			
				
					|  |  |  |  |                 line_use[i].append(lidx) | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  |         self.line_use_counts = np.array([len(line_use[i]) for i in range(len(self.levels))], dtype=np.int32) | 
			
		
	
		
			
				
					|  |  |  |  |         self.line_use_offsets = np.zeros_like(self.line_use_counts) | 
			
		
	
		
			
				
					|  |  |  |  |         self.line_use_offsets[1:] = self.line_use_counts.cumsum()[:-1] | 
			
		
	
		
			
				
					|  |  |  |  |         self.line_use = np.hstack([line_use[i] for i in range(len(self.levels))]) | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  |         self.h = np.zeros((self.c_locs_len, sims), dtype=np.float32)  # hashes of generated waveforms | 
			
		
	
		
			
				
					|  |  |  |  |         self.h_base = np.zeros_like(self.h)  # base hashes to compare to | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  |         self.error_counts = np.zeros(self.s_len, dtype=np.uint32)  # number of capture errors by PPO | 
			
		
	
		
			
				
					|  |  |  |  |         self.lsts = np.zeros(self.s_len, dtype=np.float32)  # LST by PPO | 
			
		
	
		
			
				
					|  |  |  |  |         self.overflows = np.zeros(self.s_len, dtype=np.uint32)  # Overflows by PPO | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  |         self.nbytes = sum([a.nbytes for a in (self.c, self.s, self.e, self.c_locs, self.c_caps, self.ops, self.simctl_int)]) | 
			
		
	
		
			
				
					|  |  |  |  |         self.nbytes = sum([a.nbytes for a in (self.c, self.s, self.h, self.c_locs, self.c_caps, self.ops, self.simctl_int)]) | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  |     def __repr__(self): | 
			
		
	
		
			
				
					|  |  |  |  |         dev = 'GPU' if hasattr(self.c, 'copy_to_host') else 'CPU' | 
			
		
	
	
		
			
				
					|  |  |  | @ -131,7 +144,7 @@ class WaveSim(sim.SimOps):@@ -131,7 +144,7 @@ class WaveSim(sim.SimOps): | 
			
		
	
		
			
				
					|  |  |  |  |         """ | 
			
		
	
		
			
				
					|  |  |  |  |         sims = min(sims or self.sims, self.sims) | 
			
		
	
		
			
				
					|  |  |  |  |         for op_start, op_stop in zip(self.level_starts, self.level_stops): | 
			
		
	
		
			
				
					|  |  |  |  |             level_eval_cpu(self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, self.e, self.abuf, 0, sims, self.delays, self.simctl_int, seed) | 
			
		
	
		
			
				
					|  |  |  |  |             level_eval_cpu(self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, self.h, self.abuf, 0, sims, self.delays, self.simctl_int, seed) | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  |     def c_to_s(self, time=TMAX, sd=0.0, seed=1): | 
			
		
	
		
			
				
					|  |  |  |  |         """Simulates a capture operation at all sequential elements and primary outputs. | 
			
		
	
	
		
			
				
					|  |  |  | @ -159,7 +172,7 @@ class WaveSim(sim.SimOps):@@ -159,7 +172,7 @@ class WaveSim(sim.SimOps): | 
			
		
	
		
			
				
					|  |  |  |  |         self.s[2, self.ppio_s_locs] = self.s[8, self.ppio_s_locs] | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  | def _wave_eval(op, cbuf, c_locs, c_caps, ebuf, sim, delays, simctl_int, seed): | 
			
		
	
		
			
				
					|  |  |  |  | def _wave_eval(op, cbuf, c_locs, c_caps, hbuf, sim, delays, simctl_int, seed): | 
			
		
	
		
			
				
					|  |  |  |  |     overflows = int(0) | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  |     lut = op[0] | 
			
		
	
	
		
			
				
					|  |  |  | @ -189,6 +202,8 @@ def _wave_eval(op, cbuf, c_locs, c_caps, ebuf, sim, delays, simctl_int, seed):@@ -189,6 +202,8 @@ def _wave_eval(op, cbuf, c_locs, c_caps, ebuf, sim, delays, simctl_int, seed): | 
			
		
	
		
			
				
					|  |  |  |  |     z_mem = c_locs[z_idx] | 
			
		
	
		
			
				
					|  |  |  |  |     z_cap = c_caps[z_idx] | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  |     h = np.float32(0) | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  |     a_cur = int(0) | 
			
		
	
		
			
				
					|  |  |  |  |     b_cur = int(0) | 
			
		
	
		
			
				
					|  |  |  |  |     c_cur = int(0) | 
			
		
	
	
		
			
				
					|  |  |  | @ -236,6 +251,7 @@ def _wave_eval(op, cbuf, c_locs, c_caps, ebuf, sim, delays, simctl_int, seed):@@ -236,6 +251,7 @@ def _wave_eval(op, cbuf, c_locs, c_caps, ebuf, sim, delays, simctl_int, seed): | 
			
		
	
		
			
				
					|  |  |  |  |             next_t = cbuf[d_mem + d_cur, sim] + delays[d_idx, (d_cur & 1) ^ 1, z_val ^ 1] | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  |         if (z_cur & 1) != ((lut >> inputs) & 1): | 
			
		
	
		
			
				
					|  |  |  |  |             h += h*3 + max(current_t, -10)  # hash based on generated transitions before filtering | 
			
		
	
		
			
				
					|  |  |  |  |             # we generate an edge in z_mem, if ... | 
			
		
	
		
			
				
					|  |  |  |  |             if (z_cur == 0                            # it is the first edge in z_mem ... | 
			
		
	
		
			
				
					|  |  |  |  |                 or next_t < current_t                 # -OR- the next edge on SAME input is EARLIER (need current edge to filter BOTH in next iteration) ... | 
			
		
	
	
		
			
				
					|  |  |  | @ -265,15 +281,11 @@ def _wave_eval(op, cbuf, c_locs, c_caps, ebuf, sim, delays, simctl_int, seed):@@ -265,15 +281,11 @@ def _wave_eval(op, cbuf, c_locs, c_caps, ebuf, sim, delays, simctl_int, seed): | 
			
		
	
		
			
				
					|  |  |  |  |     # generate or propagate overflow flag | 
			
		
	
		
			
				
					|  |  |  |  |     cbuf[z_mem + z_cur, sim] = TMAX_OVL if overflows > 0 else max(a, b, c, d) | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  |     hbuf[z_idx, sim] = h | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  |     nrise = max(0, (z_cur+1) // 2 - (cbuf[z_mem, sim] == TMIN)) | 
			
		
	
		
			
				
					|  |  |  |  |     nfall = z_cur // 2 | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  |     e = int(((cbuf[z_mem, sim] == TMIN) << 1) & 2)  # initial value | 
			
		
	
		
			
				
					|  |  |  |  |     e |= z_val  # final value | 
			
		
	
		
			
				
					|  |  |  |  |     e |= (nrise + nfall)<<2  # number of transitions | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  |     ebuf[z_idx, sim] = e | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  |     return nrise, nfall | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
	
		
			
				
					|  |  |  | @ -281,11 +293,11 @@ wave_eval_cpu = numba.njit(_wave_eval)@@ -281,11 +293,11 @@ wave_eval_cpu = numba.njit(_wave_eval) | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  | @numba.njit | 
			
		
	
		
			
				
					|  |  |  |  | def level_eval_cpu(ops, op_start, op_stop, c, c_locs, c_caps, ebuf, abuf, sim_start, sim_stop, delays, simctl_int, seed): | 
			
		
	
		
			
				
					|  |  |  |  | def level_eval_cpu(ops, op_start, op_stop, c, c_locs, c_caps, hbuf, abuf, sim_start, sim_stop, delays, simctl_int, seed): | 
			
		
	
		
			
				
					|  |  |  |  |     for op_idx in range(op_start, op_stop): | 
			
		
	
		
			
				
					|  |  |  |  |         op = ops[op_idx] | 
			
		
	
		
			
				
					|  |  |  |  |         for sim in range(sim_start, sim_stop): | 
			
		
	
		
			
				
					|  |  |  |  |             nrise, nfall = wave_eval_cpu(op, c, c_locs, c_caps, ebuf, sim, delays, simctl_int[:, sim], seed) | 
			
		
	
		
			
				
					|  |  |  |  |             nrise, nfall = wave_eval_cpu(op, c, c_locs, c_caps, hbuf, sim, delays, simctl_int[:, sim], seed) | 
			
		
	
		
			
				
					|  |  |  |  |             a_loc = op[6] | 
			
		
	
		
			
				
					|  |  |  |  |             a_wr = op[7] | 
			
		
	
		
			
				
					|  |  |  |  |             a_wf = op[8] | 
			
		
	
	
		
			
				
					|  |  |  | @ -358,10 +370,10 @@ class WaveSimCuda(WaveSim):@@ -358,10 +370,10 @@ class WaveSimCuda(WaveSim): | 
			
		
	
		
			
				
					|  |  |  |  |         self.delays = cuda.to_device(self.delays) | 
			
		
	
		
			
				
					|  |  |  |  |         self.simctl_int = cuda.to_device(self.simctl_int) | 
			
		
	
		
			
				
					|  |  |  |  |         self.abuf = cuda.to_device(self.abuf) | 
			
		
	
		
			
				
					|  |  |  |  |         self.e = cuda.to_device(self.e) | 
			
		
	
		
			
				
					|  |  |  |  |         self.h = cuda.to_device(self.h) | 
			
		
	
		
			
				
					|  |  |  |  |         self.h_base = cuda.to_device(self.h_base) | 
			
		
	
		
			
				
					|  |  |  |  |         self.line_use = cuda.to_device(self.line_use) | 
			
		
	
		
			
				
					|  |  |  |  |         self.error_counts = cuda.to_device(self.error_counts) | 
			
		
	
		
			
				
					|  |  |  |  |         self.lsts = cuda.to_device(self.lsts) | 
			
		
	
		
			
				
					|  |  |  |  |         self.overflows = cuda.to_device(self.overflows) | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  |         self.retval_int = cuda.to_device(np.array([0], dtype=np.int32)) | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
	
		
			
				
					|  |  |  | @ -377,10 +389,10 @@ class WaveSimCuda(WaveSim):@@ -377,10 +389,10 @@ class WaveSimCuda(WaveSim): | 
			
		
	
		
			
				
					|  |  |  |  |         state['delays'] = np.array(self.delays) | 
			
		
	
		
			
				
					|  |  |  |  |         state['simctl_int'] = np.array(self.simctl_int) | 
			
		
	
		
			
				
					|  |  |  |  |         state['abuf'] = np.array(self.abuf) | 
			
		
	
		
			
				
					|  |  |  |  |         state['e'] = np.array(self.e) | 
			
		
	
		
			
				
					|  |  |  |  |         state['h'] = np.array(self.h) | 
			
		
	
		
			
				
					|  |  |  |  |         state['h_base'] = np.array(self.h_base) | 
			
		
	
		
			
				
					|  |  |  |  |         state['line_use'] = np.array(self.line_use) | 
			
		
	
		
			
				
					|  |  |  |  |         state['error_counts'] = np.array(self.error_counts) | 
			
		
	
		
			
				
					|  |  |  |  |         state['lsts'] = np.array(self.lsts) | 
			
		
	
		
			
				
					|  |  |  |  |         state['overflows'] = np.array(self.overflows) | 
			
		
	
		
			
				
					|  |  |  |  |         state['retval_int'] = np.array(self.retval_int) | 
			
		
	
		
			
				
					|  |  |  |  |         return state | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
	
		
			
				
					|  |  |  | @ -394,10 +406,10 @@ class WaveSimCuda(WaveSim):@@ -394,10 +406,10 @@ class WaveSimCuda(WaveSim): | 
			
		
	
		
			
				
					|  |  |  |  |         self.delays = cuda.to_device(self.delays) | 
			
		
	
		
			
				
					|  |  |  |  |         self.simctl_int = cuda.to_device(self.simctl_int) | 
			
		
	
		
			
				
					|  |  |  |  |         self.abuf = cuda.to_device(self.abuf) | 
			
		
	
		
			
				
					|  |  |  |  |         self.e = cuda.to_device(self.e) | 
			
		
	
		
			
				
					|  |  |  |  |         self.h = cuda.to_device(self.h) | 
			
		
	
		
			
				
					|  |  |  |  |         self.h_base = cuda.to_device(self.h_base) | 
			
		
	
		
			
				
					|  |  |  |  |         self.line_use = cuda.to_device(self.line_use) | 
			
		
	
		
			
				
					|  |  |  |  |         self.error_counts = cuda.to_device(self.error_counts) | 
			
		
	
		
			
				
					|  |  |  |  |         self.lsts = cuda.to_device(self.lsts) | 
			
		
	
		
			
				
					|  |  |  |  |         self.overflows = cuda.to_device(self.overflows) | 
			
		
	
		
			
				
					|  |  |  |  |         self.retval_int = cuda.to_device(self.retval_int) | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  |     def s_to_c(self): | 
			
		
	
	
		
			
				
					|  |  |  | @ -412,7 +424,7 @@ class WaveSimCuda(WaveSim):@@ -412,7 +424,7 @@ class WaveSimCuda(WaveSim): | 
			
		
	
		
			
				
					|  |  |  |  |             if op_from > op_start: continue | 
			
		
	
		
			
				
					|  |  |  |  |             if op_to is not None and op_to <= op_start: break | 
			
		
	
		
			
				
					|  |  |  |  |             grid_dim = self._grid_dim(sims, op_stop - op_start) | 
			
		
	
		
			
				
					|  |  |  |  |             wave_eval_gpu[grid_dim, self._block_dim](self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, self.e, self.abuf, int(0), | 
			
		
	
		
			
				
					|  |  |  |  |             wave_eval_gpu[grid_dim, self._block_dim](self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, self.h, self.abuf, int(0), | 
			
		
	
		
			
				
					|  |  |  |  |                 sims, self.delays, self.simctl_int, seed) | 
			
		
	
		
			
				
					|  |  |  |  |         cuda.synchronize() | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
	
		
			
				
					|  |  |  | @ -421,7 +433,7 @@ class WaveSimCuda(WaveSim):@@ -421,7 +433,7 @@ class WaveSimCuda(WaveSim): | 
			
		
	
		
			
				
					|  |  |  |  |         op_start = self.level_starts[level] | 
			
		
	
		
			
				
					|  |  |  |  |         op_stop = self.level_stops[level] | 
			
		
	
		
			
				
					|  |  |  |  |         grid_dim = self._grid_dim(sims, op_stop - op_start) | 
			
		
	
		
			
				
					|  |  |  |  |         wave_eval_gpu[grid_dim, self._block_dim](self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, self.e, self.abuf, int(0), | 
			
		
	
		
			
				
					|  |  |  |  |         wave_eval_gpu[grid_dim, self._block_dim](self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, self.h, self.abuf, int(0), | 
			
		
	
		
			
				
					|  |  |  |  |             sims, self.delays, self.simctl_int, seed) | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  |     def c_to_s(self, time=TMAX, sd=0.0, seed=1): | 
			
		
	
	
		
			
				
					|  |  |  | @ -433,38 +445,23 @@ class WaveSimCuda(WaveSim):@@ -433,38 +445,23 @@ class WaveSimCuda(WaveSim): | 
			
		
	
		
			
				
					|  |  |  |  |         grid_dim = self._grid_dim(self.sims, self.s_len) | 
			
		
	
		
			
				
					|  |  |  |  |         ppo_to_ppi_gpu[grid_dim, self._block_dim](self.s, self.c_locs, time, self.ppi_offset, self.ppo_offset) | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  |     def acc_error_counts(self, sims=None): | 
			
		
	
		
			
				
					|  |  |  |  |         sims = min(sims or self.sims, self.sims) | 
			
		
	
		
			
				
					|  |  |  |  |         grid_dim = cdiv(self.s_len, 256) | 
			
		
	
		
			
				
					|  |  |  |  |         acc_error_counts_gpu[grid_dim, 256](self.s, sims, self.error_counts) | 
			
		
	
		
			
				
					|  |  |  |  |     def set_base_hashes(self): | 
			
		
	
		
			
				
					|  |  |  |  |         nitems = self.h_base.shape[0] * self.h_base.shape[1] | 
			
		
	
		
			
				
					|  |  |  |  |         grid_dim = cdiv(nitems, 256) | 
			
		
	
		
			
				
					|  |  |  |  |         memcpy_gpu[grid_dim, 256](self.h, self.h_base, nitems) | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  |     def reset_error_counts(self): | 
			
		
	
		
			
				
					|  |  |  |  |         self.error_counts[:] = 0 | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  |     def get_error_counts(self): | 
			
		
	
		
			
				
					|  |  |  |  |         return np.array(self.error_counts) | 
			
		
	
		
			
				
					|  |  |  |  |     def compare_hashes_level(self, lv): | 
			
		
	
		
			
				
					|  |  |  |  |         self.retval_int[0] = 0 | 
			
		
	
		
			
				
					|  |  |  |  |         grid_dim = self._grid_dim(self.sims, self.line_use_counts[lv]) | 
			
		
	
		
			
				
					|  |  |  |  |         diff_hash_gpu[grid_dim, self._block_dim](self.h, self.h_base, self.line_use, self.line_use_offsets[lv], | 
			
		
	
		
			
				
					|  |  |  |  |                                                  self.line_use_counts[lv], self.retval_int) | 
			
		
	
		
			
				
					|  |  |  |  |         return self.retval_int[0] | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  |     def acc_overflows(self, sims=None): | 
			
		
	
		
			
				
					|  |  |  |  |     def calc_error_counts(self, sims=None): | 
			
		
	
		
			
				
					|  |  |  |  |         sims = min(sims or self.sims, self.sims) | 
			
		
	
		
			
				
					|  |  |  |  |         grid_dim = cdiv(self.s_len, 256) | 
			
		
	
		
			
				
					|  |  |  |  |         acc_overflows_gpu[grid_dim, 256](self.s, sims, self.overflows) | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  |     def reset_overflows(self): | 
			
		
	
		
			
				
					|  |  |  |  |         self.overflows[:] = 0 | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  |     def get_overflows(self): | 
			
		
	
		
			
				
					|  |  |  |  |         return np.array(self.overflows) | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  |     def acc_lsts(self, sims=None): | 
			
		
	
		
			
				
					|  |  |  |  |         sims = min(sims or self.sims, self.sims) | 
			
		
	
		
			
				
					|  |  |  |  |         grid_dim = cdiv(self.s_len, 256) | 
			
		
	
		
			
				
					|  |  |  |  |         acc_lsts_gpu[grid_dim, 256](self.s, sims, self.lsts) | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  |     def reset_lsts(self): | 
			
		
	
		
			
				
					|  |  |  |  |         self.lsts[:] = 0.0 | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  |     def get_lsts(self): | 
			
		
	
		
			
				
					|  |  |  |  |         return np.array(self.lsts) | 
			
		
	
		
			
				
					|  |  |  |  |         calc_error_counts_gpu[grid_dim, 256](self.s, sims, self.error_counts) | 
			
		
	
		
			
				
					|  |  |  |  |         return np.array(self.error_counts) | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  | @cuda.jit() | 
			
		
	
	
		
			
				
					|  |  |  | @ -476,33 +473,23 @@ def memcpy_gpu (src, dst, nitems):@@ -476,33 +473,23 @@ def memcpy_gpu (src, dst, nitems): | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  | @cuda.jit() | 
			
		
	
		
			
				
					|  |  |  |  | def acc_error_counts_gpu(s, sims, error_counts): | 
			
		
	
		
			
				
					|  |  |  |  |     x = cuda.grid(1) | 
			
		
	
		
			
				
					|  |  |  |  |     if x >= s.shape[1]: return | 
			
		
	
		
			
				
					|  |  |  |  |     cnt = 0 | 
			
		
	
		
			
				
					|  |  |  |  |     for i in range(sims): | 
			
		
	
		
			
				
					|  |  |  |  |         cnt += (s[6,x,i] != s[8,x,i]) | 
			
		
	
		
			
				
					|  |  |  |  |     error_counts[x] += cnt | 
			
		
	
		
			
				
					|  |  |  |  | def diff_hash_gpu(hbuf1, hbuf2, h_locs, h_locs_offset, h_locs_cnt, differs): | 
			
		
	
		
			
				
					|  |  |  |  |     x, y = cuda.grid(2) | 
			
		
	
		
			
				
					|  |  |  |  |     if x >= hbuf1.shape[1]: return | 
			
		
	
		
			
				
					|  |  |  |  |     if y >= h_locs_cnt: return | 
			
		
	
		
			
				
					|  |  |  |  |     h_loc = h_locs[h_locs_offset+y] | 
			
		
	
		
			
				
					|  |  |  |  |     if hbuf1[h_loc, x] != hbuf2[h_loc, x]: | 
			
		
	
		
			
				
					|  |  |  |  |         differs[0] = 1 | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  | @cuda.jit() | 
			
		
	
		
			
				
					|  |  |  |  | def acc_overflows_gpu(s, sims, overflows): | 
			
		
	
		
			
				
					|  |  |  |  | def calc_error_counts_gpu(s, sims, error_counts): | 
			
		
	
		
			
				
					|  |  |  |  |     x = cuda.grid(1) | 
			
		
	
		
			
				
					|  |  |  |  |     if x >= s.shape[1]: return | 
			
		
	
		
			
				
					|  |  |  |  |     cnt = 0 | 
			
		
	
		
			
				
					|  |  |  |  |     for i in range(sims): | 
			
		
	
		
			
				
					|  |  |  |  |         cnt += s[10,x,i] | 
			
		
	
		
			
				
					|  |  |  |  |     overflows[x] += cnt | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  | @cuda.jit() | 
			
		
	
		
			
				
					|  |  |  |  | def acc_lsts_gpu(s, sims, lsts): | 
			
		
	
		
			
				
					|  |  |  |  |     x = cuda.grid(1) | 
			
		
	
		
			
				
					|  |  |  |  |     if x >= s.shape[1]: return | 
			
		
	
		
			
				
					|  |  |  |  |     lst = 0 | 
			
		
	
		
			
				
					|  |  |  |  |     for i in range(sims): | 
			
		
	
		
			
				
					|  |  |  |  |         lst = max(lst, s[5,x,i]) | 
			
		
	
		
			
				
					|  |  |  |  |     lsts[x] = max(lsts[x], lst) | 
			
		
	
		
			
				
					|  |  |  |  |         cnt += (s[6,x,i] != s[8,x,i]) | 
			
		
	
		
			
				
					|  |  |  |  |     error_counts[x] = cnt | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  | @cuda.jit() | 
			
		
	
	
		
			
				
					|  |  |  | @ -533,7 +520,7 @@ _wave_eval_gpu = cuda.jit(_wave_eval, device=True)@@ -533,7 +520,7 @@ _wave_eval_gpu = cuda.jit(_wave_eval, device=True) | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  | @cuda.jit() | 
			
		
	
		
			
				
					|  |  |  |  | def wave_eval_gpu(ops, op_start, op_stop, cbuf, c_locs, c_caps, ebuf, abuf, sim_start, sim_stop, delays, simctl_int, seed): | 
			
		
	
		
			
				
					|  |  |  |  | def wave_eval_gpu(ops, op_start, op_stop, cbuf, c_locs, c_caps, hbuf, abuf, sim_start, sim_stop, delays, simctl_int, seed): | 
			
		
	
		
			
				
					|  |  |  |  |     x, y = cuda.grid(2) | 
			
		
	
		
			
				
					|  |  |  |  |     sim = sim_start + x | 
			
		
	
		
			
				
					|  |  |  |  |     op_idx = op_start + y | 
			
		
	
	
		
			
				
					|  |  |  | @ -545,7 +532,7 @@ def wave_eval_gpu(ops, op_start, op_stop, cbuf, c_locs, c_caps, ebuf, abuf, sim_@@ -545,7 +532,7 @@ def wave_eval_gpu(ops, op_start, op_stop, cbuf, c_locs, c_caps, ebuf, abuf, sim_ | 
			
		
	
		
			
				
					|  |  |  |  |     a_wr = op[7] | 
			
		
	
		
			
				
					|  |  |  |  |     a_wf = op[8] | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  |     nrise, nfall = _wave_eval_gpu(op, cbuf, c_locs, c_caps, ebuf, sim, delays, simctl_int[:, sim], seed) | 
			
		
	
		
			
				
					|  |  |  |  |     nrise, nfall = _wave_eval_gpu(op, cbuf, c_locs, c_caps, hbuf, sim, delays, simctl_int[:, sim], seed) | 
			
		
	
		
			
				
					|  |  |  |  | 
 | 
			
		
	
		
			
				
					|  |  |  |  |     # accumulate WSA into abuf | 
			
		
	
		
			
				
					|  |  |  |  |     if a_loc >= 0: | 
			
		
	
	
		
			
				
					|  |  |  | 
 |