From f59e97afa9e658125edc244a17ddda029157f38d Mon Sep 17 00:00:00 2001
From: Stefan Holst <holst@csn.kyutech.ac.jp>
Date: Fri, 5 Apr 2024 23:24:00 +0900
Subject: [PATCH] remove hashes, add lst, overflow, ebuf

---
 src/kyupy/wave_sim.py | 137 +++++++++++++++++++++++-------------------
 1 file changed, 75 insertions(+), 62 deletions(-)

diff --git a/src/kyupy/wave_sim.py b/src/kyupy/wave_sim.py
index e02cbed..8a25e54 100644
--- a/src/kyupy/wave_sim.py
+++ b/src/kyupy/wave_sim.py
@@ -99,26 +99,13 @@ class WaveSim(sim.SimOps):
         self.simctl_int[0] = range(sims)  # unique seed for each sim by default, zero this to pick same delays for all sims.
         self.simctl_int[1] = 2  # random picking by default.
 
-        # flat array for line use information
-        line_use = defaultdict(list)
-        for lidx in range(len(self.circuit.lines)):
-            if self.line_use_start[lidx] < 0: continue
-            if self.line_use_stop[lidx] < 0:
-                log.warn(f'line {lidx} never read?')
-            for i in range(self.line_use_start[lidx], self.line_use_stop[lidx]):
-                line_use[i].append(lidx)
-
-        self.line_use_counts = np.array([len(line_use[i]) for i in range(len(self.levels))], dtype=np.int32)
-        self.line_use_offsets = np.zeros_like(self.line_use_counts)
-        self.line_use_offsets[1:] = self.line_use_counts.cumsum()[:-1]
-        self.line_use = np.hstack([line_use[i] for i in range(len(self.levels))])
-
-        self.h = np.zeros((self.c_locs_len, sims), dtype=np.float32)  # hashes of generated waveforms
-        self.h_base = np.zeros_like(self.h)  # base hashes to compare to
+        self.e = np.zeros((self.c_locs_len, sims), dtype=np.uint8)  # aux data for each line and sim
 
         self.error_counts = np.zeros(self.s_len, dtype=np.uint32)  # number of capture errors by PPO
+        self.lsts = np.zeros(self.s_len, dtype=np.float32)  # LST by PPO
+        self.overflows = np.zeros(self.s_len, dtype=np.uint32)  # Overflows by PPO
 
-        self.nbytes = sum([a.nbytes for a in (self.c, self.s, self.h, self.c_locs, self.c_caps, self.ops, self.simctl_int)])
+        self.nbytes = sum([a.nbytes for a in (self.c, self.s, self.e, self.c_locs, self.c_caps, self.ops, self.simctl_int)])
 
     def __repr__(self):
         dev = 'GPU' if hasattr(self.c, 'copy_to_host') else 'CPU'
@@ -144,7 +131,7 @@ class WaveSim(sim.SimOps):
         """
         sims = min(sims or self.sims, self.sims)
         for op_start, op_stop in zip(self.level_starts, self.level_stops):
-            level_eval_cpu(self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, self.h, self.abuf, 0, sims, self.delays, self.simctl_int, seed)
+            level_eval_cpu(self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, self.e, self.abuf, 0, sims, self.delays, self.simctl_int, seed)
 
     def c_to_s(self, time=TMAX, sd=0.0, seed=1):
         """Simulates a capture operation at all sequential elements and primary outputs.
@@ -172,7 +159,7 @@ class WaveSim(sim.SimOps):
         self.s[2, self.ppio_s_locs] = self.s[8, self.ppio_s_locs]
 
 
-def _wave_eval(op, cbuf, c_locs, c_caps, hbuf, sim, delays, simctl_int, seed):
+def _wave_eval(op, cbuf, c_locs, c_caps, ebuf, sim, delays, simctl_int, seed):
     overflows = int(0)
 
     lut = op[0]
@@ -202,8 +189,6 @@ def _wave_eval(op, cbuf, c_locs, c_caps, hbuf, sim, delays, simctl_int, seed):
     z_mem = c_locs[z_idx]
     z_cap = c_caps[z_idx]
 
-    h = np.float32(0)
-
     a_cur = int(0)
     b_cur = int(0)
     c_cur = int(0)
@@ -251,7 +236,6 @@ def _wave_eval(op, cbuf, c_locs, c_caps, hbuf, sim, delays, simctl_int, seed):
             next_t = cbuf[d_mem + d_cur, sim] + delays[d_idx, (d_cur & 1) ^ 1, z_val ^ 1]
 
         if (z_cur & 1) != ((lut >> inputs) & 1):
-            h += h*3 + max(current_t, -10)  # hash based on generated transitions before filtering
             # we generate an edge in z_mem, if ...
             if (z_cur == 0                            # it is the first edge in z_mem ...
                 or next_t < current_t                 # -OR- the next edge on SAME input is EARLIER (need current edge to filter BOTH in next iteration) ...
@@ -281,11 +265,15 @@ def _wave_eval(op, cbuf, c_locs, c_caps, hbuf, sim, delays, simctl_int, seed):
     # generate or propagate overflow flag
     cbuf[z_mem + z_cur, sim] = TMAX_OVL if overflows > 0 else max(a, b, c, d)
 
-    hbuf[z_idx, sim] = h
-
     nrise = max(0, (z_cur+1) // 2 - (cbuf[z_mem, sim] == TMIN))
     nfall = z_cur // 2
 
+    e = int(((cbuf[z_mem, sim] == TMIN) << 1) & 2)  # initial value
+    e |= z_val  # final value
+    e |= (nrise + nfall)<<2  # number of transitions
+
+    ebuf[z_idx, sim] = e
+
     return nrise, nfall
 
 
@@ -293,11 +281,11 @@ wave_eval_cpu = numba.njit(_wave_eval)
 
 
 @numba.njit
-def level_eval_cpu(ops, op_start, op_stop, c, c_locs, c_caps, hbuf, abuf, sim_start, sim_stop, delays, simctl_int, seed):
+def level_eval_cpu(ops, op_start, op_stop, c, c_locs, c_caps, ebuf, abuf, sim_start, sim_stop, delays, simctl_int, seed):
     for op_idx in range(op_start, op_stop):
         op = ops[op_idx]
         for sim in range(sim_start, sim_stop):
-            nrise, nfall = wave_eval_cpu(op, c, c_locs, c_caps, hbuf, sim, delays, simctl_int[:, sim], seed)
+            nrise, nfall = wave_eval_cpu(op, c, c_locs, c_caps, ebuf, sim, delays, simctl_int[:, sim], seed)
             a_loc = op[6]
             a_wr = op[7]
             a_wf = op[8]
@@ -370,10 +358,10 @@ class WaveSimCuda(WaveSim):
         self.delays = cuda.to_device(self.delays)
         self.simctl_int = cuda.to_device(self.simctl_int)
         self.abuf = cuda.to_device(self.abuf)
-        self.h = cuda.to_device(self.h)
-        self.h_base = cuda.to_device(self.h_base)
-        self.line_use = cuda.to_device(self.line_use)
+        self.e = cuda.to_device(self.e)
         self.error_counts = cuda.to_device(self.error_counts)
+        self.lsts = cuda.to_device(self.lsts)
+        self.overflows = cuda.to_device(self.overflows)
 
         self.retval_int = cuda.to_device(np.array([0], dtype=np.int32))
 
@@ -389,10 +377,10 @@ class WaveSimCuda(WaveSim):
         state['delays'] = np.array(self.delays)
         state['simctl_int'] = np.array(self.simctl_int)
         state['abuf'] = np.array(self.abuf)
-        state['h'] = np.array(self.h)
-        state['h_base'] = np.array(self.h_base)
-        state['line_use'] = np.array(self.line_use)
+        state['e'] = np.array(self.e)
         state['error_counts'] = np.array(self.error_counts)
+        state['lsts'] = np.array(self.lsts)
+        state['overflows'] = np.array(self.overflows)
         state['retval_int'] = np.array(self.retval_int)
         return state
 
@@ -406,10 +394,10 @@ class WaveSimCuda(WaveSim):
         self.delays = cuda.to_device(self.delays)
         self.simctl_int = cuda.to_device(self.simctl_int)
         self.abuf = cuda.to_device(self.abuf)
-        self.h = cuda.to_device(self.h)
-        self.h_base = cuda.to_device(self.h_base)
-        self.line_use = cuda.to_device(self.line_use)
+        self.e = cuda.to_device(self.e)
         self.error_counts = cuda.to_device(self.error_counts)
+        self.lsts = cuda.to_device(self.lsts)
+        self.overflows = cuda.to_device(self.overflows)
         self.retval_int = cuda.to_device(self.retval_int)
 
     def s_to_c(self):
@@ -424,7 +412,7 @@ class WaveSimCuda(WaveSim):
             if op_from > op_start: continue
             if op_to is not None and op_to <= op_start: break
             grid_dim = self._grid_dim(sims, op_stop - op_start)
-            wave_eval_gpu[grid_dim, self._block_dim](self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, self.h, self.abuf, int(0),
+            wave_eval_gpu[grid_dim, self._block_dim](self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, self.e, self.abuf, int(0),
                 sims, self.delays, self.simctl_int, seed)
         cuda.synchronize()
 
@@ -433,7 +421,7 @@ class WaveSimCuda(WaveSim):
         op_start = self.level_starts[level]
         op_stop = self.level_stops[level]
         grid_dim = self._grid_dim(sims, op_stop - op_start)
-        wave_eval_gpu[grid_dim, self._block_dim](self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, self.h, self.abuf, int(0),
+        wave_eval_gpu[grid_dim, self._block_dim](self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, self.e, self.abuf, int(0),
             sims, self.delays, self.simctl_int, seed)
 
     def c_to_s(self, time=TMAX, sd=0.0, seed=1):
@@ -445,23 +433,38 @@ class WaveSimCuda(WaveSim):
         grid_dim = self._grid_dim(self.sims, self.s_len)
         ppo_to_ppi_gpu[grid_dim, self._block_dim](self.s, self.c_locs, time, self.ppi_offset, self.ppo_offset)
 
-    def set_base_hashes(self):
-        nitems = self.h_base.shape[0] * self.h_base.shape[1]
-        grid_dim = cdiv(nitems, 256)
-        memcpy_gpu[grid_dim, 256](self.h, self.h_base, nitems)
+    def acc_error_counts(self, sims=None):
+        sims = min(sims or self.sims, self.sims)
+        grid_dim = cdiv(self.s_len, 256)
+        acc_error_counts_gpu[grid_dim, 256](self.s, sims, self.error_counts)
 
-    def compare_hashes_level(self, lv):
-        self.retval_int[0] = 0
-        grid_dim = self._grid_dim(self.sims, self.line_use_counts[lv])
-        diff_hash_gpu[grid_dim, self._block_dim](self.h, self.h_base, self.line_use, self.line_use_offsets[lv],
-                                                 self.line_use_counts[lv], self.retval_int)
-        return self.retval_int[0]
+    def reset_error_counts(self):
+        self.error_counts[:] = 0
+
+    def get_error_counts(self):
+        return np.array(self.error_counts)
 
-    def calc_error_counts(self, sims=None):
+    def acc_overflows(self, sims=None):
         sims = min(sims or self.sims, self.sims)
         grid_dim = cdiv(self.s_len, 256)
-        calc_error_counts_gpu[grid_dim, 256](self.s, sims, self.error_counts)
-        return np.array(self.error_counts)
+        acc_overflows_gpu[grid_dim, 256](self.s, sims, self.overflows)
+
+    def reset_overflows(self):
+        self.overflows[:] = 0
+
+    def get_overflows(self):
+        return np.array(self.overflows)
+
+    def acc_lsts(self, sims=None):
+        sims = min(sims or self.sims, self.sims)
+        grid_dim = cdiv(self.s_len, 256)
+        acc_lsts_gpu[grid_dim, 256](self.s, sims, self.lsts)
+
+    def reset_lsts(self):
+        self.lsts[:] = 0.0
+
+    def get_lsts(self):
+        return np.array(self.lsts)
 
 
 @cuda.jit()
@@ -473,23 +476,33 @@ def memcpy_gpu (src, dst, nitems):
 
 
 @cuda.jit()
-def diff_hash_gpu(hbuf1, hbuf2, h_locs, h_locs_offset, h_locs_cnt, differs):
-    x, y = cuda.grid(2)
-    if x >= hbuf1.shape[1]: return
-    if y >= h_locs_cnt: return
-    h_loc = h_locs[h_locs_offset+y]
-    if hbuf1[h_loc, x] != hbuf2[h_loc, x]:
-        differs[0] = 1
+def acc_error_counts_gpu(s, sims, error_counts):
+    x = cuda.grid(1)
+    if x >= s.shape[1]: return
+    cnt = 0
+    for i in range(sims):
+        cnt += (s[6,x,i] != s[8,x,i])
+    error_counts[x] += cnt
 
 
 @cuda.jit()
-def calc_error_counts_gpu(s, sims, error_counts):
+def acc_overflows_gpu(s, sims, overflows):
     x = cuda.grid(1)
     if x >= s.shape[1]: return
     cnt = 0
     for i in range(sims):
-        cnt += (s[6,x,i] != s[8,x,i])
-    error_counts[x] = cnt
+        cnt += s[10,x,i]
+    overflows[x] += cnt
+
+
+@cuda.jit()
+def acc_lsts_gpu(s, sims, lsts):
+    x = cuda.grid(1)
+    if x >= s.shape[1]: return
+    lst = 0
+    for i in range(sims):
+        lst = max(lst, s[5,x,i])
+    lsts[x] = max(lsts[x], lst)
 
 
 @cuda.jit()
@@ -520,7 +533,7 @@ _wave_eval_gpu = cuda.jit(_wave_eval, device=True)
 
 
 @cuda.jit()
-def wave_eval_gpu(ops, op_start, op_stop, cbuf, c_locs, c_caps, hbuf, abuf, sim_start, sim_stop, delays, simctl_int, seed):
+def wave_eval_gpu(ops, op_start, op_stop, cbuf, c_locs, c_caps, ebuf, abuf, sim_start, sim_stop, delays, simctl_int, seed):
     x, y = cuda.grid(2)
     sim = sim_start + x
     op_idx = op_start + y
@@ -532,7 +545,7 @@ def wave_eval_gpu(ops, op_start, op_stop, cbuf, c_locs, c_caps, hbuf, abuf, sim_
     a_wr = op[7]
     a_wf = op[8]
 
-    nrise, nfall = _wave_eval_gpu(op, cbuf, c_locs, c_caps, hbuf, sim, delays, simctl_int[:, sim], seed)
+    nrise, nfall = _wave_eval_gpu(op, cbuf, c_locs, c_caps, ebuf, sim, delays, simctl_int[:, sim], seed)
 
     # accumulate WSA into abuf
     if a_loc >= 0: