remove hashes, add lst, overflow, ebuf

a fast 6v sim
allow interconnect annotations without forks
6 changed files with 237 additions and 75 deletions
--- a/src/kyupy/circuit.py
+++ b/src/kyupy/circuit.py
@ -40,6 +40,11 @@ class GrowingList(list):
				@@ -40,6 +40,11 @@ class GrowingList(list):
            self.has_nones = fi < len(self)
        return fi

+    def without_nones(self):
+        for item in self:
+            if item is not None:
+                yield item
+

 class IndexList(list):
    def __delitem__(self, index):
--- a/src/kyupy/logic.py
+++ b/src/kyupy/logic.py
@ -269,6 +269,18 @@ def mv_to_bp(mva):
				@@ -269,6 +269,18 @@ def mv_to_bp(mva):
    return np.packbits(unpackbits(mva)[...,:3], axis=-2, bitorder='little').swapaxes(-1,-2)


+def mv_init(mva):
+    """Returns the initial binary values for mva.
+    """
+    return (mva>>1) & ((mva>>2)|mva) & 1
+
+
+def mv_final(mva):
+    """Returns the final binary value of mva.
+    """
+    return mva & ((mva>>2)|(mva>>1)) & 1
+
+
 def bparray(*a):
    """Converts (lists of) Boolean values or strings into a bit-parallel array.

--- a/src/kyupy/logic_sim.py
+++ b/src/kyupy/logic_sim.py
@ -10,9 +10,10 @@ import math
				@@ -10,9 +10,10 @@ import math

 import numpy as np

-from . import numba, logic, hr_bytes, sim, eng
+from . import numba, logic, hr_bytes, sim, eng, cdiv
 from .circuit import Circuit

+
 class LogicSim(sim.SimOps):
    """A bit-parallel naïve combinational simulator for 2-, 4-, or 8-valued logic.

@ -28,7 +29,7 @@ class LogicSim(sim.SimOps):
				@@ -28,7 +29,7 @@ class LogicSim(sim.SimOps):
        self.m = m
        self.mdim = math.ceil(math.log2(m))
        self.sims = sims
-        nbytes = (sims - 1) // 8 + 1
+        nbytes = cdiv(sims, 8)

        self.c = np.zeros((self.c_len, self.mdim, nbytes), dtype=np.uint8)
        self.s = np.zeros((2, self.s_len, 3, nbytes), dtype=np.uint8)
@ -334,3 +335,101 @@ def _prop_cpu(ops, c_locs, c):
				@@ -334,3 +335,101 @@ def _prop_cpu(ops, c_locs, c):
        elif op == sim.OAI211: c[o0] = ~((c[i0] | c[i1]) & c[i2] & c[i3])
        elif op == sim.MUX21: c[o0] = (c[i0] & ~c[i2]) | (c[i1] & c[i2])
        else: print(f'unknown op {op}')
+
+
+class LogicSim6V(sim.SimOps):
+    """A bit-parallel naïve combinational simulator for 6-valued logic.
+
+    :param circuit: The circuit to simulate.
+    :param sims: The number of parallel logic simulations to perform.
+    :param c_reuse: If True, intermediate signal values may get overwritten when not needed anymore to save memory.
+    :param strip_forks: If True, forks are not included in the simulation model to save memory and simulation time.
+    """
+    def __init__(self, circuit: Circuit, sims: int = 8, c_reuse: bool = False, strip_forks: bool = False):
+        super().__init__(circuit, c_reuse=c_reuse, strip_forks=strip_forks)
+        self.sims = sims
+        nbytes = cdiv(sims, 8)
+
+        self.c = np.zeros((self.c_len, 3, nbytes), dtype=np.uint8)
+        self.s = np.zeros((2, self.s_len, self.sims), dtype=np.uint8)
+        """Logic values of the sequential elements (flip-flops) and ports.
+
+        It is a pair of arrays in mv storage format:
+
+        * ``s[0]`` Assigned values. Simulator will read (P)PI value from here.
+        * ``s[1]`` Result values. Simulator will write (P)PO values here.
+
+        Access this array to assign new values to the (P)PIs or read values from the (P)POs.
+        """
+
+    def __repr__(self):
+        return f'{{name: "{self.circuit.name}", sims: {self.sims}, c_bytes: {eng(self.c.nbytes)}}}'
+
+    def s_to_c(self):
+        """Assigns the values from ``s[0]`` to the inputs of the combinational portion.
+        """
+        self.c[self.pippi_c_locs] = logic.mv_to_bp(self.s[0, self.pippi_s_locs])
+
+    def c_prop(self):
+        c_prop_cpu(self.ops, self.c, self.c_locs)
+
+    def c_to_s(self):
+        """Captures the results of the combinational portion into ``s[1]``.
+        """
+        self.s[1, self.poppo_s_locs] = logic.bp_to_mv(self.c[self.poppo_c_locs])[:,:self.sims]
+
+
+@numba.njit
+def c_prop_cpu(ops, c, c_locs):
+    inv_op = np.array([255, 255, 0], dtype=np.uint8)[np.newaxis, :, np.newaxis]
+    for op, o0l, i0l, i1l, i2l, i3l in ops[:,:6]:
+        o0, i0, i1, i2, i3 = [c[c_locs[x]] for x in (o0l, i0l, i1l, i2l, i3l)]
+        if op == sim.BUF1 or op == sim.INV1:
+            o0[...] = i0
+        elif op == sim.AND2 or op == sim.NAND2:
+            o0[0] = i0[0] & i1[0]
+            o0[1] = i0[1] & i1[1]
+            o0[2] = (i0[2]&(i1[0]|i1[1]|i1[2])|
+                     i1[2]&(i0[0]|i0[1]|i0[2]))
+        elif op == sim.AND3 or op == sim.NAND3:
+            o0[0] = i0[0] & i1[0] & i2[0]
+            o0[1] = i0[1] & i1[1] & i2[1]
+            o0[2] = (i0[2]&(i1[0]|i1[1]|i1[2])&(i2[0]|i2[1]|i2[2])|
+                     i1[2]&(i0[0]|i0[1]|i0[2])&(i2[0]|i2[1]|i2[2])|
+                     i2[2]&(i0[0]|i0[1]|i0[2])&(i1[0]|i1[1]|i1[2]))
+        elif op == sim.AND4 or op == sim.NAND4:
+            o0[0] = i0[0] & i1[0] & i2[0] & i3[0]
+            o0[1] = i0[1] & i1[1] & i2[1] & i3[1]
+            o0[2] = (i0[2]&(i1[0]|i1[1]|i1[2])&(i2[0]|i2[1]|i2[2])&(i3[0]|i3[1]|i3[2])|
+                     i1[2]&(i0[0]|i0[1]|i0[2])&(i2[0]|i2[1]|i2[2])&(i3[0]|i3[1]|i3[2])|
+                     i2[2]&(i0[0]|i0[1]|i0[2])&(i1[0]|i1[1]|i1[2])&(i3[0]|i3[1]|i3[2])|
+                     i3[2]&(i0[0]|i0[1]|i0[2])&(i1[0]|i1[1]|i1[2])&(i2[0]|i2[1]|i2[2]))
+        elif op == sim.OR2 or op == sim.NOR2:
+            o0[0] = i0[0] | i1[0]
+            o0[1] = i0[1] | i1[1]
+            o0[2] = (i0[2]&(~i1[0]|~i1[1]|i1[2])|
+                     i1[2]&(~i0[0]|~i0[1]|i0[2]))
+        elif op == sim.OR3 or op == sim.NOR3:
+            o0[0] = i0[0] | i1[0] | i2[0]
+            o0[1] = i0[1] | i1[1] | i2[1]
+            o0[2] = (i0[2]&(~i1[0]|~i1[1]|i1[2])&(~i2[0]|~i2[1]|i2[2])|
+                     i1[2]&(~i0[0]|~i0[1]|i0[2])&(~i2[0]|~i2[1]|i2[2])|
+                     i2[2]&(~i0[0]|~i0[1]|i0[2])&(~i1[0]|~i1[1]|i1[2]))
+        elif op == sim.OR4 or op == sim.NOR4:
+            o0[0] = i0[0] | i1[0] | i2[0] | i3[0]
+            o0[1] = i0[1] | i1[1] | i2[1] | i3[1]
+            o0[2] = (i0[2]&(~i1[0]|~i1[1]|i1[2])&(~i2[0]|~i2[1]|i2[2])&(~i3[0]|~i3[1]|i3[2])|
+                     i1[2]&(~i0[0]|~i0[1]|i0[2])&(~i2[0]|~i2[1]|i2[2])&(~i3[0]|~i3[1]|i3[2])|
+                     i2[2]&(~i0[0]|~i0[1]|i0[2])&(~i1[0]|~i1[1]|i1[2])&(~i3[0]|~i3[1]|i3[2])|
+                     i3[2]&(~i0[0]|~i0[1]|i0[2])&(~i1[0]|~i1[1]|i1[2])&(~i2[0]|~i2[1]|i2[2]))
+        elif op == sim.XOR2 or op == sim.XNOR2:
+            o0[0] = i0[0] ^ i1[0]
+            o0[1] = i0[1] ^ i1[1]
+            o0[2] = i0[2] | i1[2]
+        else: print(f'unknown op {op}')
+
+        if (op == sim.INV1 or
+            op == sim.NAND2 or op == sim.NAND3 or op == sim.NAND4 or
+            op == sim.NOR2 or op == sim.NOR3 or op == sim.NOR4 or
+            op == sim.XNOR2):
+            o0[...] = o0 ^ inv_op
--- a/src/kyupy/sdf.py
+++ b/src/kyupy/sdf.py
@ -103,6 +103,7 @@ class DelayFile:
				@@ -103,6 +103,7 @@ class DelayFile:

        delays = np.zeros((len(circuit.lines), 2, 2, 3))  # dataset last during construction.

+        nonfork_annotations = 0
        for n1, n2, *delvals in self._interconnects:
            delvals = [d if len(d) > 0 else [0, 0, 0] for d in delvals]
            if max(max(delvals)) == 0: continue
@ -120,19 +121,27 @@ class DelayFile:
				@@ -120,19 +121,27 @@ class DelayFile:
                log.warn(f'No line to annotate pin {pn2} of {c2}')
                continue
            f1, f2 = c1.outs[p1].reader, c2.ins[p2].driver  # find the forks between cells.
-            assert f1.kind == '__fork__'
-            assert f2.kind == '__fork__'
-            if f1 != f2:  # at least two forks, make sure f2 is a branchfork connected to f1
-                assert len(f2.outs) == 1
-                assert f1.outs[f2.ins[0].driver_pin] == f2.ins[0]
-                line = f2.ins[0]
-            elif len(f2.outs) == 1:  # f1==f2, only OK when there is no fanout.
-                line = f2.ins[0]
+            if f1 == c2 and f2 == c1:
+                nonfork_annotations += 1
+                if nonfork_annotations < 10:
+                    log.warn(f'No fork between {c1.name}/{p1} and {c2.name}/{p2}, using {c2.name}/{p2}')
+                line = c2.ins[p2]
            else:
-                log.warn(f'No branchfork to annotate interconnect delay {c1.name}/{p1}->{c2.name}/{p2}')
-                continue
+                assert f1.kind == '__fork__'
+                assert f2.kind == '__fork__'
+                if len(f2.outs) == 1:
+                    assert f1 == f2 or f1.outs[f2.ins[0].driver_pin] == f2.ins[0]
+                    line = f2.ins[0]
+                else:
+                    nonfork_annotations += 1
+                    if nonfork_annotations < 10:
+                        log.warn(f'No branchfork between {c1.name}/{p1} and {c2.name}/{p2}, using {c2.name}/{p2}')
+                    line = c2.ins[p2]
            delays[line, :] = delvals

+        if nonfork_annotations > 0:
+            log.warn(f'{nonfork_annotations} interconnect annotations were moved to gate inputs due to missing forks.')
+
        return np.moveaxis(delays, -1, 0)


--- a/src/kyupy/wave_sim.py
+++ b/src/kyupy/wave_sim.py
@ -99,26 +99,13 @@ class WaveSim(sim.SimOps):
				@@ -99,26 +99,13 @@ class WaveSim(sim.SimOps):
        self.simctl_int[0] = range(sims)  # unique seed for each sim by default, zero this to pick same delays for all sims.
        self.simctl_int[1] = 2  # random picking by default.

-        # flat array for line use information
-        line_use = defaultdict(list)
-        for lidx in range(len(self.circuit.lines)):
-            if self.line_use_start[lidx] < 0: continue
-            if self.line_use_stop[lidx] < 0:
-                log.warn(f'line {lidx} never read?')
-            for i in range(self.line_use_start[lidx], self.line_use_stop[lidx]):
-                line_use[i].append(lidx)
-
-        self.line_use_counts = np.array([len(line_use[i]) for i in range(len(self.levels))], dtype=np.int32)
-        self.line_use_offsets = np.zeros_like(self.line_use_counts)
-        self.line_use_offsets[1:] = self.line_use_counts.cumsum()[:-1]
-        self.line_use = np.hstack([line_use[i] for i in range(len(self.levels))])
-
-        self.h = np.zeros((self.c_locs_len, sims), dtype=np.float32)  # hashes of generated waveforms
-        self.h_base = np.zeros_like(self.h)  # base hashes to compare to
+        self.e = np.zeros((self.c_locs_len, sims), dtype=np.uint8)  # aux data for each line and sim

        self.error_counts = np.zeros(self.s_len, dtype=np.uint32)  # number of capture errors by PPO
+        self.lsts = np.zeros(self.s_len, dtype=np.float32)  # LST by PPO
+        self.overflows = np.zeros(self.s_len, dtype=np.uint32)  # Overflows by PPO

-        self.nbytes = sum([a.nbytes for a in (self.c, self.s, self.h, self.c_locs, self.c_caps, self.ops, self.simctl_int)])
+        self.nbytes = sum([a.nbytes for a in (self.c, self.s, self.e, self.c_locs, self.c_caps, self.ops, self.simctl_int)])

    def __repr__(self):
        dev = 'GPU' if hasattr(self.c, 'copy_to_host') else 'CPU'
@ -144,7 +131,7 @@ class WaveSim(sim.SimOps):
				@@ -144,7 +131,7 @@ class WaveSim(sim.SimOps):
        """
        sims = min(sims or self.sims, self.sims)
        for op_start, op_stop in zip(self.level_starts, self.level_stops):
-            level_eval_cpu(self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, self.h, self.abuf, 0, sims, self.delays, self.simctl_int, seed)
+            level_eval_cpu(self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, self.e, self.abuf, 0, sims, self.delays, self.simctl_int, seed)

    def c_to_s(self, time=TMAX, sd=0.0, seed=1):
        """Simulates a capture operation at all sequential elements and primary outputs.
@ -172,7 +159,7 @@ class WaveSim(sim.SimOps):
				@@ -172,7 +159,7 @@ class WaveSim(sim.SimOps):
        self.s[2, self.ppio_s_locs] = self.s[8, self.ppio_s_locs]


-def _wave_eval(op, cbuf, c_locs, c_caps, hbuf, sim, delays, simctl_int, seed):
+def _wave_eval(op, cbuf, c_locs, c_caps, ebuf, sim, delays, simctl_int, seed):
    overflows = int(0)

    lut = op[0]
@ -202,8 +189,6 @@ def _wave_eval(op, cbuf, c_locs, c_caps, hbuf, sim, delays, simctl_int, seed):
				@@ -202,8 +189,6 @@ def _wave_eval(op, cbuf, c_locs, c_caps, hbuf, sim, delays, simctl_int, seed):
    z_mem = c_locs[z_idx]
    z_cap = c_caps[z_idx]

-    h = np.float32(0)
-
    a_cur = int(0)
    b_cur = int(0)
    c_cur = int(0)
@ -251,7 +236,6 @@ def _wave_eval(op, cbuf, c_locs, c_caps, hbuf, sim, delays, simctl_int, seed):
				@@ -251,7 +236,6 @@ def _wave_eval(op, cbuf, c_locs, c_caps, hbuf, sim, delays, simctl_int, seed):
            next_t = cbuf[d_mem + d_cur, sim] + delays[d_idx, (d_cur & 1) ^ 1, z_val ^ 1]

        if (z_cur & 1) != ((lut >> inputs) & 1):
-            h += h*3 + max(current_t, -10)  # hash based on generated transitions before filtering
            # we generate an edge in z_mem, if ...
            if (z_cur == 0                            # it is the first edge in z_mem ...
                or next_t < current_t                 # -OR- the next edge on SAME input is EARLIER (need current edge to filter BOTH in next iteration) ...
@ -281,11 +265,15 @@ def _wave_eval(op, cbuf, c_locs, c_caps, hbuf, sim, delays, simctl_int, seed):
				@@ -281,11 +265,15 @@ def _wave_eval(op, cbuf, c_locs, c_caps, hbuf, sim, delays, simctl_int, seed):
    # generate or propagate overflow flag
    cbuf[z_mem + z_cur, sim] = TMAX_OVL if overflows > 0 else max(a, b, c, d)

-    hbuf[z_idx, sim] = h
-
    nrise = max(0, (z_cur+1) // 2 - (cbuf[z_mem, sim] == TMIN))
    nfall = z_cur // 2

+    e = int(((cbuf[z_mem, sim] == TMIN) << 1) & 2)  # initial value
+    e |= z_val  # final value
+    e |= (nrise + nfall)<<2  # number of transitions
+
+    ebuf[z_idx, sim] = e
+
    return nrise, nfall


@ -293,11 +281,11 @@ wave_eval_cpu = numba.njit(_wave_eval)
				@@ -293,11 +281,11 @@ wave_eval_cpu = numba.njit(_wave_eval)


@numba.njit
-def level_eval_cpu(ops, op_start, op_stop, c, c_locs, c_caps, hbuf, abuf, sim_start, sim_stop, delays, simctl_int, seed):
+def level_eval_cpu(ops, op_start, op_stop, c, c_locs, c_caps, ebuf, abuf, sim_start, sim_stop, delays, simctl_int, seed):
    for op_idx in range(op_start, op_stop):
        op = ops[op_idx]
        for sim in range(sim_start, sim_stop):
-            nrise, nfall = wave_eval_cpu(op, c, c_locs, c_caps, hbuf, sim, delays, simctl_int[:, sim], seed)
+            nrise, nfall = wave_eval_cpu(op, c, c_locs, c_caps, ebuf, sim, delays, simctl_int[:, sim], seed)
            a_loc = op[6]
            a_wr = op[7]
            a_wf = op[8]
@ -370,10 +358,10 @@ class WaveSimCuda(WaveSim):
				@@ -370,10 +358,10 @@ class WaveSimCuda(WaveSim):
        self.delays = cuda.to_device(self.delays)
        self.simctl_int = cuda.to_device(self.simctl_int)
        self.abuf = cuda.to_device(self.abuf)
-        self.h = cuda.to_device(self.h)
-        self.h_base = cuda.to_device(self.h_base)
-        self.line_use = cuda.to_device(self.line_use)
+        self.e = cuda.to_device(self.e)
        self.error_counts = cuda.to_device(self.error_counts)
+        self.lsts = cuda.to_device(self.lsts)
+        self.overflows = cuda.to_device(self.overflows)

        self.retval_int = cuda.to_device(np.array([0], dtype=np.int32))

@ -389,10 +377,10 @@ class WaveSimCuda(WaveSim):
				@@ -389,10 +377,10 @@ class WaveSimCuda(WaveSim):
        state['delays'] = np.array(self.delays)
        state['simctl_int'] = np.array(self.simctl_int)
        state['abuf'] = np.array(self.abuf)
-        state['h'] = np.array(self.h)
-        state['h_base'] = np.array(self.h_base)
-        state['line_use'] = np.array(self.line_use)
+        state['e'] = np.array(self.e)
        state['error_counts'] = np.array(self.error_counts)
+        state['lsts'] = np.array(self.lsts)
+        state['overflows'] = np.array(self.overflows)
        state['retval_int'] = np.array(self.retval_int)
        return state

@ -406,10 +394,10 @@ class WaveSimCuda(WaveSim):
				@@ -406,10 +394,10 @@ class WaveSimCuda(WaveSim):
        self.delays = cuda.to_device(self.delays)
        self.simctl_int = cuda.to_device(self.simctl_int)
        self.abuf = cuda.to_device(self.abuf)
-        self.h = cuda.to_device(self.h)
-        self.h_base = cuda.to_device(self.h_base)
-        self.line_use = cuda.to_device(self.line_use)
+        self.e = cuda.to_device(self.e)
        self.error_counts = cuda.to_device(self.error_counts)
+        self.lsts = cuda.to_device(self.lsts)
+        self.overflows = cuda.to_device(self.overflows)
        self.retval_int = cuda.to_device(self.retval_int)

    def s_to_c(self):
@ -424,7 +412,7 @@ class WaveSimCuda(WaveSim):
				@@ -424,7 +412,7 @@ class WaveSimCuda(WaveSim):
            if op_from > op_start: continue
            if op_to is not None and op_to <= op_start: break
            grid_dim = self._grid_dim(sims, op_stop - op_start)
-            wave_eval_gpu[grid_dim, self._block_dim](self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, self.h, self.abuf, int(0),
+            wave_eval_gpu[grid_dim, self._block_dim](self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, self.e, self.abuf, int(0),
                sims, self.delays, self.simctl_int, seed)
        cuda.synchronize()

@ -433,7 +421,7 @@ class WaveSimCuda(WaveSim):
				@@ -433,7 +421,7 @@ class WaveSimCuda(WaveSim):
        op_start = self.level_starts[level]
        op_stop = self.level_stops[level]
        grid_dim = self._grid_dim(sims, op_stop - op_start)
-        wave_eval_gpu[grid_dim, self._block_dim](self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, self.h, self.abuf, int(0),
+        wave_eval_gpu[grid_dim, self._block_dim](self.ops, op_start, op_stop, self.c, self.c_locs, self.c_caps, self.e, self.abuf, int(0),
            sims, self.delays, self.simctl_int, seed)

    def c_to_s(self, time=TMAX, sd=0.0, seed=1):
@ -445,23 +433,38 @@ class WaveSimCuda(WaveSim):
				@@ -445,23 +433,38 @@ class WaveSimCuda(WaveSim):
        grid_dim = self._grid_dim(self.sims, self.s_len)
        ppo_to_ppi_gpu[grid_dim, self._block_dim](self.s, self.c_locs, time, self.ppi_offset, self.ppo_offset)

-    def set_base_hashes(self):
-        nitems = self.h_base.shape[0] * self.h_base.shape[1]
-        grid_dim = cdiv(nitems, 256)
-        memcpy_gpu[grid_dim, 256](self.h, self.h_base, nitems)
+    def acc_error_counts(self, sims=None):
+        sims = min(sims or self.sims, self.sims)
+        grid_dim = cdiv(self.s_len, 256)
+        acc_error_counts_gpu[grid_dim, 256](self.s, sims, self.error_counts)

-    def compare_hashes_level(self, lv):
-        self.retval_int[0] = 0
-        grid_dim = self._grid_dim(self.sims, self.line_use_counts[lv])
-        diff_hash_gpu[grid_dim, self._block_dim](self.h, self.h_base, self.line_use, self.line_use_offsets[lv],
-                                                 self.line_use_counts[lv], self.retval_int)
-        return self.retval_int[0]
+    def reset_error_counts(self):
+        self.error_counts[:] = 0
+
+    def get_error_counts(self):
+        return np.array(self.error_counts)

-    def calc_error_counts(self, sims=None):
+    def acc_overflows(self, sims=None):
        sims = min(sims or self.sims, self.sims)
        grid_dim = cdiv(self.s_len, 256)
-        calc_error_counts_gpu[grid_dim, 256](self.s, sims, self.error_counts)
-        return np.array(self.error_counts)
+        acc_overflows_gpu[grid_dim, 256](self.s, sims, self.overflows)
+
+    def reset_overflows(self):
+        self.overflows[:] = 0
+
+    def get_overflows(self):
+        return np.array(self.overflows)
+
+    def acc_lsts(self, sims=None):
+        sims = min(sims or self.sims, self.sims)
+        grid_dim = cdiv(self.s_len, 256)
+        acc_lsts_gpu[grid_dim, 256](self.s, sims, self.lsts)
+
+    def reset_lsts(self):
+        self.lsts[:] = 0.0
+
+    def get_lsts(self):
+        return np.array(self.lsts)


@cuda.jit()
@ -473,23 +476,33 @@ def memcpy_gpu (src, dst, nitems):
				@@ -473,23 +476,33 @@ def memcpy_gpu (src, dst, nitems):


@cuda.jit()
-def diff_hash_gpu(hbuf1, hbuf2, h_locs, h_locs_offset, h_locs_cnt, differs):
-    x, y = cuda.grid(2)
-    if x >= hbuf1.shape[1]: return
-    if y >= h_locs_cnt: return
-    h_loc = h_locs[h_locs_offset+y]
-    if hbuf1[h_loc, x] != hbuf2[h_loc, x]:
-        differs[0] = 1
+def acc_error_counts_gpu(s, sims, error_counts):
+    x = cuda.grid(1)
+    if x >= s.shape[1]: return
+    cnt = 0
+    for i in range(sims):
+        cnt += (s[6,x,i] != s[8,x,i])
+    error_counts[x] += cnt


@cuda.jit()
-def calc_error_counts_gpu(s, sims, error_counts):
+def acc_overflows_gpu(s, sims, overflows):
    x = cuda.grid(1)
    if x >= s.shape[1]: return
    cnt = 0
    for i in range(sims):
-        cnt += (s[6,x,i] != s[8,x,i])
-    error_counts[x] = cnt
+        cnt += s[10,x,i]
+    overflows[x] += cnt
+
+
+@cuda.jit()
+def acc_lsts_gpu(s, sims, lsts):
+    x = cuda.grid(1)
+    if x >= s.shape[1]: return
+    lst = 0
+    for i in range(sims):
+        lst = max(lst, s[5,x,i])
+    lsts[x] = max(lsts[x], lst)


@cuda.jit()
@ -520,7 +533,7 @@ _wave_eval_gpu = cuda.jit(_wave_eval, device=True)
				@@ -520,7 +533,7 @@ _wave_eval_gpu = cuda.jit(_wave_eval, device=True)


@cuda.jit()
-def wave_eval_gpu(ops, op_start, op_stop, cbuf, c_locs, c_caps, hbuf, abuf, sim_start, sim_stop, delays, simctl_int, seed):
+def wave_eval_gpu(ops, op_start, op_stop, cbuf, c_locs, c_caps, ebuf, abuf, sim_start, sim_stop, delays, simctl_int, seed):
    x, y = cuda.grid(2)
    sim = sim_start + x
    op_idx = op_start + y
@ -532,7 +545,7 @@ def wave_eval_gpu(ops, op_start, op_stop, cbuf, c_locs, c_caps, hbuf, abuf, sim_
				@@ -532,7 +545,7 @@ def wave_eval_gpu(ops, op_start, op_stop, cbuf, c_locs, c_caps, hbuf, abuf, sim_
    a_wr = op[7]
    a_wf = op[8]

-    nrise, nfall = _wave_eval_gpu(op, cbuf, c_locs, c_caps, hbuf, sim, delays, simctl_int[:, sim], seed)
+    nrise, nfall = _wave_eval_gpu(op, cbuf, c_locs, c_caps, ebuf, sim, delays, simctl_int[:, sim], seed)

    # accumulate WSA into abuf
    if a_loc >= 0:
--- a/tests/test_logic_sim.py
+++ b/tests/test_logic_sim.py
@ -1,6 +1,6 @@
				@@ -1,6 +1,6 @@
 import numpy as np

-from kyupy.logic_sim import LogicSim
+from kyupy.logic_sim import LogicSim, LogicSim6V
 from kyupy import bench, logic, sim
 from kyupy.logic import mvarray, bparray, bp_to_mv, mv_to_bp

@ -94,6 +94,30 @@ def test_4v():
				@@ -94,6 +94,30 @@ def test_4v():
        '--0XX', '--X1X', '--XXX', '--XXX'))


+def test_6v():
+    c = bench.parse('input(x, y) output(a, o, n, xo, no) a=AND2(x,y) o=OR2(x,y) n=INV1(x) xo=XOR2(x,y) no=NOR2(x,y)')
+    s = LogicSim6V(c, 36)
+    assert s.s_len == 7
+    mva = mvarray(
+        '0000101', '0101110', '0R0R1RF', '0F0F1FR', '0P0P1PN', '0N0N1NP',
+        '1001010', '1111000', '1RR10F0', '1FF10R0', '1PP10N0', '1NN10P0',
+        'R00RFRF', 'R1R1FF0', 'RRRRFPF', 'RFPNFNP', 'RPPRFRF', 'RNRNFFP',
+        'F00FRFR', 'F1F1RR0', 'FRPNRNP', 'FFFFRPR', 'FPPFRFR', 'FNFNRRP',
+        'P00PNPN', 'P1P1NN0', 'PRPRNRF', 'PFPFNFR', 'PPPPNPN', 'PNPNNNP',
+        'N00NPNP', 'N1N1PP0', 'NRRNPFP', 'NFFNPRP', 'NPPNPNP', 'NNNNPPP')
+    tests = np.copy(mva)
+    tests[2:] = logic.ZERO
+    s.s[0] = tests
+    s.s_to_c()
+    s.c_prop()
+    s.c_to_s()
+    resp = s.s[1].copy()
+
+    exp_resp = np.copy(mva)
+    exp_resp[:2] = logic.ZERO
+    np.testing.assert_allclose(resp, exp_resp)
+
+
 def test_8v():
    c = bench.parse('input(x, y) output(a, o, n, xo) a=and(x,y) o=or(x,y) n=not(x) xo=xor(x,y)')
    s = LogicSim(c, 64, m=8)
Author	SHA1	Message	Date
Stefan Holst	f59e97afa9	remove hashes, add lst, overflow, ebuf	2 years ago
Stefan Holst	f6baf9cb5e	a fast 6v sim	2 years ago
Stefan Holst	fc030c6708	allow interconnect annotations without forks	2 years ago
Stefan Holst	795cac0716	initial and final values from mvarrays	2 years ago
Stefan Holst	3a8777e0a3	none-filtering iterator for GrowingList	2 years ago