cleanup

3 years ago · f04f1b0012
3 changed files with 28 additions and 319 deletions
--- a/src/kyupy/logic_sim.py
+++ b/src/kyupy/logic_sim.py
@ -34,21 +34,6 @@ class LogicSim(sim.SimOps):
        self.s = np.zeros((2, self.s_len, 3, nbytes), dtype=np.uint8)
        self.s[:,:,1,:] = 255  # unassigned
        self.pi_s_locs = np.flatnonzero(self.c_locs[self.ppi_offset+np.arange(len(self.circuit.io_nodes))] >= 0)
        self.po_s_locs = np.flatnonzero(self.c_locs[self.ppo_offset+np.arange(len(self.circuit.io_nodes))] >= 0)
        self.ppio_s_locs = np.arange(len(self.circuit.io_nodes), len(self.s_nodes))
        self.pippi_s_locs = np.concatenate([self.pi_s_locs, self.ppio_s_locs])
        self.poppo_s_locs = np.concatenate([self.po_s_locs, self.ppio_s_locs])
        self.pi_c_locs = self.c_locs[self.ppi_offset+self.pi_s_locs]
        self.po_c_locs = self.c_locs[self.ppo_offset+self.po_s_locs]
        self.ppi_c_locs = self.c_locs[self.ppi_offset+self.ppio_s_locs]
        self.ppo_c_locs = self.c_locs[self.ppo_offset+self.ppio_s_locs]
        self.pippi_c_locs = np.concatenate([self.pi_c_locs, self.ppi_c_locs])
        self.poppo_c_locs = np.concatenate([self.po_c_locs, self.ppo_c_locs])
    def __repr__(self):
        return f'<LogicSim {self.circuit.name} sims={self.sims} m={self.m} state_mem={hr_bytes(self.c.nbytes)}>'
@ -81,15 +66,6 @@ class LogicSim(sim.SimOps):
        """Propagate the input values towards the outputs (Perform all logic operations in topological order).
        If the circuit is sequential (it contains flip-flops), one call simulates one clock cycle.
        Multiple clock cycles are simulated by a assign-propagate-capture loop:
        .. code-block:: python
           # initial state in state_bp
           for cycle in range(10):  # simulate 10 clock cycles
               sim.assign(state_bp)
               sim.propagate()
               sim.capture(state_bp)
        :param inject_cb: A callback function for manipulating intermediate signal values.
            This function is called with a line and its new logic values (in bit-parallel format) after
@ -113,7 +89,7 @@ class LogicSim(sim.SimOps):
                    elif op == sim.NOR2: self.c[o0] = ~(self.c[i0] | self.c[i1])
                    elif op == sim.XOR2: self.c[o0] = self.c[i0] ^ self.c[i1]
                    elif op == sim.XNOR2: self.c[o0] = ~(self.c[i0] ^ self.c[i1])
-                    else: print(f'unknown sim {op}')
+                    else: print(f'unknown op {op}')
                    inject_cb(o0, self.s[o0])
        elif self.m == 4:
            pass
@ -128,7 +104,7 @@ class LogicSim(sim.SimOps):
                elif op == sim.NOR2: logic.bp_or(self.c[o0], self.c[i0], self.c[i1]); logic.bp_not(self.c[o0], self.c[o0])
                elif op == sim.XOR2: logic.bp_xor(self.c[o0], self.c[i0], self.c[i1])
                elif op == sim.XNOR2: logic.bp_xor(self.c[o0], self.c[i0], self.c[i1]); logic.bp_not(self.c[o0], self.c[o0])
-                else: print(f'unknown sim {op}')
+                else: print(f'unknown op {op}')
                if inject_cb is not None: inject_cb(o0, self.s[o0])
    def s_ppo_to_ppi(self):
@ -169,4 +145,4 @@ def _prop_cpu(ops, c_locs, c):
        elif op == sim.NOR2: c[o0] = ~(c[i0] | c[i1])
        elif op == sim.XOR2: c[o0] = c[i0] ^ c[i1]
        elif op == sim.XNOR2: c[o0] = ~(c[i0] ^ c[i1])
-        else: print(f'unknown sim {op}')
+        else: print(f'unknown op {op}')
--- a/src/kyupy/sim.py
+++ b/src/kyupy/sim.py
@ -145,7 +145,7 @@ class SimOps:
    :param keep_signals: If disabled, memory of intermediate signal waveforms will be re-used. This greatly reduces
        memory footprint, but intermediate signal waveforms become unaccessible after a propagation.
    """
-    def __init__(self, circuit, c_caps=1, c_reuse=False, strip_forks=False):
+    def __init__(self, circuit, c_caps=1, c_caps_min=1, c_reuse=False, strip_forks=False):
        self.circuit = circuit
        dffs = [n for n in circuit.nodes if 'dff' in n.kind.lower()]
        latches = [n for n in circuit.nodes if 'latch' in n.kind.lower()]
@ -248,15 +248,15 @@ class SimOps:
        h = Heap()
        # allocate and keep memory for special fields
-        self.c_locs[self.zero_idx], self.c_caps[self.zero_idx] = h.alloc(1), 1
+        self.c_locs[self.zero_idx], self.c_caps[self.zero_idx] = h.alloc(c_caps_min), c_caps_min
-        self.c_locs[self.tmp_idx], self.c_caps[self.tmp_idx] = h.alloc(1), 1
+        self.c_locs[self.tmp_idx], self.c_caps[self.tmp_idx] = h.alloc(c_caps_min), c_caps_min
        ref_count[self.zero_idx] += 1
        ref_count[self.tmp_idx] += 1
        # allocate and keep memory for PI/PPI, keep memory for PO/PPO (allocated later)
        for i, n in enumerate(self.s_nodes):
            if len(n.outs) > 0:
-                self.c_locs[self.ppi_offset + i], self.c_caps[self.ppi_offset + i] = h.alloc(1), 1
+                self.c_locs[self.ppi_offset + i], self.c_caps[self.ppi_offset + i] = h.alloc(c_caps_min), c_caps_min
                ref_count[self.ppi_offset + i] += 1
            if len(n.ins) > 0:
                i0_idx = stems[n.ins[0]] if stems[n.ins[0]] >= 0 else n.ins[0]
@ -280,7 +280,7 @@ class SimOps:
                if ref_count[i2_idx] <= 0: free_list.append(self.c_locs[i2_idx])
                if ref_count[i3_idx] <= 0: free_list.append(self.c_locs[i3_idx])
                o_idx = op[1]
-                cap = c_caps[o_idx]
+                cap = max(c_caps_min, c_caps[o_idx])
                self.c_locs[o_idx], self.c_caps[o_idx] = h.alloc(cap), cap
            if not keep_signals:
                for loc in free_list:
@ -301,3 +301,18 @@ class SimOps:
        from collections import defaultdict
        self.prim_counts = defaultdict(int)
        for op, _, _, _, _, _ in self.ops: self.prim_counts[names[op]] += 1
        self.pi_s_locs = np.flatnonzero(self.c_locs[self.ppi_offset+np.arange(len(self.circuit.io_nodes))] >= 0)
        self.po_s_locs = np.flatnonzero(self.c_locs[self.ppo_offset+np.arange(len(self.circuit.io_nodes))] >= 0)
        self.ppio_s_locs = np.arange(len(self.circuit.io_nodes), len(self.s_nodes))
        self.pippi_s_locs = np.concatenate([self.pi_s_locs, self.ppio_s_locs])
        self.poppo_s_locs = np.concatenate([self.po_s_locs, self.ppio_s_locs])
        self.pi_c_locs = self.c_locs[self.ppi_offset+self.pi_s_locs]
        self.po_c_locs = self.c_locs[self.ppo_offset+self.po_s_locs]
        self.ppi_c_locs = self.c_locs[self.ppi_offset+self.ppio_s_locs]
        self.ppo_c_locs = self.c_locs[self.ppo_offset+self.ppio_s_locs]
        self.pippi_c_locs = np.concatenate([self.pi_c_locs, self.ppi_c_locs])
        self.poppo_c_locs = np.concatenate([self.po_c_locs, self.ppo_c_locs])
--- a/src/kyupy/wave_sim.py
+++ b/src/kyupy/wave_sim.py
@ -48,14 +48,8 @@ class WaveSim(sim.SimOps):
        memory footprint, but intermediate signal waveforms become unaccessible after a propagation.
    """
    def __init__(self, circuit, delays, sims=8, c_caps=16, c_reuse=False, strip_forks=False):
-        assert c_caps > 0 and c_caps % 4 == 0
+        super().__init__(circuit, c_caps=c_caps, c_caps_min=4, c_reuse=c_reuse, strip_forks=strip_forks)
        super().__init__(circuit, c_caps=c_caps//4, c_reuse=c_reuse, strip_forks=strip_forks)
        self.sims = sims
        self.c_len *= 4
        self.c_locs[...] *= 4
        self.c_caps[...] *= 4
        self.delays = np.zeros((len(delays), self.c_locs_len, 2, 2), dtype=delays.dtype)
        self.delays[:, :delays.shape[1]] = delays
@ -87,21 +81,6 @@ class WaveSim(sim.SimOps):
        self.nbytes = sum([a.nbytes for a in (self.c, self.s, self.c_locs, self.c_caps, self.ops, self.params)])
        self.pi_s_locs = np.flatnonzero(self.c_locs[self.ppi_offset+np.arange(len(self.circuit.io_nodes))] >= 0)
        self.po_s_locs = np.flatnonzero(self.c_locs[self.ppo_offset+np.arange(len(self.circuit.io_nodes))] >= 0)
        self.ppio_s_locs = np.arange(len(self.circuit.io_nodes), len(self.s_nodes))
        self.pippi_s_locs = np.concatenate([self.pi_s_locs, self.ppio_s_locs])
        self.poppo_s_locs = np.concatenate([self.po_s_locs, self.ppio_s_locs])
        self.pi_c_locs = self.c_locs[self.ppi_offset+self.pi_s_locs]
        self.po_c_locs = self.c_locs[self.ppo_offset+self.po_s_locs]
        self.ppi_c_locs = self.c_locs[self.ppi_offset+self.ppio_s_locs]
        self.ppo_c_locs = self.c_locs[self.ppo_offset+self.ppio_s_locs]
        self.pippi_c_locs = np.concatenate([self.pi_c_locs, self.ppi_c_locs])
        self.poppo_c_locs = np.concatenate([self.po_c_locs, self.ppo_c_locs])
    def __repr__(self):
        return f'<{type(self).__name__} {self.circuit.name} sims={self.sims} ops={len(self.ops)} ' + \
               f'levels={len(self.level_starts)} mem={hr_bytes(self.nbytes)}>'
@ -156,131 +135,6 @@ class WaveSim(sim.SimOps):
        self.s[2, self.ppio_s_locs] = self.s[8, self.ppio_s_locs]
@numba.njit
 def rand_gauss_cpu(seed, sd):
    clamp = 0.5
    if sd <= 0.0:
        return 1.0
    while True:
        x = -6.0
        for _ in range(12):
            seed = int(0xDEECE66D) * seed + 0xB
            x += float((seed >> 8) & 0xffffff) / float(1 << 24)
        x *= sd
        if abs(x) <= clamp:
            break
    return x + 1.0
@numba.njit
 def wave_eval_cpu_old(op, cbuf, c_locs, c_caps, st_idx, line_times, param, sd=0.0, seed=0):
    lut, z_idx, a_idx, b_idx, c_idx, d_idx = op
    # >>> same code as wave_eval_cpu (except rand_gauss_*pu()-calls) >>>
    overflows = int(0)
    _seed = (seed << 4) + (z_idx << 20) + (st_idx << 1)
    a_mem = c_locs[a_idx]
    b_mem = c_locs[b_idx]
    c_mem = c_locs[c_idx]
    d_mem = c_locs[d_idx]
    z_mem = c_locs[z_idx]
    z_cap = c_caps[z_idx]
    a_cur = int(0)
    b_cur = int(0)
    c_cur = int(0)
    d_cur = int(0)
    z_cur = lut & 1
    if z_cur == 1:
        cbuf[z_mem, st_idx] = TMIN
    a = cbuf[a_mem, st_idx] + line_times[a_idx, 0, z_cur] * rand_gauss_cpu(_seed ^ a_mem ^ z_cur, sd) * param[0]
    if int(param[1]) == a_idx: a += param[2+z_cur]
    b = cbuf[b_mem, st_idx] + line_times[b_idx, 0, z_cur] * rand_gauss_cpu(_seed ^ b_mem ^ z_cur, sd) * param[0]
    if int(param[1]) == b_idx: b += param[2+z_cur]
    c = cbuf[c_mem, st_idx] + line_times[c_idx, 0, z_cur] * rand_gauss_cpu(_seed ^ c_mem ^ z_cur, sd) * param[0]
    if int(param[1]) == c_idx: c += param[2+z_cur]
    d = cbuf[d_mem, st_idx] + line_times[d_idx, 0, z_cur] * rand_gauss_cpu(_seed ^ d_mem ^ z_cur, sd) * param[0]
    if int(param[1]) == d_idx: d += param[2+z_cur]
    previous_t = TMIN
    current_t = min(a, b, c, d)
    inputs = int(0)
    while current_t < TMAX:
        z_val = z_cur & 1
        if a == current_t:
            a_cur += 1
            a = cbuf[a_mem + a_cur, st_idx]
            a += line_times[a_idx, 0, z_val ^ 1] * rand_gauss_cpu(_seed ^ a_mem ^ z_val ^ 1, sd) * param[0]
            thresh = line_times[a_idx, 1, z_val] * rand_gauss_cpu(_seed ^ a_mem ^ z_val, sd) * param[0]
            if int(param[1]) == a_idx:
                a += param[2+(z_val^1)]
                thresh += param[2+z_val]
            inputs ^= 1
            next_t = a
        elif b == current_t:
            b_cur += 1
            b = cbuf[b_mem + b_cur, st_idx]
            b += line_times[b_idx, 0, z_val ^ 1] * rand_gauss_cpu(_seed ^ b_mem ^ z_val ^ 1, sd) * param[0]
            thresh = line_times[b_idx, 1, z_val] * rand_gauss_cpu(_seed ^ b_mem ^ z_val, sd) * param[0]
            if int(param[1]) == b_idx:
                b += param[2+(z_val^1)]
                thresh += param[2+z_val]
            inputs ^= 2
            next_t = b
        elif c == current_t:
            c_cur += 1
            c = cbuf[c_mem + c_cur, st_idx]
            c += line_times[c_idx, 0, z_val ^ 1] * rand_gauss_cpu(_seed ^ c_mem ^ z_val ^ 1, sd) * param[0]
            thresh = line_times[c_idx, 1, z_val] * rand_gauss_cpu(_seed ^ c_mem ^ z_val, sd) * param[0]
            if int(param[1]) == c_idx:
                c += param[2+(z_val^1)]
                thresh += param[2+z_val]
            inputs ^= 4
            next_t = c
        else:
            d_cur += 1
            d = cbuf[d_mem + d_cur, st_idx]
            d += line_times[d_idx, 0, z_val ^ 1] * rand_gauss_cpu(_seed ^ d_mem ^ z_val ^ 1, sd) * param[0]
            thresh = line_times[d_idx, 1, z_val] * rand_gauss_cpu(_seed ^ d_mem ^ z_val, sd) * param[0]
            if int(param[1]) == d_idx:
                d += param[2+(z_val^1)]
                thresh += param[2+z_val]
            inputs ^= 8
            next_t = d
        if (z_cur & 1) != ((lut >> inputs) & 1):
            # we generate a toggle in z_mem, if:
            #   ( it is the first toggle in z_mem OR
            #   following toggle is earlier OR
            #   pulse is wide enough ) AND enough space in z_mem.
            if z_cur == 0 or next_t < current_t or (current_t - previous_t) > thresh:
                if z_cur < (z_cap - 1):
                    cbuf[z_mem + z_cur, st_idx] = current_t
                    previous_t = current_t
                    z_cur += 1
                else:
                    overflows += 1
                    previous_t = cbuf[z_mem + z_cur - 1, st_idx]
                    z_cur -= 1
            else:
                z_cur -= 1
                previous_t = cbuf[z_mem + z_cur - 1, st_idx] if z_cur > 0 else TMIN
        current_t = min(a, b, c, d)
    # generate or propagate overflow flag
    cbuf[z_mem + z_cur, st_idx] = TMAX_OVL if overflows > 0 else max(a, b, c, d)
@numba.njit
 def wave_eval_cpu(op, cbuf, c_locs, c_caps, st_idx, delays, param, sd=0.0, seed=0):
    lut, z_idx, a_idx, b_idx, c_idx, d_idx = op
@ -379,6 +233,7 @@ def wave_eval_cpu(op, cbuf, c_locs, c_caps, st_idx, delays, param, sd=0.0, seed=
    # generate or propagate overflow flag
    cbuf[z_mem + z_cur, st_idx] = TMAX_OVL if overflows > 0 else max(a, b, c, d)
@numba.njit
 def level_eval_cpu(ops, op_start, op_stop, c, c_locs, c_caps, st_start, st_stop, delays, params, sd, seed):
    overflows = 0
@ -505,143 +360,6 @@ def wave_assign_gpu(c, s, c_locs, ppi_offset):
    c[c_loc+2, x] = TMAX
@cuda.jit(device=True)
 def rand_gauss_gpu(seed, sd):
    clamp = 0.5
    if sd <= 0.0:
        return 1.0
    while True:
        x = -6.0
        for _ in range(12):
            seed = int(0xDEECE66D) * seed + 0xB
            x += float((seed >> 8) & 0xffffff) / float(1 << 24)
        x *= sd
        if abs(x) <= clamp:
            break
    return x + 1.0
@cuda.jit()
 def wave_eval_gpu_old(ops, op_start, op_stop, cbuf, c_locs, c_caps, st_start, st_stop, line_times, param, sd, seed):
    x, y = cuda.grid(2)
    st_idx = st_start + x
    op_idx = op_start + y
    if st_idx >= st_stop: return
    if op_idx >= op_stop: return
    lut = ops[op_idx, 0]
    z_idx = ops[op_idx, 1]
    a_idx = ops[op_idx, 2]
    b_idx = ops[op_idx, 3]
    c_idx = ops[op_idx, 4]
    d_idx = ops[op_idx, 5]
    param = param[st_idx]
    # >>> same code as wave_eval_cpu (except rand_gauss_*pu()-calls) >>>
    overflows = int(0)
    _seed = (seed << 4) + (z_idx << 20) + (st_idx << 1)
    a_mem = c_locs[a_idx]
    b_mem = c_locs[b_idx]
    c_mem = c_locs[c_idx]
    d_mem = c_locs[d_idx]
    z_mem = c_locs[z_idx]
    z_cap = c_caps[z_idx]
    a_cur = int(0)
    b_cur = int(0)
    c_cur = int(0)
    d_cur = int(0)
    z_cur = lut & 1
    if z_cur == 1:
        cbuf[z_mem, st_idx] = TMIN
    a = cbuf[a_mem, st_idx] + line_times[a_idx, 0, z_cur] * rand_gauss_gpu(_seed ^ a_mem ^ z_cur, sd) * param[0]
    if int(param[1]) == a_idx: a += param[2+z_cur]
    b = cbuf[b_mem, st_idx] + line_times[b_idx, 0, z_cur] * rand_gauss_gpu(_seed ^ b_mem ^ z_cur, sd) * param[0]
    if int(param[1]) == b_idx: b += param[2+z_cur]
    c = cbuf[c_mem, st_idx] + line_times[c_idx, 0, z_cur] * rand_gauss_gpu(_seed ^ c_mem ^ z_cur, sd) * param[0]
    if int(param[1]) == c_idx: c += param[2+z_cur]
    d = cbuf[d_mem, st_idx] + line_times[d_idx, 0, z_cur] * rand_gauss_gpu(_seed ^ d_mem ^ z_cur, sd) * param[0]
    if int(param[1]) == d_idx: d += param[2+z_cur]
    previous_t = TMIN
    current_t = min(a, b, c, d)
    inputs = int(0)
    while current_t < TMAX:
        z_val = z_cur & 1
        if a == current_t:
            a_cur += 1
            a = cbuf[a_mem + a_cur, st_idx]
            a += line_times[a_idx, 0, z_val ^ 1] * rand_gauss_gpu(_seed ^ a_mem ^ z_val ^ 1, sd) * param[0]
            thresh = line_times[a_idx, 1, z_val] * rand_gauss_gpu(_seed ^ a_mem ^ z_val, sd) * param[0]
            if int(param[1]) == a_idx:
                a += param[2+(z_val^1)]
                thresh += param[2+z_val]
            inputs ^= 1
            next_t = a
        elif b == current_t:
            b_cur += 1
            b = cbuf[b_mem + b_cur, st_idx]
            b += line_times[b_idx, 0, z_val ^ 1] * rand_gauss_gpu(_seed ^ b_mem ^ z_val ^ 1, sd) * param[0]
            thresh = line_times[b_idx, 1, z_val] * rand_gauss_gpu(_seed ^ b_mem ^ z_val, sd) * param[0]
            if int(param[1]) == b_idx:
                b += param[2+(z_val^1)]
                thresh += param[2+z_val]
            inputs ^= 2
            next_t = b
        elif c == current_t:
            c_cur += 1
            c = cbuf[c_mem + c_cur, st_idx]
            c += line_times[c_idx, 0, z_val ^ 1] * rand_gauss_gpu(_seed ^ c_mem ^ z_val ^ 1, sd) * param[0]
            thresh = line_times[c_idx, 1, z_val] * rand_gauss_gpu(_seed ^ c_mem ^ z_val, sd) * param[0]
            if int(param[1]) == c_idx:
                c += param[2+(z_val^1)]
                thresh += param[2+z_val]
            inputs ^= 4
            next_t = c
        else:
            d_cur += 1
            d = cbuf[d_mem + d_cur, st_idx]
            d += line_times[d_idx, 0, z_val ^ 1] * rand_gauss_gpu(_seed ^ d_mem ^ z_val ^ 1, sd) * param[0]
            thresh = line_times[d_idx, 1, z_val] * rand_gauss_gpu(_seed ^ d_mem ^ z_val, sd) * param[0]
            if int(param[1]) == d_idx:
                d += param[2+(z_val^1)]
                thresh += param[2+z_val]
            inputs ^= 8
            next_t = d
        if (z_cur & 1) != ((lut >> inputs) & 1):
            # we generate a toggle in z_mem, if:
            #   ( it is the first toggle in z_mem OR
            #   following toggle is earlier OR
            #   pulse is wide enough ) AND enough space in z_mem.
            if z_cur == 0 or next_t < current_t or (current_t - previous_t) > thresh:
                if z_cur < (z_cap - 1):
                    cbuf[z_mem + z_cur, st_idx] = current_t
                    previous_t = current_t
                    z_cur += 1
                else:
                    overflows += 1
                    previous_t = cbuf[z_mem + z_cur - 1, st_idx]
                    z_cur -= 1
            else:
                z_cur -= 1
                previous_t = cbuf[z_mem + z_cur - 1, st_idx] if z_cur > 0 else TMIN
        current_t = min(a, b, c, d)
    # generate or propagate overflow flag
    cbuf[z_mem + z_cur, st_idx] = TMAX_OVL if overflows > 0 else max(a, b, c, d)
@cuda.jit()
 def wave_eval_gpu(ops, op_start, op_stop, cbuf, c_locs, c_caps, st_start, st_stop, delays, param, sd, seed):
    x, y = cuda.grid(2)