From 6bba7ac359351554535c7af757057fd3100ac61e Mon Sep 17 00:00:00 2001
From: Stefan Holst <mail@s-holst.de>
Date: Sat, 28 Nov 2020 10:39:13 +0900
Subject: [PATCH] support for stripping forks and memory re-use in wavesim.

---
 kyupy/wave_sim.py      | 209 +++++++++++++++++++++++++++++++----------
 kyupy/wave_sim_cuda.py |   4 +-
 tests/test_wave_sim.py |   8 ++
 3 files changed, 170 insertions(+), 51 deletions(-)

diff --git a/kyupy/wave_sim.py b/kyupy/wave_sim.py
index fa8d585..109f564 100644
--- a/kyupy/wave_sim.py
+++ b/kyupy/wave_sim.py
@@ -1,5 +1,7 @@
-import numpy as np
 import math
+from bisect import bisect, insort_left
+
+import numpy as np
 from . import numba
 
 
@@ -8,8 +10,74 @@ TMAX_OVL = np.float32(1.1 * 2 ** 127)  # almost np.PINF with overflow mark
 TMIN = np.float32(-2 ** 127)  # almost np.NINF for 32-bit floating point values
 
 
+class Heap:
+    def __init__(self):
+        self.chunks = dict()  # map start location to chunk size
+        self.released = list()  # chunks that were released
+        self.current_size = 0
+        self.max_size = 0
+
+    def alloc(self, size):
+        for idx, loc in enumerate(self.released):
+            if self.chunks[loc] == size:
+                del self.released[idx]
+                return loc
+            elif self.chunks[loc] > size:  # split chunk
+                chunksize = self.chunks[loc]
+                self.chunks[loc] = size
+                self.chunks[loc + size] = chunksize - size
+                self.released[idx] = loc + size  # move released pointer: loc -> loc+size
+                return loc
+        # no previously released chunk; make new one
+        loc = self.current_size
+        self.chunks[loc] = size
+        self.current_size += size
+        self.max_size = max(self.max_size, self.current_size)
+        return loc
+
+    def free(self, loc):
+        size = self.chunks[loc]
+        if loc + size == self.current_size:  # end of managed area, remove chunk
+            del self.chunks[loc]
+            self.current_size -= size
+            # check and remove prev chunk if free
+            if len(self.released) > 0:
+                prev = self.released[-1]
+                if prev + self.chunks[prev] == self.current_size:
+                    chunksize = self.chunks[prev]
+                    del self.chunks[prev]
+                    del self.released[-1]
+                    self.current_size -= chunksize
+            return
+        released_idx = bisect(self.released, loc)
+        if released_idx < len(self.released) and loc + size == self.released[released_idx]:  # next chunk is free, merge
+            chunksize = size + self.chunks[loc + size]
+            del self.chunks[loc + size]
+            self.chunks[loc] = chunksize
+            size = self.chunks[loc]
+            self.released[released_idx] = loc
+        else:
+            insort_left(self.released, loc)  # put in a new release
+        if released_idx > 0:  # check if previous chunk is free
+            prev = self.released[released_idx - 1]
+            if prev + self.chunks[prev] == loc:  # previous chunk is adjacent to freed one, merge
+                chunksize = size + self.chunks[prev]
+                del self.chunks[loc]
+                self.chunks[prev] = chunksize
+                del self.released[released_idx]
+
+    def __repr__(self):
+        r = []
+        for loc in sorted(self.chunks.keys()):
+            size = self.chunks[loc]
+            released_idx = bisect(self.released, loc)
+            is_released = released_idx > 0 and len(self.released) > 0 and self.released[released_idx - 1] == loc
+            r.append(f'{loc:5d}: {"free" if is_released else "used"} {size}')
+        return "\n".join(r)
+
+
 class WaveSim:
-    def __init__(self, circuit, timing, sims=8, wavecaps=16):
+    def __init__(self, circuit, timing, sims=8, wavecaps=16, strip_forks=False, keep_waveforms=True):
         self.circuit = circuit
         self.sims = sims
         self.overflows = 0
@@ -24,67 +92,37 @@ class WaveSim:
 
         intf_wavecap = 4  # sufficient for storing only 1 transition.
 
-        # state allocation table. maps line and interface indices to self.state memory locations
-
-        self.sat = np.zeros((len(circuit.lines) + 2 + 2 * len(self.interface), 3), dtype='int')
-        self.sat[:, 0] = -1
-        filled = 0
-        for lidx, cap in enumerate(wavecaps):
-            self.sat[lidx] = filled, cap, 0
-            filled += cap
-
+        # indices for state allocation table (sat)
         self.zero_idx = len(circuit.lines)
-        self.sat[self.zero_idx] = filled, intf_wavecap, 0
-        filled += intf_wavecap
         self.tmp_idx = self.zero_idx + 1
-        self.sat[self.tmp_idx] = filled, intf_wavecap, 0
-        filled += intf_wavecap
-
         self.ppi_offset = self.tmp_idx + 1
         self.ppo_offset = self.ppi_offset + len(self.interface)
-        for i, n in enumerate(self.interface):
-            if len(n.outs) > 0:
-                self.sat[self.ppi_offset + i] = filled, intf_wavecap, 0
-                filled += intf_wavecap
-            if len(n.ins) > 0:
-                self.sat[self.ppo_offset + i] = self.sat[n.ins[0].index]
-
-        # pad timing
-        self.timing = np.zeros((len(self.sat), 2, 2))
-        self.timing[:len(timing)] = timing
+        self.sat_length = self.ppo_offset + len(self.interface)
 
-        # allocate self.state
-        self.state = np.zeros((filled, sims), dtype='float32') + TMAX
-
-        # generate self.ops
+        # translate circuit structure into self.ops
         ops = []
         interface_dict = dict([(n, i) for i, n in enumerate(self.interface)])
         for n in circuit.topological_order():
             if n in interface_dict:
                 inp_idx = self.ppi_offset + interface_dict[n]
-                if len(n.outs) > 0 and n.outs[0] is not None:
+                if len(n.outs) > 0 and n.outs[0] is not None:  # first output of a PI/PPI
                     ops.append((0b1010, n.outs[0].index, inp_idx, self.zero_idx))
-                if 'dff' in n.kind.lower():
+                if 'dff' in n.kind.lower():  # second output of DFF is inverted
                     if len(n.outs) > 1 and n.outs[1] is not None:
                         ops.append((0b0101, n.outs[1].index, inp_idx, self.zero_idx))
-                else:
+                else:  # if not DFF, no output is inverted.
                     for o_line in n.outs[1:]:
                         if o_line is not None:
                             ops.append((0b1010, o_line.index, inp_idx, self.zero_idx))
-            else:
-                o0_idx = self.tmp_idx
-                i0_idx = self.zero_idx
-                i1_idx = self.zero_idx
-                if len(n.outs) > 0 and n.outs[0] is not None:
-                    o0_idx = n.outs[0].index
-                else:
-                    print(f'no outputs for {n}')
-                if len(n.ins) > 0 and n.ins[0] is not None: i0_idx = n.ins[0].index
-                if len(n.ins) > 1 and n.ins[1] is not None: i1_idx = n.ins[1].index
+            else:  # regular node, not PI/PPI or PO/PPO
+                o0_idx = n.outs[0].index if len(n.outs) > 0 and n.outs[0] is not None else self.tmp_idx
+                i0_idx = n.ins[0].index if len(n.ins) > 0 and n.ins[0] is not None else self.zero_idx
+                i1_idx = n.ins[1].index if len(n.ins) > 1 and n.ins[1] is not None else self.zero_idx
                 kind = n.kind.lower()
                 if kind == '__fork__':
-                    for o_line in n.outs:
-                        ops.append((0b1010, o_line.index, i0_idx, i1_idx))
+                    if not strip_forks:
+                        for o_line in n.outs:
+                            ops.append((0b1010, o_line.index, i0_idx, i1_idx))
                 elif kind.startswith('nand'):
                     ops.append((0b0111, o0_idx, i0_idx, i1_idx))
                 elif kind.startswith('nor'):
@@ -109,18 +147,91 @@ class WaveSim:
                     print('unknown gate type', kind)
         self.ops = np.asarray(ops, dtype='int32')
 
-        # generate level data
-        levels = np.zeros(len(self.sat), dtype='int32')
+        # create a map from fanout lines to stem lines for fork stripping
+        stems = np.zeros(self.sat_length, dtype='int32') - 1  # default to -1: 'no fanout line'
+        if strip_forks:
+            for f in circuit.forks.values():
+                prev_line = f.ins[0]
+                while prev_line.driver.kind == '__fork__':
+                    prev_line = prev_line.driver.ins[0]
+                stem_idx = prev_line.index
+                for ol in f.outs:
+                    stems[ol.index] = stem_idx
+
+        # calculate level (distance from PI/PPI) and reference count for each line
+        levels = np.zeros(self.sat_length, dtype='int32')
+        ref_count = np.zeros(self.sat_length, dtype='int32')
         level_starts = [0]
         current_level = 1
         for i, op in enumerate(self.ops):
-            if levels[op[2]] >= current_level or levels[op[3]] >= current_level:
+            # if we fork-strip, always take the stems for determining fan-in level
+            i0_idx = stems[op[2]] if stems[op[2]] >= 0 else op[2]
+            i1_idx = stems[op[3]] if stems[op[3]] >= 0 else op[3]
+            if levels[i0_idx] >= current_level or levels[i1_idx] >= current_level:
                 current_level += 1
                 level_starts.append(i)
-            levels[op[1]] = current_level
+            levels[op[1]] = current_level  # set level of the output line
+            ref_count[i0_idx] += 1
+            ref_count[i1_idx] += 1
         self.level_starts = np.asarray(level_starts, dtype='int32')
         self.level_stops = np.asarray(level_starts[1:] + [len(self.ops)], dtype='int32')
 
+        # state allocation table. maps line and interface indices to self.state memory locations
+        self.sat = np.zeros((self.sat_length, 3), dtype='int')
+        self.sat[:, 0] = -1
+
+        h = Heap()
+
+        # allocate and keep memory for special fields
+        self.sat[self.zero_idx] = h.alloc(intf_wavecap), intf_wavecap, 0
+        self.sat[self.tmp_idx] = h.alloc(intf_wavecap), intf_wavecap, 0
+        ref_count[self.zero_idx] += 1
+        ref_count[self.tmp_idx] += 1
+
+        # allocate and keep memory for PI/PPI, keep memory for PO/PPO (allocated later)
+        for i, n in enumerate(self.interface):
+            if len(n.outs) > 0:
+                self.sat[self.ppi_offset + i] = h.alloc(intf_wavecap), intf_wavecap, 0
+                ref_count[self.ppi_offset + i] += 1
+            if len(n.ins) > 0:
+                i0_idx = stems[n.ins[0].index] if stems[n.ins[0].index] >= 0 else n.ins[0].index
+                ref_count[i0_idx] += 1
+
+        # allocate memory for the rest of the circuit
+        for op_start, op_stop in zip(self.level_starts, self.level_stops):
+            free_list = []
+            for op in self.ops[op_start:op_stop]:
+                # if we fork-strip, always take the stems
+                i0_idx = stems[op[2]] if stems[op[2]] >= 0 else op[2]
+                i1_idx = stems[op[3]] if stems[op[3]] >= 0 else op[3]
+                ref_count[i0_idx] -= 1
+                ref_count[i1_idx] -= 1
+                if ref_count[i0_idx] <= 0: free_list.append(self.sat[i0_idx, 0])
+                if ref_count[i1_idx] <= 0: free_list.append(self.sat[i1_idx, 0])
+                o_idx = op[1]
+                cap = wavecaps[o_idx]
+                self.sat[o_idx] = h.alloc(cap), cap, 0
+            if not keep_waveforms:
+                for loc in free_list:
+                    h.free(loc)
+
+        # copy memory location and capacity from stems to fanout lines
+        for lidx, stem in enumerate(stems):
+            if stem >= 0:  # if at a fanout line
+                self.sat[lidx] = self.sat[stem]
+
+        # copy memory location to PO/PPO area
+        for i, n in enumerate(self.interface):
+            if len(n.ins) > 0:
+                self.sat[self.ppo_offset + i] = self.sat[n.ins[0].index]
+
+        # pad timing
+        self.timing = np.zeros((self.sat_length, 2, 2))
+        self.timing[:len(timing)] = timing
+
+        # allocate self.state
+        self.state = np.zeros((h.max_size, sims), dtype='float32') + TMAX
+
         m1 = np.array([2 ** x for x in range(7, -1, -1)], dtype='uint8')
         m0 = ~m1
         self.mask = np.rollaxis(np.vstack((m0, m1)), 1)
diff --git a/kyupy/wave_sim_cuda.py b/kyupy/wave_sim_cuda.py
index 1b6ee80..c5eb2e9 100644
--- a/kyupy/wave_sim_cuda.py
+++ b/kyupy/wave_sim_cuda.py
@@ -9,8 +9,8 @@ TMIN = np.float32(-2 ** 127)  # almost np.NINF for 32-bit floating point values
 
 
 class WaveSimCuda(WaveSim):
-    def __init__(self, circuit, timing, sims=8, wavecaps=16):
-        super().__init__(circuit, timing, sims, wavecaps)
+    def __init__(self, circuit, timing, sims=8, wavecaps=16, strip_forks=False, keep_waveforms=True):
+        super().__init__(circuit, timing, sims, wavecaps, strip_forks, keep_waveforms)
 
         self.tdata = np.zeros((len(self.interface), 3, (sims - 1) // 8 + 1), dtype='uint8')
 
diff --git a/tests/test_wave_sim.py b/tests/test_wave_sim.py
index 8723cdc..35eb760 100644
--- a/tests/test_wave_sim.py
+++ b/tests/test_wave_sim.py
@@ -130,6 +130,14 @@ def test_b14(mydir):
     compare_to_logic_sim(wsim)
 
 
+def test_b14_strip_forks(mydir):
+    c = verilog.parse(mydir / 'b14.v.gz', branchforks=True)
+    df = sdf.parse(mydir / 'b14.sdf.gz')
+    lt = df.annotation(c, pin_index)
+    wsim = WaveSim(c, lt, 8, strip_forks=True)
+    compare_to_logic_sim(wsim)
+
+
 def test_b14_cuda(mydir):
     c = verilog.parse(mydir / 'b14.v.gz', branchforks=True)
     df = sdf.parse(mydir / 'b14.sdf.gz')