From 6bba7ac359351554535c7af757057fd3100ac61e Mon Sep 17 00:00:00 2001 From: Stefan Holst Date: Sat, 28 Nov 2020 10:39:13 +0900 Subject: [PATCH] support for stripping forks and memory re-use in wavesim. --- kyupy/wave_sim.py | 209 +++++++++++++++++++++++++++++++---------- kyupy/wave_sim_cuda.py | 4 +- tests/test_wave_sim.py | 8 ++ 3 files changed, 170 insertions(+), 51 deletions(-) diff --git a/kyupy/wave_sim.py b/kyupy/wave_sim.py index fa8d585..109f564 100644 --- a/kyupy/wave_sim.py +++ b/kyupy/wave_sim.py @@ -1,5 +1,7 @@ -import numpy as np import math +from bisect import bisect, insort_left + +import numpy as np from . import numba @@ -8,8 +10,74 @@ TMAX_OVL = np.float32(1.1 * 2 ** 127) # almost np.PINF with overflow mark TMIN = np.float32(-2 ** 127) # almost np.NINF for 32-bit floating point values +class Heap: + def __init__(self): + self.chunks = dict() # map start location to chunk size + self.released = list() # chunks that were released + self.current_size = 0 + self.max_size = 0 + + def alloc(self, size): + for idx, loc in enumerate(self.released): + if self.chunks[loc] == size: + del self.released[idx] + return loc + elif self.chunks[loc] > size: # split chunk + chunksize = self.chunks[loc] + self.chunks[loc] = size + self.chunks[loc + size] = chunksize - size + self.released[idx] = loc + size # move released pointer: loc -> loc+size + return loc + # no previously released chunk; make new one + loc = self.current_size + self.chunks[loc] = size + self.current_size += size + self.max_size = max(self.max_size, self.current_size) + return loc + + def free(self, loc): + size = self.chunks[loc] + if loc + size == self.current_size: # end of managed area, remove chunk + del self.chunks[loc] + self.current_size -= size + # check and remove prev chunk if free + if len(self.released) > 0: + prev = self.released[-1] + if prev + self.chunks[prev] == self.current_size: + chunksize = self.chunks[prev] + del self.chunks[prev] + del self.released[-1] + self.current_size -= chunksize + return + released_idx = bisect(self.released, loc) + if released_idx < len(self.released) and loc + size == self.released[released_idx]: # next chunk is free, merge + chunksize = size + self.chunks[loc + size] + del self.chunks[loc + size] + self.chunks[loc] = chunksize + size = self.chunks[loc] + self.released[released_idx] = loc + else: + insort_left(self.released, loc) # put in a new release + if released_idx > 0: # check if previous chunk is free + prev = self.released[released_idx - 1] + if prev + self.chunks[prev] == loc: # previous chunk is adjacent to freed one, merge + chunksize = size + self.chunks[prev] + del self.chunks[loc] + self.chunks[prev] = chunksize + del self.released[released_idx] + + def __repr__(self): + r = [] + for loc in sorted(self.chunks.keys()): + size = self.chunks[loc] + released_idx = bisect(self.released, loc) + is_released = released_idx > 0 and len(self.released) > 0 and self.released[released_idx - 1] == loc + r.append(f'{loc:5d}: {"free" if is_released else "used"} {size}') + return "\n".join(r) + + class WaveSim: - def __init__(self, circuit, timing, sims=8, wavecaps=16): + def __init__(self, circuit, timing, sims=8, wavecaps=16, strip_forks=False, keep_waveforms=True): self.circuit = circuit self.sims = sims self.overflows = 0 @@ -24,67 +92,37 @@ class WaveSim: intf_wavecap = 4 # sufficient for storing only 1 transition. - # state allocation table. maps line and interface indices to self.state memory locations - - self.sat = np.zeros((len(circuit.lines) + 2 + 2 * len(self.interface), 3), dtype='int') - self.sat[:, 0] = -1 - filled = 0 - for lidx, cap in enumerate(wavecaps): - self.sat[lidx] = filled, cap, 0 - filled += cap - + # indices for state allocation table (sat) self.zero_idx = len(circuit.lines) - self.sat[self.zero_idx] = filled, intf_wavecap, 0 - filled += intf_wavecap self.tmp_idx = self.zero_idx + 1 - self.sat[self.tmp_idx] = filled, intf_wavecap, 0 - filled += intf_wavecap - self.ppi_offset = self.tmp_idx + 1 self.ppo_offset = self.ppi_offset + len(self.interface) - for i, n in enumerate(self.interface): - if len(n.outs) > 0: - self.sat[self.ppi_offset + i] = filled, intf_wavecap, 0 - filled += intf_wavecap - if len(n.ins) > 0: - self.sat[self.ppo_offset + i] = self.sat[n.ins[0].index] - - # pad timing - self.timing = np.zeros((len(self.sat), 2, 2)) - self.timing[:len(timing)] = timing + self.sat_length = self.ppo_offset + len(self.interface) - # allocate self.state - self.state = np.zeros((filled, sims), dtype='float32') + TMAX - - # generate self.ops + # translate circuit structure into self.ops ops = [] interface_dict = dict([(n, i) for i, n in enumerate(self.interface)]) for n in circuit.topological_order(): if n in interface_dict: inp_idx = self.ppi_offset + interface_dict[n] - if len(n.outs) > 0 and n.outs[0] is not None: + if len(n.outs) > 0 and n.outs[0] is not None: # first output of a PI/PPI ops.append((0b1010, n.outs[0].index, inp_idx, self.zero_idx)) - if 'dff' in n.kind.lower(): + if 'dff' in n.kind.lower(): # second output of DFF is inverted if len(n.outs) > 1 and n.outs[1] is not None: ops.append((0b0101, n.outs[1].index, inp_idx, self.zero_idx)) - else: + else: # if not DFF, no output is inverted. for o_line in n.outs[1:]: if o_line is not None: ops.append((0b1010, o_line.index, inp_idx, self.zero_idx)) - else: - o0_idx = self.tmp_idx - i0_idx = self.zero_idx - i1_idx = self.zero_idx - if len(n.outs) > 0 and n.outs[0] is not None: - o0_idx = n.outs[0].index - else: - print(f'no outputs for {n}') - if len(n.ins) > 0 and n.ins[0] is not None: i0_idx = n.ins[0].index - if len(n.ins) > 1 and n.ins[1] is not None: i1_idx = n.ins[1].index + else: # regular node, not PI/PPI or PO/PPO + o0_idx = n.outs[0].index if len(n.outs) > 0 and n.outs[0] is not None else self.tmp_idx + i0_idx = n.ins[0].index if len(n.ins) > 0 and n.ins[0] is not None else self.zero_idx + i1_idx = n.ins[1].index if len(n.ins) > 1 and n.ins[1] is not None else self.zero_idx kind = n.kind.lower() if kind == '__fork__': - for o_line in n.outs: - ops.append((0b1010, o_line.index, i0_idx, i1_idx)) + if not strip_forks: + for o_line in n.outs: + ops.append((0b1010, o_line.index, i0_idx, i1_idx)) elif kind.startswith('nand'): ops.append((0b0111, o0_idx, i0_idx, i1_idx)) elif kind.startswith('nor'): @@ -109,18 +147,91 @@ class WaveSim: print('unknown gate type', kind) self.ops = np.asarray(ops, dtype='int32') - # generate level data - levels = np.zeros(len(self.sat), dtype='int32') + # create a map from fanout lines to stem lines for fork stripping + stems = np.zeros(self.sat_length, dtype='int32') - 1 # default to -1: 'no fanout line' + if strip_forks: + for f in circuit.forks.values(): + prev_line = f.ins[0] + while prev_line.driver.kind == '__fork__': + prev_line = prev_line.driver.ins[0] + stem_idx = prev_line.index + for ol in f.outs: + stems[ol.index] = stem_idx + + # calculate level (distance from PI/PPI) and reference count for each line + levels = np.zeros(self.sat_length, dtype='int32') + ref_count = np.zeros(self.sat_length, dtype='int32') level_starts = [0] current_level = 1 for i, op in enumerate(self.ops): - if levels[op[2]] >= current_level or levels[op[3]] >= current_level: + # if we fork-strip, always take the stems for determining fan-in level + i0_idx = stems[op[2]] if stems[op[2]] >= 0 else op[2] + i1_idx = stems[op[3]] if stems[op[3]] >= 0 else op[3] + if levels[i0_idx] >= current_level or levels[i1_idx] >= current_level: current_level += 1 level_starts.append(i) - levels[op[1]] = current_level + levels[op[1]] = current_level # set level of the output line + ref_count[i0_idx] += 1 + ref_count[i1_idx] += 1 self.level_starts = np.asarray(level_starts, dtype='int32') self.level_stops = np.asarray(level_starts[1:] + [len(self.ops)], dtype='int32') + # state allocation table. maps line and interface indices to self.state memory locations + self.sat = np.zeros((self.sat_length, 3), dtype='int') + self.sat[:, 0] = -1 + + h = Heap() + + # allocate and keep memory for special fields + self.sat[self.zero_idx] = h.alloc(intf_wavecap), intf_wavecap, 0 + self.sat[self.tmp_idx] = h.alloc(intf_wavecap), intf_wavecap, 0 + ref_count[self.zero_idx] += 1 + ref_count[self.tmp_idx] += 1 + + # allocate and keep memory for PI/PPI, keep memory for PO/PPO (allocated later) + for i, n in enumerate(self.interface): + if len(n.outs) > 0: + self.sat[self.ppi_offset + i] = h.alloc(intf_wavecap), intf_wavecap, 0 + ref_count[self.ppi_offset + i] += 1 + if len(n.ins) > 0: + i0_idx = stems[n.ins[0].index] if stems[n.ins[0].index] >= 0 else n.ins[0].index + ref_count[i0_idx] += 1 + + # allocate memory for the rest of the circuit + for op_start, op_stop in zip(self.level_starts, self.level_stops): + free_list = [] + for op in self.ops[op_start:op_stop]: + # if we fork-strip, always take the stems + i0_idx = stems[op[2]] if stems[op[2]] >= 0 else op[2] + i1_idx = stems[op[3]] if stems[op[3]] >= 0 else op[3] + ref_count[i0_idx] -= 1 + ref_count[i1_idx] -= 1 + if ref_count[i0_idx] <= 0: free_list.append(self.sat[i0_idx, 0]) + if ref_count[i1_idx] <= 0: free_list.append(self.sat[i1_idx, 0]) + o_idx = op[1] + cap = wavecaps[o_idx] + self.sat[o_idx] = h.alloc(cap), cap, 0 + if not keep_waveforms: + for loc in free_list: + h.free(loc) + + # copy memory location and capacity from stems to fanout lines + for lidx, stem in enumerate(stems): + if stem >= 0: # if at a fanout line + self.sat[lidx] = self.sat[stem] + + # copy memory location to PO/PPO area + for i, n in enumerate(self.interface): + if len(n.ins) > 0: + self.sat[self.ppo_offset + i] = self.sat[n.ins[0].index] + + # pad timing + self.timing = np.zeros((self.sat_length, 2, 2)) + self.timing[:len(timing)] = timing + + # allocate self.state + self.state = np.zeros((h.max_size, sims), dtype='float32') + TMAX + m1 = np.array([2 ** x for x in range(7, -1, -1)], dtype='uint8') m0 = ~m1 self.mask = np.rollaxis(np.vstack((m0, m1)), 1) diff --git a/kyupy/wave_sim_cuda.py b/kyupy/wave_sim_cuda.py index 1b6ee80..c5eb2e9 100644 --- a/kyupy/wave_sim_cuda.py +++ b/kyupy/wave_sim_cuda.py @@ -9,8 +9,8 @@ TMIN = np.float32(-2 ** 127) # almost np.NINF for 32-bit floating point values class WaveSimCuda(WaveSim): - def __init__(self, circuit, timing, sims=8, wavecaps=16): - super().__init__(circuit, timing, sims, wavecaps) + def __init__(self, circuit, timing, sims=8, wavecaps=16, strip_forks=False, keep_waveforms=True): + super().__init__(circuit, timing, sims, wavecaps, strip_forks, keep_waveforms) self.tdata = np.zeros((len(self.interface), 3, (sims - 1) // 8 + 1), dtype='uint8') diff --git a/tests/test_wave_sim.py b/tests/test_wave_sim.py index 8723cdc..35eb760 100644 --- a/tests/test_wave_sim.py +++ b/tests/test_wave_sim.py @@ -130,6 +130,14 @@ def test_b14(mydir): compare_to_logic_sim(wsim) +def test_b14_strip_forks(mydir): + c = verilog.parse(mydir / 'b14.v.gz', branchforks=True) + df = sdf.parse(mydir / 'b14.sdf.gz') + lt = df.annotation(c, pin_index) + wsim = WaveSim(c, lt, 8, strip_forks=True) + compare_to_logic_sim(wsim) + + def test_b14_cuda(mydir): c = verilog.parse(mydir / 'b14.v.gz', branchforks=True) df = sdf.parse(mydir / 'b14.sdf.gz')