new common scheduler for simulators

2 years ago · 6801606dca
1 changed files with 305 additions and 0 deletions
--- a/src/kyupy/schedule.py
+++ b/src/kyupy/schedule.py
@ -0,0 +1,305 @@
 import math
 from bisect import bisect, insort_left
 import numpy as np
 class SimPrim:
    BUF1 = 0b1010_1010_1010_1010
    INV1 = 0b0101_0101_0101_0101
    NAND4 = 0b0111_1111_1111_1111
    NAND3 = 0b0111_1111_0111_1111
    NAND2 = 0b0111_0111_0111_0111
    NOR4 = 0b0000_0000_0000_0001
    NOR3 = 0b0000_0001_0000_0001
    NOR2 = 0b0001_0001_0001_0001
    AND4 = 0b1000_0000_0000_0000
    AND3 = 0b1000_0000_1000_0000
    AND2 = 0b1000_1000_1000_1000
    OR4 = 0b1111_1111_1111_1110
    OR3 = 0b1111_1110_1111_1110
    OR2 = 0b1110_1110_1110_1110
    XOR4 = 0b0110_1001_1001_0110
    XOR3 = 0b1001_0110_1001_0110
    XOR2 = 0b0110_0110_0110_0110
    XNOR4 = 0b1001_0110_0110_1001
    XNOR3 = 0b0110_1001_0110_1001
    XNOR2 = 0b1001_1001_1001_1001
    AO22 = 0b1111_1000_1000_1000
    AOI22 = 0b0000_0111_0111_0111
    AO21 = 0b1110_1010_1110_1010
    AOI21 = 0b0001_0101_0001_0101
    OA22 = 0b1110_1110_1110_0000
    OAI22 = 0b0001_0001_0001_1111
    OA21 = 0b1010_1000_1010_1000
    OAI21 = 0b0101_0111_0101_0111
    MUX21 = 0b1110_0100_1110_0100
    kind_prefixes = {
        'nand': (NAND4, NAND3, NAND2),
        'nor': (NOR4, NOR3, NOR2),
        'and': (AND4, AND3, AND2),
        'or': (OR4, OR3, OR2),
        'xor': (XOR4, XOR3, XOR2),
        'xnor': (XNOR4, XNOR3, XNOR2),
        'not': (INV1, INV1, INV1),
        'inv': (INV1, INV1, INV1),
        'ibuf': (INV1, INV1, INV1),
        '__const1__': (INV1, INV1, INV1),
        'tieh': (INV1, INV1, INV1),
        'buf': (BUF1, BUF1, BUF1),
        'nbuf': (BUF1, BUF1, BUF1),
        'delln': (BUF1, BUF1, BUF1),
        '__const0__': (BUF1, BUF1, BUF1),
        'tiel': (BUF1, BUF1, BUF1),
        'ao22': (AO22, AO22, AO22),
        'aoi22': (AOI22, AOI22, AOI22),
        'ao21': (AO21, AO21, AO21),
        'aoi21': (AOI21, AOI21, AOI21),
        'oa22': (OA22, OA22, OA22),
        'oai22': (OAI22, OAI22, OAI22),
        'oa21': (OA21, OA21, OA21),
        'oai21': (OAI21, OAI21, OAI21),
        'mux21': (MUX21, MUX21, MUX21),
    }
    @classmethod
    def names(cls):
        return dict([(v, k) for k, v in cls.__dict__.items() if isinstance(v, int)])
 class Heap:
    def __init__(self):
        self.chunks = dict()  # map start location to chunk size
        self.released = list()  # chunks that were released
        self.current_size = 0
        self.max_size = 0
    def alloc(self, size):
        for idx, loc in enumerate(self.released):
            if self.chunks[loc] == size:
                del self.released[idx]
                return loc
            if self.chunks[loc] > size:  # split chunk
                chunksize = self.chunks[loc]
                self.chunks[loc] = size
                self.chunks[loc + size] = chunksize - size
                self.released[idx] = loc + size  # move released pointer: loc -> loc+size
                return loc
        # no previously released chunk; make new one
        loc = self.current_size
        self.chunks[loc] = size
        self.current_size += size
        self.max_size = max(self.max_size, self.current_size)
        return loc
    def free(self, loc):
        size = self.chunks[loc]
        if loc + size == self.current_size:  # end of managed area, remove chunk
            del self.chunks[loc]
            self.current_size -= size
            # check and remove prev chunk if free
            if len(self.released) > 0:
                prev = self.released[-1]
                if prev + self.chunks[prev] == self.current_size:
                    chunksize = self.chunks[prev]
                    del self.chunks[prev]
                    del self.released[-1]
                    self.current_size -= chunksize
            return
        released_idx = bisect(self.released, loc)
        if released_idx < len(self.released) and loc + size == self.released[released_idx]:  # next chunk is free, merge
            chunksize = size + self.chunks[loc + size]
            del self.chunks[loc + size]
            self.chunks[loc] = chunksize
            size = self.chunks[loc]
            self.released[released_idx] = loc
        else:
            insort_left(self.released, loc)  # put in a new release
        if released_idx > 0:  # check if previous chunk is free
            prev = self.released[released_idx - 1]
            if prev + self.chunks[prev] == loc:  # previous chunk is adjacent to freed one, merge
                chunksize = size + self.chunks[prev]
                del self.chunks[loc]
                self.chunks[prev] = chunksize
                del self.released[released_idx]
    def __repr__(self):
        r = []
        for loc in sorted(self.chunks.keys()):
            size = self.chunks[loc]
            released_idx = bisect(self.released, loc)
            is_released = released_idx > 0 and len(self.released) > 0 and self.released[released_idx - 1] == loc
            r.append(f'{loc:5d}: {"free" if is_released else "used"} {size}')
        return "\n".join(r)
 class Schedule:
    """A static scheduler that translates a Circuit into a topologically sorted list of basic logic operations (self.ops) and
    a value allocation table (self.vat) for use in simulators.
    :param circuit: The circuit to create a schedule for.
    :param strip_forks: If enabled, the scheduler will not include fork nodes to safe simulation time.
        Stripping forks will cause interconnect delay annotations of lines read by fork nodes to be ignored.
    :param keep_signals: If disabled, memory of intermediate signal waveforms will be re-used. This greatly reduces
        memory footprint, but intermediate signal waveforms become unaccessible after a propagation.
    """
    def __init__(self, circuit, strip_forks=False, keep_signals=True, signal_caps=1):
        self.circuit = circuit
        self.interface = list(circuit.interface) + [n for n in circuit.nodes if 'dff' in n.kind.lower()]
        if isinstance(signal_caps, int):
            signal_caps = [signal_caps] * len(circuit.lines)
        # indices for state allocation table (sat)
        self.zero_idx = len(circuit.lines)
        self.tmp_idx = self.zero_idx + 1
        self.ppi_offset = self.tmp_idx + 1
        self.ppo_offset = self.ppi_offset + len(self.interface)
        self.vat_length = self.ppo_offset + len(self.interface)
        # translate circuit structure into self.ops
        ops = []
        interface_dict = dict((n, i) for i, n in enumerate(self.interface))
        for n in circuit.topological_order():
            if n in interface_dict:
                inp_idx = self.ppi_offset + interface_dict[n]
                if len(n.outs) > 0 and n.outs[0] is not None:  # first output of a PI/PPI
                    ops.append((SimPrim.BUF1, n.outs[0].index, inp_idx, self.zero_idx, self.zero_idx, self.zero_idx)) 
                if 'dff' in n.kind.lower():  # second output of DFF is inverted
                    if len(n.outs) > 1 and n.outs[1] is not None:
                        ops.append((SimPrim.INV1, n.outs[1].index, inp_idx, self.zero_idx, self.zero_idx, self.zero_idx))
                else:  # if not DFF, no output is inverted.
                    for o_line in n.outs[1:]:
                        if o_line is not None:
                            ops.append((SimPrim.BUF1, o_line.index, inp_idx, self.zero_idx, self.zero_idx, self.zero_idx))
                continue
            # regular node, not PI/PPI or PO/PPO
            o0_idx = n.outs[0].index if len(n.outs) > 0 and n.outs[0] is not None else self.tmp_idx
            i0_idx = n.ins[0].index if len(n.ins) > 0 and n.ins[0] is not None else self.zero_idx
            i1_idx = n.ins[1].index if len(n.ins) > 1 and n.ins[1] is not None else self.zero_idx
            i2_idx = n.ins[2].index if len(n.ins) > 2 and n.ins[2] is not None else self.zero_idx
            i3_idx = n.ins[3].index if len(n.ins) > 3 and n.ins[3] is not None else self.zero_idx
            kind = n.kind.lower()
            if kind == '__fork__':
                if not strip_forks:
                    for o_line in n.outs:
                        if o_line is not None:
                            ops.append((SimPrim.BUF1, o_line.index, i0_idx, i1_idx, i2_idx, i3_idx))
                continue
            sp = None
            for prefix, prims in SimPrim.kind_prefixes.items():
                if kind.startswith(prefix):
                    sp = prims[0]
                    if i3_idx == self.zero_idx:
                        sp = prims[1]
                        if i2_idx == self.zero_idx:
                            sp = prims[2]
                    break
            if sp is None:
                print('unknown gate type', kind)
            else:
                ops.append((sp, o0_idx, i0_idx, i1_idx, i2_idx, i3_idx))
        self.ops = np.asarray(ops, dtype='int32')
        # create a map from fanout lines to stem lines for fork stripping
        stems = np.zeros(self.vat_length, dtype='int32') - 1  # default to -1: 'no fanout line'
        if strip_forks:
            for f in circuit.forks.values():
                prev_line = f.ins[0]
                while prev_line.driver.kind == '__fork__':
                    prev_line = prev_line.driver.ins[0]
                stem_idx = prev_line.index
                for ol in f.outs:
                    stems[ol] = stem_idx
        # calculate level (distance from PI/PPI) and reference count for each line
        levels = np.zeros(self.vat_length, dtype='int32')
        ref_count = np.zeros(self.vat_length, dtype='int32')
        level_starts = [0]
        current_level = 1
        for i, op in enumerate(self.ops):
            # if we fork-strip, always take the stems for determining fan-in level
            i0_idx = stems[op[2]] if stems[op[2]] >= 0 else op[2]
            i1_idx = stems[op[3]] if stems[op[3]] >= 0 else op[3]
            i2_idx = stems[op[4]] if stems[op[4]] >= 0 else op[4]
            i3_idx = stems[op[5]] if stems[op[5]] >= 0 else op[5]
            if levels[i0_idx] >= current_level or levels[i1_idx] >= current_level or levels[i2_idx] >= current_level or levels[i3_idx] >= current_level:
                current_level += 1
                level_starts.append(i)
            levels[op[1]] = current_level  # set level of the output line
            ref_count[i0_idx] += 1
            ref_count[i1_idx] += 1
            ref_count[i2_idx] += 1
            ref_count[i3_idx] += 1
        self.level_starts = np.asarray(level_starts, dtype='int32')
        self.level_stops = np.asarray(level_starts[1:] + [len(self.ops)], dtype='int32')
        # state allocation table. maps line and interface indices to self.state memory locations
        self.vat = np.zeros((self.vat_length, 3), dtype='int')
        self.vat[:, 0] = -1
        h = Heap()
        # allocate and keep memory for special fields
        self.vat[self.zero_idx] = h.alloc(1), 1, 0
        self.vat[self.tmp_idx] = h.alloc(1), 1, 0
        ref_count[self.zero_idx] += 1
        ref_count[self.tmp_idx] += 1
        # allocate and keep memory for PI/PPI, keep memory for PO/PPO (allocated later)
        for i, n in enumerate(self.interface):
            if len(n.outs) > 0:
                self.vat[self.ppi_offset + i] = h.alloc(1), 1, 0
                ref_count[self.ppi_offset + i] += 1
            if len(n.ins) > 0:
                i0_idx = stems[n.ins[0]] if stems[n.ins[0]] >= 0 else n.ins[0]
                ref_count[i0_idx] += 1
        # allocate memory for the rest of the circuit
        for op_start, op_stop in zip(self.level_starts, self.level_stops):
            free_list = []
            for op in self.ops[op_start:op_stop]:
                # if we fork-strip, always take the stems
                i0_idx = stems[op[2]] if stems[op[2]] >= 0 else op[2]
                i1_idx = stems[op[3]] if stems[op[3]] >= 0 else op[3]
                i2_idx = stems[op[4]] if stems[op[4]] >= 0 else op[4]
                i3_idx = stems[op[5]] if stems[op[5]] >= 0 else op[5]
                ref_count[i0_idx] -= 1
                ref_count[i1_idx] -= 1
                ref_count[i2_idx] -= 1
                ref_count[i3_idx] -= 1
                if ref_count[i0_idx] <= 0: free_list.append(self.vat[i0_idx, 0])
                if ref_count[i1_idx] <= 0: free_list.append(self.vat[i1_idx, 0])
                if ref_count[i2_idx] <= 0: free_list.append(self.vat[i2_idx, 0])
                if ref_count[i3_idx] <= 0: free_list.append(self.vat[i3_idx, 0])
                o_idx = op[1]
                cap = signal_caps[o_idx]
                self.vat[o_idx] = h.alloc(cap), cap, 0
            if not keep_signals:
                for loc in free_list:
                    h.free(loc)
        # copy memory location and capacity from stems to fanout lines
        for lidx, stem in enumerate(stems):
            if stem >= 0:  # if at a fanout line
                self.vat[lidx] = self.vat[stem]
        # copy memory location to PO/PPO area
        for i, n in enumerate(self.interface):
            if len(n.ins) > 0:
                self.vat[self.ppo_offset + i] = self.vat[n.ins[0]]
        self.state_length = h.max_size