You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
521 lines
20 KiB
521 lines
20 KiB
import math |
|
from bisect import bisect, insort_left |
|
|
|
import numpy as np |
|
from . import numba |
|
|
|
|
|
TMAX = np.float32(2 ** 127) # almost np.PINF for 32-bit floating point values |
|
TMAX_OVL = np.float32(1.1 * 2 ** 127) # almost np.PINF with overflow mark |
|
TMIN = np.float32(-2 ** 127) # almost np.NINF for 32-bit floating point values |
|
|
|
|
|
class Heap: |
|
def __init__(self): |
|
self.chunks = dict() # map start location to chunk size |
|
self.released = list() # chunks that were released |
|
self.current_size = 0 |
|
self.max_size = 0 |
|
|
|
def alloc(self, size): |
|
for idx, loc in enumerate(self.released): |
|
if self.chunks[loc] == size: |
|
del self.released[idx] |
|
return loc |
|
elif self.chunks[loc] > size: # split chunk |
|
chunksize = self.chunks[loc] |
|
self.chunks[loc] = size |
|
self.chunks[loc + size] = chunksize - size |
|
self.released[idx] = loc + size # move released pointer: loc -> loc+size |
|
return loc |
|
# no previously released chunk; make new one |
|
loc = self.current_size |
|
self.chunks[loc] = size |
|
self.current_size += size |
|
self.max_size = max(self.max_size, self.current_size) |
|
return loc |
|
|
|
def free(self, loc): |
|
size = self.chunks[loc] |
|
if loc + size == self.current_size: # end of managed area, remove chunk |
|
del self.chunks[loc] |
|
self.current_size -= size |
|
# check and remove prev chunk if free |
|
if len(self.released) > 0: |
|
prev = self.released[-1] |
|
if prev + self.chunks[prev] == self.current_size: |
|
chunksize = self.chunks[prev] |
|
del self.chunks[prev] |
|
del self.released[-1] |
|
self.current_size -= chunksize |
|
return |
|
released_idx = bisect(self.released, loc) |
|
if released_idx < len(self.released) and loc + size == self.released[released_idx]: # next chunk is free, merge |
|
chunksize = size + self.chunks[loc + size] |
|
del self.chunks[loc + size] |
|
self.chunks[loc] = chunksize |
|
size = self.chunks[loc] |
|
self.released[released_idx] = loc |
|
else: |
|
insort_left(self.released, loc) # put in a new release |
|
if released_idx > 0: # check if previous chunk is free |
|
prev = self.released[released_idx - 1] |
|
if prev + self.chunks[prev] == loc: # previous chunk is adjacent to freed one, merge |
|
chunksize = size + self.chunks[prev] |
|
del self.chunks[loc] |
|
self.chunks[prev] = chunksize |
|
del self.released[released_idx] |
|
|
|
def __repr__(self): |
|
r = [] |
|
for loc in sorted(self.chunks.keys()): |
|
size = self.chunks[loc] |
|
released_idx = bisect(self.released, loc) |
|
is_released = released_idx > 0 and len(self.released) > 0 and self.released[released_idx - 1] == loc |
|
r.append(f'{loc:5d}: {"free" if is_released else "used"} {size}') |
|
return "\n".join(r) |
|
|
|
|
|
class WaveSim: |
|
def __init__(self, circuit, timing, sims=8, wavecaps=16, strip_forks=False, keep_waveforms=True): |
|
self.circuit = circuit |
|
self.sims = sims |
|
self.overflows = 0 |
|
self.interface = list(circuit.interface) + [n for n in circuit.nodes if 'dff' in n.kind.lower()] |
|
|
|
self.lst_eat_valid = False |
|
|
|
self.cdata = np.zeros((len(self.interface), sims, 7), dtype='float32') |
|
|
|
if type(wavecaps) is int: |
|
wavecaps = [wavecaps] * len(circuit.lines) |
|
|
|
intf_wavecap = 4 # sufficient for storing only 1 transition. |
|
|
|
# indices for state allocation table (sat) |
|
self.zero_idx = len(circuit.lines) |
|
self.tmp_idx = self.zero_idx + 1 |
|
self.ppi_offset = self.tmp_idx + 1 |
|
self.ppo_offset = self.ppi_offset + len(self.interface) |
|
self.sat_length = self.ppo_offset + len(self.interface) |
|
|
|
# translate circuit structure into self.ops |
|
ops = [] |
|
interface_dict = dict([(n, i) for i, n in enumerate(self.interface)]) |
|
for n in circuit.topological_order(): |
|
if n in interface_dict: |
|
inp_idx = self.ppi_offset + interface_dict[n] |
|
if len(n.outs) > 0 and n.outs[0] is not None: # first output of a PI/PPI |
|
ops.append((0b1010, n.outs[0].index, inp_idx, self.zero_idx)) |
|
if 'dff' in n.kind.lower(): # second output of DFF is inverted |
|
if len(n.outs) > 1 and n.outs[1] is not None: |
|
ops.append((0b0101, n.outs[1].index, inp_idx, self.zero_idx)) |
|
else: # if not DFF, no output is inverted. |
|
for o_line in n.outs[1:]: |
|
if o_line is not None: |
|
ops.append((0b1010, o_line.index, inp_idx, self.zero_idx)) |
|
else: # regular node, not PI/PPI or PO/PPO |
|
o0_idx = n.outs[0].index if len(n.outs) > 0 and n.outs[0] is not None else self.tmp_idx |
|
i0_idx = n.ins[0].index if len(n.ins) > 0 and n.ins[0] is not None else self.zero_idx |
|
i1_idx = n.ins[1].index if len(n.ins) > 1 and n.ins[1] is not None else self.zero_idx |
|
kind = n.kind.lower() |
|
if kind == '__fork__': |
|
if not strip_forks: |
|
for o_line in n.outs: |
|
ops.append((0b1010, o_line.index, i0_idx, i1_idx)) |
|
elif kind.startswith('nand'): |
|
ops.append((0b0111, o0_idx, i0_idx, i1_idx)) |
|
elif kind.startswith('nor'): |
|
ops.append((0b0001, o0_idx, i0_idx, i1_idx)) |
|
elif kind.startswith('and'): |
|
ops.append((0b1000, o0_idx, i0_idx, i1_idx)) |
|
elif kind.startswith('or'): |
|
ops.append((0b1110, o0_idx, i0_idx, i1_idx)) |
|
elif kind.startswith('xor'): |
|
ops.append((0b0110, o0_idx, i0_idx, i1_idx)) |
|
elif kind.startswith('xnor'): |
|
ops.append((0b1001, o0_idx, i0_idx, i1_idx)) |
|
elif kind.startswith('not') or kind.startswith('inv'): |
|
ops.append((0b0101, o0_idx, i0_idx, i1_idx)) |
|
elif kind.startswith('buf') or kind.startswith('nbuf'): |
|
ops.append((0b1010, o0_idx, i0_idx, i1_idx)) |
|
elif kind.startswith('__const1__') or kind.startswith('tieh'): |
|
ops.append((0b0101, o0_idx, i0_idx, i1_idx)) |
|
elif kind.startswith('__const0__') or kind.startswith('tiel'): |
|
ops.append((0b1010, o0_idx, i0_idx, i1_idx)) |
|
else: |
|
print('unknown gate type', kind) |
|
self.ops = np.asarray(ops, dtype='int32') |
|
|
|
# create a map from fanout lines to stem lines for fork stripping |
|
stems = np.zeros(self.sat_length, dtype='int32') - 1 # default to -1: 'no fanout line' |
|
if strip_forks: |
|
for f in circuit.forks.values(): |
|
prev_line = f.ins[0] |
|
while prev_line.driver.kind == '__fork__': |
|
prev_line = prev_line.driver.ins[0] |
|
stem_idx = prev_line.index |
|
for ol in f.outs: |
|
stems[ol.index] = stem_idx |
|
|
|
# calculate level (distance from PI/PPI) and reference count for each line |
|
levels = np.zeros(self.sat_length, dtype='int32') |
|
ref_count = np.zeros(self.sat_length, dtype='int32') |
|
level_starts = [0] |
|
current_level = 1 |
|
for i, op in enumerate(self.ops): |
|
# if we fork-strip, always take the stems for determining fan-in level |
|
i0_idx = stems[op[2]] if stems[op[2]] >= 0 else op[2] |
|
i1_idx = stems[op[3]] if stems[op[3]] >= 0 else op[3] |
|
if levels[i0_idx] >= current_level or levels[i1_idx] >= current_level: |
|
current_level += 1 |
|
level_starts.append(i) |
|
levels[op[1]] = current_level # set level of the output line |
|
ref_count[i0_idx] += 1 |
|
ref_count[i1_idx] += 1 |
|
self.level_starts = np.asarray(level_starts, dtype='int32') |
|
self.level_stops = np.asarray(level_starts[1:] + [len(self.ops)], dtype='int32') |
|
|
|
# state allocation table. maps line and interface indices to self.state memory locations |
|
self.sat = np.zeros((self.sat_length, 3), dtype='int') |
|
self.sat[:, 0] = -1 |
|
|
|
h = Heap() |
|
|
|
# allocate and keep memory for special fields |
|
self.sat[self.zero_idx] = h.alloc(intf_wavecap), intf_wavecap, 0 |
|
self.sat[self.tmp_idx] = h.alloc(intf_wavecap), intf_wavecap, 0 |
|
ref_count[self.zero_idx] += 1 |
|
ref_count[self.tmp_idx] += 1 |
|
|
|
# allocate and keep memory for PI/PPI, keep memory for PO/PPO (allocated later) |
|
for i, n in enumerate(self.interface): |
|
if len(n.outs) > 0: |
|
self.sat[self.ppi_offset + i] = h.alloc(intf_wavecap), intf_wavecap, 0 |
|
ref_count[self.ppi_offset + i] += 1 |
|
if len(n.ins) > 0: |
|
i0_idx = stems[n.ins[0].index] if stems[n.ins[0].index] >= 0 else n.ins[0].index |
|
ref_count[i0_idx] += 1 |
|
|
|
# allocate memory for the rest of the circuit |
|
for op_start, op_stop in zip(self.level_starts, self.level_stops): |
|
free_list = [] |
|
for op in self.ops[op_start:op_stop]: |
|
# if we fork-strip, always take the stems |
|
i0_idx = stems[op[2]] if stems[op[2]] >= 0 else op[2] |
|
i1_idx = stems[op[3]] if stems[op[3]] >= 0 else op[3] |
|
ref_count[i0_idx] -= 1 |
|
ref_count[i1_idx] -= 1 |
|
if ref_count[i0_idx] <= 0: free_list.append(self.sat[i0_idx, 0]) |
|
if ref_count[i1_idx] <= 0: free_list.append(self.sat[i1_idx, 0]) |
|
o_idx = op[1] |
|
cap = wavecaps[o_idx] |
|
self.sat[o_idx] = h.alloc(cap), cap, 0 |
|
if not keep_waveforms: |
|
for loc in free_list: |
|
h.free(loc) |
|
|
|
# copy memory location and capacity from stems to fanout lines |
|
for lidx, stem in enumerate(stems): |
|
if stem >= 0: # if at a fanout line |
|
self.sat[lidx] = self.sat[stem] |
|
|
|
# copy memory location to PO/PPO area |
|
for i, n in enumerate(self.interface): |
|
if len(n.ins) > 0: |
|
self.sat[self.ppo_offset + i] = self.sat[n.ins[0].index] |
|
|
|
# pad timing |
|
self.timing = np.zeros((self.sat_length, 2, 2)) |
|
self.timing[:len(timing)] = timing |
|
|
|
# allocate self.state |
|
self.state = np.zeros((h.max_size, sims), dtype='float32') + TMAX |
|
|
|
m1 = np.array([2 ** x for x in range(7, -1, -1)], dtype='uint8') |
|
m0 = ~m1 |
|
self.mask = np.rollaxis(np.vstack((m0, m1)), 1) |
|
|
|
def get_line_delay(self, line, polarity): |
|
return self.timing[line, 0, polarity] |
|
|
|
def set_line_delay(self, line, polarity, delay): |
|
self.timing[line, 0, polarity] = delay |
|
|
|
def assign(self, vectors, time=0.0, offset=0): |
|
nvectors = min(vectors.nvectors - offset, self.sims) |
|
for i, node in enumerate(self.interface): |
|
ppi_loc = self.sat[self.ppi_offset + i, 0] |
|
if ppi_loc < 0: continue |
|
for p in range(nvectors): |
|
vector = p + offset |
|
a = vectors.bits[i, :, vector // 8] |
|
m = self.mask[vector % 8] |
|
toggle = 0 |
|
if a[0] & m[1]: |
|
self.state[ppi_loc, p] = TMIN |
|
toggle += 1 |
|
if (len(a) > 2) and (a[2] & m[1]) and ((a[0] & m[1]) == (a[1] & m[1])): |
|
self.state[ppi_loc + toggle, p] = time |
|
toggle += 1 |
|
self.state[ppi_loc + toggle, p] = TMAX |
|
|
|
def propagate(self, sims=None, sd=0.0, seed=1): |
|
if sims is None: |
|
sims = self.sims |
|
else: |
|
sims = min(sims, self.sims) |
|
for op_start, op_stop in zip(self.level_starts, self.level_stops): |
|
self.overflows += level_eval(self.ops, op_start, op_stop, self.state, self.sat, 0, sims, |
|
self.timing, sd, seed) |
|
self.lst_eat_valid = False |
|
|
|
def wave(self, line, vector): |
|
if line < 0: |
|
return [TMAX] |
|
mem, wcap, _ = self.sat[line] |
|
if mem < 0: |
|
return [TMAX] |
|
return self.state[mem:mem + wcap, vector] |
|
|
|
def wave_ppi(self, i, vector): |
|
return self.wave(self.ppi_offset + i, vector) |
|
|
|
def wave_ppo(self, o, vector): |
|
return self.wave(self.ppo_offset + o, vector) |
|
|
|
def capture(self, time=TMAX, sd=0, seed=1, probabilities=None, offset=0): |
|
for i, node in enumerate(self.interface): |
|
if len(node.ins) == 0: continue |
|
for p in range(self.sims): |
|
self.cdata[i, p] = self.capture_wave(self.ppo_offset + i, p, time, sd, seed) |
|
if probabilities is not None: |
|
assert offset < probabilities.shape[1] |
|
cap_dim = min(probabilities.shape[1] - offset, self.sims) |
|
probabilities[:, offset:cap_dim + offset] = self.cdata[:, 0:cap_dim, 0] |
|
self.lst_eat_valid = True |
|
return self.cdata |
|
|
|
def reassign(self, time=0.0): |
|
for i, node in enumerate(self.interface): |
|
ppi_loc = self.sat[self.ppi_offset + i, 0] |
|
ppo_loc = self.sat[self.ppo_offset + i, 0] |
|
if ppi_loc < 0 or ppo_loc < 0: continue |
|
for sidx in range(self.sims): |
|
ival = self.val(self.ppi_offset + i, sidx, TMAX) > 0.5 |
|
oval = self.cdata[i, sidx, 1] > 0.5 |
|
toggle = 0 |
|
if ival: |
|
self.state[ppi_loc, sidx] = TMIN |
|
toggle += 1 |
|
if ival != oval: |
|
self.state[ppi_loc + toggle, sidx] = time |
|
toggle += 1 |
|
self.state[ppi_loc + toggle, sidx] = TMAX |
|
|
|
def eat(self, line, vector): |
|
eat = TMAX |
|
for t in self.wave(line, vector): |
|
if t >= TMAX: break |
|
if t <= TMIN: continue |
|
eat = min(eat, t) |
|
return eat |
|
|
|
def lst(self, line, vector): |
|
lst = TMIN |
|
for t in self.wave(line, vector): |
|
if t >= TMAX: break |
|
if t <= TMIN: continue |
|
lst = max(lst, t) |
|
return lst |
|
|
|
def lst_ppo(self, o, vector): |
|
if not self.lst_eat_valid: |
|
self.capture() |
|
return self.cdata[o, vector, 5] |
|
|
|
def toggles(self, line, vector): |
|
tog = 0 |
|
for t in self.wave(line, vector): |
|
if t >= TMAX: break |
|
if t <= TMIN: continue |
|
tog += 1 |
|
return tog |
|
|
|
def _vals(self, idx, vector, times, sd=0.0): |
|
s_sqrt2 = sd * math.sqrt(2) |
|
m = 0.5 |
|
accs = [0.0] * len(times) |
|
values = [0] * len(times) |
|
for t in self.wave(idx, vector): |
|
if t >= TMAX: break |
|
for idx, time in enumerate(times): |
|
if t < time: |
|
values[idx] = values[idx] ^ 1 |
|
m = -m |
|
if t <= TMIN: continue |
|
if s_sqrt2 > 0: |
|
for idx, time in enumerate(times): |
|
accs[idx] += m * (1 + math.erf((t - time) / s_sqrt2)) |
|
if (m < 0) and (s_sqrt2 > 0): |
|
for idx, time in enumerate(times): |
|
accs[idx] += 1 |
|
if s_sqrt2 == 0: |
|
return values |
|
else: |
|
return accs |
|
|
|
def vals(self, line, vector, times, sd=0): |
|
return self._vals(line, vector, times, sd) |
|
|
|
def val(self, line, vector, time=TMAX, sd=0): |
|
return self.capture_wave(line, vector, time, sd)[0] |
|
|
|
def vals_ppo(self, o, vector, times, sd=0): |
|
return self._vals(self.ppo_offset + o, vector, times, sd) |
|
|
|
def val_ppo(self, o, vector, time=TMAX, sd=0): |
|
if not self.lst_eat_valid: |
|
self.capture(time, sd) |
|
return self.cdata[o, vector, 0] |
|
|
|
def capture_wave(self, line, vector, time=TMAX, sd=0.0, seed=1): |
|
s_sqrt2 = sd * math.sqrt(2) |
|
m = 0.5 |
|
acc = 0.0 |
|
eat = TMAX |
|
lst = TMIN |
|
tog = 0 |
|
ovl = 0 |
|
val = int(0) |
|
final = int(0) |
|
for t in self.wave(line, vector): |
|
if t >= TMAX: |
|
if t == TMAX_OVL: |
|
ovl = 1 |
|
break |
|
m = -m |
|
final ^= 1 |
|
if t < time: |
|
val ^= 1 |
|
if t <= TMIN: continue |
|
if s_sqrt2 > 0: |
|
acc += m * (1 + math.erf((t - time) / s_sqrt2)) |
|
eat = min(eat, t) |
|
lst = max(lst, t) |
|
tog += 1 |
|
if s_sqrt2 > 0: |
|
if m < 0: |
|
acc += 1 |
|
if acc >= 0.99: |
|
val = 1 |
|
elif acc > 0.01: |
|
seed = (seed << 4) + (vector << 20) + (line-self.ppo_offset << 1) |
|
seed = int(0xDEECE66D) * seed + 0xB |
|
seed = int(0xDEECE66D) * seed + 0xB |
|
rnd = float((seed >> 8) & 0xffffff) / float(1 << 24) |
|
val = rnd < acc |
|
else: |
|
val = 0 |
|
else: |
|
acc = val |
|
|
|
return acc, val, final, (val != final), eat, lst, ovl |
|
|
|
|
|
@numba.njit |
|
def level_eval(ops, op_start, op_stop, state, sat, st_start, st_stop, line_times, sd, seed): |
|
overflows = 0 |
|
for op_idx in range(op_start, op_stop): |
|
op = ops[op_idx] |
|
for st_idx in range(st_start, st_stop): |
|
overflows += wave_eval(op, state, sat, st_idx, line_times, sd, seed) |
|
return overflows |
|
|
|
|
|
@numba.njit |
|
def rand_gauss(seed, sd): |
|
clamp = 0.5 |
|
if sd <= 0.0: |
|
return 1.0 |
|
while True: |
|
x = -6.0 |
|
for i in range(12): |
|
seed = int(0xDEECE66D) * seed + 0xB |
|
x += float((seed >> 8) & 0xffffff) / float(1 << 24) |
|
x *= sd |
|
if abs(x) <= clamp: |
|
break |
|
return x + 1.0 |
|
|
|
|
|
@numba.njit |
|
def wave_eval(op, state, sat, st_idx, line_times, sd=0.0, seed=0): |
|
lut, z_idx, a_idx, b_idx = op |
|
overflows = int(0) |
|
|
|
_seed = (seed << 4) + (z_idx << 20) + (st_idx << 1) |
|
|
|
a_mem = sat[a_idx, 0] |
|
b_mem = sat[b_idx, 0] |
|
z_mem, z_cap, _ = sat[z_idx] |
|
|
|
a_cur = int(0) |
|
b_cur = int(0) |
|
z_cur = lut & 1 |
|
if z_cur == 1: |
|
state[z_mem, st_idx] = TMIN |
|
|
|
a = state[a_mem, st_idx] + line_times[a_idx, 0, z_cur] * rand_gauss(_seed ^ a_mem ^ z_cur, sd) |
|
b = state[b_mem, st_idx] + line_times[b_idx, 0, z_cur] * rand_gauss(_seed ^ b_mem ^ z_cur, sd) |
|
|
|
previous_t = TMIN |
|
|
|
current_t = min(a, b) |
|
inputs = int(0) |
|
|
|
while current_t < TMAX: |
|
z_val = z_cur & 1 |
|
if b < a: |
|
b_cur += 1 |
|
b = state[b_mem + b_cur, st_idx] |
|
b += line_times[b_idx, 0, z_val ^ 1] * rand_gauss(_seed ^ b_mem ^ z_val ^ 1, sd) |
|
thresh = line_times[b_idx, 1, z_val] * rand_gauss(_seed ^ b_mem ^ z_val, sd) |
|
inputs ^= 2 |
|
next_t = b |
|
else: |
|
a_cur += 1 |
|
a = state[a_mem + a_cur, st_idx] |
|
a += line_times[a_idx, 0, z_val ^ 1] * rand_gauss(_seed ^ a_mem ^ z_val ^ 1, sd) |
|
thresh = line_times[a_idx, 1, z_val] * rand_gauss(_seed ^ a_mem ^ z_val, sd) |
|
inputs ^= 1 |
|
next_t = a |
|
|
|
if (z_cur & 1) != ((lut >> inputs) & 1): |
|
# we generate a toggle in z_mem, if: |
|
# ( it is the first toggle in z_mem OR |
|
# following toggle is earlier OR |
|
# pulse is wide enough ) AND enough space in z_mem. |
|
if z_cur == 0 or next_t < current_t or (current_t - previous_t) > thresh: |
|
if z_cur < (z_cap - 1): |
|
state[z_mem + z_cur, st_idx] = current_t |
|
previous_t = current_t |
|
z_cur += 1 |
|
else: |
|
overflows += 1 |
|
previous_t = state[z_mem + z_cur - 1, st_idx] |
|
z_cur -= 1 |
|
else: |
|
z_cur -= 1 |
|
if z_cur > 0: |
|
previous_t = state[z_mem + z_cur - 1, st_idx] |
|
else: |
|
previous_t = TMIN |
|
current_t = min(a, b) |
|
|
|
if overflows > 0: |
|
state[z_mem + z_cur, st_idx] = TMAX_OVL |
|
else: |
|
state[z_mem + z_cur, st_idx] = a if a > b else b # propagate overflow flags by storing biggest TMAX from input |
|
|
|
return overflows
|
|
|