From d2400ccab12b774c9af39f6b6a911cdc4ed01b10 Mon Sep 17 00:00:00 2001 From: stefan Date: Wed, 27 May 2026 22:48:13 +0900 Subject: [PATCH] starting ffr-based transient fault sim --- README.md | 31 ++++++++++++++++ kyupy | 2 +- sim_transient_faults.py | 78 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 110 insertions(+), 1 deletion(-) create mode 100755 sim_transient_faults.py diff --git a/README.md b/README.md index c7d0947..98884c2 100644 --- a/README.md +++ b/README.md @@ -57,3 +57,34 @@ nix build github:s-holst/benchmark-circuits#picorv32-sky130 nix build github:s-holst/benchmark-circuits#jpeg_core-sky130 ``` +## Transient Fault Simulation + +Goal: Classify fault effects into: masked, non-SDC, SDC. + +- *Transient fault:* A single line-flip fault that is only active for a single clock cycle. +- *Single line-flip fault:* Logic value on a single signal in the circuit gets inverted. This is equivalent to a stuck-at 0(1) if the fault-free value is 1(0). +- *Trace length $l$:* Number of clock cycles considered in simulation. +- *Fault sites $f$:* Number of distict fault locations considered in simulation. +- Number of distinct faults (size of the fault set) is: $l\cdot f$. + +Approach: +- PPSFP for fault injection cycle. Generates $l\cdot f$ responses (=system states right after fault injection). +- System states are simulated for the next clock cycles as necessary (fault-free propagation of erroneous states). +- System states are classified into: + - *error-free:* state is the same as in fault-free operation (fault effect disappeared) + - *erroneous non-SDC:* surrounding system (testbench) detected the error (criterions: trap signal, OOB memory access, ...) + - *erroneous potential-SDC:* state differs from fault-free operation, but remains undetected by surrounding system (testbench). +- Once a system state becomes *error-free*: Stop further simulation, original fault is *masked*. +- Once a system state becomes *erroneous non-SDC*: Stop further simulation, original fault is *non-SDC*. +- As long as system state remains *erroneous potential-SDC*, keep simulating until the end of the trace. +- If system state is *erroneous potential-SDC* at the end of the trace: original fault is *SDC*. + +Optimizations: +- FFR: Only explicitly simulate FFR stems, reduce number of responses to $l\cdot \#FFR$. +- Independent FFR could be simulated together, but system states have to be re-constructed for checking. +- Distributed computing: Partition over fault universe, worker tasks for erroneous potential-SDC propagation. + +Some initial code is in `sim_transient_faults.py`. Run +``` +uv run sim_transient_faults.py picorv32-sky130 patterns.npy +``` \ No newline at end of file diff --git a/kyupy b/kyupy index d3d0672..f863b70 160000 --- a/kyupy +++ b/kyupy @@ -1 +1 @@ -Subproject commit d3d06722c134874515e622cc12e3035c8a2a04a1 +Subproject commit f863b704571966ac4945e1dc91b11284c5594038 diff --git a/sim_transient_faults.py b/sim_transient_faults.py new file mode 100755 index 0000000..46035a4 --- /dev/null +++ b/sim_transient_faults.py @@ -0,0 +1,78 @@ +#!/usr/bin/env -S uv run + +import argparse +import subprocess +from pathlib import Path + +import numpy as np + +from kyupy import log, verilog, batchrange, logic +from kyupy.logic_sim import LogicSim2V +from kyupy.techlib import techlib_by_name + +def main(): + parser = argparse.ArgumentParser(description='Transient Fault Simulator.') + #parser.add_argument('-o', default=None, help='[path/]stem for output files. default: ') + parser.add_argument('-t', '--tlib', default='SKY130', help=f'techlib of circuit. default: SKY130, available: {sorted(techlib_by_name.keys())}.') + parser.add_argument('netlist', help='gate-level verilog file or nix package to import. See "nix flake show github:s-holst/benchmark-circuits" for available packages.') + parser.add_argument('npy', help='npy with imported patterns to simulate.') + args = parser.parse_args() + args.tlib = techlib_by_name[args.tlib] + + if not (nl_path := Path(args.netlist)).exists(): # fallback to published nix package. + nix_cmd = f"nix build github:s-holst/benchmark-circuits#{args.netlist} --print-out-paths --no-link" + benchmark_path = Path(subprocess.check_output(nix_cmd.split(), text=True).strip()) + nl_path = next(benchmark_path.glob("*/nl/*.nl.v")) + + log.info(f'Loading {nl_path.absolute()}') + c = verilog.load(nl_path, tlib=args.tlib) + c.resolve_tlib_cells(args.tlib) # resolve every cell into kyupy simulation primitives + log.info(c) + for kind, cnt in sorted(c.stats.items()): + log.info(f' {kind:10s} {cnt}') + trap_loc = c.io_locs('trap') + mem_valid_loc = c.io_locs('mem_valid') + mem_addr_locs = c.io_locs('mem_addr') + log.info(f'{trap_loc=} {mem_valid_loc=} {mem_addr_locs=}') + + fault_sites = [] + for stem, _ in c.fanout_free_regions(): + if len(stem.outs) > 0 and stem.outs[0] is not None: + fault_sites.append(stem.outs[0]) + fault_sites = np.array(fault_sites, dtype=np.uint32) + np.random.shuffle(fault_sites) + + log.info(f'Number of injection sites (FFR stems): {len(fault_sites)}') + + patterns = np.load(args.npy) + patterns = np.array([logic.ZERO, logic.ZERO, logic.ZERO, logic.ONE, logic.ZERO, logic.ONE, logic.ZERO, logic.ONE], dtype=np.uint8)[patterns] # zero-fill unknowns + assert len(patterns) == len(c.s_nodes), 'number of bits in patterns does not match the number of circuit PPIOs.' + pcount = patterns.shape[1] + log.info(f'Pattern count: {pcount}') + + sim = LogicSim2V(c, sims=min(pcount, 10240)) + log.info(sim) + good = sim.allocate() + + for fault_site in fault_sites[:200]: + for bo, bs in batchrange(pcount, sim.sims): + #log.info(f'Simulating {bs} patterns @ {bo} ...') + sim.s_assign[:, :bs] = patterns[:, bo:bo+bs] + sim.s_to_c() + sim.c_prop() + sim.c_to_s() + good[:,:bs] = sim.s_result[:,:bs] + sim.c_prop(fault_line=fault_site) + sim.c_to_s() + error_counts = (sim.s_result[:,:bs] != good[:,:bs]).sum(axis=0) + traps = sim.s_result[trap_loc,:bs] == logic.ONE + mem_valids = sim.s_result[mem_valid_loc,:bs] == logic.ONE + mem_addrs = logic.packbits(sim.s_result[mem_addr_locs,:bs].T, dtype=np.uint32) + mem_rom_access = mem_valids & (mem_addrs <= 0xffff) + mem_ram_access = mem_valids & (mem_addrs >= 0x20000) & (mem_addrs <= 0x23fff) + mem_io_access = mem_valids & (mem_addrs == 0x1000_0000) + mem_oob_access = mem_valids & ~mem_rom_access & ~mem_ram_access & ~mem_io_access + log.info(f'flips@{fault_site} erroneous={np.sum(error_counts != 0)} traps={np.sum(traps)} oob={np.sum(mem_oob_access)}') + +if __name__ == '__main__': + main() \ No newline at end of file