@ -13,10 +13,11 @@ Two simulators are available: :py:class:`WaveSim` runs on the CPU, and the deriv
@@ -13,10 +13,11 @@ Two simulators are available: :py:class:`WaveSim` runs on the CPU, and the deriv
"""
import math
from collections import defaultdict
import numpy as np
from . import numba , cuda , sim , cdiv , eng
from . import log , numba , cuda , sim , cdiv , eng
TMAX = np . float32 ( 2 * * 127 )
@ -59,8 +60,8 @@ class WaveSim(sim.SimOps):
@@ -59,8 +60,8 @@ class WaveSim(sim.SimOps):
self . delays = np . zeros ( ( len ( delays ) , self . c_locs_len , 2 , 2 ) , dtype = delays . dtype )
self . delays [ : , : delays . shape [ 1 ] ] = delays
self . c = np . zeros ( ( self . c_len , sims ) , dtype = np . float32 ) + TMAX
self . s = np . zeros ( ( 11 , self . s_len , sims ) , dtype = np . float32 )
self . c = np . full ( ( self . c_len , self . sims ) , TMAX , dtype = np . float32 )
self . s = np . zeros ( ( 11 , self . s_len , self . sims ) , dtype = np . float32 )
""" Information about the logic values and transitions around the sequential elements (flip-flops) and ports.
The first 3 values are read by : py : func : ` s_to_c ` .
@ -98,7 +99,26 @@ class WaveSim(sim.SimOps):
@@ -98,7 +99,26 @@ class WaveSim(sim.SimOps):
self . simctl_int [ 0 ] = range ( sims ) # unique seed for each sim by default, zero this to pick same delays for all sims.
self . simctl_int [ 1 ] = 2 # random picking by default.
self . nbytes = sum ( [ a . nbytes for a in ( self . c , self . s , self . c_locs , self . c_caps , self . ops , self . simctl_int ) ] )
# flat array for line use information
line_use = defaultdict ( list )
for lidx in range ( len ( self . circuit . lines ) ) :
if self . line_use_start [ lidx ] < 0 : continue
if self . line_use_stop [ lidx ] < 0 :
log . warn ( f ' line { lidx } never read? ' )
for i in range ( self . line_use_start [ lidx ] , self . line_use_stop [ lidx ] ) :
line_use [ i ] . append ( lidx )
self . line_use_counts = np . array ( [ len ( line_use [ i ] ) for i in range ( len ( self . levels ) ) ] , dtype = np . int32 )
self . line_use_offsets = np . zeros_like ( self . line_use_counts )
self . line_use_offsets [ 1 : ] = self . line_use_counts . cumsum ( ) [ : - 1 ]
self . line_use = np . hstack ( [ line_use [ i ] for i in range ( len ( self . levels ) ) ] )
self . h = np . zeros ( ( self . c_locs_len , sims ) , dtype = np . float32 ) # hashes of generated waveforms
self . h_base = np . zeros_like ( self . h ) # base hashes to compare to
self . error_counts = np . zeros ( self . s_len , dtype = np . uint32 ) # number of capture errors by PPO
self . nbytes = sum ( [ a . nbytes for a in ( self . c , self . s , self . h , self . c_locs , self . c_caps , self . ops , self . simctl_int ) ] )
def __repr__ ( self ) :
dev = ' GPU ' if hasattr ( self . c , ' copy_to_host ' ) else ' CPU '
@ -124,7 +144,7 @@ class WaveSim(sim.SimOps):
@@ -124,7 +144,7 @@ class WaveSim(sim.SimOps):
"""
sims = min ( sims or self . sims , self . sims )
for op_start , op_stop in zip ( self . level_starts , self . level_stops ) :
level_eval_cpu ( self . ops , op_start , op_stop , self . c , self . c_locs , self . c_caps , self . abuf , 0 , sims , self . delays , self . simctl_int , seed )
level_eval_cpu ( self . ops , op_start , op_stop , self . c , self . c_locs , self . c_caps , self . h , self . abuf , 0 , sims , self . delays , self . simctl_int , seed )
def c_to_s ( self , time = TMAX , sd = 0.0 , seed = 1 ) :
""" Simulates a capture operation at all sequential elements and primary outputs.
@ -152,7 +172,7 @@ class WaveSim(sim.SimOps):
@@ -152,7 +172,7 @@ class WaveSim(sim.SimOps):
self . s [ 2 , self . ppio_s_locs ] = self . s [ 8 , self . ppio_s_locs ]
def _wave_eval ( op , cbuf , c_locs , c_caps , sim , delays , simctl_int , seed = 0 ) :
def _wave_eval ( op , cbuf , c_locs , c_caps , hbuf , sim , delays , simctl_int , seed ) :
overflows = int ( 0 )
lut = op [ 0 ]
@ -182,6 +202,8 @@ def _wave_eval(op, cbuf, c_locs, c_caps, sim, delays, simctl_int, seed=0):
@@ -182,6 +202,8 @@ def _wave_eval(op, cbuf, c_locs, c_caps, sim, delays, simctl_int, seed=0):
z_mem = c_locs [ z_idx ]
z_cap = c_caps [ z_idx ]
h = np . float32 ( 0 )
a_cur = int ( 0 )
b_cur = int ( 0 )
c_cur = int ( 0 )
@ -229,6 +251,7 @@ def _wave_eval(op, cbuf, c_locs, c_caps, sim, delays, simctl_int, seed=0):
@@ -229,6 +251,7 @@ def _wave_eval(op, cbuf, c_locs, c_caps, sim, delays, simctl_int, seed=0):
next_t = cbuf [ d_mem + d_cur , sim ] + delays [ d_idx , ( d_cur & 1 ) ^ 1 , z_val ^ 1 ]
if ( z_cur & 1 ) != ( ( lut >> inputs ) & 1 ) :
h + = h * 3 + max ( current_t , - 10 ) # hash based on generated transitions before filtering
# we generate an edge in z_mem, if ...
if ( z_cur == 0 # it is the first edge in z_mem ...
or next_t < current_t # -OR- the next edge on SAME input is EARLIER (need current edge to filter BOTH in next iteration) ...
@ -240,8 +263,8 @@ def _wave_eval(op, cbuf, c_locs, c_caps, sim, delays, simctl_int, seed=0):
@@ -240,8 +263,8 @@ def _wave_eval(op, cbuf, c_locs, c_caps, sim, delays, simctl_int, seed=0):
z_cur + = 1
else :
overflows + = 1
previous_t = cbuf [ z_mem + z_cur - 1 , sim ]
z_cur - = 1
previous_t = cbuf [ z_mem + z_cur , sim ]
else :
z_cur - = 1
previous_t = cbuf [ z_mem + z_cur - 1 , sim ] if z_cur > 0 else TMIN
@ -258,6 +281,8 @@ def _wave_eval(op, cbuf, c_locs, c_caps, sim, delays, simctl_int, seed=0):
@@ -258,6 +281,8 @@ def _wave_eval(op, cbuf, c_locs, c_caps, sim, delays, simctl_int, seed=0):
# generate or propagate overflow flag
cbuf [ z_mem + z_cur , sim ] = TMAX_OVL if overflows > 0 else max ( a , b , c , d )
hbuf [ z_idx , sim ] = h
nrise = max ( 0 , ( z_cur + 1 ) / / 2 - ( cbuf [ z_mem , sim ] == TMIN ) )
nfall = z_cur / / 2
@ -268,11 +293,11 @@ wave_eval_cpu = numba.njit(_wave_eval)
@@ -268,11 +293,11 @@ wave_eval_cpu = numba.njit(_wave_eval)
@numba . njit
def level_eval_cpu ( ops , op_start , op_stop , c , c_locs , c_caps , abuf , sim_start , sim_stop , delays , simctl_int , seed ) :
def level_eval_cpu ( ops , op_start , op_stop , c , c_locs , c_caps , hbuf , abuf , sim_start , sim_stop , delays , simctl_int , seed ) :
for op_idx in range ( op_start , op_stop ) :
op = ops [ op_idx ]
for sim in range ( sim_start , sim_stop ) :
nrise , nfall = wave_eval_cpu ( op , c , c_locs , c_caps , sim , delays , simctl_int [ : , sim ] , seed )
nrise , nfall = wave_eval_cpu ( op , c , c_locs , c_caps , hbuf , sim , delays , simctl_int [ : , sim ] , seed )
a_loc = op [ 6 ]
a_wr = op [ 7 ]
a_wf = op [ 8 ]
@ -345,12 +370,18 @@ class WaveSimCuda(WaveSim):
@@ -345,12 +370,18 @@ class WaveSimCuda(WaveSim):
self . delays = cuda . to_device ( self . delays )
self . simctl_int = cuda . to_device ( self . simctl_int )
self . abuf = cuda . to_device ( self . abuf )
self . h = cuda . to_device ( self . h )
self . h_base = cuda . to_device ( self . h_base )
self . line_use = cuda . to_device ( self . line_use )
self . error_counts = cuda . to_device ( self . error_counts )
self . retval_int = cuda . to_device ( np . array ( [ 0 ] , dtype = np . int32 ) )
self . _block_dim = ( 32 , 16 )
def __getstate__ ( self ) :
state = self . __dict__ . copy ( )
state [ ' c ' ] = np . array ( self . c )
del state [ ' c ' ]
state [ ' s ' ] = np . array ( self . s )
state [ ' ops ' ] = np . array ( self . ops )
state [ ' c_locs ' ] = np . array ( self . c_locs )
@ -358,11 +389,16 @@ class WaveSimCuda(WaveSim):
@@ -358,11 +389,16 @@ class WaveSimCuda(WaveSim):
state [ ' delays ' ] = np . array ( self . delays )
state [ ' simctl_int ' ] = np . array ( self . simctl_int )
state [ ' abuf ' ] = np . array ( self . abuf )
state [ ' h ' ] = np . array ( self . h )
state [ ' h_base ' ] = np . array ( self . h_base )
state [ ' line_use ' ] = np . array ( self . line_use )
state [ ' error_counts ' ] = np . array ( self . error_counts )
state [ ' retval_int ' ] = np . array ( self . retval_int )
return state
def __setstate__ ( self , state ) :
self . __dict__ . update ( state )
self . c = cuda . to_device ( self . c )
self . c = cuda . to_device ( np . full ( ( self . c_len , self . sims ) , TMAX , dtype = np . float32 ) )
self . s = cuda . to_device ( self . s )
self . ops = cuda . to_device ( self . ops )
self . c_locs = cuda . to_device ( self . c_locs )
@ -370,6 +406,11 @@ class WaveSimCuda(WaveSim):
@@ -370,6 +406,11 @@ class WaveSimCuda(WaveSim):
self . delays = cuda . to_device ( self . delays )
self . simctl_int = cuda . to_device ( self . simctl_int )
self . abuf = cuda . to_device ( self . abuf )
self . h = cuda . to_device ( self . h )
self . h_base = cuda . to_device ( self . h_base )
self . line_use = cuda . to_device ( self . line_use )
self . error_counts = cuda . to_device ( self . error_counts )
self . retval_int = cuda . to_device ( self . retval_int )
def s_to_c ( self ) :
grid_dim = self . _grid_dim ( self . sims , self . s_len )
@ -383,10 +424,18 @@ class WaveSimCuda(WaveSim):
@@ -383,10 +424,18 @@ class WaveSimCuda(WaveSim):
if op_from > op_start : continue
if op_to is not None and op_to < = op_start : break
grid_dim = self . _grid_dim ( sims , op_stop - op_start )
wave_eval_gpu [ grid_dim , self . _block_dim ] ( self . ops , op_start , op_stop , self . c , self . c_locs , self . c_caps , self . abuf , int ( 0 ) ,
wave_eval_gpu [ grid_dim , self . _block_dim ] ( self . ops , op_start , op_stop , self . c , self . c_locs , self . c_caps , self . h , self . abuf , int ( 0 ) ,
sims , self . delays , self . simctl_int , seed )
cuda . synchronize ( )
def c_prop_level ( self , level , sims = None , seed = 1 ) :
sims = min ( sims or self . sims , self . sims )
op_start = self . level_starts [ level ]
op_stop = self . level_stops [ level ]
grid_dim = self . _grid_dim ( sims , op_stop - op_start )
wave_eval_gpu [ grid_dim , self . _block_dim ] ( self . ops , op_start , op_stop , self . c , self . c_locs , self . c_caps , self . h , self . abuf , int ( 0 ) ,
sims , self . delays , self . simctl_int , seed )
def c_to_s ( self , time = TMAX , sd = 0.0 , seed = 1 ) :
grid_dim = self . _grid_dim ( self . sims , self . s_len )
wave_capture_gpu [ grid_dim , self . _block_dim ] ( self . c , self . s , self . c_locs , self . c_caps , self . ppo_offset ,
@ -396,6 +445,52 @@ class WaveSimCuda(WaveSim):
@@ -396,6 +445,52 @@ class WaveSimCuda(WaveSim):
grid_dim = self . _grid_dim ( self . sims , self . s_len )
ppo_to_ppi_gpu [ grid_dim , self . _block_dim ] ( self . s , self . c_locs , time , self . ppi_offset , self . ppo_offset )
def set_base_hashes ( self ) :
nitems = self . h_base . shape [ 0 ] * self . h_base . shape [ 1 ]
grid_dim = cdiv ( nitems , 256 )
memcpy_gpu [ grid_dim , 256 ] ( self . h , self . h_base , nitems )
def compare_hashes_level ( self , lv ) :
self . retval_int [ 0 ] = 0
grid_dim = self . _grid_dim ( self . sims , self . line_use_counts [ lv ] )
diff_hash_gpu [ grid_dim , self . _block_dim ] ( self . h , self . h_base , self . line_use , self . line_use_offsets [ lv ] ,
self . line_use_counts [ lv ] , self . retval_int )
return self . retval_int [ 0 ]
def calc_error_counts ( self , sims = None ) :
sims = min ( sims or self . sims , self . sims )
grid_dim = cdiv ( self . s_len , 256 )
calc_error_counts_gpu [ grid_dim , 256 ] ( self . s , sims , self . error_counts )
return np . array ( self . error_counts )
@cuda . jit ( )
def memcpy_gpu ( src , dst , nitems ) :
tid = cuda . grid ( 1 )
stride = cuda . gridDim . x * cuda . blockDim . x
for i in range ( tid , nitems , stride ) :
dst . flat [ i ] = src . flat [ i ]
@cuda . jit ( )
def diff_hash_gpu ( hbuf1 , hbuf2 , h_locs , h_locs_offset , h_locs_cnt , differs ) :
x , y = cuda . grid ( 2 )
if x > = hbuf1 . shape [ 1 ] : return
if y > = h_locs_cnt : return
h_loc = h_locs [ h_locs_offset + y ]
if hbuf1 [ h_loc , x ] != hbuf2 [ h_loc , x ] :
differs [ 0 ] = 1
@cuda . jit ( )
def calc_error_counts_gpu ( s , sims , error_counts ) :
x = cuda . grid ( 1 )
if x > = s . shape [ 1 ] : return
cnt = 0
for i in range ( sims ) :
cnt + = ( s [ 6 , x , i ] != s [ 8 , x , i ] )
error_counts [ x ] = cnt
@cuda . jit ( )
def wave_assign_gpu ( c , s , c_locs , ppi_offset ) :
@ -425,7 +520,7 @@ _wave_eval_gpu = cuda.jit(_wave_eval, device=True)
@@ -425,7 +520,7 @@ _wave_eval_gpu = cuda.jit(_wave_eval, device=True)
@cuda . jit ( )
def wave_eval_gpu ( ops , op_start , op_stop , cbuf , c_locs , c_caps , abuf , sim_start , sim_stop , delays , simctl_int , seed ) :
def wave_eval_gpu ( ops , op_start , op_stop , cbuf , c_locs , c_caps , hbuf , abuf , sim_start , sim_stop , delays , simctl_int , seed ) :
x , y = cuda . grid ( 2 )
sim = sim_start + x
op_idx = op_start + y
@ -437,7 +532,7 @@ def wave_eval_gpu(ops, op_start, op_stop, cbuf, c_locs, c_caps, abuf, sim_start,
@@ -437,7 +532,7 @@ def wave_eval_gpu(ops, op_start, op_stop, cbuf, c_locs, c_caps, abuf, sim_start,
a_wr = op [ 7 ]
a_wf = op [ 8 ]
nrise , nfall = _wave_eval_gpu ( op , cbuf , c_locs , c_caps , sim , delays , simctl_int [ : , sim ] , seed )
nrise , nfall = _wave_eval_gpu ( op , cbuf , c_locs , c_caps , hbuf , sim , delays , simctl_int [ : , sim ] , seed )
# accumulate WSA into abuf
if a_loc > = 0 :