Browse Source

more on-gpu code, bump python requirement

devel
Stefan Holst 2 years ago
parent
commit
7bfc02e683
  1. 5
      setup.py
  2. 1
      src/kyupy/sim.py
  3. 129
      src/kyupy/wave_sim.py

5
setup.py

@ -14,7 +14,7 @@ setup(
url='https://github.com/s-holst/kyupy', url='https://github.com/s-holst/kyupy',
author='Stefan Holst', author='Stefan Holst',
author_email='mail@s-holst.de', author_email='mail@s-holst.de',
python_requires='>=3.6', python_requires='>=3.8',
install_requires=[ install_requires=[
'numpy>=1.17.0', 'numpy>=1.17.0',
'lark-parser>=0.8.0' 'lark-parser>=0.8.0'
@ -33,9 +33,8 @@ setup(
'Operating System :: OS Independent', 'Operating System :: OS Independent',
'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3 :: Only', 'Programming Language :: Python :: 3 :: Only',
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: 3.9', 'Programming Language :: Python :: 3.9',
'Programming Language :: Python :: 3.10',
], ],
) )

1
src/kyupy/sim.py

@ -158,6 +158,7 @@ class SimOps:
def __init__(self, circuit, c_caps=1, c_reuse=False, strip_forks=False): def __init__(self, circuit, c_caps=1, c_reuse=False, strip_forks=False):
self.circuit = circuit self.circuit = circuit
self.s_nodes = list(circuit.io_nodes) + [n for n in circuit.nodes if 'dff' in n.kind.lower()] self.s_nodes = list(circuit.io_nodes) + [n for n in circuit.nodes if 'dff' in n.kind.lower()]
self.s_len = len(self.s_nodes)
keep_signals = not c_reuse keep_signals = not c_reuse
if isinstance(c_caps, int): if isinstance(c_caps, int):

129
src/kyupy/wave_sim.py

@ -353,17 +353,9 @@ class WaveSimCuda(WaveSim):
self._block_dim = (32, 16) self._block_dim = (32, 16)
# TODO implement on GPU
def s_to_c(self): def s_to_c(self):
s = np.array(self.s) grid_dim = self._grid_dim(self.sims, self.s_len)
c = np.array(self.c) wave_assign_gpu[grid_dim, self._block_dim](self.c, self.s, self.vat, self.ppi_offset)
sins = np.moveaxis(s[self.pippi_s_locs], -1, 0)
cond = (sins[2] != 0) + 2*(sins[0] != 0) # choices order: 0 R F 1
c[self.pippi_c_locs] = np.choose(cond, [TMAX, sins[1], TMIN, TMIN])
c[self.pippi_c_locs+1] = np.choose(cond, [TMAX, TMAX, sins[1], TMAX])
c[self.pippi_c_locs+2] = TMAX
self.s[:,:,:] = s
self.c[:,:] = c
def _grid_dim(self, x, y): def _grid_dim(self, x, y):
gx = math.ceil(x / self._block_dim[0]) gx = math.ceil(x / self._block_dim[0])
@ -378,20 +370,39 @@ class WaveSimCuda(WaveSim):
sims, self.timing, self.params, sd, seed) sims, self.timing, self.params, sd, seed)
cuda.synchronize() cuda.synchronize()
# TODO implement on GPU
def c_to_s(self, time=TMAX, sd=0.0, seed=1): def c_to_s(self, time=TMAX, sd=0.0, seed=1):
s = np.array(self.s) grid_dim = self._grid_dim(self.sims, self.s_len)
c = np.array(self.c) wave_capture_gpu[grid_dim, self._block_dim](self.c, self.s, self.vat, self.ppo_offset,
vat = np.array(self.vat) time, sd * math.sqrt(2), seed)
for s_loc, (c_loc, c_len, _) in zip(self.poppo_s_locs, vat[self.ppo_offset+self.poppo_s_locs]):
for vector in range(self.sims):
s[s_loc, vector, 3:] = wave_capture_cpu(c, c_loc, c_len, vector, time=time, sd=sd, seed=seed)
self.s[:,:,:] = s
self.c[:,:] = c
# TODO implement on GPU
#def s_ppo_to_ppi(self, time=0.0):
def s_ppo_to_ppi(self, time=0.0):
grid_dim = self._grid_dim(self.sims, self.s_len)
ppo_to_ppi_gpu[grid_dim, self._block_dim](self.s, self.vat, time, self.ppi_offset, self.ppo_offset)
@cuda.jit()
def wave_assign_gpu(c, s, vat, ppi_offset):
x, y = cuda.grid(2)
if y >= len(s): return
c_loc, c_len, _ = vat[ppi_offset + y]
if c_loc < 0: return
if x >= c.shape[-1]: return
value = int(s[y, x, 2] >= 0.5) | (2*int(s[y, x, 0] >= 0.5))
ttime = s[y, x, 1]
if value == 0:
c[c_loc, x] = TMAX
c[c_loc+1, x] = TMAX
elif value == 1:
c[c_loc, x] = ttime
c[c_loc+1, x] = TMAX
elif value == 2:
c[c_loc, x] = TMIN
c[c_loc+1, x] = ttime
else:
c[c_loc, x] = TMIN
c[c_loc+1, x] = TMAX
c[c_loc+2, x] = TMAX
@cuda.jit(device=True) @cuda.jit(device=True)
def rand_gauss_gpu(seed, sd): def rand_gauss_gpu(seed, sd):
@ -526,4 +537,76 @@ def wave_eval_gpu(ops, op_start, op_stop, cbuf, vat, st_start, st_stop, line_tim
current_t = min(a, b, c, d) current_t = min(a, b, c, d)
# generate overflow flag or propagate from input # generate overflow flag or propagate from input
cbuf[z_mem + z_cur, st_idx] = TMAX_OVL if overflows > 0 else max(a, b, c, d) cbuf[z_mem + z_cur, st_idx] = TMAX_OVL if overflows > 0 else max(a, b, c, d)
@cuda.jit()
def wave_capture_gpu(c, s, vat, ppo_offset, time, s_sqrt2, seed):
x, y = cuda.grid(2)
if ppo_offset + y >= len(vat): return
line, tdim, _ = vat[ppo_offset + y]
if line < 0: return
if x >= c.shape[-1]: return
vector = x
m = 0.5
acc = 0.0
eat = TMAX
lst = TMIN
tog = 0
ovl = 0
val = int(0)
final = int(0)
for tidx in range(tdim):
t = c[line + tidx, vector]
if t >= TMAX:
if t == TMAX_OVL:
ovl = 1
break
m = -m
final ^= 1
if t < time:
val ^= 1
if t <= TMIN: continue
if s_sqrt2 > 0:
acc += m * (1 + math.erf((t - time) / s_sqrt2))
eat = min(eat, t)
lst = max(lst, t)
tog += 1
if s_sqrt2 > 0:
if m < 0:
acc += 1
if acc >= 0.99:
val = 1
elif acc > 0.01:
seed = (seed << 4) + (vector << 20) + (y << 1)
seed = int(0xDEECE66D) * seed + 0xB
seed = int(0xDEECE66D) * seed + 0xB
rnd = float((seed >> 8) & 0xffffff) / float(1 << 24)
val = rnd < acc
else:
val = 0
else:
acc = val
s[y, vector, 3] = (c[line, vector] <= TMIN)
s[y, vector, 4] = eat
s[y, vector, 5] = lst
s[y, vector, 6] = final
s[y, vector, 7] = acc
s[y, vector, 8] = val
s[y, vector, 9] = 0 # TODO
s[y, vector, 10] = ovl
@cuda.jit()
def ppo_to_ppi_gpu(s, vat, time, ppi_offset, ppo_offset):
x, y = cuda.grid(2)
if y >= s.shape[0]: return
if x >= s.shape[1]: return
if vat[ppi_offset + y, 0] < 0: return
if vat[ppo_offset + y, 0] < 0: return
s[y, x, 0] = s[y, x, 2]
s[y, x, 1] = time
s[y, x, 2] = s[y, x, 8]

Loading…
Cancel
Save