Thank you Andrew!
Here is what I see when I type Ctrl-C for my process that’s ground to a halt and has been doing absolutely nothing for 8 hours:
Process Process-31:
Process Process-27:
Process Process-24:
Process Process-13:
Process Process-22:
Process Process-16:
Process Process-18:
Process Process-21:
Process Process-19:
Process Process-12:
Process Process-15:
Traceback (most recent call last):
File "./stanmaster_syntheticdata.py", line 1212, in <module>
runFunctionsInParallel(fns, names=names, parallel=defaults['server']['parallel'], maxAtOnce=MAX_AT_ONCE, # Four cores, or 8, it seems, per estimate
File "/home/meuser/bin/cpblUtilities/parallel.py", line 134, in runFunctionsInParallel
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
return cRunFunctionsInParallel(*args, **kwargs).launch_jobs()
File "/usr/lib/python3.8/multiprocessing/process.py", line 318, in _bootstrap
util._exit_function()
File "/home/meuser/bin/cpblUtilities/parallel.py", line 342, in launch_jobs
File "/usr/lib/python3.8/multiprocessing/util.py", line 357, in _exit_function
p.join()
File "/usr/lib/python3.8/multiprocessing/process.py", line 318, in _bootstrap
util._exit_function()
File "/usr/lib/python3.8/multiprocessing/process.py", line 318, in _bootstrap
util._exit_function()
File "/usr/lib/python3.8/multiprocessing/process.py", line 149, in join
res = self._popen.wait(timeout)
File "/usr/lib/python3.8/multiprocessing/util.py", line 357, in _exit_function
p.join()
File "/usr/lib/python3.8/multiprocessing/util.py", line 357, in _exit_function
p.join()
File "/usr/lib/python3.8/multiprocessing/popen_fork.py", line 47, in wait
return self.poll(os.WNOHANG if timeout == 0.0 else 0)
File "/usr/lib/python3.8/multiprocessing/process.py", line 149, in join
res = self._popen.wait(timeout)
File "/usr/lib/python3.8/multiprocessing/process.py", line 149, in join
res = self._popen.wait(timeout)
File "/usr/lib/python3.8/multiprocessing/popen_fork.py", line 27, in poll
pid, sts = os.waitpid(self.pid, flag)
File "/usr/lib/python3.8/multiprocessing/popen_fork.py", line 47, in wait
return self.poll(os.WNOHANG if timeout == 0.0 else 0)
File "/usr/lib/python3.8/multiprocessing/popen_fork.py", line 47, in wait
return self.poll(os.WNOHANG if timeout == 0.0 else 0)
Traceback (most recent call last):
Traceback (most recent call last):
File "/usr/lib/python3.8/multiprocessing/popen_fork.py", line 27, in poll
pid, sts = os.waitpid(self.pid, flag)
KeyboardInterrupt
self.updateStatus()
File "/usr/lib/python3.8/multiprocessing/popen_fork.py", line 27, in poll
pid, sts = os.waitpid(self.pid, flag)
Traceback (most recent call last):
File "/home/meuser/bin/cpblUtilities/parallel.py", line 365, in updateStatus
KeyboardInterrupt
KeyboardInterrupt
File "/usr/lib/python3.8/multiprocessing/process.py", line 318, in _bootstrap
util._exit_function()
File "/usr/lib/python3.8/multiprocessing/process.py", line 318, in _bootstrap
util._exit_function()
File "/usr/lib/python3.8/multiprocessing/util.py", line 357, in _exit_function
p.join()
File "/usr/lib/python3.8/multiprocessing/process.py", line 318, in _bootstrap
util._exit_function()
File "/usr/lib/python3.8/multiprocessing/util.py", line 357, in _exit_function
p.join()
File "/usr/lib/python3.8/multiprocessing/process.py", line 149, in join
res = self._popen.wait(timeout)
File "/usr/lib/python3.8/multiprocessing/util.py", line 357, in _exit_function
p.join()
File "/usr/lib/python3.8/multiprocessing/process.py", line 149, in join
res = self._popen.wait(timeout)
File "/usr/lib/python3.8/multiprocessing/popen_fork.py", line 47, in wait
return self.poll(os.WNOHANG if timeout == 0.0 else 0)
File "/usr/lib/python3.8/multiprocessing/process.py", line 149, in join
res = self._popen.wait(timeout)
self.status[ii] = self.jobs[ii].status()
File "/usr/lib/python3.8/multiprocessing/popen_fork.py", line 47, in wait
return self.poll(os.WNOHANG if timeout == 0.0 else 0)
File "/usr/lib/python3.8/multiprocessing/popen_fork.py", line 27, in poll
pid, sts = os.waitpid(self.pid, flag)
File "/usr/lib/python3.8/multiprocessing/popen_fork.py", line 27, in poll
pid, sts = os.waitpid(self.pid, flag)
KeyboardInterrupt
KeyboardInterrupt
KeyboardInterrupt
if not self.queue.empty():
File "/usr/lib/python3.8/multiprocessing/queues.py", line 123, in empty
return not self._poll()
File "/usr/lib/python3.8/multiprocessing/connection.py", line 257, in poll
Traceback (most recent call last):
return self._poll(timeout)
Traceback (most recent call last):
File "/usr/lib/python3.8/multiprocessing/connection.py", line 424, in _poll
Traceback (most recent call last):
File "/usr/lib/python3.8/multiprocessing/process.py", line 318, in _bootstrap
util._exit_function()
File "/usr/lib/python3.8/multiprocessing/util.py", line 357, in _exit_function
p.join()
Traceback (most recent call last):
File "/usr/lib/python3.8/multiprocessing/process.py", line 149, in join
res = self._popen.wait(timeout)
Traceback (most recent call last):
File "/usr/lib/python3.8/multiprocessing/process.py", line 318, in _bootstrap
util._exit_function()
File "/usr/lib/python3.8/multiprocessing/popen_fork.py", line 47, in wait
return self.poll(os.WNOHANG if timeout == 0.0 else 0)
File "/usr/lib/python3.8/multiprocessing/process.py", line 318, in _bootstrap
util._exit_function()
File "/usr/lib/python3.8/multiprocessing/util.py", line 357, in _exit_function
p.join()
File "/usr/lib/python3.8/multiprocessing/popen_fork.py", line 27, in poll
pid, sts = os.waitpid(self.pid, flag)
File "/usr/lib/python3.8/multiprocessing/process.py", line 149, in join
res = self._popen.wait(timeout)
File "/usr/lib/python3.8/multiprocessing/util.py", line 357, in _exit_function
p.join()
r = wait([self], timeout)
File "/usr/lib/python3.8/multiprocessing/process.py", line 318, in _bootstrap
util._exit_function()
File "/usr/lib/python3.8/multiprocessing/popen_fork.py", line 47, in wait
return self.poll(os.WNOHANG if timeout == 0.0 else 0)
File "/usr/lib/python3.8/multiprocessing/process.py", line 149, in join
res = self._popen.wait(timeout)
File "/usr/lib/python3.8/multiprocessing/popen_fork.py", line 27, in poll
pid, sts = os.waitpid(self.pid, flag)
File "/usr/lib/python3.8/multiprocessing/process.py", line 318, in _bootstrap
util._exit_function()
KeyboardInterrupt
File "/usr/lib/python3.8/multiprocessing/connection.py", line 925, in wait
File "/usr/lib/python3.8/multiprocessing/util.py", line 357, in _exit_function
p.join()
File "/usr/lib/python3.8/multiprocessing/popen_fork.py", line 47, in wait
return self.poll(os.WNOHANG if timeout == 0.0 else 0)
File "/usr/lib/python3.8/multiprocessing/util.py", line 357, in _exit_function
p.join()
KeyboardInterrupt
File "/usr/lib/python3.8/multiprocessing/process.py", line 149, in join
res = self._popen.wait(timeout)
File "/usr/lib/python3.8/multiprocessing/popen_fork.py", line 27, in poll
pid, sts = os.waitpid(self.pid, flag)
File "/usr/lib/python3.8/multiprocessing/popen_fork.py", line 47, in wait
return self.poll(os.WNOHANG if timeout == 0.0 else 0)
File "/usr/lib/python3.8/multiprocessing/process.py", line 149, in join
res = self._popen.wait(timeout)
File "/usr/lib/python3.8/multiprocessing/popen_fork.py", line 27, in poll
pid, sts = os.waitpid(self.pid, flag)
KeyboardInterrupt
File "/usr/lib/python3.8/multiprocessing/popen_fork.py", line 47, in wait
return self.poll(os.WNOHANG if timeout == 0.0 else 0)
File "/usr/lib/python3.8/multiprocessing/popen_fork.py", line 27, in poll
pid, sts = os.waitpid(self.pid, flag)
KeyboardInterrupt
KeyboardInterrupt
selector.register(obj, selectors.EVENT_READ)
File "/usr/lib/python3.8/selectors.py", line 352, in register
key = super().register(fileobj, events, data)
File "/usr/lib/python3.8/selectors.py", line 238, in register
key = SelectorKey(fileobj, self._fileobj_lookup(fileobj), events, data)
File "/usr/lib/python3.8/selectors.py", line 225, in _fileobj_lookup
return _fileobj_to_fd(fileobj)
File "/usr/lib/python3.8/selectors.py", line 33, in _fileobj_to_fd
if isinstance(fileobj, int):
KeyboardInterrupt
And even after I’ve kill python, htop
shows a couple of hundred of zombie processes, which look like my command line call (by zombie I mean zero cpu use):
/usr/bin/python3 ./stanmaster_syntheticdata.py estimate --experiment=fig8 --sampleN=302 --allow-cheating --max-at-once=11
The actual set-up of multiprocessing is like this:
fns,names =[],[]
for ii,row in dfe.iterrows():
pdict = row[pcols].to_dict(into=OrderedDict)
print('{}\n{}\nN={}\nmodel={}\n{}'.format('='*80 , row, SAMPLE_N, MODELN, '='*80))
fns+= [[run_synthetic_dataset, [pdict], dict( model=MODELN, N=SAMPLE_N, recreate_data=False, ologit = False, stata=False, nIter=1000, allow_cheating= ALLOW_CHEATING)]]
names +=['V{}{}-stanmixture-{}'.format(list(row[pcols].values), row['hashname'], SAMPLE_N)]
runFunctionsInParallel(fns, names=names, parallel=defaults['server']['parallel'], maxAtOnce=MAX_AT_ONCE, # Four cores, or 8, it seems, per estimate
)
And that run_synthetic_dataset
calls both build and sample:
posterior = stan.build(stancode, data=datadict)
fit = posterior.sample(num_chains=4, num_samples=nIter)
It’s just that the model happens to have already been built, earlier. i.e., every process calls build.
I think the multiprocessing jobs are actually never returning, i.e. if I set it to have max 11 jobs, then 11 estimates run but do not return.
Running with “num_chains=1” in .sample makes no difference (except for fewer samples).
I hope I understood and tried your two suggestions.
I may try launching the jobs each through os.system().
Thank you!
Chris