jobserver: don't release the very last token in wait_all().

After waiting for children to exit, we would release our own token, and
then the caller would immediately try to obtain a token again.  This
accounted for tokens correctly, but would pass tokens around the call
tree in unexpected ways.

For example, imagine we had only one token.  We call 'redo a1 a2', and
a1 calls 'redo b1 b2', and b1 calls 'redo c1'.  When c1 exits, it
releases its token, then tries to re-acquire it before exiting.  This
also includes 'redo b1 b2' and 'redo a1 a2' in the race for the token,
which means b1 might get suspended while *either* a2 or b2 starts
running.

This never caused a deadlock, even if a2 or b2 depends on b1, because
if they tried to build b1, they would notice it is locked, give up
their token, and wait for the lock.  c1 (and then b1) could then obtain
the token and immediately terminate, allowing progress to continue.

But this is not really the way we expect things to happen.  "Obviously"
what we want here is a straightforward stack unwinding: c1 should finish,
then b1, then b2, then a1, then b2.

The not-very-obvious symptom of this bug is that redo's unit tests
seemed to run in the wrong order when using -j1 --no-log.  (--log would
hide the problem by rearranging logs back into the right order!)
This commit is contained in:
Avery Pennarun 2018-12-31 16:53:13 -05:00
commit e247a72300
2 changed files with 20 additions and 14 deletions

View file

@ -550,7 +550,7 @@ def run(targets, shouldbuildfunc):
while locked or jobserver.running(): while locked or jobserver.running():
state.commit() state.commit()
jobserver.wait_all() jobserver.wait_all()
assert jobserver._mytokens == 0 # pylint: disable=protected-access assert jobserver._mytokens <= 1 # pylint: disable=protected-access
jobserver.ensure_token_or_cheat('self', cheat) jobserver.ensure_token_or_cheat('self', cheat)
# at this point, we don't have any children holding any tokens, so # at this point, we don't have any children holding any tokens, so
# it's okay to block below. # it's okay to block below.

View file

@ -287,18 +287,17 @@ def _wait(want_token, max_delay):
_debug("done: %r\n" % pd.name) _debug("done: %r\n" % pd.name)
# redo subprocesses are expected to die without releasing their # redo subprocesses are expected to die without releasing their
# tokens, so things are less likely to get confused if they # tokens, so things are less likely to get confused if they
# die abnormally. That means a token has 'disappeared' and we # die abnormally. Since a child has died, that means a token has
# now need to recreate it. # 'disappeared' and we now need to recreate it.
b = _try_read(_cheatfds[0], 1) b = _try_read(_cheatfds[0], 1)
_debug('GOT cheatfd\n') if b:
if b is None: # someone exited with _cheats > 0, so we need to compensate
# by *not* re-creating a token now.
_debug('EAT cheatfd %r\n' % b)
else:
_create_tokens(1) _create_tokens(1)
if has_token(): if has_token():
_release_except_mine() _release_except_mine()
else:
# someone exited with _cheats > 0, so we need to compensate
# by *not* re-creating a token now.
pass
os.close(fd) os.close(fd)
del _waitfds[fd] del _waitfds[fd]
rv = os.waitpid(pd.pid, 0) rv = os.waitpid(pd.pid, 0)
@ -347,7 +346,6 @@ def _ensure_token(reason, max_delay=None):
break break
assert _mytokens < 1 assert _mytokens < 1
b = _try_read(_tokenfds[0], 1) b = _try_read(_tokenfds[0], 1)
_debug('GOT tokenfd\n')
if b == '': if b == '':
raise Exception('unexpected EOF on token read') raise Exception('unexpected EOF on token read')
if b: if b:
@ -407,10 +405,16 @@ def wait_all():
_debug("%d,%d -> wait_all\n" % (_mytokens, _cheats)) _debug("%d,%d -> wait_all\n" % (_mytokens, _cheats))
assert state.is_flushed() assert state.is_flushed()
while 1: while 1:
while _mytokens >= 1: while _mytokens >= 2:
release_mine() _release(1)
if not running(): if not running():
break break
# We should only release our last token if we have remaining
# children. A terminating redo process should try to terminate while
# holding a token, and if we have no children left, we might be
# about to terminate.
if _mytokens >= 1:
release_mine()
_debug("wait_all: wait()\n") _debug("wait_all: wait()\n")
_wait(want_token=0, max_delay=None) _wait(want_token=0, max_delay=None)
_debug("wait_all: empty list\n") _debug("wait_all: empty list\n")
@ -418,6 +422,8 @@ def wait_all():
# If we're the toplevel and we're sure no child processes remain, # If we're the toplevel and we're sure no child processes remain,
# then we know we're totally idle. Self-test to ensure no tokens # then we know we're totally idle. Self-test to ensure no tokens
# mysteriously got created/destroyed. # mysteriously got created/destroyed.
if _mytokens >= 1:
release_mine()
tokens = _try_read_all(_tokenfds[0], 8192) tokens = _try_read_all(_tokenfds[0], 8192)
cheats = _try_read_all(_cheatfds[0], 8192) cheats = _try_read_all(_cheatfds[0], 8192)
_debug('toplevel: GOT %d tokens and %d cheats\n' _debug('toplevel: GOT %d tokens and %d cheats\n'
@ -426,8 +432,8 @@ def wait_all():
raise Exception('on exit: expected %d tokens; found %r-%r' raise Exception('on exit: expected %d tokens; found %r-%r'
% (_toplevel, len(tokens), len(cheats))) % (_toplevel, len(tokens), len(cheats)))
os.write(_tokenfds[1], tokens) os.write(_tokenfds[1], tokens)
# note: when we return, we have *no* tokens, not even our own! # note: when we return, we may have *no* tokens, not even our own!
# If caller wants to continue, they have to obtain one right away. # If caller wants to continue, they might have to obtain one first.
def force_return_tokens(): def force_return_tokens():