apenwarr-redo/redo/builder.py
Avery Pennarun e247a72300 jobserver: don't release the very last token in wait_all().
After waiting for children to exit, we would release our own token, and
then the caller would immediately try to obtain a token again.  This
accounted for tokens correctly, but would pass tokens around the call
tree in unexpected ways.

For example, imagine we had only one token.  We call 'redo a1 a2', and
a1 calls 'redo b1 b2', and b1 calls 'redo c1'.  When c1 exits, it
releases its token, then tries to re-acquire it before exiting.  This
also includes 'redo b1 b2' and 'redo a1 a2' in the race for the token,
which means b1 might get suspended while *either* a2 or b2 starts
running.

This never caused a deadlock, even if a2 or b2 depends on b1, because
if they tried to build b1, they would notice it is locked, give up
their token, and wait for the lock.  c1 (and then b1) could then obtain
the token and immediately terminate, allowing progress to continue.

But this is not really the way we expect things to happen.  "Obviously"
what we want here is a straightforward stack unwinding: c1 should finish,
then b1, then b2, then a1, then b2.

The not-very-obvious symptom of this bug is that redo's unit tests
seemed to run in the wrong order when using -j1 --no-log.  (--log would
hide the problem by rearranging logs back into the right order!)
2018-12-31 19:02:55 -05:00

605 lines
24 KiB
Python

"""Code for parallel-building a set of targets, if needed."""
import errno, os, stat, signal, sys, tempfile, time
from . import cycles, env, jobserver, logs, state, paths
from .helpers import unlink, close_on_exec
from .logs import debug2, err, warn, meta
def _nice(t):
return state.relpath(t, env.v.STARTDIR)
def _try_stat(filename):
try:
return os.lstat(filename)
except OSError, e:
if e.errno == errno.ENOENT:
return None
else:
raise
log_reader_pid = None
def close_stdin():
f = open('/dev/null')
os.dup2(f.fileno(), 0)
f.close()
def start_stdin_log_reader(status, details, pretty, color,
debug_locks, debug_pids):
"""Redirect stderr to a redo-log instance with the given options.
Then we automatically run logs.setup() to send the right data format
to that redo-log instance.
After this, be sure to run await_log_reader() before exiting.
"""
global log_reader_pid
r, w = os.pipe() # main pipe to redo-log
ar, aw = os.pipe() # ack pipe from redo-log --ack-fd
sys.stdout.flush()
sys.stderr.flush()
pid = os.fork()
if pid:
# parent
log_reader_pid = pid
os.close(r)
os.close(aw)
b = os.read(ar, 8)
if not b:
# subprocess died without sending us anything: that's bad.
err('failed to start redo-log subprocess; cannot continue.\n')
os._exit(99)
assert b == 'REDO-OK\n'
# now we know the subproc is running and will report our errors
# to stderr, so it's okay to lose our own stderr.
os.close(ar)
os.dup2(w, 1)
os.dup2(w, 2)
os.close(w)
logs.setup(tty=sys.stderr, pretty=False, color=False)
else:
# child
try:
os.close(ar)
os.close(w)
os.dup2(r, 0)
os.close(r)
# redo-log sends to stdout (because if you ask for logs, that's
# the output you wanted!). But redo itself sends logs to stderr
# (because they're incidental to the thing you asked for).
# To make these semantics work, we point redo-log's stdout at
# our stderr when we launch it.
os.dup2(2, 1)
argv = [
'redo-log',
'--recursive', '--follow',
'--ack-fd', str(aw),
('--status' if status and os.isatty(2) else '--no-status'),
('--details' if details else '--no-details'),
('--pretty' if pretty else '--no-pretty'),
('--debug-locks' if debug_locks else '--no-debug-locks'),
('--debug-pids' if debug_pids else '--no-debug-pids'),
]
if color != 1:
argv.append('--color' if color >= 2 else '--no-color')
argv.append('-')
os.execvp(argv[0], argv)
except Exception, e: # pylint: disable=broad-except
sys.stderr.write('redo-log: exec: %s\n' % e)
finally:
os._exit(99)
def await_log_reader():
"""Await the redo-log instance we redirected stderr to, if any."""
if not env.v.LOG:
return
if log_reader_pid > 0:
# never actually close fd#1 or fd#2; insanity awaits.
# replace it with something else instead.
# Since our stdout/stderr are attached to redo-log's stdin,
# this will notify redo-log that it's time to die (after it finishes
# reading the logs)
out = open('/dev/tty', 'w')
os.dup2(out.fileno(), 1)
os.dup2(out.fileno(), 2)
os.waitpid(log_reader_pid, 0)
class ImmediateReturn(Exception):
def __init__(self, rv):
Exception.__init__(self, "immediate return with exit code %d" % rv)
self.rv = rv
class _BuildJob(object):
def __init__(self, t, sf, lock, shouldbuildfunc, donefunc):
self.t = t # original target name, not relative to env.v.BASE
self.sf = sf
self.tmpname = None
self.lock = lock
self.shouldbuildfunc = shouldbuildfunc
self.donefunc = donefunc
self.before_t = _try_stat(self.t)
# attributes of the running process
self.outfile = None
def start(self):
"""Actually start running this job in a subproc, if needed."""
assert self.lock.owned
try:
try:
is_target, dirty = self.shouldbuildfunc(self.t)
except cycles.CyclicDependencyError:
err('cyclic dependency while checking %s\n' % _nice(self.t))
raise ImmediateReturn(208)
if not dirty:
# target doesn't need to be built; skip the whole task
if is_target:
meta('unchanged', state.target_relpath(self.t))
return self._finalize(0)
except ImmediateReturn, e:
return self._finalize(e.rv)
if env.v.NO_OOB or dirty == True: # pylint: disable=singleton-comparison
self._start_self()
else:
self._start_deps_unlocked(dirty)
def _start_self(self):
"""Run jobserver.start() to build this object's target file."""
assert self.lock.owned
t = self.t
sf = self.sf
newstamp = sf.read_stamp()
if (sf.is_generated and
newstamp != state.STAMP_MISSING and
(sf.is_override or state.detect_override(sf.stamp, newstamp))):
state.warn_override(_nice(t))
if not sf.is_override:
warn('%s - old: %r\n' % (_nice(t), sf.stamp))
warn('%s - new: %r\n' % (_nice(t), newstamp))
sf.set_override()
sf.save()
# fall through and treat it the same as a static file
if (os.path.exists(t) and not os.path.isdir(t + '/.')
and (sf.is_override or not sf.is_generated)):
# an existing source file that was not generated by us.
# This step is mentioned by djb in his notes.
# For example, a rule called default.c.do could be used to try
# to produce hello.c, but we don't want that to happen if
# hello.c was created by the end user.
debug2("-- static (%r)\n" % t)
sf.set_static()
sf.save()
return self._finalize(0)
sf.zap_deps1()
(dodir, dofile, _, basename, ext) = paths.find_do_file(sf)
if not dofile:
if os.path.exists(t):
sf.set_static()
sf.save()
return self._finalize(0)
else:
err('no rule to redo %r\n' % t)
return self._finalize(1)
# There is no good place for us to pre-create a temp file for
# stdout. The target dir might not exist yet, or it might currently
# exist but get wiped by the .do script. Other dirs, like the one
# containing the .do file, might be mounted readonly. We can put it
# in the system temp dir, but then we can't necessarily rename it to
# the target filename because it might cross filesystem boundaries.
# Also, if redo is interrupted, it would leave a temp file lying
# around. To avoid all this, use mkstemp() to create a temp file
# wherever it wants to, and immediately unlink it, but keep a file
# handle open. When the .do script finishes, we can copy the
# content out of that nameless file handle into a file in the same
# dir as the target (which by definition must now exist, if you
# wanted the target to exist).
#
# On the other hand, the $3 temp filename can be hardcoded to be in
# the target directory, even if that directory does not exist.
# It's not *redo*'s job to create that file. The .do file will
# create it, if it wants, and it's the .do file's job to first ensure
# that the directory exists.
tmpbase = os.path.join(dodir, basename + ext)
self.tmpname = tmpbase + '.redo.tmp'
unlink(self.tmpname)
ffd, fname = tempfile.mkstemp(prefix='redo.', suffix='.tmp')
close_on_exec(ffd, True)
os.unlink(fname)
self.outfile = os.fdopen(ffd, 'w+')
# this will run in the dofile's directory, so use only basenames here
arg1 = basename + ext # target name (including extension)
arg2 = basename # target name (without extension)
argv = ['sh', '-e',
dofile,
arg1,
arg2,
# $3 temp output file name
state.relpath(os.path.abspath(self.tmpname), dodir),
]
if env.v.VERBOSE:
argv[1] += 'v'
if env.v.XTRACE:
argv[1] += 'x'
firstline = open(os.path.join(dodir, dofile)).readline().strip()
if firstline.startswith('#!/'):
argv[0:2] = firstline[2:].split(' ')
# make sure to create the logfile *before* writing the meta() about it.
# that way redo-log won't trace into an obsolete logfile.
if env.v.LOG:
open(state.logname(self.sf.id), 'w')
dof = state.File(name=os.path.join(dodir, dofile))
dof.set_static()
dof.save()
state.commit()
meta('do', state.target_relpath(t))
def call_subproc():
self._subproc(dodir, basename, ext, argv)
def call_exited(t, rv):
self._subproc_exited(t, rv, argv)
jobserver.start(t, call_subproc, call_exited)
def _start_deps_unlocked(self, dirty):
"""Run jobserver.start to build objects needed to check deps.
Out-of-band redo of some sub-objects. This happens when we're not
quite sure if t needs to be built or not (because some children
look dirty, but might turn out to be clean thanks to redo-stamp
checksums). We have to call redo-unlocked to figure it all out.
Note: redo-unlocked will handle all the updating of sf, so we don't
have to do it here, nor call _record_new_state. However, we have to
hold onto the lock because otherwise we would introduce a race
condition; that's why it's called redo-unlocked, because it doesn't
grab a lock.
"""
# FIXME: redo-unlocked is kind of a weird hack.
# Maybe we should just start jobs to build the necessary deps
# directly from this process, and when done, reconsider building
# the target we started with. But that makes this one process's
# build recursive, where currently it's flat.
here = os.getcwd()
def _fix(p):
return state.relpath(os.path.join(env.v.BASE, p), here)
argv = (['redo-unlocked', _fix(self.sf.name)] +
list(set(_fix(d.name) for d in dirty)))
meta('check', state.target_relpath(self.t))
state.commit()
def subtask():
os.environ['REDO_DEPTH'] = env.v.DEPTH + ' '
# python ignores SIGPIPE
signal.signal(signal.SIGPIPE, signal.SIG_DFL)
os.execvp(argv[0], argv)
assert 0
# returns only if there's an exception
def job_exited(t, rv):
return self._finalize(rv)
jobserver.start(self.t, jobfunc=subtask, donefunc=job_exited)
def _subproc(self, dodir, basename, ext, argv):
"""The function by jobserver.start to exec the build script.
This is run in the *child* process.
"""
# careful: REDO_PWD was the PWD relative to the STARTPATH at the time
# we *started* building the current target; but that target ran
# redo-ifchange, and it might have done it from a different directory
# than we started it in. So os.getcwd() might be != REDO_PWD right
# now.
assert state.is_flushed()
newp = os.path.realpath(dodir)
os.environ['REDO_PWD'] = state.relpath(newp, env.v.STARTDIR)
os.environ['REDO_TARGET'] = basename + ext
os.environ['REDO_DEPTH'] = env.v.DEPTH + ' '
cycles.add(self.lock.fid)
if dodir:
os.chdir(dodir)
os.dup2(self.outfile.fileno(), 1)
os.close(self.outfile.fileno())
close_on_exec(1, False)
if env.v.LOG:
cur_inode = str(os.fstat(2).st_ino)
if not env.v.LOG_INODE or cur_inode == env.v.LOG_INODE:
# .do script has *not* redirected stderr, which means we're
# using redo-log's log saving mode. That means subprocs
# should be logged to their own file. If the .do script
# *does* redirect stderr, that redirection should be inherited
# by subprocs, so we'd do nothing.
logf = open(state.logname(self.sf.id), 'w')
new_inode = str(os.fstat(logf.fileno()).st_ino)
os.environ['REDO_LOG'] = '1' # .do files can check this
os.environ['REDO_LOG_INODE'] = new_inode
os.dup2(logf.fileno(), 2)
close_on_exec(2, False)
logf.close()
else:
if 'REDO_LOG_INODE' in os.environ:
del os.environ['REDO_LOG_INODE']
os.environ['REDO_LOG'] = ''
signal.signal(signal.SIGPIPE, signal.SIG_DFL) # python ignores SIGPIPE
if env.v.VERBOSE or env.v.XTRACE:
logs.write('* %s' % ' '.join(argv).replace('\n', ' '))
os.execvp(argv[0], argv)
# FIXME: it would be nice to log the exit code to logf.
# But that would have to happen in the parent process, which doesn't
# have logf open.
assert 0
# returns only if there's an exception
def _subproc_exited(self, t, rv, argv):
"""Called by the jobserver when our subtask exits.
This is run in the *parent* process.
"""
try:
state.check_sane()
rv = self._record_new_state(t, rv, argv)
state.commit()
finally:
self._finalize(rv)
def _record_new_state(self, t, rv, argv):
"""After a subtask finishes, handle its changes to the output file.
This is run in the *parent* process.
This includes renaming temp files into place and detecting mistakes
(like writing directly to $1 instead of $3). We also have to record
the new file stamp data for the completed target.
"""
outfile = self.outfile
before_t = self.before_t
after_t = _try_stat(t)
st1 = os.fstat(outfile.fileno())
st2 = _try_stat(self.tmpname)
if (after_t and
(not before_t or before_t.st_mtime != after_t.st_mtime) and
not stat.S_ISDIR(after_t.st_mode)):
err('%s modified %s directly!\n' % (argv[2], t))
err('...you should update $3 (a temp file) or stdout, not $1.\n')
rv = 206
elif st2 and st1.st_size > 0:
err('%s wrote to stdout *and* created $3.\n' % argv[2])
err('...you should write status messages to stderr, not stdout.\n')
rv = 207
if rv == 0:
# FIXME: race condition here between updating stamp/is_generated
# and actually renaming the files into place. There needs to
# be some kind of two-stage commit, I guess.
if st1.st_size > 0 and not st2:
# script wrote to stdout. Copy its contents to the tmpfile.
unlink(self.tmpname)
try:
newf = open(self.tmpname, 'w')
except IOError, e:
dnt = os.path.dirname(os.path.abspath(t))
if not os.path.exists(dnt):
# This could happen, so report a simple error message
# that gives a hint for how to fix your .do script.
err('%s: target dir %r does not exist!\n' % (t, dnt))
else:
# This could happen for, eg. a permissions error on
# the target directory.
err('%s: copy stdout: %s\n' % (t, e))
rv = 209
else:
self.outfile.seek(0)
while 1:
b = self.outfile.read(1024*1024)
if not b:
break
newf.write(b)
newf.close()
st2 = _try_stat(self.tmpname)
if st2:
# either $3 file was created *or* stdout was written to.
# therefore tmpfile now exists.
try:
# Atomically replace the target file
os.rename(self.tmpname, t)
except OSError, e:
# This could happen for, eg. a permissions error on
# the target directory.
err('%s: rename %s: %s\n' % (t, self.tmpname, e))
rv = 209
else: # no output generated at all; that's ok
unlink(t)
sf = self.sf
sf.refresh()
sf.is_generated = True
sf.is_override = False
if sf.is_checked() or sf.is_changed():
# it got checked during the run; someone ran redo-stamp.
# update_stamp would call set_changed(); we don't want that,
# so only use read_stamp.
sf.stamp = sf.read_stamp()
else:
sf.csum = None
sf.update_stamp()
sf.set_changed()
else:
unlink(self.tmpname)
sf = self.sf
sf.set_failed()
sf.zap_deps2()
sf.save()
outfile.close()
meta('done', '%d %s' % (rv, state.target_relpath(self.t)))
return rv
def _finalize(self, rv):
"""After a target is built, report completion and unlock.
This is run in the *parent* process.
Note: you usually need to call _record_new_state() first.
"""
try:
self.donefunc(self.t, rv)
assert self.lock.owned
finally:
self.lock.unlock()
def run(targets, shouldbuildfunc):
"""Build the given list of targets, if necessary.
Builds in parallel using whatever jobserver tokens can be obtained.
Args:
targets: a list of target filenames to consider building.
shouldbuildfunc: a function(target) which determines whether the given
target needs to be built, as of the time it is called.
Returns:
0 if all necessary targets returned exit code zero; nonzero otherwise.
"""
retcode = [0] # a list so that it can be reassigned from done()
if env.v.SHUFFLE:
import random
random.shuffle(targets)
locked = []
def job_exited(t, rv):
if rv:
retcode[0] = 1
if env.v.TARGET and not env.v.UNLOCKED:
me = os.path.join(env.v.STARTDIR,
os.path.join(env.v.PWD, env.v.TARGET))
myfile = state.File(name=me)
selflock = state.Lock(state.LOG_LOCK_MAGIC + myfile.id)
else:
selflock = myfile = me = None
def cheat():
if not selflock:
return 0
selflock.trylock()
if not selflock.owned:
# redo-log already owns it: let's cheat.
# Give ourselves one extra token so that the "foreground" log
# can always make progress.
return 1
else:
# redo-log isn't watching us (yet)
selflock.unlock()
return 0
# In the first cycle, we just build as much as we can without worrying
# about any lock contention. If someone else has it locked, we move on.
seen = {}
lock = None
for t in targets:
if not t:
err('cannot build the empty target ("").\n')
retcode[0] = 204
break
assert state.is_flushed()
if t in seen:
continue
seen[t] = 1
if not jobserver.has_token():
state.commit()
jobserver.ensure_token_or_cheat(t, cheat)
if retcode[0] and not env.v.KEEP_GOING:
break
if not state.check_sane():
err('.redo directory disappeared; cannot continue.\n')
retcode[0] = 205
break
f = state.File(name=t)
lock = state.Lock(f.id)
if env.v.UNLOCKED:
lock.owned = True
else:
lock.trylock()
if not lock.owned:
meta('locked', state.target_relpath(t))
locked.append((f.id, t, f.name))
else:
# We had to create f before we had a lock, because we need f.id
# to make the lock. But someone may have updated the state
# between then and now.
# FIXME: separate obtaining the fid from creating the File.
# FIXME: maybe integrate locking into the File object?
f.refresh()
_BuildJob(t, f, lock,
shouldbuildfunc=shouldbuildfunc,
donefunc=job_exited).start()
state.commit()
assert state.is_flushed()
lock = None
del lock
# Now we've built all the "easy" ones. Go back and just wait on the
# remaining ones one by one. There's no reason to do it any more
# efficiently, because if these targets were previously locked, that
# means someone else was building them; thus, we probably won't need to
# do anything. The only exception is if we're invoked as redo instead
# of redo-ifchange; then we have to redo it even if someone else already
# did. But that should be rare.
while locked or jobserver.running():
state.commit()
jobserver.wait_all()
assert jobserver._mytokens <= 1 # pylint: disable=protected-access
jobserver.ensure_token_or_cheat('self', cheat)
# at this point, we don't have any children holding any tokens, so
# it's okay to block below.
if retcode[0] and not env.v.KEEP_GOING:
break
if locked:
if not state.check_sane():
err('.redo directory disappeared; cannot continue.\n')
retcode[0] = 205
break
fid, t, _ = locked.pop(0)
lock = state.Lock(fid)
backoff = 0.01
lock.trylock()
while not lock.owned:
# Don't spin with 100% CPU while we fight for the lock.
import random
time.sleep(random.random() * min(backoff, 1.0))
backoff *= 2
# after printing this line, redo-log will recurse into t,
# whether it's us building it, or someone else.
meta('waiting', state.target_relpath(t))
try:
lock.check()
except cycles.CyclicDependencyError:
err('cyclic dependency while building %s\n' % _nice(t))
retcode[0] = 208
return retcode[0]
# this sequence looks a little silly, but the idea is to
# give up our personal token while we wait for the lock to
# be released; but we should never run ensure_token() while
# holding a lock, or we could cause deadlocks.
jobserver.release_mine()
lock.waitlock()
# now t is definitely free, so we get to decide whether
# to build it.
lock.unlock()
jobserver.ensure_token_or_cheat(t, cheat)
lock.trylock()
assert lock.owned
meta('unlocked', state.target_relpath(t))
if state.File(name=t).is_failed():
err('%s: failed in another thread\n' % _nice(t))
retcode[0] = 2
lock.unlock()
else:
_BuildJob(t, state.File(fid=fid), lock,
shouldbuildfunc=shouldbuildfunc,
donefunc=job_exited).start()
lock = None
state.commit()
return retcode[0]