From 3dbdfbc06f68c47035b832b3fade26b1b0ff014c Mon Sep 17 00:00:00 2001 From: Avery Pennarun Date: Fri, 18 Jan 2019 00:06:18 +0000 Subject: [PATCH] Better handling if parent closes REDO_CHEATFDS or MAKEFLAGS fds. Silently recover if REDO_CHEATFDS file descriptors are closed, because they aren't completely essential and MAKEFLAGS-related warnings already get printed if all file descriptors have been closed. If MAKEFLAGS --jobserver-auth flags are closed, improve the error message so that a) it's a normal error instead of an exception and b) we link to documentation about why it happens. Also write some more detailed documentation about what's going on here. --- docs/FAQInterop.md | 2 - docs/FAQParallel.md | 53 +++++++++++++++++++++ docs/extra_style.css | 2 +- redo/builder.py | 27 ++++------- redo/cmd_ifchange.py | 6 +-- redo/cmd_redo.py | 4 +- redo/helpers.py | 14 +++++- redo/jobserver.py | 33 +++++++------ redo/state.py | 12 ++--- t/204-makeflags/all.do | 16 +++++++ t/204-makeflags/clean.do | 1 + t/204-makeflags/closefds.py | 12 +++++ t/204-makeflags/noflags.do | 1 + t/{204-readonly => 205-readonly}/.gitignore | 0 t/{204-readonly => 205-readonly}/all.do | 0 t/{204-readonly => 205-readonly}/clean.do | 0 16 files changed, 136 insertions(+), 47 deletions(-) create mode 100644 t/204-makeflags/all.do create mode 100644 t/204-makeflags/clean.do create mode 100644 t/204-makeflags/closefds.py create mode 100644 t/204-makeflags/noflags.do rename t/{204-readonly => 205-readonly}/.gitignore (100%) rename t/{204-readonly => 205-readonly}/all.do (100%) rename t/{204-readonly => 205-readonly}/clean.do (100%) diff --git a/docs/FAQInterop.md b/docs/FAQInterop.md index 551c4ab..8070e24 100644 --- a/docs/FAQInterop.md +++ b/docs/FAQInterop.md @@ -51,5 +51,3 @@ dependencies are too generous, we end up calling 'make all' more often than necessary. But 'make all' probably runs pretty fast when there's nothing to do, so that's not so bad. - - diff --git a/docs/FAQParallel.md b/docs/FAQParallel.md index b642e2b..e80d898 100644 --- a/docs/FAQParallel.md +++ b/docs/FAQParallel.md @@ -156,3 +156,56 @@ Yes. Put this in your .do script: The child makes will then not have access to the jobserver, so will build serially instead. + + + +# What does the "broken --jobserver-auth" error mean? + +redo (and GNU make) use the `MAKEFLAGS` environment variable to pass +information about the parallel build environment from one process to the +next. Inside `MAKEFLAGS` is a string that looks like either +`--jobserver-auth=X,Y` or `--jobserver-fds=X,Y`, depending on the version of +make. + +If redo finds one of these strings, but the file descriptors named by `X` +and `Y` are *not* available in the subprocess, that means some ill-behaved +parent process has closed them. This prevents parallelism from working, so +redo aborts to let you know something is seriously wrong. + +GNU make will intentionally close these file descriptors if you write a +Makefile rule that contains *neither* the exact string `$(MAKE)` nor a +leading `+` character. So you might have had a Makefile rule that looked +like this: + + subdir/all: + $(MAKE) -C subdir all + +and that worked as expected: the sub-make inherited your parallelism +settings. But people are sometimes surprised to find that this doesn't work +as expected: + + subdir/all: + make -C subdir all + +In that case, the sub-make does *not* inherit the jobserver file +descriptors, so it runs serially. If for some reason you don't want to use +`$(MAKE)` but you do want parallelism, you need to write something like this +instead: + + subdir/all: + +make -C subdir all + +And similarly, if you recurse into redo instead of make, you need the same +trick: + + subdir/all: + +redo subdir/all + +There are a few other programs that also close file descriptors. For +example, if your .do file starts with `#!/usr/bin/env xonsh`, you might +run into [a bug in xonsh where it closes file descriptors +incorrectly](https://github.com/xonsh/xonsh/issues/2984). + +If you really can't stop your program from closing file descriptors that it +shouldn't, you can work around the problem by unsetting `MAKEFLAGS`. This +will let your program build, but will disable parallelism. diff --git a/docs/extra_style.css b/docs/extra_style.css index 6d814df..4a07667 100644 --- a/docs/extra_style.css +++ b/docs/extra_style.css @@ -1,7 +1,7 @@ p code { border: 0; padding: 0 2px 0 2px; - font-size: 100%; + font-size: 80%; white-space: nowrap; color: #a04040; letter-spacing: -1px; diff --git a/redo/builder.py b/redo/builder.py index e55955f..94788aa 100644 --- a/redo/builder.py +++ b/redo/builder.py @@ -1,7 +1,6 @@ """Code for parallel-building a set of targets, if needed.""" import errno, os, stat, signal, sys, tempfile, time -from . import cycles, env, jobserver, logs, state, paths -from .helpers import unlink, close_on_exec +from . import cycles, env, helpers, jobserver, logs, paths, state from .logs import debug2, err, warn, meta @@ -110,12 +109,6 @@ def await_log_reader(): os.waitpid(log_reader_pid, 0) -class ImmediateReturn(Exception): - def __init__(self, rv): - Exception.__init__(self, "immediate return with exit code %d" % rv) - self.rv = rv - - class _BuildJob(object): def __init__(self, t, sf, lock, shouldbuildfunc, donefunc): self.t = t # original target name, not relative to env.v.BASE @@ -137,13 +130,13 @@ class _BuildJob(object): is_target, dirty = self.shouldbuildfunc(self.t) except cycles.CyclicDependencyError: err('cyclic dependency while checking %s\n' % _nice(self.t)) - raise ImmediateReturn(208) + raise helpers.ImmediateReturn(208) if not dirty: # target doesn't need to be built; skip the whole task if is_target: meta('unchanged', state.target_relpath(self.t)) return self._finalize(0) - except ImmediateReturn, e: + except helpers.ImmediateReturn, e: return self._finalize(e.rv) if env.v.NO_OOB or dirty == True: # pylint: disable=singleton-comparison @@ -209,9 +202,9 @@ class _BuildJob(object): # that the directory exists. tmpbase = os.path.join(dodir, basename + ext) self.tmpname = tmpbase + '.redo.tmp' - unlink(self.tmpname) + helpers.unlink(self.tmpname) ffd, fname = tempfile.mkstemp(prefix='redo.', suffix='.tmp') - close_on_exec(ffd, True) + helpers.close_on_exec(ffd, True) os.unlink(fname) self.outfile = os.fdopen(ffd, 'w+') # this will run in the dofile's directory, so use only basenames here @@ -303,7 +296,7 @@ class _BuildJob(object): os.chdir(dodir) os.dup2(self.outfile.fileno(), 1) os.close(self.outfile.fileno()) - close_on_exec(1, False) + helpers.close_on_exec(1, False) if env.v.LOG: cur_inode = str(os.fstat(2).st_ino) if not env.v.LOG_INODE or cur_inode == env.v.LOG_INODE: @@ -317,7 +310,7 @@ class _BuildJob(object): os.environ['REDO_LOG'] = '1' # .do files can check this os.environ['REDO_LOG_INODE'] = str(new_inode) os.dup2(logf.fileno(), 2) - close_on_exec(2, False) + helpers.close_on_exec(2, False) logf.close() else: if 'REDO_LOG_INODE' in os.environ: @@ -375,7 +368,7 @@ class _BuildJob(object): # be some kind of two-stage commit, I guess. if st1.st_size > 0 and not st2: # script wrote to stdout. Copy its contents to the tmpfile. - unlink(self.tmpname) + helpers.unlink(self.tmpname) try: newf = open(self.tmpname, 'w') except IOError, e: @@ -410,7 +403,7 @@ class _BuildJob(object): err('%s: rename %s: %s\n' % (t, self.tmpname, e)) rv = 209 else: # no output generated at all; that's ok - unlink(t) + helpers.unlink(t) sf = self.sf sf.refresh() sf.is_generated = True @@ -425,7 +418,7 @@ class _BuildJob(object): sf.update_stamp() sf.set_changed() else: - unlink(self.tmpname) + helpers.unlink(self.tmpname) sf = self.sf sf.set_failed() sf.zap_deps2() diff --git a/redo/cmd_ifchange.py b/redo/cmd_ifchange.py index 37609cf..aaa0e41 100644 --- a/redo/cmd_ifchange.py +++ b/redo/cmd_ifchange.py @@ -1,13 +1,13 @@ """redo-ifchange: build the given targets if they have changed.""" import os, sys, traceback -from . import env, builder, deps, jobserver, logs, state +from . import env, builder, deps, helpers, jobserver, logs, state from .logs import debug2, err def should_build(t): f = state.File(name=t) if f.is_failed(): - raise builder.ImmediateReturn(32) + raise helpers.ImmediateReturn(32) dirty = deps.isdirty(f, depth='', max_changed=env.v.RUNID, already_checked=[]) return f.is_generated, dirty == [f] and deps.DIRTY or dirty @@ -56,7 +56,7 @@ def main(): traceback.print_exc(100, sys.stderr) err('unexpected error: %r\n' % e) rv = 1 - except KeyboardInterrupt: + except (KeyboardInterrupt, helpers.ImmediateReturn): if env.is_toplevel: builder.await_log_reader() sys.exit(200) diff --git a/redo/cmd_redo.py b/redo/cmd_redo.py index 5c0a73d..fd2cd50 100644 --- a/redo/cmd_redo.py +++ b/redo/cmd_redo.py @@ -15,7 +15,7 @@ # limitations under the License. # import sys, os, traceback -from . import builder, env, jobserver, logs, options, state +from . import builder, env, helpers, jobserver, logs, options, state from .atoi import atoi from .logs import warn, err @@ -121,7 +121,7 @@ def main(): if env.is_toplevel: builder.await_log_reader() sys.exit(retcode) - except KeyboardInterrupt: + except (KeyboardInterrupt, helpers.ImmediateReturn): if env.is_toplevel: builder.await_log_reader() sys.exit(200) diff --git a/redo/helpers.py b/redo/helpers.py index 70fbfc9..55d190c 100644 --- a/redo/helpers.py +++ b/redo/helpers.py @@ -2,8 +2,10 @@ import os, errno, fcntl -def join(between, l): - return between.join(l) +class ImmediateReturn(Exception): + def __init__(self, rv): + Exception.__init__(self, "immediate return with exit code %d" % rv) + self.rv = rv def unlink(f): @@ -25,3 +27,11 @@ def close_on_exec(fd, yes): if yes: fl |= fcntl.FD_CLOEXEC fcntl.fcntl(fd, fcntl.F_SETFD, fl) + + +def fd_exists(fd): + try: + fcntl.fcntl(fd, fcntl.F_GETFD) + except IOError: + return False + return True diff --git a/redo/jobserver.py b/redo/jobserver.py index 8df3b24..ab6984f 100644 --- a/redo/jobserver.py +++ b/redo/jobserver.py @@ -73,9 +73,8 @@ # simpler :) # import sys, os, errno, select, fcntl, signal -from . import env, state, logs +from . import env, helpers, logs, state from .atoi import atoi -from .helpers import close_on_exec _toplevel = 0 _mytokens = 1 @@ -228,16 +227,15 @@ def setup(maxjobs): a = atoi(a) b = atoi(b) if a <= 0 or b <= 0: - raise ValueError('invalid --jobserver-auth: %r' % arg) - try: - fcntl.fcntl(a, fcntl.F_GETFL) - fcntl.fcntl(b, fcntl.F_GETFL) - except IOError, e: - if e.errno == errno.EBADF: - raise ValueError('broken --jobserver-auth from make; ' + - 'prefix your Makefile rule with a "+"') - else: - raise + logs.err('invalid --jobserver-auth: %r\n' % arg) + raise helpers.ImmediateReturn(200) + if not helpers.fd_exists(a) or not helpers.fd_exists(b): + logs.err('broken --jobserver-auth from parent process:\n') + logs.err(' using GNU make? prefix your Makefile rule with "+"\n') + logs.err( + ' otherwise, see ' + + 'https://redo.rtfd.io/en/latest/FAQParallel/#MAKEFLAGS\n') + raise helpers.ImmediateReturn(200) if maxjobs == 1: # user requested exactly one token, which means they want to # serialize us, even if the parent redo is running in parallel. @@ -254,6 +252,7 @@ def setup(maxjobs): _tokenfds = (a, b) cheats = os.getenv('REDO_CHEATFDS', '') if not maxjobs else '' + _cheatfds = None if cheats: (a, b) = cheats.split(',', 1) a = atoi(a) @@ -261,7 +260,13 @@ def setup(maxjobs): if a <= 0 or b <= 0: raise ValueError('invalid REDO_CHEATFDS: %r' % cheats) _cheatfds = (a, b) - else: + if not helpers.fd_exists(a) or not helpers.fd_exists(b): + # This can happen if we're called by a parent process who closes + # all "unknown" file descriptors (which is anti-social behaviour, + # but oh well, we'll warn about it if they close the jobserver + # fds in MAKEFLAGS, so just ignore it if it also happens here). + _cheatfds = None + if not _cheatfds: _cheatfds = _make_pipe(102) os.environ['REDO_CHEATFDS'] = ('%d,%d' % (_cheatfds[0], _cheatfds[1])) @@ -534,7 +539,7 @@ def start(reason, jobfunc, donefunc): finally: _debug('exit: %d\n' % rv) os._exit(rv) - close_on_exec(r, True) + helpers.close_on_exec(r, True) os.close(w) pd = Job(reason, pid, donefunc) _waitfds[r] = pd diff --git a/redo/state.py b/redo/state.py index 409a64f..4ad5f53 100644 --- a/redo/state.py +++ b/redo/state.py @@ -1,7 +1,7 @@ """Code for manipulating redo's state database.""" import sys, os, errno, stat, fcntl, sqlite3 from . import cycles, env -from .helpers import unlink, close_on_exec, join +from .helpers import unlink, close_on_exec from .logs import warn, debug2, debug3 SCHEMA_VER = 2 @@ -192,7 +192,7 @@ def relpath(t, base): while bparts: tparts.insert(0, '..') bparts.pop(0) - return join('/', tparts) + return '/'.join(tparts) # Return a relative path for t that will work after we do @@ -247,7 +247,7 @@ class File(object): # initialized, which we should fix, and then re-enable warning. # pylint: disable=attribute-defined-outside-init def _init_from_idname(self, fid, name, allow_add): - q = ('select %s from Files ' % join(', ', _file_cols)) + q = ('select %s from Files ' % ', '.join(_file_cols)) if fid != None: q += 'where rowid=?' l = [fid] @@ -294,7 +294,7 @@ class File(object): self._init_from_idname(self.id, None, allow_add=False) def save(self): - cols = join(', ', ['%s=?'%i for i in _file_cols[2:]]) + cols = ', '.join(['%s=?'%i for i in _file_cols[2:]]) _write('update Files set ' ' %s ' ' where rowid=?' % cols, @@ -395,7 +395,7 @@ class File(object): q = ('select Deps.mode, Deps.source, %s ' ' from Files ' ' join Deps on Files.rowid = Deps.source ' - ' where target=?' % join(', ', _file_cols[1:])) + ' where target=?' % ', '.join(_file_cols[1:])) for row in db().execute(q, [self.id]).fetchall(): mode = row[0] cols = row[1:] @@ -465,7 +465,7 @@ class File(object): def files(): - q = ('select %s from Files order by name' % join(', ', _file_cols)) + q = ('select %s from Files order by name' % ', '.join(_file_cols)) for cols in db().execute(q).fetchall(): yield File(cols=cols) diff --git a/t/204-makeflags/all.do b/t/204-makeflags/all.do new file mode 100644 index 0000000..de78104 --- /dev/null +++ b/t/204-makeflags/all.do @@ -0,0 +1,16 @@ +# Make sure we can survive if a process closes all file descriptors, +# including any jobserver file descriptors, as long as they also +# unset MAKEFLAGS. +redo-ifchange ../../redo/py + +# If we leave MAKEFLAGS set, then it's fair game to complain that the +# advertised file descriptors are gone, because GNU make also complains. +# (Although they only warn while we abort. They can't abort so that +# they don't break backward compat, but we have no such constraint, because +# redo has always failed for that case.) +# +# On the other hand, we shouldn't have to unset REDO_CHEATFDS, both for +# backward compatibility, and because REDO_CHEATFDS is undocumented. +# redo should recover silently from that problem. +unset MAKEFLAGS +../../redo/py ./closefds.py redo noflags diff --git a/t/204-makeflags/clean.do b/t/204-makeflags/clean.do new file mode 100644 index 0000000..be320e6 --- /dev/null +++ b/t/204-makeflags/clean.do @@ -0,0 +1 @@ +rm -f *~ .*~ diff --git a/t/204-makeflags/closefds.py b/t/204-makeflags/closefds.py new file mode 100644 index 0000000..2f57b10 --- /dev/null +++ b/t/204-makeflags/closefds.py @@ -0,0 +1,12 @@ +import subprocess, sys, os + +# subprocess.call(close_fds=True) is unfortunately not a good idea, +# because some versions (Debian's python version?) try to close inordinately +# many file descriptors, like 0..1000000, which takes a very long time. +# +# We happen to know that redo doesn't need such huge fd values, so we'll +# just cheat and use a smaller range. +os.closerange(3, 1024) +rv = subprocess.call(sys.argv[1:]) +sys.exit(rv) + diff --git a/t/204-makeflags/noflags.do b/t/204-makeflags/noflags.do new file mode 100644 index 0000000..27ba77d --- /dev/null +++ b/t/204-makeflags/noflags.do @@ -0,0 +1 @@ +true diff --git a/t/204-readonly/.gitignore b/t/205-readonly/.gitignore similarity index 100% rename from t/204-readonly/.gitignore rename to t/205-readonly/.gitignore diff --git a/t/204-readonly/all.do b/t/205-readonly/all.do similarity index 100% rename from t/204-readonly/all.do rename to t/205-readonly/all.do diff --git a/t/204-readonly/clean.do b/t/205-readonly/clean.do similarity index 100% rename from t/204-readonly/clean.do rename to t/205-readonly/clean.do