From 61f3e4672e16e5742ea0ffde7d00c058e1d89e36 Mon Sep 17 00:00:00 2001 From: Avery Pennarun Date: Wed, 2 Jan 2019 14:18:51 -0500 Subject: [PATCH] Workaround for completely broken file locking on Windows 10 WSL. WSL (Windows Services for Linux) provides a Linux-kernel-compatible ABI for userspace processes, but the current version doesn't not implement fcntl() locks at all; it just always returns success. See https://github.com/Microsoft/WSL/issues/1927. This causes us three kinds of problem: 1. sqlite3 in WAL mode gives "OperationalError: locking protocol". 1b. Other sqlite3 journal modes also don't work when used by multiple processes. 2. redo parallelism doesn't work, because we can't prevent the same target from being build several times simultaneously. 3. "redo-log -f" doesn't work, since it can't tell whether the log file it's tailing is "done" or not. To fix #1, we switch the sqlite3 journal back to PERSIST instead of WAL. We originally changed to WAL in commit 5156feae9d to reduce deadlocks on MacOS. That was never adequately explained, but PERSIST still acts weird on MacOS, so we'll only switch to PERSIST when we detect that locking is definitely broken. Sigh. To (mostly) fix #2, we disable any -j value > 1 when locking is broken. This prevents basic forms of parallelism, but doesn't stop you from re-entrantly starting other instances of redo. To fix that properly, we need to switch to a different locking mechanism entirely, which is tough in python. flock() locks probably work, for example, but python's locks lie and just use fcntl locks for those. To fix #3, we always force --no-log mode when we find that locking is broken. --- redo/builder.py | 6 ++-- redo/cmd_always.py | 4 ++- redo/cmd_ifchange.py | 4 ++- redo/cmd_ifcreate.py | 4 ++- redo/cmd_log.py | 4 ++- redo/cmd_ood.py | 4 ++- redo/cmd_redo.py | 27 +++++++++-------- redo/cmd_sources.py | 4 ++- redo/cmd_stamp.py | 4 ++- redo/cmd_targets.py | 4 ++- redo/cmd_unlocked.py | 4 ++- redo/cmd_whichdo.py | 4 ++- redo/env.py | 17 ++++++++--- redo/logs.py | 4 +-- redo/state.py | 54 ++++++++++++++++++++++++++++++++- t/010-jobserver/all.do | 5 +++ t/010-jobserver/paralleltest.do | 2 +- t/010-jobserver/serialtest.do | 1 - 18 files changed, 122 insertions(+), 34 deletions(-) diff --git a/redo/builder.py b/redo/builder.py index 93a0818..e55955f 100644 --- a/redo/builder.py +++ b/redo/builder.py @@ -60,7 +60,7 @@ def start_stdin_log_reader(status, details, pretty, color, os.dup2(w, 1) os.dup2(w, 2) os.close(w) - logs.setup(tty=sys.stderr, pretty=False, color=False) + logs.setup(tty=sys.stderr, parent_logs=True, pretty=False, color=False) else: # child try: @@ -313,9 +313,9 @@ class _BuildJob(object): # *does* redirect stderr, that redirection should be inherited # by subprocs, so we'd do nothing. logf = open(state.logname(self.sf.id), 'w') - new_inode = str(os.fstat(logf.fileno()).st_ino) + new_inode = os.fstat(logf.fileno()).st_ino os.environ['REDO_LOG'] = '1' # .do files can check this - os.environ['REDO_LOG_INODE'] = new_inode + os.environ['REDO_LOG_INODE'] = str(new_inode) os.dup2(logf.fileno(), 2) close_on_exec(2, False) logf.close() diff --git a/redo/cmd_always.py b/redo/cmd_always.py index 51fdd14..b690b03 100644 --- a/redo/cmd_always.py +++ b/redo/cmd_always.py @@ -6,7 +6,9 @@ from . import env, logs, state def main(): try: env.inherit() - logs.setup(tty=sys.stderr, pretty=env.v.PRETTY, color=env.v.COLOR) + logs.setup( + tty=sys.stderr, parent_logs=env.v.LOG, + pretty=env.v.PRETTY, color=env.v.COLOR) me = os.path.join(env.v.STARTDIR, os.path.join(env.v.PWD, env.v.TARGET)) diff --git a/redo/cmd_ifchange.py b/redo/cmd_ifchange.py index b417b7f..37609cf 100644 --- a/redo/cmd_ifchange.py +++ b/redo/cmd_ifchange.py @@ -26,7 +26,9 @@ def main(): status=True, details=True, pretty=True, color=True, debug_locks=False, debug_pids=False) else: - logs.setup(tty=sys.stderr, pretty=env.v.PRETTY, color=env.v.COLOR) + logs.setup( + tty=sys.stderr, parent_logs=env.v.LOG, + pretty=env.v.PRETTY, color=env.v.COLOR) if env.v.TARGET and not env.v.UNLOCKED: me = os.path.join(env.v.STARTDIR, os.path.join(env.v.PWD, env.v.TARGET)) diff --git a/redo/cmd_ifcreate.py b/redo/cmd_ifcreate.py index 86d1a28..88b844b 100644 --- a/redo/cmd_ifcreate.py +++ b/redo/cmd_ifcreate.py @@ -7,7 +7,9 @@ from .logs import err def main(): try: env.inherit() - logs.setup(tty=sys.stderr, pretty=env.v.PRETTY, color=env.v.COLOR) + logs.setup( + tty=sys.stderr, parent_logs=env.v.LOG, + pretty=env.v.PRETTY, color=env.v.COLOR) me = os.path.join(env.v.STARTDIR, os.path.join(env.v.PWD, env.v.TARGET)) diff --git a/redo/cmd_log.py b/redo/cmd_log.py index 402a5f4..8cd9085 100644 --- a/redo/cmd_log.py +++ b/redo/cmd_log.py @@ -230,7 +230,9 @@ def main(): state.init(targets) if opt.status < 2 and not os.isatty(2): opt.status = False - logs.setup(tty=sys.stdout, pretty=opt.pretty, color=opt.color) + logs.setup( + tty=sys.stdout, parent_logs=False, + pretty=opt.pretty, color=opt.color) if opt.debug_locks: env.v.DEBUG_LOCKS = 1 if opt.debug_pids: diff --git a/redo/cmd_ood.py b/redo/cmd_ood.py index fec0064..ece5441 100644 --- a/redo/cmd_ood.py +++ b/redo/cmd_ood.py @@ -23,7 +23,9 @@ def main(): sys.exit(1) state.init([]) - logs.setup(tty=sys.stderr, pretty=env.v.PRETTY, color=env.v.COLOR) + logs.setup( + tty=sys.stderr, parent_logs=env.v.LOG, + pretty=env.v.PRETTY, color=env.v.COLOR) cwd = os.getcwd() for f in state.files(): if f.is_target(): diff --git a/redo/cmd_redo.py b/redo/cmd_redo.py index 2890c49..5c0a73d 100644 --- a/redo/cmd_redo.py +++ b/redo/cmd_redo.py @@ -64,16 +64,12 @@ def main(): os.environ['REDO_DEBUG_LOCKS'] = '1' if opt.debug_pids: os.environ['REDO_DEBUG_PIDS'] = '1' - - # This is slightly tricky: the log and pretty options default to true. We - # want to inherit that 'true' value from parent processes *unless* someone - # explicitly specifies the reverse. - if opt.no_log: - os.environ['REDO_LOG'] = '0' - if opt.no_pretty: - os.environ['REDO_PRETTY'] = '0' - if opt.no_color: - os.environ['REDO_COLOR'] = '0' + # These might get overridden in subprocesses in builder.py + def _set_defint(name, val): + os.environ[name] = os.environ.get(name, str(int(val))) + _set_defint('REDO_LOG', opt.log) + _set_defint('REDO_PRETTY', opt.pretty) + _set_defint('REDO_COLOR', opt.color) try: state.init(targets) @@ -85,10 +81,17 @@ def main(): if env.is_toplevel and env.v.LOG: builder.start_stdin_log_reader( status=opt.status, details=opt.details, - pretty=opt.pretty, color=opt.color, + pretty=env.v.PRETTY, color=env.v.COLOR, debug_locks=opt.debug_locks, debug_pids=opt.debug_pids) else: - logs.setup(tty=sys.stderr, pretty=env.v.PRETTY, color=env.v.COLOR) + logs.setup( + tty=sys.stderr, parent_logs=env.v.LOG, + pretty=env.v.PRETTY, color=env.v.COLOR) + if (env.is_toplevel or j > 1) and env.v.LOCKS_BROKEN: + warn('detected broken fcntl locks; parallelism disabled.\n') + warn(' ...details: https://github.com/Microsoft/WSL/issues/1927\n') + if j > 1: + j = 1 for t in targets: if os.path.exists(t): diff --git a/redo/cmd_sources.py b/redo/cmd_sources.py index b9fba09..2229452 100644 --- a/redo/cmd_sources.py +++ b/redo/cmd_sources.py @@ -9,7 +9,9 @@ def main(): sys.exit(1) state.init([]) - logs.setup(tty=sys.stderr, pretty=env.v.PRETTY, color=env.v.COLOR) + logs.setup( + tty=sys.stderr, parent_logs=env.v.LOG, + pretty=env.v.PRETTY, color=env.v.COLOR) cwd = os.getcwd() for f in state.files(): diff --git a/redo/cmd_stamp.py b/redo/cmd_stamp.py index 48f12bf..8929803 100644 --- a/redo/cmd_stamp.py +++ b/redo/cmd_stamp.py @@ -15,7 +15,9 @@ def main(): sys.exit(1) env.inherit() - logs.setup(tty=sys.stderr, pretty=env.v.PRETTY, color=env.v.COLOR) + logs.setup( + tty=sys.stderr, parent_logs=env.v.LOG, + pretty=env.v.PRETTY, color=env.v.COLOR) # hashlib is only available in python 2.5 or higher, but the 'sha' # module produces a DeprecationWarning in python 2.6 or higher. We want diff --git a/redo/cmd_targets.py b/redo/cmd_targets.py index 887e202..1a73dc5 100644 --- a/redo/cmd_targets.py +++ b/redo/cmd_targets.py @@ -9,7 +9,9 @@ def main(): sys.exit(1) state.init([]) - logs.setup(tty=sys.stderr, pretty=env.v.PRETTY, color=env.v.COLOR) + logs.setup( + tty=sys.stderr, parent_logs=env.v.LOG, + pretty=env.v.PRETTY, color=env.v.COLOR) cwd = os.getcwd() for f in state.files(): diff --git a/redo/cmd_unlocked.py b/redo/cmd_unlocked.py index 81da28e..c5b7565 100644 --- a/redo/cmd_unlocked.py +++ b/redo/cmd_unlocked.py @@ -9,7 +9,9 @@ def main(): sys.exit(1) env.inherit() - logs.setup(tty=sys.stderr, pretty=env.v.PRETTY, color=env.v.COLOR) + logs.setup( + tty=sys.stderr, parent_logs=env.v.LOG, + pretty=env.v.PRETTY, color=env.v.COLOR) target = sys.argv[1] deps = sys.argv[2:] diff --git a/redo/cmd_whichdo.py b/redo/cmd_whichdo.py index ad580c2..170ebe6 100644 --- a/redo/cmd_whichdo.py +++ b/redo/cmd_whichdo.py @@ -10,7 +10,9 @@ def main(): sys.exit(1) env.init_no_state() - logs.setup(tty=sys.stderr, pretty=env.v.PRETTY, color=env.v.COLOR) + logs.setup( + tty=sys.stderr, parent_logs=env.v.LOG, + pretty=env.v.PRETTY, color=env.v.COLOR) want = sys.argv[1] if not want: diff --git a/redo/env.py b/redo/env.py index 2d8a338..104f532 100644 --- a/redo/env.py +++ b/redo/env.py @@ -31,14 +31,14 @@ class Env(object): self.DEBUG = atoi(_get('REDO_DEBUG', '')) self.DEBUG_LOCKS = _get_bool('REDO_DEBUG_LOCKS', '') self.DEBUG_PIDS = _get_bool('REDO_DEBUG_PIDS', '') + self.LOCKS_BROKEN = _get_bool('REDO_LOCKS_BROKEN', '') self.VERBOSE = _get_bool('REDO_VERBOSE', '') self.XTRACE = _get_bool('REDO_XTRACE', '') self.KEEP_GOING = _get_bool('REDO_KEEP_GOING', '') - self.LOG = _get_int('REDO_LOG', 1) # defaults on + self.LOG = _get_int('REDO_LOG', '') self.LOG_INODE = _get('REDO_LOG_INODE', '') - self.COLOR = _get_int('REDO_COLOR', 1) # defaults on - # subprocesses mustn't pretty-print if a parent is running redo-log - self.PRETTY = (not self.LOG) and _get_int('REDO_PRETTY', 1) + self.COLOR = _get_int('REDO_COLOR', '') + self.PRETTY = _get_int('REDO_PRETTY', '') self.SHUFFLE = _get_bool('REDO_SHUFFLE', '') self.STARTDIR = _get('REDO_STARTDIR', '') self.RUNID = _get_int('REDO_RUNID', '') or None @@ -115,3 +115,12 @@ def init(targets): os.environ['REDO_STARTDIR'] = os.getcwd() inherit() + + +def mark_locks_broken(): + """If file locking is broken, update the environment accordingly.""" + os.environ['REDO_LOCKS_BROKEN'] = '1' + # FIXME: redo-log doesn't work when fcntl locks are broken. + # We can probably work around that someday. + os.environ['REDO_LOG'] = '0' + inherit() diff --git a/redo/logs.py b/redo/logs.py index 3d3a4df..9d01359 100644 --- a/redo/logs.py +++ b/redo/logs.py @@ -110,9 +110,9 @@ class PrettyLog(object): _log = None -def setup(tty, pretty, color): +def setup(tty, parent_logs, pretty, color): global _log - if pretty: + if pretty and not parent_logs: _check_tty(tty, color=color) _log = PrettyLog(tty=tty) else: diff --git a/redo/state.py b/redo/state.py index 00dd254..409a64f 100644 --- a/redo/state.py +++ b/redo/state.py @@ -17,7 +17,12 @@ LOG_LOCK_MAGIC = 0x10000000 # fid offset for "log locks" def _connect(dbfile): _db = sqlite3.connect(dbfile, timeout=TIMEOUT) _db.execute("pragma synchronous = off") - _db.execute("pragma journal_mode = WAL") + # Some old/broken versions of pysqlite on MacOS work badly with journal + # mode PERSIST. But WAL fails on Windows WSL due to WSL's totally broken + # locking. On WSL, at least PERSIST works in single-threaded mode, so + # if we're careful we can use it, more or less. + jmode = 'PERSIST' if env.v.LOCKS_BROKEN else 'WAL' + _db.execute("pragma journal_mode = %s" % (jmode,)) _db.text_factory = str return _db @@ -50,6 +55,8 @@ def db(): _lockfile = os.open(os.path.join(env.v.BASE, '.redo/locks'), os.O_RDWR | os.O_CREAT, 0666) close_on_exec(_lockfile, True) + if env.is_toplevel and detect_broken_locks(): + env.mark_locks_broken() must_create = not os.path.exists(dbfile) if not must_create: @@ -110,6 +117,8 @@ def db(): def init(targets): env.init(targets) db() + if env.is_toplevel and detect_broken_locks(): + env.mark_locks_broken() _wrote = 0 @@ -530,3 +539,46 @@ class Lock(object): % self.fid) fcntl.lockf(_lockfile, fcntl.LOCK_UN, 1, self.fid) self.owned = False + + +def detect_broken_locks(): + """Detect Windows WSL's completely broken fcntl() locks. + + Symptom: locking a file always returns success, even if other processes + also think they have it locked. See + https://github.com/Microsoft/WSL/issues/1927 for more details. + + Bug exists at least in WSL "4.4.0-17134-Microsoft #471-Microsoft". + + Returns true if broken, false otherwise. + """ + pl = Lock(0) + # We wait for the lock here, just in case others are doing + # this test at the same time. + pl.waitlock(shared=False) + pid = os.fork() + if pid: + # parent + _, rv = os.waitpid(pid, 0) + ok = os.WIFEXITED(rv) and not os.WEXITSTATUS(rv) + return not ok + else: + # child + try: + # Doesn't actually unlock, since child process doesn't own it + pl.unlock() + del pl + cl = Lock(0) + # parent is holding lock, which should prevent us from getting it. + owned = cl.trylock() + if owned: + # Got the lock? Yikes, the locking system is broken! + os._exit(1) + else: + # Failed to get the lock? Good, the parent owns it. + os._exit(0) + except Exception: # pylint: disable=broad-except + import traceback + traceback.print_exc() + finally: + os._exit(99) diff --git a/t/010-jobserver/all.do b/t/010-jobserver/all.do index 85cc9b1..15d171a 100644 --- a/t/010-jobserver/all.do +++ b/t/010-jobserver/all.do @@ -6,6 +6,11 @@ redo -j1 serialtest # Capture log output to parallel.log to hide the (intentional since we're # testing it) scary warning from redo about overriding the jobserver. +if [ -n "$REDO_LOCKS_BROKEN" ]; then + echo "Locks are broken on this OS; skipping parallel tests." >&2 + exit 0 +fi + echo 'parallel test...' >&2 if ! redo -j10 paralleltest 2>parallel.log; then cat parallel.log >&2 diff --git a/t/010-jobserver/paralleltest.do b/t/010-jobserver/paralleltest.do index 0fb3203..db7513d 100644 --- a/t/010-jobserver/paralleltest.do +++ b/t/010-jobserver/paralleltest.do @@ -3,5 +3,5 @@ # sharing with other tests, we can't be sure that parallel2 will run while # parallel is running, and the race condition will make this test at least # be flakey instead of pass, which means there's a bug.) -rm -f *.sub *.spin *.x *.log parallel *.start *.end +rm -f *.sub *.spin *.x parallel *.start *.end redo parallel parallel2 diff --git a/t/010-jobserver/serialtest.do b/t/010-jobserver/serialtest.do index 4c2bb7e..b0cb5cf 100644 --- a/t/010-jobserver/serialtest.do +++ b/t/010-jobserver/serialtest.do @@ -1,4 +1,3 @@ # Test that -j1 really serializes all sub-redo processes. rm -f *.sub *.spin *.x first second *.start *.end redo first second -