From 95680ed7eff8659dc18d5e5ec22fc40470a2ee89 Mon Sep 17 00:00:00 2001 From: Avery Pennarun Date: Tue, 14 Dec 2010 02:25:17 -0800 Subject: [PATCH] Switch to using a separate lockfile per target. The previous method, using fcntl byterange locks, was very efficient and avoided unnecessarily filesystem metadata churn (ie. creating/deleting inodes). Unfortunately, MacOS X (at least version 10.6.5) apparently has a race condition in its fcntl locking that makes it unusably unreliable (http://apenwarr.ca/log/?m=201012#13). My tests indicate that if you only ever lock a *single* byterange on a file, the race condition doesn't cause a problem. So let's just use one lockfile per target. Now "redo -j20 test" passes for me on both MacOS and Linux. This doesn't measurably affect the speed on Linux, at least, in my tests. The bad news: it's hard to safely *delete* those lockfiles when we're done with them, so they tend to accumulate in the .redo dir. --- state.py | 18 ++++++++---------- t/curse/default.n2.do | 5 +++++ 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/state.py b/state.py index 458a2af..10f7260 100644 --- a/state.py +++ b/state.py @@ -19,9 +19,8 @@ def _connect(dbfile): _db = None -_lockfile = None def db(): - global _db, _lockfile + global _db if _db: return _db @@ -35,10 +34,6 @@ def db(): else: raise - _lockfile = os.open(os.path.join(vars.BASE, '.redo/locks'), - os.O_RDWR | os.O_CREAT, 0666) - close_on_exec(_lockfile, True) - must_create = not os.path.exists(dbfile) if not must_create: _db = _connect(dbfile) @@ -296,19 +291,22 @@ class Lock: def __init__(self, fid): self.owned = False self.fid = fid - assert(_lockfile >= 0) + self.lockfile = os.open(os.path.join(vars.BASE, '.redo/lock.%d' % fid), + os.O_RDWR | os.O_CREAT, 0666) + close_on_exec(self.lockfile, True) assert(_locks.get(fid,0) == 0) _locks[fid] = 1 def __del__(self): _locks[self.fid] = 0 + os.close(self.lockfile) if self.owned: self.unlock() def trylock(self): assert(not self.owned) try: - fcntl.lockf(_lockfile, fcntl.LOCK_EX|fcntl.LOCK_NB, 1, self.fid) + fcntl.lockf(self.lockfile, fcntl.LOCK_EX|fcntl.LOCK_NB, 0, 0) except IOError, e: if e.errno in (errno.EAGAIN, errno.EACCES): pass # someone else has it locked @@ -319,12 +317,12 @@ class Lock: def waitlock(self): assert(not self.owned) - fcntl.lockf(_lockfile, fcntl.LOCK_EX, 1, self.fid) + fcntl.lockf(self.lockfile, fcntl.LOCK_EX, 0, 0) self.owned = True def unlock(self): if not self.owned: raise Exception("can't unlock %r - we don't own it" % self.lockname) - fcntl.lockf(_lockfile, fcntl.LOCK_UN, 1, self.fid) + fcntl.lockf(self.lockfile, fcntl.LOCK_UN, 0, 0) self.owned = False diff --git a/t/curse/default.n2.do b/t/curse/default.n2.do index ad037fa..2ab2dc5 100644 --- a/t/curse/default.n2.do +++ b/t/curse/default.n2.do @@ -1,4 +1,9 @@ echo n2-$1 echo $1 >>$1.count echo $1 >>in.countall + +# we deliberately use 'redo' here instead of redo-ifchange, because this *heavily* +# stresses redo's locking when building in parallel. We end up with 100 +# different targets that all not only depend on this file, but absolutely must +# acquire the lock on this file, build it atomically, and release the lock. redo countall