Switch to using a separate lockfile per target.

The previous method, using fcntl byterange locks, was very efficient and avoided unnecessarily filesystem metadata churn (ie. creating/deleting inodes). Unfortunately, MacOS X (at least version 10.6.5) apparently has a race condition in its fcntl locking that makes it unusably unreliable (http://apenwarr.ca/log/?m=201012#13). My tests indicate that if you only ever lock a *single* byterange on a file, the race condition doesn't cause a problem. So let's just use one lockfile per target. Now "redo -j20 test" passes for me on both MacOS and Linux. This doesn't measurably affect the speed on Linux, at least, in my tests. The bad news: it's hard to safely *delete* those lockfiles when we're done with them, so they tend to accumulate in the .redo dir.
2010-12-14 02:25:17 -08:00 · 2010-12-14 02:25:17 -08:00 · 95680ed7ef
commit 95680ed7ef
parent 294945bd0f
2 changed files with 13 additions and 10 deletions
--- a/state.py
+++ b/state.py
@ -19,9 +19,8 @@ def _connect(dbfile):
 _db = None
 _lockfile = None
 def db():
-    global _db, _lockfile
+    global _db
    if _db:
        return _db
@ -35,10 +34,6 @@ def db():
        else:
            raise
    _lockfile = os.open(os.path.join(vars.BASE, '.redo/locks'),
                        os.O_RDWR | os.O_CREAT, 0666)
    close_on_exec(_lockfile, True)
    must_create = not os.path.exists(dbfile)
    if not must_create:
        _db = _connect(dbfile)
@ -296,19 +291,22 @@ class Lock:
    def __init__(self, fid):
        self.owned = False
        self.fid = fid
-        assert(_lockfile >= 0)
+        self.lockfile = os.open(os.path.join(vars.BASE, '.redo/lock.%d' % fid),
                                os.O_RDWR | os.O_CREAT, 0666)
        close_on_exec(self.lockfile, True)
        assert(_locks.get(fid,0) == 0)
        _locks[fid] = 1
    def __del__(self):
        _locks[self.fid] = 0
        os.close(self.lockfile)
        if self.owned:
            self.unlock()
    def trylock(self):
        assert(not self.owned)
        try:
-            fcntl.lockf(_lockfile, fcntl.LOCK_EX|fcntl.LOCK_NB, 1, self.fid)
+            fcntl.lockf(self.lockfile, fcntl.LOCK_EX|fcntl.LOCK_NB, 0, 0)
        except IOError, e:
            if e.errno in (errno.EAGAIN, errno.EACCES):
                pass  # someone else has it locked
@ -319,12 +317,12 @@ class Lock:
    def waitlock(self):
        assert(not self.owned)
-        fcntl.lockf(_lockfile, fcntl.LOCK_EX, 1, self.fid)
+        fcntl.lockf(self.lockfile, fcntl.LOCK_EX, 0, 0)
        self.owned = True
    def unlock(self):
        if not self.owned:
            raise Exception("can't unlock %r - we don't own it" 
                            % self.lockname)
-        fcntl.lockf(_lockfile, fcntl.LOCK_UN, 1, self.fid)
+        fcntl.lockf(self.lockfile, fcntl.LOCK_UN, 0, 0)
        self.owned = False
--- a/t/curse/default.n2.do
+++ b/t/curse/default.n2.do
@ -1,4 +1,9 @@
 echo n2-$1
 echo $1 >>$1.count
 echo $1 >>in.countall
 # we deliberately use 'redo' here instead of redo-ifchange, because this *heavily*
 # stresses redo's locking when building in parallel.  We end up with 100
 # different targets that all not only depend on this file, but absolutely must
 # acquire the lock on this file, build it atomically, and release the lock.
 redo countall