From 95680ed7eff8659dc18d5e5ec22fc40470a2ee89 Mon Sep 17 00:00:00 2001
From: Avery Pennarun <apenwarr@gmail.com>
Date: Tue, 14 Dec 2010 02:25:17 -0800
Subject: [PATCH] Switch to using a separate lockfile per target.

The previous method, using fcntl byterange locks, was very efficient and
avoided unnecessarily filesystem metadata churn (ie. creating/deleting
inodes).  Unfortunately, MacOS X (at least version 10.6.5) apparently has a
race condition in its fcntl locking that makes it unusably unreliable
(http://apenwarr.ca/log/?m=201012#13).

My tests indicate that if you only ever lock a *single* byterange on a file,
the race condition doesn't cause a problem.  So let's just use one lockfile
per target.  Now "redo -j20 test" passes for me on both MacOS and Linux.

This doesn't measurably affect the speed on Linux, at least, in my tests.

The bad news: it's hard to safely *delete* those lockfiles when we're done
with them, so they tend to accumulate in the .redo dir.
---
 state.py              | 18 ++++++++----------
 t/curse/default.n2.do |  5 +++++
 2 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/state.py b/state.py
index 458a2af..10f7260 100644
--- a/state.py
+++ b/state.py
@@ -19,9 +19,8 @@ def _connect(dbfile):
 
 
 _db = None
-_lockfile = None
 def db():
-    global _db, _lockfile
+    global _db
     if _db:
         return _db
         
@@ -35,10 +34,6 @@ def db():
         else:
             raise
 
-    _lockfile = os.open(os.path.join(vars.BASE, '.redo/locks'),
-                        os.O_RDWR | os.O_CREAT, 0666)
-    close_on_exec(_lockfile, True)
-    
     must_create = not os.path.exists(dbfile)
     if not must_create:
         _db = _connect(dbfile)
@@ -296,19 +291,22 @@ class Lock:
     def __init__(self, fid):
         self.owned = False
         self.fid = fid
-        assert(_lockfile >= 0)
+        self.lockfile = os.open(os.path.join(vars.BASE, '.redo/lock.%d' % fid),
+                                os.O_RDWR | os.O_CREAT, 0666)
+        close_on_exec(self.lockfile, True)
         assert(_locks.get(fid,0) == 0)
         _locks[fid] = 1
 
     def __del__(self):
         _locks[self.fid] = 0
+        os.close(self.lockfile)
         if self.owned:
             self.unlock()
 
     def trylock(self):
         assert(not self.owned)
         try:
-            fcntl.lockf(_lockfile, fcntl.LOCK_EX|fcntl.LOCK_NB, 1, self.fid)
+            fcntl.lockf(self.lockfile, fcntl.LOCK_EX|fcntl.LOCK_NB, 0, 0)
         except IOError, e:
             if e.errno in (errno.EAGAIN, errno.EACCES):
                 pass  # someone else has it locked
@@ -319,12 +317,12 @@ class Lock:
 
     def waitlock(self):
         assert(not self.owned)
-        fcntl.lockf(_lockfile, fcntl.LOCK_EX, 1, self.fid)
+        fcntl.lockf(self.lockfile, fcntl.LOCK_EX, 0, 0)
         self.owned = True
             
     def unlock(self):
         if not self.owned:
             raise Exception("can't unlock %r - we don't own it" 
                             % self.lockname)
-        fcntl.lockf(_lockfile, fcntl.LOCK_UN, 1, self.fid)
+        fcntl.lockf(self.lockfile, fcntl.LOCK_UN, 0, 0)
         self.owned = False
diff --git a/t/curse/default.n2.do b/t/curse/default.n2.do
index ad037fa..2ab2dc5 100644
--- a/t/curse/default.n2.do
+++ b/t/curse/default.n2.do
@@ -1,4 +1,9 @@
 echo n2-$1
 echo $1 >>$1.count
 echo $1 >>in.countall
+
+# we deliberately use 'redo' here instead of redo-ifchange, because this *heavily*
+# stresses redo's locking when building in parallel.  We end up with 100
+# different targets that all not only depend on this file, but absolutely must
+# acquire the lock on this file, build it atomically, and release the lock.
 redo countall