Switch to using a separate lockfile per target.

The previous method, using fcntl byterange locks, was very efficient and
avoided unnecessarily filesystem metadata churn (ie. creating/deleting
inodes).  Unfortunately, MacOS X (at least version 10.6.5) apparently has a
race condition in its fcntl locking that makes it unusably unreliable
(http://apenwarr.ca/log/?m=201012#13).

My tests indicate that if you only ever lock a *single* byterange on a file,
the race condition doesn't cause a problem.  So let's just use one lockfile
per target.  Now "redo -j20 test" passes for me on both MacOS and Linux.

This doesn't measurably affect the speed on Linux, at least, in my tests.

The bad news: it's hard to safely *delete* those lockfiles when we're done
with them, so they tend to accumulate in the .redo dir.
This commit is contained in:
Avery Pennarun 2010-12-14 02:25:17 -08:00
commit 95680ed7ef
2 changed files with 13 additions and 10 deletions

View file

@ -19,9 +19,8 @@ def _connect(dbfile):
_db = None _db = None
_lockfile = None
def db(): def db():
global _db, _lockfile global _db
if _db: if _db:
return _db return _db
@ -35,10 +34,6 @@ def db():
else: else:
raise raise
_lockfile = os.open(os.path.join(vars.BASE, '.redo/locks'),
os.O_RDWR | os.O_CREAT, 0666)
close_on_exec(_lockfile, True)
must_create = not os.path.exists(dbfile) must_create = not os.path.exists(dbfile)
if not must_create: if not must_create:
_db = _connect(dbfile) _db = _connect(dbfile)
@ -296,19 +291,22 @@ class Lock:
def __init__(self, fid): def __init__(self, fid):
self.owned = False self.owned = False
self.fid = fid self.fid = fid
assert(_lockfile >= 0) self.lockfile = os.open(os.path.join(vars.BASE, '.redo/lock.%d' % fid),
os.O_RDWR | os.O_CREAT, 0666)
close_on_exec(self.lockfile, True)
assert(_locks.get(fid,0) == 0) assert(_locks.get(fid,0) == 0)
_locks[fid] = 1 _locks[fid] = 1
def __del__(self): def __del__(self):
_locks[self.fid] = 0 _locks[self.fid] = 0
os.close(self.lockfile)
if self.owned: if self.owned:
self.unlock() self.unlock()
def trylock(self): def trylock(self):
assert(not self.owned) assert(not self.owned)
try: try:
fcntl.lockf(_lockfile, fcntl.LOCK_EX|fcntl.LOCK_NB, 1, self.fid) fcntl.lockf(self.lockfile, fcntl.LOCK_EX|fcntl.LOCK_NB, 0, 0)
except IOError, e: except IOError, e:
if e.errno in (errno.EAGAIN, errno.EACCES): if e.errno in (errno.EAGAIN, errno.EACCES):
pass # someone else has it locked pass # someone else has it locked
@ -319,12 +317,12 @@ class Lock:
def waitlock(self): def waitlock(self):
assert(not self.owned) assert(not self.owned)
fcntl.lockf(_lockfile, fcntl.LOCK_EX, 1, self.fid) fcntl.lockf(self.lockfile, fcntl.LOCK_EX, 0, 0)
self.owned = True self.owned = True
def unlock(self): def unlock(self):
if not self.owned: if not self.owned:
raise Exception("can't unlock %r - we don't own it" raise Exception("can't unlock %r - we don't own it"
% self.lockname) % self.lockname)
fcntl.lockf(_lockfile, fcntl.LOCK_UN, 1, self.fid) fcntl.lockf(self.lockfile, fcntl.LOCK_UN, 0, 0)
self.owned = False self.owned = False

View file

@ -1,4 +1,9 @@
echo n2-$1 echo n2-$1
echo $1 >>$1.count echo $1 >>$1.count
echo $1 >>in.countall echo $1 >>in.countall
# we deliberately use 'redo' here instead of redo-ifchange, because this *heavily*
# stresses redo's locking when building in parallel. We end up with 100
# different targets that all not only depend on this file, but absolutely must
# acquire the lock on this file, build it atomically, and release the lock.
redo countall redo countall