From 84169c5d27cef1cdf17e59f1fee9e3e56747cb04 Mon Sep 17 00:00:00 2001
From: Avery Pennarun <apenwarr@gmail.com>
Date: Fri, 10 Dec 2010 02:58:13 -0800
Subject: [PATCH] Change locking stuff from fifos to fcntl.lockf().

This should reduce filesystem grinding a bit, and makes the code simpler.
It's also theoretically a bit more portable, since I'm guessing fifo
semantics aren't the same on win32 if we ever get there.

Also, a major problem with the old fifo-based system is that if a redo
process died without cleaning up after itself, it wouldn't delete its
lockfiles, so we had to wipe them all at the beginning of each build.  Now
we don't; in theory, you can now have multiple copies of redo poking at the
same tree at the same time and not stepping on each other.
---
 builder.py | 49 ++++++++++++++++--------------
 state.py   | 88 ++++++++++++++++++++----------------------------------
 2 files changed, 58 insertions(+), 79 deletions(-)

diff --git a/builder.py b/builder.py
index 8896ea3..16695cf 100644
--- a/builder.py
+++ b/builder.py
@@ -43,8 +43,9 @@ def _try_stat(filename):
 
 
 class BuildJob:
-    def __init__(self, t, lock, shouldbuildfunc, donefunc):
-        self.t = t
+    def __init__(self, t, sf, lock, shouldbuildfunc, donefunc):
+        self.t = t  # original target name, not relative to vars.BASE
+        self.sf = sf
         self.tmpname = '%s.redo.tmp' % t
         self.lock = lock
         self.shouldbuildfunc = shouldbuildfunc
@@ -54,13 +55,13 @@ class BuildJob:
     def start(self):
         assert(self.lock.owned)
         t = self.t
-        f = state.File(name=t)
+        sf = self.sf
         tmpname = self.tmpname
         if not self.shouldbuildfunc(t):
             # target doesn't need to be built; skip the whole task
             return self._after2(0)
         if (os.path.exists(t) and not os.path.exists(t + '/.')
-             and not f.is_generated):
+             and not sf.is_generated):
             # an existing source file that was not generated by us.
             # This step is mentioned by djb in his notes.
             # For example, a rule called default.c.do could be used to try
@@ -70,16 +71,16 @@ class BuildJob:
             # of redo?  That would make it easy for someone to override a
             # file temporarily, and could be undone by deleting the file.
             debug2("-- static (%r)\n" % t)
-            f.set_static()
-            f.save()
+            sf.set_static()
+            sf.save()
             return self._after2(0)
-        f.zap_deps()
-        (dofile, basename, ext) = _find_do_file(f)
+        sf.zap_deps()
+        (dofile, basename, ext) = _find_do_file(sf)
         if not dofile:
             if os.path.exists(t):
-                f.is_generated = False
-                f.set_static()
-                f.save()
+                sf.is_generated = False
+                sf.set_static()
+                sf.save()
                 return self._after2(0)
             else:
                 err('no rule to make %r\n' % t)
@@ -100,8 +101,8 @@ class BuildJob:
         if vars.VERBOSE or vars.XTRACE: log_('\n')
         log('%s\n' % _nice(t))
         self.argv = argv
-        f.is_generated = True
-        f.save()
+        sf.is_generated = True
+        sf.save()
         dof = state.File(name=dofile)
         dof.set_static()
         dof.save()
@@ -165,14 +166,14 @@ class BuildJob:
                 os.rename(tmpname, t)
             else:
                 unlink(tmpname)
-            sf = state.File(name=t)
+            sf = self.sf
             sf.is_generated=True
             sf.update_stamp()
             sf.set_changed()
             sf.save()
         else:
             unlink(tmpname)
-            sf = state.File(name=t)
+            sf = self.sf
             sf.stamp = None
             sf.set_changed()
             sf.save()
@@ -219,18 +220,19 @@ def main(targets, shouldbuildfunc):
             err('.redo directory disappeared; cannot continue.\n')
             retcode[0] = 205
             break
-        lock = state.Lock(t)
+        f = state.File(name=t)
+        lock = state.Lock(f.id)
         lock.trylock()
         if not lock.owned:
             if vars.DEBUG_LOCKS:
                 log('%s (locked...)\n' % _nice(t))
-            locked.append(t)
+            locked.append((f.id,t))
         else:
-            BuildJob(t, lock, shouldbuildfunc, done).start()
+            BuildJob(t, f, lock, shouldbuildfunc, done).start()
 
     # Now we've built all the "easy" ones.  Go back and just wait on the
-    # remaining ones one by one.  This is technically non-optimal; we could
-    # use select.select() to wait on more than one at a time.  But it should
+    # remaining ones one by one.  This is non-optimal; we could go faster if
+    # we could wait on multiple locks at once.  But it should
     # be rare enough that it doesn't matter, and the logic is easier this way.
     while locked or jwack.running():
         state.commit()
@@ -244,8 +246,8 @@ def main(targets, shouldbuildfunc):
                 err('.redo directory disappeared; cannot continue.\n')
                 retcode[0] = 205
                 break
-            t = locked.pop(0)
-            lock = state.Lock(t)
+            fid,t = locked.pop(0)
+            lock = state.Lock(fid)
             lock.waitlock()
             assert(lock.owned)
             if vars.DEBUG_LOCKS:
@@ -255,6 +257,7 @@ def main(targets, shouldbuildfunc):
                 retcode[0] = 2
                 lock.unlock()
             else:
-                BuildJob(t, lock, shouldbuildfunc, done).start()
+                BuildJob(t, state.File(id=fid), lock,
+                         shouldbuildfunc, done).start()
     state.commit()
     return retcode[0]
diff --git a/state.py b/state.py
index e75a5b0..dc3a7c5 100644
--- a/state.py
+++ b/state.py
@@ -1,4 +1,4 @@
-import sys, os, errno, glob, stat, sqlite3
+import sys, os, errno, glob, stat, fcntl, sqlite3
 import vars
 from helpers import unlink, err, debug2, debug3, close_on_exec
 import helpers
@@ -14,10 +14,12 @@ def _connect(dbfile):
 
 
 _db = None
+_lockfile = None
 def db():
-    global _db
+    global _db, _lockfile
     if _db:
         return _db
+        
     dbdir = '%s/.redo' % vars.BASE
     dbfile = '%s/db.sqlite3' % dbdir
     try:
@@ -27,6 +29,11 @@ def db():
             pass  # if it exists, that's okay
         else:
             raise
+
+    _lockfile = os.open(os.path.join(vars.BASE, '.redo/locks'),
+                        os.O_RDWR | os.O_CREAT, 0666)
+    close_on_exec(_lockfile, True)
+    
     must_create = not os.path.exists(dbfile)
     if not must_create:
         _db = _connect(dbfile)
@@ -60,7 +67,9 @@ def db():
                     "     mode not null, "
                     "     primary key (target,source))")
         _db.execute("insert into Schema (version) values (?)", [SCHEMA_VER])
-        _db.execute("insert into Runid default values")  # eat the '0' runid
+        # eat the '0' runid and File id
+        _db.execute("insert into Runid default values")
+        _db.execute("insert into Files (name) values (?)", [''])
 
     if not vars.RUNID:
         _db.execute("insert into Runid default values")
@@ -72,13 +81,7 @@ def db():
     
 
 def init():
-    # FIXME: just wiping out all the locks is kind of cheating.  But we
-    # only do this from the toplevel redo process, so unless the user
-    # deliberately starts more than one redo on the same repository, it's
-    # sort of ok.
     db()
-    for f in glob.glob('%s/.redo/lock*' % vars.BASE):
-        os.unlink(f)
 
 
 _wrote = 0
@@ -128,16 +131,8 @@ def relpath(t, base):
     return '/'.join(tparts)
 
 
-def xx_sname(typ, t):
-    # FIXME: t.replace(...) is non-reversible and non-unique here!
-    tnew = relpath(t, vars.BASE)
-    v = vars.BASE + ('/.redo/%s^%s' % (typ, tnew.replace('/', '^')))
-    if vars.DEBUG >= 3:
-        debug3('sname: (%r) %r -> %r\n' % (os.getcwd(), t, tnew))
-    return v
-
-
 class File(object):
+    # use this mostly to avoid accidentally assigning to typos
     __slots__ = ['id', 'name', 'is_generated',
                  'checked_runid', 'changed_runid',
                  'stamp', 'csum']
@@ -247,62 +242,43 @@ class File(object):
         else:
             # a "unique identifier" stamp for a regular file
             return str((st.st_ctime, st.st_mtime, st.st_size, st.st_ino))
-        
 
 
+# FIXME: I really want to use fcntl F_SETLK, F_SETLKW, etc here.  But python
+# doesn't do the lockdata structure in a portable way, so we have to use
+# fcntl.lockf() instead.  Usually this is just a wrapper for fcntl, so it's
+# ok, but it doesn't have F_GETLK, so we can't report which pid owns the lock.
+# The makes debugging a bit harder.  When we someday port to C, we can do that.
 class Lock:
-    def __init__(self, t):
+    def __init__(self, fid):
+        assert(_lockfile >= 0)
         self.owned = False
-        self.rfd = self.wfd = None
-        self.lockname = xx_sname('lock', t)
+        self.fid = fid
 
     def __del__(self):
         if self.owned:
             self.unlock()
 
     def trylock(self):
+        assert(not self.owned)
         try:
-            os.mkfifo(self.lockname, 0600)
-            self.owned = True
-            self.rfd = os.open(self.lockname, os.O_RDONLY|os.O_NONBLOCK)
-            self.wfd = os.open(self.lockname, os.O_WRONLY)
-            close_on_exec(self.rfd, True)
-            close_on_exec(self.wfd, True)
-        except OSError, e:
-            if e.errno == errno.EEXIST:
-                pass
+            fcntl.lockf(_lockfile, fcntl.LOCK_EX|fcntl.LOCK_NB, 1, self.fid)
+        except IOError, e:
+            if e.errno in (errno.EAGAIN, errno.EACCES):
+                pass  # someone else has it locked
             else:
                 raise
+        else:
+            self.owned = True
 
     def waitlock(self):
-        while not self.owned:
-            self.wait()
-            self.trylock()
-        assert(self.owned)
+        assert(not self.owned)
+        fcntl.lockf(_lockfile, fcntl.LOCK_EX, 1, self.fid)
+        self.owned = True
             
     def unlock(self):
         if not self.owned:
             raise Exception("can't unlock %r - we don't own it" 
                             % self.lockname)
-        unlink(self.lockname)
-        # ping any connected readers
-        os.close(self.rfd)
-        os.close(self.wfd)
-        self.rfd = self.wfd = None
+        fcntl.lockf(_lockfile, fcntl.LOCK_UN, 1, self.fid)
         self.owned = False
-
-    def wait(self):
-        if self.owned:
-            raise Exception("can't wait on %r - we own it" % self.lockname)
-        try:
-            # open() will finish only when a writer exists and does close()
-            fd = os.open(self.lockname, os.O_RDONLY)
-            try:
-                os.read(fd, 1)
-            finally:
-                os.close(fd)
-        except OSError, e:
-            if e.errno == errno.ENOENT:
-                pass  # it's not even unlocked or was unlocked earlier
-            else:
-                raise