Fetch revisions in chunks of a 1000.
[jelmer/subvertpy.git] / logwalker.py
index 39fc311783ca6f4ab43d95fb1e02271d98b196e8..80cb025bf6bb934284bd260cffdad92b0351df6d 100644 (file)
 # You should have received a copy of the GNU General Public License
 # along with this program; if not, write to the Free Software
 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+"""Cache of the Subversion history log."""
 
-from bzrlib.errors import NoSuchRevision, BzrError, NotBranchError
-from bzrlib.progress import ProgressBar, DummyProgress
-from bzrlib.trace import mutter
+from bzrlib import urlutils
+from bzrlib.errors import NoSuchRevision
+import bzrlib.ui as ui
+from copy import copy
 
-import os
-
-from svn.core import SubversionException
+from svn.core import SubversionException, Pool
 from transport import SvnRaTransport
 import svn.core
 
-try:
-    import sqlite3
-except ImportError:
-    from pysqlite2 import dbapi2 as sqlite3
+import base64
+
+from cache import sqlite3
 
-shelves = {}
+LOG_CHUNK_LIMIT = 1000
 
 def _escape_commit_message(message):
     """Replace xml-incompatible control characters."""
+    if message is None:
+        return None
     import re
     # FIXME: RBC 20060419 this should be done by the revision
     # serialiser not by commit. Then we can also add an unescaper
@@ -50,96 +51,91 @@ def _escape_commit_message(message):
     return message
 
 
-class NotSvnBranchPath(BzrError):
-    _fmt = """{%(branch_path)s} is not a valid Svn branch path"""
-
-    def __init__(self, branch_path):
-        BzrError.__init__(self)
-        self.branch_path = branch_path
-
-
 class LogWalker(object):
     """Easy way to access the history of a Subversion repository."""
-    def __init__(self, scheme, transport=None, cache_dir=None, last_revnum=None, pb=None):
+    def __init__(self, transport, cache_db=None, limit=None):
         """Create a new instance.
 
-        :param scheme:  Branching scheme to use.
         :param transport:   SvnRaTransport to use to access the repository.
-        :param cache_dir:   Optional cache directory to use. Doesn't cache if 
-                            not set.
-        :param last_revnum: Last known revnum in the repository. Will be 
-                            determined if not specified.
-        :param pb:          Progress bar to report progress to.
+        :param cache_db:    Optional sql database connection to use. Doesn't 
+                            cache if not set.
         """
         assert isinstance(transport, SvnRaTransport)
 
-        if last_revnum is None:
-            last_revnum = transport.get_latest_revnum()
-
-        self.last_revnum = last_revnum
+        self.url = transport.base
+        self._transport = None
 
-        self.transport = transport.clone()
-        self.scheme = scheme
-
-        if not cache_dir is None:
-            cache_file = os.path.join(cache_dir, 'log-v3')
-            if not shelves.has_key(cache_file):
-                shelves[cache_file] = sqlite3.connect(cache_file)
-            self.db = shelves[cache_file]
+        if limit is not None:
+            self._limit = limit
         else:
+            self._limit = LOG_CHUNK_LIMIT
+
+        if cache_db is None:
             self.db = sqlite3.connect(":memory:")
+        else:
+            self.db = cache_db
 
         self.db.executescript("""
-          create table if not exists revision(revno integer unique, author text, message blob, date text);
+          create table if not exists revision(revno integer unique, author text, message text, date text);
           create unique index if not exists revision_revno on revision (revno);
           create table if not exists changed_path(rev integer, action text, path text, copyfrom_path text, copyfrom_rev integer);
-          create index if not exists path_rev_path on changed_path(rev, path);
+          create index if not exists path_rev on changed_path(rev);
+          create unique index if not exists path_rev_path on changed_path(rev, path);
+          create unique index if not exists path_rev_path_action on changed_path(rev, path, action);
         """)
         self.db.commit()
         self.saved_revnum = self.db.execute("SELECT MAX(revno) FROM revision").fetchone()[0]
         if self.saved_revnum is None:
             self.saved_revnum = 0
 
-    def fetch_revisions(self, to_revnum, pb=None):
+    def _get_transport(self):
+        if self._transport is not None:
+            return self._transport
+        self._transport = SvnRaTransport(self.url)
+        return self._transport
+
+    def fetch_revisions(self, to_revnum=None):
         """Fetch information about all revisions in the remote repository
         until to_revnum.
 
         :param to_revnum: End of range to fetch information for
-        :param pb: Optional progress bar to use
         """
+        to_revnum = max(self._get_transport().get_latest_revnum(), to_revnum)
+
+        pb = ui.ui_factory.nested_progress_bar()
+
         def rcvr(orig_paths, rev, author, date, message, pool):
             pb.update('fetching svn revision info', rev, to_revnum)
-            paths = {}
             if orig_paths is None:
                 orig_paths = {}
             for p in orig_paths:
                 copyfrom_path = orig_paths[p].copyfrom_path
-                if copyfrom_path:
+                if copyfrom_path is not None:
                     copyfrom_path = copyfrom_path.strip("/")
 
                 self.db.execute(
-                     "insert into changed_path (rev, path, action, copyfrom_path, copyfrom_rev) values (?, ?, ?, ?, ?)", 
+                     "replace into changed_path (rev, path, action, copyfrom_path, copyfrom_rev) values (?, ?, ?, ?, ?)", 
                      (rev, p.strip("/"), orig_paths[p].action, copyfrom_path, orig_paths[p].copyfrom_rev))
 
+            if message is not None:
+                message = base64.b64encode(message)
+
             self.db.execute("replace into revision (revno, author, date, message) values (?,?,?,?)", (rev, author, date, message))
 
             self.saved_revnum = rev
-
-        to_revnum = max(self.last_revnum, to_revnum)
-
-        # Don't bother for only a few revisions
-        if abs(self.saved_revnum-to_revnum) < 10:
-            pb = DummyProgress()
-        else:
-            pb = ProgressBar()
+            if self.saved_revnum % 1000 == 0:
+                self.db.commit()
 
         try:
             try:
-                mutter('getting log %r:%r' % (self.saved_revnum, to_revnum))
-                self.transport.get_log(["/"], self.saved_revnum, to_revnum, 
-                               0, True, True, rcvr)
+                while self.saved_revnum < to_revnum:
+                    pool = Pool()
+                    self._get_transport().get_log("/", self.saved_revnum, 
+                                             to_revnum, self._limit, True, 
+                                             True, rcvr, pool)
+                    pool.destroy()
             finally:
-                pb.clear()
+                pb.finished()
         except SubversionException, (_, num):
             if num == svn.core.SVN_ERR_FS_NO_SUCH_REVISION:
                 raise NoSuchRevision(branch=self, 
@@ -147,126 +143,129 @@ class LogWalker(object):
             raise
         self.db.commit()
 
-    def follow_history(self, branch_path, revnum):
+    def follow_path(self, path, revnum):
         """Return iterator over all the revisions between revnum and 
-        0 that touch branch_path.
-        
-        :param branch_path:   Branch path to start reporting (in revnum)
+        0 named path or inside path.
+
+        :param path:   Branch path to start reporting (in revnum)
         :param revnum:        Start revision.
+
+        :return: An iterators that yields tuples with (path, paths, revnum)
+        where paths is a dictionary with all changes that happened in path 
+        in revnum.
         """
         assert revnum >= 0
 
-        if revnum == 0 and branch_path in (None, ""):
+        if revnum == 0 and path == "":
             return
 
-        if not branch_path is None and not self.scheme.is_branch(branch_path):
-            raise NotSvnBranchPath(branch_path)
-
-        if branch_path:
-            branch_path = branch_path.strip("/")
+        recurse = (path != "")
 
-        if revnum > self.saved_revnum:
-            self.fetch_revisions(revnum)
+        path = path.strip("/")
 
-        continue_revnum = None
-        for i in range(revnum+1):
-            i = revnum - i
+        while revnum >= 0:
+            assert revnum > 0 or path == ""
+            revpaths = self.get_revision_paths(revnum, path, recurse=recurse)
 
-            if i == 0:
-                continue
+            if revpaths != {}:
+                yield (path, copy(revpaths), revnum)
 
-            if not (continue_revnum is None or continue_revnum == i):
+            if path == "":
+                revnum -= 1
                 continue
 
-            continue_revnum = None
-
-            changed_paths = {}
-            revpaths = self._get_revision_paths(i)
-            for p in revpaths:
-                if (branch_path is None or 
-                    p == branch_path or
-                    branch_path == "" or
-                    p.startswith(branch_path+"/")):
-
-                    try:
-                        (bp, rp) = self.scheme.unprefix(p)
-                        if not changed_paths.has_key(bp):
-                            changed_paths[bp] = {}
-                        changed_paths[bp][p] = revpaths[p]
-                    except NotBranchError:
-                        pass
-
-            assert branch_path is None or len(changed_paths) <= 1
-
-            for bp in changed_paths:
-                yield (bp, changed_paths[bp], i)
-
-            if (not branch_path is None and 
-                branch_path in revpaths and 
-                not revpaths[branch_path][1] is None):
-                # In this revision, this branch was copied from 
-                # somewhere else
-                # FIXME: What if copyfrom_path is not a branch path?
-                continue_revnum = revpaths[branch_path][2]
-                branch_path = revpaths[branch_path][1]
-
-    def find_branches(self, revnum):
-        """Find all branches that were changed in the specified revision number.
-
-        :param revnum: Revision to search for branches.
+            if revpaths.has_key(path):
+                if revpaths[path][1] is None:
+                    if revpaths[path][0] in ('A', 'R'):
+                        # this path didn't exist before this revision
+                        return
+                else:
+                    # In this revision, this path was copied from 
+                    # somewhere else
+                    revnum = revpaths[path][2]
+                    path = revpaths[path][1]
+                    assert path == "" or revnum > 0
+                    continue
+            revnum -= 1
+            for p in sorted(revpaths.keys()):
+                if path.startswith(p+"/") and revpaths[p][0] in ('A', 'R'):
+                    assert revpaths[p][1]
+                    path = path.replace(p, revpaths[p][1])
+                    revnum = revpaths[p][2]
+                    break
+
+    def get_revision_paths(self, revnum, path=None, recurse=False):
+        """Obtain dictionary with all the changes in a particular revision.
+
+        :param revnum: Subversion revision number
+        :param path: optional path under which to return all entries
+        :param recurse: Report changes to parents as well
+        :returns: dictionary with paths as keys and 
+                  (action, copyfrom_path, copyfrom_rev) as values.
         """
-        created_branches = {}
 
+        if revnum == 0:
+            assert path is None or path == ""
+            return {'': ('A', None, -1)}
+                
         if revnum > self.saved_revnum:
             self.fetch_revisions(revnum)
 
-        for i in range(revnum+1):
-            if i == 0:
-                paths = {'': ('A', None, None)}
-            else:
-                paths = self._get_revision_paths(i)
-            for p in paths:
-                if self.scheme.is_branch(p):
-                    if paths[p][0] in ('R', 'D'):
-                        del created_branches[p]
-                        yield (p, i, False)
-
-                    if paths[p][0] in ('A', 'R'): 
-                        created_branches[p] = i
-
-        for p in created_branches:
-            yield (p, i, True)
+        query = "select path, action, copyfrom_path, copyfrom_rev from changed_path where rev="+str(revnum)
+        if path is not None and path != "":
+            query += " and (path='%s' or path like '%s/%%'" % (path, path)
+            if recurse:
+                query += " or ('%s' LIKE path || '/%%')" % path
+            query += ")"
 
-    def _get_revision_paths(self, revnum):
         paths = {}
-        for p, act, cf, cr in self.db.execute("select path, action, copyfrom_path, copyfrom_rev from changed_path where rev="+str(revnum)):
-            paths[p] = (act, cf, cr)
+        for p, act, cf, cr in self.db.execute(query):
+            paths[p.encode("utf-8")] = (act, cf, cr)
         return paths
 
-    def get_revision_info(self, revnum, pb=None):
+    def get_revision_info(self, revnum):
         """Obtain basic information for a specific revision.
 
         :param revnum: Revision number.
         :returns: Tuple with author, log message and date of the revision.
         """
+        assert revnum >= 0
+        if revnum == 0:
+            return (None, None, None)
         if revnum > self.saved_revnum:
-            self.fetch_revisions(revnum, pb)
+            self.fetch_revisions(revnum)
         (author, message, date) = self.db.execute("select author, message, date from revision where revno="+ str(revnum)).fetchone()
-        paths = self._get_revision_paths(revnum)
-        if author is None:
-            author = None
-        return (author, _escape_commit_message(message), date, paths)
+        if message is not None:
+            message = _escape_commit_message(base64.b64decode(message))
+        return (author, message, date)
 
-    
-    def find_latest_change(self, path, revnum):
+    def find_latest_change(self, path, revnum, include_parents=False,
+                           include_children=False):
         """Find latest revision that touched path.
 
         :param path: Path to check for changes
         :param revnum: First revision to check
         """
-        while revnum > 0 and not self.touches_path(path, revnum):
-            revnum = revnum - 1
-        return revnum
+        assert isinstance(path, basestring)
+        assert isinstance(revnum, int) and revnum >= 0
+        if revnum > self.saved_revnum:
+            self.fetch_revisions(revnum)
+
+        extra = ""
+        if include_children:
+            extra += " or path like '%s/%%'" % path.strip("/")
+        if include_parents:
+            extra += " or ('%s' like (path || '/%%') and (action = 'R' or action = 'A'))" % path.strip("/")
+        query = "select rev from changed_path where (path='%s'%s) and rev <= %d order by rev desc limit 1" % (path.strip("/"), extra, revnum)
+
+        row = self.db.execute(query).fetchone()
+        if row is None and path == "":
+            return 0
+
+        if row is None:
+            return None
+
+        return row[0]
 
     def touches_path(self, path, revnum):
         """Check whether path was changed in specified revision.
@@ -276,23 +275,89 @@ class LogWalker(object):
         """
         if revnum > self.saved_revnum:
             self.fetch_revisions(revnum)
-        return (path in self._get_revision_paths(revnum))
+        if revnum == 0:
+            return (path == "")
+        return (self.db.execute("select 1 from changed_path where path='%s' and rev=%d" % (path, revnum)).fetchone() is not None)
 
     def find_children(self, path, revnum):
         """Find all children of path in revnum."""
-        # TODO: Find children by walking history, or use 
-        # cache?
-        mutter("svn ls -r %d '%r'" % (revnum, path))
-
+        path = path.strip("/")
+        transport = self._get_transport()
+        ft = transport.check_path(path, revnum)
+        if ft == svn.core.svn_node_file:
+            return []
+        assert ft == svn.core.svn_node_dir
+
+        class TreeLister(svn.delta.Editor):
+            def __init__(self, base):
+                self.files = []
+                self.base = base
+
+            def set_target_revision(self, revnum):
+                """See Editor.set_target_revision()."""
+                pass
+
+            def open_root(self, revnum, baton):
+                """See Editor.open_root()."""
+                return path
+
+            def add_directory(self, path, parent_baton, copyfrom_path, copyfrom_revnum, pool):
+                """See Editor.add_directory()."""
+                self.files.append(urlutils.join(self.base, path))
+                return path
+
+            def change_dir_prop(self, id, name, value, pool):
+                pass
+
+            def change_file_prop(self, id, name, value, pool):
+                pass
+
+            def add_file(self, path, parent_id, copyfrom_path, copyfrom_revnum, baton):
+                self.files.append(urlutils.join(self.base, path))
+                return path
+
+            def close_dir(self, id):
+                pass
+
+            def close_file(self, path, checksum):
+                pass
+
+            def close_edit(self):
+                pass
+
+            def abort_edit(self):
+                pass
+
+            def apply_textdelta(self, file_id, base_checksum):
+                pass
+        pool = Pool()
+        editor = TreeLister(path)
+        edit, baton = svn.delta.make_editor(editor, pool)
+        old_base = transport.base
         try:
-            (dirents, _, _) = self.transport.get_dir(
-                "/" + path.encode('utf8'), revnum)
-        except SubversionException, (_, num):
-            if num == svn.core.SVN_ERR_FS_NOT_DIRECTORY:
-                return
-            raise
+            root_repos = transport.get_repos_root()
+            transport.reparent(urlutils.join(root_repos, path))
+            reporter = transport.do_update(revnum,  True, edit, baton, pool)
+            reporter.set_path("", revnum, True, None, pool)
+            reporter.finish_report(pool)
+        finally:
+            transport.reparent(old_base)
+        return editor.files
+
+    def get_previous(self, path, revnum):
+        """Return path,revnum pair specified pair was derived from.
 
-        for p in dirents:
-            yield os.path.join(path, p)
-            for c in self.find_children(os.path.join(path, p), revnum):
-                yield c
+        :param path:  Path to check
+        :param revnum:  Revision to check
+        """
+        assert revnum >= 0
+        if revnum > self.saved_revnum:
+            self.fetch_revisions(revnum)
+        if revnum == 0:
+            return (None, -1)
+        row = self.db.execute("select action, copyfrom_path, copyfrom_rev from changed_path where path='%s' and rev=%d" % (path, revnum)).fetchone()
+        if row[2] == -1:
+            if row[0] == 'A':
+                return (None, -1)
+            return (path, revnum-1)
+        return (row[1], row[2])