[object_store] Avoid leaking previous pack when adding known pack
[jelmer/dulwich.git] / dulwich / object_store.py
index fdc0a654cdb9a5f3933d2d0c5f0391cd49562c8d..9fb7b94bcf0194aff4efba457a0df98ec6e9839c 100644 (file)
@@ -1,31 +1,35 @@
 # object_store.py -- Object store for git objects
-# Copyright (C) 2008-2009 Jelmer Vernooij <jelmer@samba.org>
+# Copyright (C) 2008-2013 Jelmer Vernooij <jelmer@samba.org>
+#                         and others
 #
-# This program is free software; you can redistribute it and/or
-# modify it under the terms of the GNU General Public License
-# as published by the Free Software Foundation; either version 2
-# or (at your option) a later version of the License.
+# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
+# General Public License as public by the Free Software Foundation; version 2.0
+# or (at your option) any later version. You can redistribute it and/or
+# modify it under the terms of either of these two licenses.
 #
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# You should have received a copy of the licenses; if not, see
+# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
+# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
+# License, Version 2.0.
 #
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
-# MA  02110-1301, USA.
 
 
 """Git object store interfaces and implementation."""
 
-
+from io import BytesIO
 import errno
-import itertools
+from itertools import chain
 import os
 import stat
+import sys
 import tempfile
-import urllib2
+import time
 
 from dulwich.diff_tree import (
     tree_changes,
@@ -50,12 +54,15 @@ from dulwich.objects import (
 from dulwich.pack import (
     Pack,
     PackData,
-    ThinPackData,
+    PackInflater,
     iter_sha1,
-    load_pack_index,
-    write_pack,
-    write_pack_data,
+    write_pack_header,
     write_pack_index_v2,
+    write_pack_object,
+    write_pack_objects,
+    compute_file_sha,
+    PackIndexer,
+    PackStreamCopier,
     )
 
 INFODIR = 'info'
@@ -66,9 +73,9 @@ class BaseObjectStore(object):
     """Object store interface."""
 
     def determine_wants_all(self, refs):
-        return [sha for (ref, sha) in refs.iteritems()
-                if not sha in self and not ref.endswith("^{}") and
-                   not sha == ZERO_SHA]
+        return [sha for (ref, sha) in refs.items()
+                if sha not in self and not ref.endswith(b"^{}") and
+                not sha == ZERO_SHA]
 
     def iter_shas(self, shas):
         """Iterate over the objects for the specified shas.
@@ -109,7 +116,7 @@ class BaseObjectStore(object):
     def __getitem__(self, sha):
         """Obtain an object by SHA1."""
         type_num, uncomp = self.get_raw(sha)
-        return ShaFile.from_raw_string(type_num, uncomp)
+        return ShaFile.from_raw_string(type_num, uncomp, sha=sha)
 
     def __iter__(self):
         """Iterate over the SHAs that are present in this store."""
@@ -124,15 +131,15 @@ class BaseObjectStore(object):
     def add_objects(self, objects):
         """Add a set of objects to this object store.
 
-        :param objects: Iterable over a list of objects.
+        :param objects: Iterable over a list of (object, path) tuples
         """
         raise NotImplementedError(self.add_objects)
 
     def tree_changes(self, source, target, want_unchanged=False):
         """Find the differences between the contents of two trees
 
-        :param object_store: Object store to use for retrieving tree contents
-        :param tree: SHA1 of the root tree
+        :param source: SHA1 of the source tree
+        :param target: SHA1 of the target tree
         :param want_unchanged: Whether unchanged files should be reported
         :return: Iterator over tuples with
             (oldpath, newpath), (oldmode, newmode), (oldsha, newsha)
@@ -158,18 +165,22 @@ class BaseObjectStore(object):
                 yield entry
 
     def find_missing_objects(self, haves, wants, progress=None,
-                             get_tagged=None):
+                             get_tagged=None,
+                             get_parents=lambda commit: commit.parents):
         """Find the missing objects required for a set of revisions.
 
         :param haves: Iterable over SHAs already in common.
         :param wants: Iterable over SHAs of objects to fetch.
         :param progress: Simple progress function that will be called with
             updated progress strings.
-        :param get_tagged: Function that returns a dict of pointed-to sha -> tag
-            sha for including tags.
+        :param get_tagged: Function that returns a dict of pointed-to sha ->
+            tag sha for including tags.
+        :param get_parents: Optional function for getting the parents of a
+            commit.
         :return: Iterator over (sha, path) pairs.
         """
-        finder = MissingObjectFinder(self, haves, wants, progress, get_tagged)
+        finder = MissingObjectFinder(self, haves, wants, progress, get_tagged,
+                                     get_parents=get_parents)
         return iter(finder.next, None)
 
     def find_common_revisions(self, graphwalker):
@@ -179,22 +190,14 @@ class BaseObjectStore(object):
         :return: List of SHAs that are in common
         """
         haves = []
-        sha = graphwalker.next()
+        sha = next(graphwalker)
         while sha:
             if sha in self:
                 haves.append(sha)
                 graphwalker.ack(sha)
-            sha = graphwalker.next()
+            sha = next(graphwalker)
         return haves
 
-    def get_graph_walker(self, heads):
-        """Obtain a graph walker for this object store.
-
-        :param heads: Local heads to start search with
-        :return: GraphWalker object
-        """
-        return ObjectStoreGraphWalker(heads, lambda sha: self[sha].parents)
-
     def generate_pack_contents(self, have, want, progress=None):
         """Iterate over the contents of a pack file.
 
@@ -209,8 +212,8 @@ class BaseObjectStore(object):
 
         :param sha: The object SHA to peel.
         :return: The fully-peeled SHA1 of a tag object, after peeling all
-            intermediate tags; if the original ref does not point to a tag, this
-            will equal the original SHA1.
+            intermediate tags; if the original ref does not point to a tag,
+            this will equal the original SHA1.
         """
         obj = self[sha]
         obj_class = object_class(obj.type_name)
@@ -219,39 +222,106 @@ class BaseObjectStore(object):
             obj = self[sha]
         return obj
 
+    def _collect_ancestors(self, heads, common=set(),
+                           get_parents=lambda commit: commit.parents):
+        """Collect all ancestors of heads up to (excluding) those in common.
+
+        :param heads: commits to start from
+        :param common: commits to end at, or empty set to walk repository
+            completely
+        :param get_parents: Optional function for getting the parents of a
+            commit.
+        :return: a tuple (A, B) where A - all commits reachable
+            from heads but not present in common, B - common (shared) elements
+            that are directly reachable from heads
+        """
+        bases = set()
+        commits = set()
+        queue = []
+        queue.extend(heads)
+        while queue:
+            e = queue.pop(0)
+            if e in common:
+                bases.add(e)
+            elif e not in commits:
+                commits.add(e)
+                cmt = self[e]
+                queue.extend(get_parents(cmt))
+        return (commits, bases)
+
+    def close(self):
+        """Close any files opened by this object store."""
+        # Default implementation is a NO-OP
+
 
 class PackBasedObjectStore(BaseObjectStore):
 
     def __init__(self):
-        self._pack_cache = None
+        self._pack_cache = {}
+
+    @property
+    def alternates(self):
+        return []
 
     def contains_packed(self, sha):
-        """Check if a particular object is present by SHA1 and is packed."""
+        """Check if a particular object is present by SHA1 and is packed.
+
+        This does not check alternates.
+        """
         for pack in self.packs:
             if sha in pack:
                 return True
         return False
 
-    def _load_packs(self):
-        raise NotImplementedError(self._load_packs)
+    def __contains__(self, sha):
+        """Check if a particular object is present by SHA1.
+
+        This method makes no distinction between loose and packed objects.
+        """
+        if self.contains_packed(sha) or self.contains_loose(sha):
+            return True
+        for alternate in self.alternates:
+            if sha in alternate:
+                return True
+        return False
 
     def _pack_cache_stale(self):
         """Check whether the pack cache is stale."""
         raise NotImplementedError(self._pack_cache_stale)
 
-    def _add_known_pack(self, pack):
+    def _add_known_pack(self, base_name, pack):
         """Add a newly appeared pack to the cache by path.
 
         """
-        if self._pack_cache is not None:
-            self._pack_cache.append(pack)
+        prev_pack = self._pack_cache.get(base_name)
+        if prev_pack is not pack:
+            self._pack_cache[base_name] = pack
+            if prev_pack:
+                prev_pack.close()
+
+    def _flush_pack_cache(self):
+        pack_cache = self._pack_cache
+        self._pack_cache = {}
+        while pack_cache:
+            (name, pack) = pack_cache.popitem()
+            pack.close()
+
+    def close(self):
+        self._flush_pack_cache()
 
     @property
     def packs(self):
         """List with pack objects."""
         if self._pack_cache is None or self._pack_cache_stale():
-            self._pack_cache = self._load_packs()
-        return self._pack_cache
+            self._update_pack_cache()
+
+        return self._pack_cache.values()
+
+    def _iter_alternate_objects(self):
+        """Iterate over the SHAs of all the objects in alternate stores."""
+        for alternate in self.alternates:
+            for alternate_object in alternate:
+                yield alternate_object
 
     def _iter_loose_objects(self):
         """Iterate over the SHAs of all loose objects."""
@@ -263,26 +333,62 @@ class PackBasedObjectStore(BaseObjectStore):
     def _remove_loose_object(self, sha):
         raise NotImplementedError(self._remove_loose_object)
 
+    def _remove_pack(self, name):
+        raise NotImplementedError(self._remove_pack)
+
     def pack_loose_objects(self):
         """Pack loose objects.
-        
+
         :return: Number of objects packed
         """
         objects = set()
         for sha in self._iter_loose_objects():
             objects.add((self._get_loose_object(sha), None))
-        self.add_objects(objects)
+        self.add_objects(list(objects))
         for obj, path in objects:
             self._remove_loose_object(obj.id)
         return len(objects)
 
+    def repack(self):
+        """Repack the packs in this repository.
+
+        Note that this implementation is fairly naive and currently keeps all
+        objects in memory while it repacks.
+        """
+        loose_objects = set()
+        for sha in self._iter_loose_objects():
+            loose_objects.add(self._get_loose_object(sha))
+        objects = {(obj, None) for obj in loose_objects}
+        old_packs = {p.name(): p for p in self.packs}
+        for name, pack in old_packs.items():
+            objects.update((obj, None) for obj in pack.iterobjects())
+        self._flush_pack_cache()
+
+        # The name of the consolidated pack might match the name of a
+        # pre-existing pack. Take care not to remove the newly created
+        # consolidated pack.
+
+        consolidated = self.add_objects(objects)
+        old_packs.pop(consolidated.name(), None)
+
+        for obj in loose_objects:
+            self._remove_loose_object(obj.id)
+        for name, pack in old_packs.items():
+            self._remove_pack(pack)
+        self._update_pack_cache()
+        return len(objects)
+
     def __iter__(self):
         """Iterate over the SHAs that are present in this store."""
-        iterables = self.packs + [self._iter_loose_objects()]
-        return itertools.chain(*iterables)
+        iterables = (list(self.packs) + [self._iter_loose_objects()] +
+                     [self._iter_alternate_objects()])
+        return chain(*iterables)
 
     def contains_loose(self, sha):
-        """Check if a particular object is present by SHA1 and is loose."""
+        """Check if a particular object is present by SHA1 and is loose.
+
+        This does not check alternates.
+        """
         return self._get_loose_object(sha) is not None
 
     def get_raw(self, name):
@@ -309,20 +415,31 @@ class PackBasedObjectStore(BaseObjectStore):
         ret = self._get_loose_object(hexsha)
         if ret is not None:
             return ret.type_num, ret.as_raw_string()
+        for alternate in self.alternates:
+            try:
+                return alternate.get_raw(hexsha)
+            except KeyError:
+                pass
         raise KeyError(hexsha)
 
     def add_objects(self, objects):
         """Add a set of objects to this object store.
 
-        :param objects: Iterable over objects, should support __len__.
+        :param objects: Iterable over (object, path) tuples, should support
+            __len__.
         :return: Pack object of the objects written.
         """
         if len(objects) == 0:
             # Don't bother writing an empty pack file
             return
-        f, commit = self.add_pack()
-        write_pack_data(f, objects, len(objects))
-        return commit()
+        f, commit, abort = self.add_pack()
+        try:
+            write_pack_objects(f, objects)
+        except:
+            abort()
+            raise
+        else:
+            return commit()
 
 
 class DiskObjectStore(PackBasedObjectStore):
@@ -337,29 +454,96 @@ class DiskObjectStore(PackBasedObjectStore):
         self.path = path
         self.pack_dir = os.path.join(self.path, PACKDIR)
         self._pack_cache_time = 0
+        self._pack_cache = {}
+        self._alternates = None
+
+    def __repr__(self):
+        return "<%s(%r)>" % (self.__class__.__name__, self.path)
 
-    def _load_packs(self):
-        pack_files = []
+    @property
+    def alternates(self):
+        if self._alternates is not None:
+            return self._alternates
+        self._alternates = []
+        for path in self._read_alternate_paths():
+            self._alternates.append(DiskObjectStore(path))
+        return self._alternates
+
+    def _read_alternate_paths(self):
+        try:
+            f = GitFile(os.path.join(self.path, INFODIR, "alternates"), 'rb')
+        except (OSError, IOError) as e:
+            if e.errno == errno.ENOENT:
+                return
+            raise
+        with f:
+            for l in f.readlines():
+                l = l.rstrip(b"\n")
+                if l[0] == b"#":
+                    continue
+                if os.path.isabs(l):
+                    yield l.decode(sys.getfilesystemencoding())
+                else:
+                    yield os.path.join(self.path, l).decode(
+                        sys.getfilesystemencoding())
+
+    def add_alternate_path(self, path):
+        """Add an alternate path to this object store.
+        """
+        try:
+            os.mkdir(os.path.join(self.path, INFODIR))
+        except OSError as e:
+            if e.errno != errno.EEXIST:
+                raise
+        alternates_path = os.path.join(self.path, INFODIR, "alternates")
+        with GitFile(alternates_path, 'wb') as f:
+            try:
+                orig_f = open(alternates_path, 'rb')
+            except (OSError, IOError) as e:
+                if e.errno != errno.ENOENT:
+                    raise
+            else:
+                with orig_f:
+                    f.write(orig_f.read())
+            f.write(path.encode(sys.getfilesystemencoding()) + b"\n")
+
+        if not os.path.isabs(path):
+            path = os.path.join(self.path, path)
+        self.alternates.append(DiskObjectStore(path))
+
+    def _update_pack_cache(self):
         try:
-            self._pack_cache_time = os.stat(self.pack_dir).st_mtime
             pack_dir_contents = os.listdir(self.pack_dir)
-            for name in pack_dir_contents:
-                # TODO: verify that idx exists first
-                if name.startswith("pack-") and name.endswith(".pack"):
-                    filename = os.path.join(self.pack_dir, name)
-                    pack_files.append((os.stat(filename).st_mtime, filename))
-        except OSError, e:
+        except OSError as e:
             if e.errno == errno.ENOENT:
-                return []
+                self._pack_cache_time = 0
+                self.close()
+                return
             raise
-        pack_files.sort(reverse=True)
-        suffix_len = len(".pack")
-        return [Pack(f[:-suffix_len]) for _, f in pack_files]
+        self._pack_cache_time = max(
+                os.stat(self.pack_dir).st_mtime, time.time())
+        pack_files = set()
+        for name in pack_dir_contents:
+            if name.startswith("pack-") and name.endswith(".pack"):
+                # verify that idx exists first (otherwise the pack was not yet
+                # fully written)
+                idx_name = os.path.splitext(name)[0] + ".idx"
+                if idx_name in pack_dir_contents:
+                    pack_name = name[:-len(".pack")]
+                    pack_files.add(pack_name)
+
+        # Open newly appeared pack files
+        for f in pack_files:
+            if f not in self._pack_cache:
+                self._pack_cache[f] = Pack(os.path.join(self.pack_dir, f))
+        # Remove disappeared pack files
+        for f in set(self._pack_cache) - pack_files:
+            self._pack_cache.pop(f).close()
 
     def _pack_cache_stale(self):
         try:
-            return os.stat(self.pack_dir).st_mtime > self._pack_cache_time
-        except OSError, e:
+            return os.stat(self.pack_dir).st_mtime >= self._pack_cache_time
+        except OSError as e:
             if e.errno == errno.ENOENT:
                 return True
             raise
@@ -373,13 +557,13 @@ class DiskObjectStore(PackBasedObjectStore):
             if len(base) != 2:
                 continue
             for rest in os.listdir(os.path.join(self.path, base)):
-                yield base+rest
+                yield (base+rest).encode(sys.getfilesystemencoding())
 
     def _get_loose_object(self, sha):
         path = self._get_shafile_path(sha)
         try:
             return ShaFile.from_path(path)
-        except (OSError, IOError), e:
+        except (OSError, IOError) as e:
             if e.errno == errno.ENOENT:
                 return None
             raise
@@ -387,38 +571,101 @@ class DiskObjectStore(PackBasedObjectStore):
     def _remove_loose_object(self, sha):
         os.remove(self._get_shafile_path(sha))
 
-    def move_in_thin_pack(self, path):
+    def _remove_pack(self, pack):
+        os.remove(pack.data.path)
+        os.remove(pack.index.path)
+
+    def _get_pack_basepath(self, entries):
+        suffix = iter_sha1(entry[0] for entry in entries)
+        # TODO: Handle self.pack_dir being bytes
+        suffix = suffix.decode('ascii')
+        return os.path.join(self.pack_dir, "pack-" + suffix)
+
+    def _complete_thin_pack(self, f, path, copier, indexer):
         """Move a specific file containing a pack into the pack directory.
 
         :note: The file should be on the same file system as the
             packs directory.
 
+        :param f: Open file object for the pack.
         :param path: Path to the pack file.
+        :param copier: A PackStreamCopier to use for writing pack data.
+        :param indexer: A PackIndexer for indexing the pack.
         """
-        data = ThinPackData(self.get_raw, path)
-
-        # Write index for the thin pack (do we really need this?)
-        temppath = os.path.join(self.pack_dir,
-            sha_to_hex(urllib2.randombytes(20))+".tempidx")
-        data.create_index_v2(temppath)
-        p = Pack.from_objects(data, load_pack_index(temppath))
+        entries = list(indexer)
+
+        # Update the header with the new number of objects.
+        f.seek(0)
+        write_pack_header(f, len(entries) + len(indexer.ext_refs()))
+
+        # Must flush before reading (http://bugs.python.org/issue3207)
+        f.flush()
+
+        # Rescan the rest of the pack, computing the SHA with the new header.
+        new_sha = compute_file_sha(f, end_ofs=-20)
+
+        # Must reposition before writing (http://bugs.python.org/issue3207)
+        f.seek(0, os.SEEK_CUR)
+
+        # Complete the pack.
+        for ext_sha in indexer.ext_refs():
+            assert len(ext_sha) == 20
+            type_num, data = self.get_raw(ext_sha)
+            offset = f.tell()
+            crc32 = write_pack_object(f, type_num, data, sha=new_sha)
+            entries.append((ext_sha, offset, crc32))
+        pack_sha = new_sha.digest()
+        f.write(pack_sha)
+        f.close()
+
+        # Move the pack in.
+        entries.sort()
+        pack_base_name = self._get_pack_basepath(entries)
+        if sys.platform == 'win32':
+            try:
+                os.rename(path, pack_base_name + '.pack')
+            except WindowsError:
+                os.remove(pack_base_name + '.pack')
+                os.rename(path, pack_base_name + '.pack')
+        else:
+            os.rename(path, pack_base_name + '.pack')
 
+        # Write the index.
+        index_file = GitFile(pack_base_name + '.idx', 'wb')
         try:
-            # Write a full pack version
-            temppath = os.path.join(self.pack_dir,
-                sha_to_hex(urllib2.randombytes(20))+".temppack")
-            write_pack(temppath, ((o, None) for o in p.iterobjects()), len(p))
+            write_pack_index_v2(index_file, entries, pack_sha)
+            index_file.close()
         finally:
-            p.close()
-
-        pack_sha = load_pack_index(temppath+".idx").objects_sha1()
-        newbasename = os.path.join(self.pack_dir, "pack-%s" % pack_sha)
-        os.rename(temppath+".pack", newbasename+".pack")
-        os.rename(temppath+".idx", newbasename+".idx")
-        final_pack = Pack(newbasename)
-        self._add_known_pack(final_pack)
+            index_file.abort()
+
+        # Add the pack to the store and return it.
+        final_pack = Pack(pack_base_name)
+        final_pack.check_length_and_checksum()
+        self._add_known_pack(pack_base_name, final_pack)
         return final_pack
 
+    def add_thin_pack(self, read_all, read_some):
+        """Add a new thin pack to this object store.
+
+        Thin packs are packs that contain deltas with parents that exist
+        outside the pack. They should never be placed in the object store
+        directly, and always indexed and completed as they are copied.
+
+        :param read_all: Read function that blocks until the number of
+            requested bytes are read.
+        :param read_some: Read function that returns at least one byte, but may
+            not return the number of bytes requested.
+        :return: A Pack object pointing at the now-completed thin pack in the
+            objects/pack directory.
+        """
+        fd, path = tempfile.mkstemp(dir=self.path, prefix='tmp_pack_')
+        with os.fdopen(fd, 'w+b') as f:
+            indexer = PackIndexer(f, resolve_ext_ref=self.get_raw)
+            copier = PackStreamCopier(read_all, read_some, f,
+                                      delta_iter=indexer)
+            copier.verify()
+            return self._complete_thin_pack(f, path, copier, indexer)
+
     def move_in_pack(self, path):
         """Move a specific file containing a pack into the pack directory.
 
@@ -427,47 +674,26 @@ class DiskObjectStore(PackBasedObjectStore):
 
         :param path: Path to the pack file.
         """
-        p = PackData(path)
-        entries = p.sorted_entries()
-        basename = os.path.join(self.pack_dir,
-            "pack-%s" % iter_sha1(entry[0] for entry in entries))
-        f = GitFile(basename+".idx", "wb")
-        try:
-            write_pack_index_v2(f, entries, p.get_stored_checksum())
-        finally:
-            f.close()
-        p.close()
+        with PackData(path) as p:
+            entries = p.sorted_entries()
+            basename = self._get_pack_basepath(entries)
+            with GitFile(basename+".idx", "wb") as f:
+                write_pack_index_v2(f, entries, p.get_stored_checksum())
         os.rename(path, basename + ".pack")
         final_pack = Pack(basename)
-        self._add_known_pack(final_pack)
+        self._add_known_pack(basename, final_pack)
         return final_pack
 
-    def add_thin_pack(self):
-        """Add a new thin pack to this object store.
-
-        Thin packs are packs that contain deltas with parents that exist
-        in a different pack.
-        """
-        fd, path = tempfile.mkstemp(dir=self.pack_dir, suffix=".pack")
-        f = os.fdopen(fd, 'wb')
-        def commit():
-            os.fsync(fd)
-            f.close()
-            if os.path.getsize(path) > 0:
-                return self.move_in_thin_pack(path)
-            else:
-                os.remove(path)
-                return None
-        return f, commit
-
     def add_pack(self):
         """Add a new pack to this object store.
 
-        :return: Fileobject to write to and a commit function to
-            call when the pack is finished.
+        :return: Fileobject to write to, a commit function to
+            call when the pack is finished and an abort
+            function.
         """
         fd, path = tempfile.mkstemp(dir=self.pack_dir, suffix=".pack")
         f = os.fdopen(fd, 'wb')
+
         def commit():
             os.fsync(fd)
             f.close()
@@ -476,33 +702,34 @@ class DiskObjectStore(PackBasedObjectStore):
             else:
                 os.remove(path)
                 return None
-        return f, commit
+
+        def abort():
+            f.close()
+            os.remove(path)
+        return f, commit, abort
 
     def add_object(self, obj):
         """Add a single object to this object store.
 
         :param obj: Object to add
         """
-        dir = os.path.join(self.path, obj.id[:2])
+        path = self._get_shafile_path(obj.id)
+        dir = os.path.dirname(path)
         try:
             os.mkdir(dir)
-        except OSError, e:
+        except OSError as e:
             if e.errno != errno.EEXIST:
                 raise
-        path = os.path.join(dir, obj.id[2:])
         if os.path.exists(path):
-            return # Already there, no need to write again
-        f = GitFile(path, 'wb')
-        try:
+            return  # Already there, no need to write again
+        with GitFile(path, 'wb') as f:
             f.write(obj.as_legacy_object())
-        finally:
-            f.close()
 
     @classmethod
     def init(cls, path):
         try:
             os.mkdir(path)
-        except OSError, e:
+        except OSError as e:
             if e.errno != errno.EEXIST:
                 raise
         os.mkdir(os.path.join(path, "info"))
@@ -517,9 +744,17 @@ class MemoryObjectStore(BaseObjectStore):
         super(MemoryObjectStore, self).__init__()
         self._data = {}
 
+    def _to_hexsha(self, sha):
+        if len(sha) == 40:
+            return sha
+        elif len(sha) == 20:
+            return sha_to_hex(sha)
+        else:
+            raise ValueError("Invalid sha %r" % (sha,))
+
     def contains_loose(self, sha):
         """Check if a particular object is present by SHA1 and is loose."""
-        return sha in self._data
+        return self._to_hexsha(sha) in self._data
 
     def contains_packed(self, sha):
         """Check if a particular object is present by SHA1 and is packed."""
@@ -527,7 +762,7 @@ class MemoryObjectStore(BaseObjectStore):
 
     def __iter__(self):
         """Iterate over the SHAs that are present in this store."""
-        return self._data.iterkeys()
+        return iter(self._data.keys())
 
     @property
     def packs(self):
@@ -540,28 +775,98 @@ class MemoryObjectStore(BaseObjectStore):
         :param name: sha for the object.
         :return: tuple with numeric type and object contents.
         """
-        return self[name].as_raw_string()
+        obj = self[self._to_hexsha(name)]
+        return obj.type_num, obj.as_raw_string()
 
     def __getitem__(self, name):
-        return self._data[name]
+        return self._data[self._to_hexsha(name)].copy()
 
     def __delitem__(self, name):
         """Delete an object from this store, for testing only."""
-        del self._data[name]
+        del self._data[self._to_hexsha(name)]
 
     def add_object(self, obj):
         """Add a single object to this object store.
 
         """
-        self._data[obj.id] = obj
+        self._data[obj.id] = obj.copy()
 
     def add_objects(self, objects):
         """Add a set of objects to this object store.
 
-        :param objects: Iterable over a list of objects.
+        :param objects: Iterable over a list of (object, path) tuples
         """
         for obj, path in objects:
-            self._data[obj.id] = obj
+            self.add_object(obj)
+
+    def add_pack(self):
+        """Add a new pack to this object store.
+
+        Because this object store doesn't support packs, we extract and add the
+        individual objects.
+
+        :return: Fileobject to write to and a commit function to
+            call when the pack is finished.
+        """
+        f = BytesIO()
+
+        def commit():
+            p = PackData.from_file(BytesIO(f.getvalue()), f.tell())
+            f.close()
+            for obj in PackInflater.for_pack_data(p, self.get_raw):
+                self.add_object(obj)
+
+        def abort():
+            pass
+        return f, commit, abort
+
+    def _complete_thin_pack(self, f, indexer):
+        """Complete a thin pack by adding external references.
+
+        :param f: Open file object for the pack.
+        :param indexer: A PackIndexer for indexing the pack.
+        """
+        entries = list(indexer)
+
+        # Update the header with the new number of objects.
+        f.seek(0)
+        write_pack_header(f, len(entries) + len(indexer.ext_refs()))
+
+        # Rescan the rest of the pack, computing the SHA with the new header.
+        new_sha = compute_file_sha(f, end_ofs=-20)
+
+        # Complete the pack.
+        for ext_sha in indexer.ext_refs():
+            assert len(ext_sha) == 20
+            type_num, data = self.get_raw(ext_sha)
+            write_pack_object(f, type_num, data, sha=new_sha)
+        pack_sha = new_sha.digest()
+        f.write(pack_sha)
+
+    def add_thin_pack(self, read_all, read_some):
+        """Add a new thin pack to this object store.
+
+        Thin packs are packs that contain deltas with parents that exist
+        outside the pack. Because this object store doesn't support packs, we
+        extract and add the individual objects.
+
+        :param read_all: Read function that blocks until the number of
+            requested bytes are read.
+        :param read_some: Read function that returns at least one byte, but may
+            not return the number of bytes requested.
+        """
+        f, commit, abort = self.add_pack()
+        try:
+            indexer = PackIndexer(f, resolve_ext_ref=self.get_raw)
+            copier = PackStreamCopier(read_all, read_some, f,
+                                      delta_iter=indexer)
+            copier.verify()
+            self._complete_thin_pack(f, indexer)
+        except:
+            abort()
+            raise
+        else:
+            commit()
 
 
 class ObjectImporter(object):
@@ -647,23 +952,71 @@ class ObjectStoreIterator(ObjectIterator):
 
 
 def tree_lookup_path(lookup_obj, root_sha, path):
-    """Lookup an object in a Git tree.
+    """Look up an object in a Git tree.
 
     :param lookup_obj: Callback for retrieving object by SHA1
     :param root_sha: SHA1 of the root tree
     :param path: Path to lookup
+    :return: A tuple of (mode, SHA) of the resulting path.
     """
-    parts = path.split("/")
-    sha = root_sha
-    mode = None
-    for p in parts:
-        obj = lookup_obj(sha)
-        if not isinstance(obj, Tree):
-            raise NotTreeError(sha)
-        if p == '':
-            continue
-        mode, sha = obj[p]
-    return mode, sha
+    tree = lookup_obj(root_sha)
+    if not isinstance(tree, Tree):
+        raise NotTreeError(root_sha)
+    return tree.lookup_path(lookup_obj, path)
+
+
+def _collect_filetree_revs(obj_store, tree_sha, kset):
+    """Collect SHA1s of files and directories for specified tree.
+
+    :param obj_store: Object store to get objects by SHA from
+    :param tree_sha: tree reference to walk
+    :param kset: set to fill with references to files and directories
+    """
+    filetree = obj_store[tree_sha]
+    for name, mode, sha in filetree.iteritems():
+        if not S_ISGITLINK(mode) and sha not in kset:
+            kset.add(sha)
+            if stat.S_ISDIR(mode):
+                _collect_filetree_revs(obj_store, sha, kset)
+
+
+def _split_commits_and_tags(obj_store, lst, ignore_unknown=False):
+    """Split object id list into three lists with commit, tag, and other SHAs.
+
+    Commits referenced by tags are included into commits
+    list as well. Only SHA1s known in this repository will get
+    through, and unless ignore_unknown argument is True, KeyError
+    is thrown for SHA1 missing in the repository
+
+    :param obj_store: Object store to get objects by SHA1 from
+    :param lst: Collection of commit and tag SHAs
+    :param ignore_unknown: True to skip SHA1 missing in the repository
+        silently.
+    :return: A tuple of (commits, tags, others) SHA1s
+    """
+    commits = set()
+    tags = set()
+    others = set()
+    for e in lst:
+        try:
+            o = obj_store[e]
+        except KeyError:
+            if not ignore_unknown:
+                raise
+        else:
+            if isinstance(o, Commit):
+                commits.add(e)
+            elif isinstance(o, Tag):
+                tags.add(e)
+                tagged = o.object[1]
+                c, t, o = _split_commits_and_tags(
+                    obj_store, [tagged], ignore_unknown=ignore_unknown)
+                commits |= c
+                tags |= t
+                others |= o
+            else:
+                others.add(e)
+    return (commits, tags, others)
 
 
 class MissingObjectFinder(object):
@@ -676,16 +1029,54 @@ class MissingObjectFinder(object):
     :param progress: Optional function to report progress to.
     :param get_tagged: Function that returns a dict of pointed-to sha -> tag
         sha for including tags.
+    :param get_parents: Optional function for getting the parents of a commit.
     :param tagged: dict of pointed-to sha -> tag sha for including tags
     """
 
     def __init__(self, object_store, haves, wants, progress=None,
-                 get_tagged=None):
-        haves = set(haves)
-        self.sha_done = haves
-        self.objects_to_send = set([(w, None, False) for w in wants
-                                    if w not in haves])
+                 get_tagged=None, get_parents=lambda commit: commit.parents):
         self.object_store = object_store
+        self._get_parents = get_parents
+        # process Commits and Tags differently
+        # Note, while haves may list commits/tags not available locally,
+        # and such SHAs would get filtered out by _split_commits_and_tags,
+        # wants shall list only known SHAs, and otherwise
+        # _split_commits_and_tags fails with KeyError
+        have_commits, have_tags, have_others = (
+            _split_commits_and_tags(object_store, haves, True))
+        want_commits, want_tags, want_others = (
+            _split_commits_and_tags(object_store, wants, False))
+        # all_ancestors is a set of commits that shall not be sent
+        # (complete repository up to 'haves')
+        all_ancestors = object_store._collect_ancestors(
+            have_commits, get_parents=self._get_parents)[0]
+        # all_missing - complete set of commits between haves and wants
+        # common - commits from all_ancestors we hit into while
+        # traversing parent hierarchy of wants
+        missing_commits, common_commits = object_store._collect_ancestors(
+            want_commits, all_ancestors, get_parents=self._get_parents)
+        self.sha_done = set()
+        # Now, fill sha_done with commits and revisions of
+        # files and directories known to be both locally
+        # and on target. Thus these commits and files
+        # won't get selected for fetch
+        for h in common_commits:
+            self.sha_done.add(h)
+            cmt = object_store[h]
+            _collect_filetree_revs(object_store, cmt.tree, self.sha_done)
+        # record tags we have as visited, too
+        for t in have_tags:
+            self.sha_done.add(t)
+
+        missing_tags = want_tags.difference(have_tags)
+        missing_others = want_others.difference(have_others)
+        # in fact, what we 'want' is commits, tags, and others
+        # we've found missing
+        wants = missing_commits.union(missing_tags)
+        wants = wants.union(missing_others)
+
+        self.objects_to_send = set([(w, None, False) for w in wants])
+
         if progress is None:
             self.progress = lambda x: None
         else:
@@ -696,36 +1087,32 @@ class MissingObjectFinder(object):
         self.objects_to_send.update([e for e in entries
                                      if not e[0] in self.sha_done])
 
-    def parse_tree(self, tree):
-        self.add_todo([(sha, name, not stat.S_ISDIR(mode))
-                       for name, mode, sha in tree.iteritems()
-                       if not S_ISGITLINK(mode)])
-
-    def parse_commit(self, commit):
-        self.add_todo([(commit.tree, "", False)])
-        self.add_todo([(p, None, False) for p in commit.parents])
-
-    def parse_tag(self, tag):
-        self.add_todo([(tag.object[1], None, False)])
-
     def next(self):
-        if not self.objects_to_send:
-            return None
-        (sha, name, leaf) = self.objects_to_send.pop()
+        while True:
+            if not self.objects_to_send:
+                return None
+            (sha, name, leaf) = self.objects_to_send.pop()
+            if sha not in self.sha_done:
+                break
         if not leaf:
             o = self.object_store[sha]
             if isinstance(o, Commit):
-                self.parse_commit(o)
+                self.add_todo([(o.tree, "", False)])
             elif isinstance(o, Tree):
-                self.parse_tree(o)
+                self.add_todo([(s, n, not stat.S_ISDIR(m))
+                               for n, m, s in o.iteritems()
+                               if not S_ISGITLINK(m)])
             elif isinstance(o, Tag):
-                self.parse_tag(o)
+                self.add_todo([(o.object[1], None, False)])
         if sha in self._tagged:
             self.add_todo([(self._tagged[sha], None, True)])
         self.sha_done.add(sha)
-        self.progress("counting objects: %d\r" % len(self.sha_done))
+        self.progress(("counting objects: %d\r" %
+                       len(self.sha_done)).encode('ascii'))
         return (sha, name)
 
+    __next__ = next
+
 
 class ObjectStoreGraphWalker(object):
     """Graph walker that finds what commits are missing from an object store.
@@ -746,6 +1133,8 @@ class ObjectStoreGraphWalker(object):
 
     def ack(self, sha):
         """Ack that a revision and its ancestors are present in the source."""
+        if len(sha) != 40:
+            raise ValueError("unexpected sha %r received" % sha)
         ancestors = set([sha])
 
         # stop if we run out of heads to remove
@@ -757,8 +1146,10 @@ class ObjectStoreGraphWalker(object):
             # collect all ancestors
             new_ancestors = set()
             for a in ancestors:
-                if a in self.parents:
-                    new_ancestors.update(self.parents[a])
+                ps = self.parents.get(a)
+                if ps is not None:
+                    new_ancestors.update(ps)
+                self.parents[a] = None
 
             # no more ancestors; stop
             if not new_ancestors:
@@ -772,6 +1163,57 @@ class ObjectStoreGraphWalker(object):
             ret = self.heads.pop()
             ps = self.get_parents(ret)
             self.parents[ret] = ps
-            self.heads.update(ps)
+            self.heads.update(
+                [p for p in ps if p not in self.parents])
             return ret
         return None
+
+    __next__ = next
+
+
+def commit_tree_changes(object_store, tree, changes):
+    """Commit a specified set of changes to a tree structure.
+
+    This will apply a set of changes on top of an existing tree, storing new
+    objects in object_store.
+
+    changes are a list of tuples with (path, mode, object_sha).
+    Paths can be both blobs and trees. See the mode and
+    object sha to None deletes the path.
+
+    This method works especially well if there are only a small
+    number of changes to a big tree. For a large number of changes
+    to a large tree, use e.g. commit_tree.
+
+    :param object_store: Object store to store new objects in
+        and retrieve old ones from.
+    :param tree: Original tree root
+    :param changes: changes to apply
+    :return: New tree root object
+    """
+    # TODO(jelmer): Save up the objects and add them using .add_objects
+    # rather than with individual calls to .add_object.
+    nested_changes = {}
+    for (path, new_mode, new_sha) in changes:
+        try:
+            (dirname, subpath) = path.split(b'/', 1)
+        except ValueError:
+            if new_sha is None:
+                del tree[path]
+            else:
+                tree[path] = (new_mode, new_sha)
+        else:
+            nested_changes.setdefault(dirname, []).append(
+                (subpath, new_mode, new_sha))
+    for name, subchanges in nested_changes.items():
+        try:
+            orig_subtree = object_store[tree[name][1]]
+        except KeyError:
+            orig_subtree = Tree()
+        subtree = commit_tree_changes(object_store, orig_subtree, subchanges)
+        if len(subtree) == 0:
+            del tree[name]
+        else:
+            tree[name] = (stat.S_IFDIR, subtree.id)
+    object_store.add_object(tree)
+    return tree