test_object_store: use temp dirs instead of 'foo'
[jelmer/dulwich-libgit2.git] / dulwich / pack.py
index 52c56ae64d34f4a31af4a342455d8922e369e18f..791d1c434fdf3b1eb6ec9d2239d7978807d83cda 100644 (file)
@@ -1,6 +1,6 @@
 # pack.py -- For dealing wih packed git objects.
 # Copyright (C) 2007 James Westby <jw+debian@jameswestby.net>
-# Copryight (C) 2008 Jelmer Vernooij <jelmer@samba.org>
+# Copryight (C) 2008-2009 Jelmer Vernooij <jelmer@samba.org>
 # 
 # This program is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License
@@ -35,7 +35,13 @@ try:
 except ImportError:
     from misc import defaultdict
 
-from itertools import imap, izip
+import difflib
+import errno
+from itertools import (
+    chain,
+    imap,
+    izip,
+    )
 import mmap
 import os
 import struct
@@ -45,12 +51,12 @@ except ImportError:
     from dulwich.misc import unpack_from
 import sys
 import zlib
-import difflib
 
 from dulwich.errors import (
     ApplyDeltaError,
     ChecksumMismatch,
     )
+from dulwich.file import GitFile
 from dulwich.lru_cache import (
     LRUSizeCache,
     )
@@ -59,62 +65,124 @@ from dulwich.objects import (
     hex_to_sha,
     sha_to_hex,
     )
-from dulwich.misc import make_sha
+from dulwich.misc import (
+    make_sha,
+    )
 
 supports_mmap_offset = (sys.version_info[0] >= 3 or
         (sys.version_info[0] == 2 and sys.version_info[1] >= 6))
 
 
-def take_msb_bytes(map, offset):
+def take_msb_bytes(read):
+    """Read bytes marked with most significant bit.
+    
+    :param read: Read function
+    """
     ret = []
     while len(ret) == 0 or ret[-1] & 0x80:
-        ret.append(ord(map[offset]))
-        offset += 1
+        ret.append(ord(read(1)))
     return ret
 
 
-def read_zlib(data, offset, dec_size):
+def read_zlib_chunks(read, buffer_size=4096):
+    """Read chunks of zlib data from a buffer.
+    
+    :param read: Read function
+    :return: Tuple with list of chunks, length of 
+        compressed data length and unused read data
+    """
     obj = zlib.decompressobj()
-    x = ""
+    ret = []
     fed = 0
     while obj.unused_data == "":
-        base = offset+fed
-        add = data[base:base+1024]
-        if len(add) < 1024:
+        add = read(buffer_size)
+        if len(add) < buffer_size:
             add += "Z"
         fed += len(add)
-        x += obj.decompress(add)
-    assert len(x) == dec_size
+        ret.append(obj.decompress(add))
     comp_len = fed-len(obj.unused_data)
-    return x, comp_len
+    return ret, comp_len, obj.unused_data
 
 
 def iter_sha1(iter):
+    """Return the hexdigest of the SHA1 over a set of names.
+    
+    :param iter: Iterator over string objects
+    :return: 40-byte hex sha1 digest
+    """
     sha1 = make_sha()
     for name in iter:
         sha1.update(name)
     return sha1.hexdigest()
 
 
-MAX_MMAP_SIZE = 1024 * 1024 * 1024
+def load_pack_index(path):
+    """Load an index file by path.
+
+    :param filename: Path to the index file
+    """
+    f = GitFile(path, 'rb')
+    return load_pack_index_file(path, f)
+
+
+def _load_file_contents(f, size=None):
+    fileno = getattr(f, 'fileno', None)
+    # Attempt to use mmap if possible
+    if fileno is not None:
+        fd = f.fileno()
+        if size is None:
+            size = os.fstat(fd).st_size
+        try:
+            contents = mmap.mmap(fd, size, access=mmap.ACCESS_READ)
+        except mmap.error:
+            # Perhaps a socket?
+            pass
+        else:
+            return contents, size
+    contents = f.read()
+    size = len(contents)
+    return contents, size
 
-def simple_mmap(f, offset, size, access=mmap.ACCESS_READ):
-    """Simple wrapper for mmap() which always supports the offset parameter.
 
-    :param f: File object.
-    :param offset: Offset in the file, from the beginning of the file.
-    :param size: Size of the mmap'ed area
-    :param access: Access mechanism.
-    :return: MMAP'd area.
+def load_pack_index_file(path, f):
+    """Load an index file from a file-like object.
+
+    :param path: Path for the index file
+    :param f: File-like object
     """
-    if offset+size > MAX_MMAP_SIZE and not supports_mmap_offset:
-        raise AssertionError("%s is larger than 256 meg, and this version "
-            "of Python does not support the offset argument to mmap().")
-    if supports_mmap_offset:
-        return mmap.mmap(f.fileno(), size, access=access, offset=offset), 0
+    contents, size = _load_file_contents(f)
+    if contents[:4] == '\377tOc':
+        version = struct.unpack(">L", contents[4:8])[0]
+        if version == 2:
+            return PackIndex2(path, file=f, contents=contents,
+                size=size)
+        else:
+            raise KeyError("Unknown pack index format %d" % version)
     else:
-        mem = mmap.mmap(f.fileno(), size+offset, access=access)
-        return mem, offset
+        return PackIndex1(path, file=f, contents=contents, size=size)
+
+
+def bisect_find_sha(start, end, sha, unpack_name):
+    """Find a SHA in a data blob with sorted SHAs.
+    
+    :param start: Start index of range to search
+    :param end: End index of range to search
+    :param sha: Sha to find
+    :param unpack_name: Callback to retrieve SHA by index
+    :return: Index of the SHA, or None if it wasn't found
+    """
+    assert start <= end
+    while start <= end:
+        i = (start + end)/2
+        file_sha = unpack_name(i)
+        x = cmp(file_sha, sha)
+        if x < 0:
+            start = i + 1
+        elif x > 0:
+            end = i - 1
+        else:
+            return i
+    return None
 
 
 class PackIndex(object):
@@ -131,7 +199,7 @@ class PackIndex(object):
     the start and end offset and then bisect in to find if the value is present.
     """
   
-    def __init__(self, filename):
+    def __init__(self, filename, file=None, contents=None, size=None):
         """Create a pack index object.
     
         Provide it with the name of the index file to consider, and it will map
@@ -140,32 +208,30 @@ class PackIndex(object):
         self._filename = filename
         # Take the size now, so it can be checked each time we map the file to
         # ensure that it hasn't changed.
-        self._size = os.path.getsize(filename)
-        self._file = open(filename, 'r')
-        self._contents, map_offset = simple_mmap(self._file, 0, self._size)
-        assert map_offset == 0
-        if self._contents[:4] != '\377tOc':
-            self.version = 1
-            self._fan_out_table = self._read_fan_out_table(0)
+        if file is None:
+            self._file = GitFile(filename, 'rb')
+        else:
+            self._file = file
+        if contents is None:
+            self._contents, self._size = _load_file_contents(file, size)
         else:
-            (self.version, ) = unpack_from(">L", self._contents, 4)
-            assert self.version in (2,), "Version was %d" % self.version
-            self._fan_out_table = self._read_fan_out_table(8)
-            self._name_table_offset = 8 + 0x100 * 4
-            self._crc32_table_offset = self._name_table_offset + 20 * len(self)
-            self._pack_offset_table_offset = self._crc32_table_offset + 4 * len(self)
+            self._contents, self._size = (contents, size)
   
     def __eq__(self, other):
-        if type(self) != type(other):
+        if not isinstance(other, PackIndex):
             return False
     
         if self._fan_out_table != other._fan_out_table:
             return False
     
-        for (name1, _, _), (name2, _, _) in izip(self.iterentries(), other.iterentries()):
+        for (name1, _, _), (name2, _, _) in izip(self.iterentries(),
+                                                 other.iterentries()):
             if name1 != name2:
                 return False
         return True
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
   
     def close(self):
         self._file.close()
@@ -179,36 +245,22 @@ class PackIndex(object):
     
         :return: Tuple with object name (SHA), offset in pack file and 
               CRC32 checksum (if known)."""
-        if self.version == 1:
-            (offset, name) = unpack_from(">L20s", self._contents, 
-                (0x100 * 4) + (i * 24))
-            return (name, offset, None)
-        else:
-            return (self._unpack_name(i), self._unpack_offset(i), 
-                    self._unpack_crc32_checksum(i))
+        raise NotImplementedError(self._unpack_entry)
   
     def _unpack_name(self, i):
-        if self.version == 1:
-            offset = (0x100 * 4) + (i * 24) + 4
-        else:
-            offset = self._name_table_offset + i * 20
-        return self._contents[offset:offset+20]
+        """Unpack the i-th name from the index file."""
+        raise NotImplementedError(self._unpack_name)
   
     def _unpack_offset(self, i):
-        if self.version == 1:
-            offset = (0x100 * 4) + (i * 24)
-        else:
-            offset = self._pack_offset_table_offset + i * 4
-        return unpack_from(">L", self._contents, offset)[0]
-  
+        """Unpack the i-th object offset from the index file."""
+        raise NotImplementedError(self._unpack_offset)
+
     def _unpack_crc32_checksum(self, i):
-        if self.version == 1:
-            return None
-        else:
-            return unpack_from(">L", self._contents, 
-                                      self._crc32_table_offset + i * 4)[0]
+        """Unpack the crc32 checksum for the i-th object from the index file."""
+        raise NotImplementedError(self._unpack_crc32_checksum)
   
     def __iter__(self):
+        """Iterate over the SHAs in this pack."""
         return imap(sha_to_hex, self._itersha())
   
     def _itersha(self):
@@ -216,12 +268,17 @@ class PackIndex(object):
             yield self._unpack_name(i)
   
     def objects_sha1(self):
+        """Return the hex SHA1 over all the shas of all objects in this pack.
+        
+        :note: This is used for the filename of the pack.
+        """
         return iter_sha1(self._itersha())
   
     def iterentries(self):
         """Iterate over the entries in this pack index.
        
-        Will yield tuples with object name, offset in packfile and crc32 checksum.
+        Will yield tuples with object name, offset in packfile and crc32
+        checksum.
         """
         for i in range(len(self)):
             yield self._unpack_entry(i)
@@ -229,34 +286,42 @@ class PackIndex(object):
     def _read_fan_out_table(self, start_offset):
         ret = []
         for i in range(0x100):
-            ret.append(struct.unpack(">L", self._contents[start_offset+i*4:start_offset+(i+1)*4])[0])
+            ret.append(struct.unpack(">L",
+                self._contents[start_offset+i*4:start_offset+(i+1)*4])[0])
         return ret
   
     def check(self):
         """Check that the stored checksum matches the actual checksum."""
+        # TODO: Check pack contents, too
         return self.calculate_checksum() == self.get_stored_checksum()
   
     def calculate_checksum(self):
-        f = open(self._filename, 'r')
-        try:
-            return make_sha(self._contents[:-20]).digest()
-        finally:
-            f.close()
+        """Calculate the SHA1 checksum over this pack index.
+
+        :return: This is a 20-byte binary digest
+        """
+        return make_sha(self._contents[:-20]).digest()
 
     def get_pack_checksum(self):
-        """Return the SHA1 checksum stored for the corresponding packfile."""
+        """Return the SHA1 checksum stored for the corresponding packfile.
+        
+        :return: 20-byte binary digest
+        """
         return str(self._contents[-40:-20])
   
     def get_stored_checksum(self):
-        """Return the SHA1 checksum stored for this index."""
+        """Return the SHA1 checksum stored for this index.
+        
+        :return: 20-byte binary digest
+        """
         return str(self._contents[-20:])
   
     def object_index(self, sha):
         """Return the index in to the corresponding packfile for the object.
     
-        Given the name of an object it will return the offset that object lives
-        at within the corresponding pack file. If the pack file doesn't have the
-        object then None will be returned.
+        Given the name of an object it will return the offset that object
+        lives at within the corresponding pack file. If the pack file doesn't
+        have the object then None will be returned.
         """
         if len(sha) == 40:
             sha = hex_to_sha(sha)
@@ -274,20 +339,75 @@ class PackIndex(object):
         else:
             start = self._fan_out_table[idx-1]
         end = self._fan_out_table[idx]
-        assert start <= end
-        while start <= end:
-            i = (start + end)/2
-            file_sha = self._unpack_name(i)
-            if file_sha < sha:
-                start = i + 1
-            elif file_sha > sha:
-                end = i - 1
-            else:
-                return self._unpack_offset(i)
-        return None
+        i = bisect_find_sha(start, end, sha, self._unpack_name)
+        if i is None:
+            raise KeyError(sha)
+        return self._unpack_offset(i)
+            
+
+
+class PackIndex1(PackIndex):
+    """Version 1 Pack Index."""
+
+    def __init__(self, filename, file=None, contents=None, size=None):
+        PackIndex.__init__(self, filename, file, contents, size)
+        self.version = 1
+        self._fan_out_table = self._read_fan_out_table(0)
+
+    def _unpack_entry(self, i):
+        (offset, name) = unpack_from(">L20s", self._contents, 
+            (0x100 * 4) + (i * 24))
+        return (name, offset, None)
+    def _unpack_name(self, i):
+        offset = (0x100 * 4) + (i * 24) + 4
+        return self._contents[offset:offset+20]
+  
+    def _unpack_offset(self, i):
+        offset = (0x100 * 4) + (i * 24)
+        return unpack_from(">L", self._contents, offset)[0]
+  
+    def _unpack_crc32_checksum(self, i):
+        # Not stored in v1 index files
+        return None 
+  
+
+class PackIndex2(PackIndex):
+    """Version 2 Pack Index."""
+
+    def __init__(self, filename, file=None, contents=None, size=None):
+        PackIndex.__init__(self, filename, file, contents, size)
+        assert self._contents[:4] == '\377tOc', "Not a v2 pack index file"
+        (self.version, ) = unpack_from(">L", self._contents, 4)
+        assert self.version == 2, "Version was %d" % self.version
+        self._fan_out_table = self._read_fan_out_table(8)
+        self._name_table_offset = 8 + 0x100 * 4
+        self._crc32_table_offset = self._name_table_offset + 20 * len(self)
+        self._pack_offset_table_offset = self._crc32_table_offset + 4 * len(self)
+
+    def _unpack_entry(self, i):
+        return (self._unpack_name(i), self._unpack_offset(i), 
+                self._unpack_crc32_checksum(i))
+    def _unpack_name(self, i):
+        offset = self._name_table_offset + i * 20
+        return self._contents[offset:offset+20]
+  
+    def _unpack_offset(self, i):
+        offset = self._pack_offset_table_offset + i * 4
+        return unpack_from(">L", self._contents, offset)[0]
+  
+    def _unpack_crc32_checksum(self, i):
+        return unpack_from(">L", self._contents, 
+                          self._crc32_table_offset + i * 4)[0]
+  
 
 
 def read_pack_header(f):
+    """Read the header of a pack file.
+
+    :param f: File-like object to read from
+    """
     header = f.read(12)
     assert header[:4] == "PACK"
     (version,) = unpack_from(">L", header, 4)
@@ -296,45 +416,52 @@ def read_pack_header(f):
     return (version, num_objects)
 
 
-def read_pack_tail(f):
-    return (f.read(20),)
+def chunks_length(chunks):
+    return sum(imap(len, chunks))
 
 
-def unpack_object(map, offset=0):
-    bytes = take_msb_bytes(map, offset)
+def unpack_object(read):
+    """Unpack a Git object.
+
+    :return: tuple with type, uncompressed data as chunks, compressed size and 
+        tail data
+    """
+    bytes = take_msb_bytes(read)
     type = (bytes[0] >> 4) & 0x07
     size = bytes[0] & 0x0f
     for i, byte in enumerate(bytes[1:]):
         size += (byte & 0x7f) << ((i * 7) + 4)
     raw_base = len(bytes)
     if type == 6: # offset delta
-        bytes = take_msb_bytes(map, raw_base + offset)
+        bytes = take_msb_bytes(read)
+        raw_base += len(bytes)
         assert not (bytes[-1] & 0x80)
         delta_base_offset = bytes[0] & 0x7f
         for byte in bytes[1:]:
             delta_base_offset += 1
             delta_base_offset <<= 7
             delta_base_offset += (byte & 0x7f)
-        raw_base+=len(bytes)
-        uncomp, comp_len = read_zlib(map, offset + raw_base, size)
-        assert size == len(uncomp)
-        return type, (delta_base_offset, uncomp), comp_len+raw_base
+        uncomp, comp_len, unused = read_zlib_chunks(read)
+        assert size == chunks_length(uncomp)
+        return type, (delta_base_offset, uncomp), comp_len+raw_base, unused
     elif type == 7: # ref delta
-        basename = map[offset+raw_base:offset+raw_base+20]
-        uncomp, comp_len = read_zlib(map, offset+raw_base+20, size)
-        assert size == len(uncomp)
-        return type, (basename, uncomp), comp_len+raw_base+20
+        basename = read(20)
+        raw_base += 20
+        uncomp, comp_len, unused = read_zlib_chunks(read)
+        assert size == chunks_length(uncomp)
+        return type, (basename, uncomp), comp_len+raw_base, unused
     else:
-        uncomp, comp_len = read_zlib(map, offset+raw_base, size)
-        assert len(uncomp) == size
-        return type, uncomp, comp_len+raw_base
+        uncomp, comp_len, unused = read_zlib_chunks(read)
+        assert chunks_length(uncomp) == size
+        return type, uncomp, comp_len+raw_base, unused
 
 
-def compute_object_size((num, obj)):
+def _compute_object_size((num, obj)):
+    """Compute the size of a unresolved object for use with LRUSizeCache.
+    """
     if num in (6, 7):
-        return len(obj[1])
-    assert isinstance(obj, str)
-    return len(obj)
+        return chunks_length(obj[1])
+    return chunks_length(obj)
 
 
 class PackData(object):
@@ -359,58 +486,74 @@ class PackData(object):
     buffer from the start of the deflated object on. This is bad, but until I
     get mmap sorted out it will have to do.
   
-    Currently there are no integrity checks done. Also no attempt is made to try
-    and detect the delta case, or a request for an object at the wrong position.
-    It will all just throw a zlib or KeyError.
+    Currently there are no integrity checks done. Also no attempt is made to
+    try and detect the delta case, or a request for an object at the wrong
+    position.  It will all just throw a zlib or KeyError.
     """
   
-    def __init__(self, filename):
-        """Create a PackData object that represents the pack in the given filename.
+    def __init__(self, filename, file=None, size=None):
+        """Create a PackData object that represents the pack in the given
+        filename.
     
-        The file must exist and stay readable until the object is disposed of. It
-        must also stay the same size. It will be mapped whenever needed.
+        The file must exist and stay readable until the object is disposed of.
+        It must also stay the same size. It will be mapped whenever needed.
     
         Currently there is a restriction on the size of the pack as the python
         mmap implementation is flawed.
         """
         self._filename = filename
-        assert os.path.exists(filename), "%s is not a packfile" % filename
-        self._size = os.path.getsize(filename)
+        self._size = size
         self._header_size = 12
-        assert self._size >= self._header_size, "%s is too small for a packfile (%d < %d)" % (filename, self._size, self._header_size)
-        self._read_header()
-        self._offset_cache = LRUSizeCache(1024*1024*100, 
-            compute_size=compute_object_size)
-  
-    def _read_header(self):
-        f = open(self._filename, 'rb')
-        try:
-            (version, self._num_objects) = \
-                    read_pack_header(f)
-            f.seek(self._size-20)
-            (self._stored_checksum,) = read_pack_tail(f)
-        finally:
-            f.close()
+        if file is None:
+            self._file = GitFile(self._filename, 'rb')
+        else:
+            self._file = file
+        (version, self._num_objects) = read_pack_header(self._file)
+        self._offset_cache = LRUSizeCache(1024*1024*20, 
+            compute_size=_compute_object_size)
+
+    @classmethod
+    def from_file(cls, file, size):
+        return cls(str(file), file=file, size=size)
+
+    @classmethod
+    def from_path(cls, path):
+        return cls(filename=path)
+
+    def close(self):
+        self._file.close()
+
+    def _get_size(self):
+        if self._size is not None:
+            return self._size
+        self._size = os.path.getsize(self._filename)
+        assert self._size >= self._header_size, "%s is too small for a packfile (%d < %d)" % (self._filename, self._size, self._header_size)
+        return self._size
   
     def __len__(self):
         """Returns the number of objects in this pack."""
         return self._num_objects
   
     def calculate_checksum(self):
-        """Calculate the checksum for this pack."""
-        f = open(self._filename, 'rb')
-        try:
-            map, map_offset = simple_mmap(f, 0, self._size - 20)
-            return make_sha(map[map_offset:self._size-20]).digest()
-        finally:
-            f.close()
+        """Calculate the checksum for this pack.
+
+        :return: 20-byte binary SHA1 digest
+        """
+        s = make_sha()
+        self._file.seek(0)
+        todo = self._get_size() - 20
+        while todo > 0:
+            x = self._file.read(min(todo, 1<<16))
+            s.update(x)
+            todo -= len(x)
+        return s.digest()
 
     def resolve_object(self, offset, type, obj, get_ref, get_offset=None):
         """Resolve an object, possibly resolving deltas when necessary.
         
         :return: Tuple with object type and contents.
         """
-        if not type in (6, 7): # Not a delta
+        if type not in (6, 7): # Not a delta
             return type, obj
 
         if get_offset is None:
@@ -419,95 +562,163 @@ class PackData(object):
         if type == 6: # offset delta
             (delta_offset, delta) = obj
             assert isinstance(delta_offset, int)
-            assert isinstance(delta, str)
             base_offset = offset-delta_offset
             type, base_obj = get_offset(base_offset)
             assert isinstance(type, int)
         elif type == 7: # ref delta
             (basename, delta) = obj
             assert isinstance(basename, str) and len(basename) == 20
-            assert isinstance(delta, str)
             type, base_obj = get_ref(basename)
             assert isinstance(type, int)
             # Can't be a ofs delta, as we wouldn't know the base offset
             assert type != 6
             base_offset = None
-        type, base_text = self.resolve_object(base_offset, type, base_obj, get_ref)
+        type, base_chunks = self.resolve_object(base_offset, type, base_obj,
+            get_ref)
         if base_offset is not None:
-            self._offset_cache[base_offset] = type, base_text
-        ret = (type, apply_delta(base_text, delta))
-        return ret
+            self._offset_cache[base_offset] = type, base_chunks
+        return (type, apply_delta(base_chunks, delta))
   
-    def iterobjects(self):
-        offset = self._header_size
-        f = open(self._filename, 'rb')
-        num = len(self)
-        map, _ = simple_mmap(f, 0, self._size)
-        for i in range(num):
-            (type, obj, total_size) = unpack_object(map, offset)
-            crc32 = zlib.crc32(map[offset:offset+total_size]) & 0xffffffff
-            yield offset, type, obj, crc32
-            offset += total_size
-        f.close()
+    def iterobjects(self, progress=None):
+
+        class ObjectIterator(object):
+            
+            def __init__(self, pack):
+                self.i = 0
+                self.offset = pack._header_size
+                self.num = len(pack)
+                self.map = pack._file
+
+            def __iter__(self):
+                return self
+
+            def __len__(self):
+                return self.num
+            
+            def next(self):
+                if self.i == self.num:
+                    raise StopIteration
+                self.map.seek(self.offset)
+                (type, obj, total_size, unused) = unpack_object(self.map.read)
+                self.map.seek(self.offset)
+                crc32 = zlib.crc32(self.map.read(total_size)) & 0xffffffff
+                ret = (self.offset, type, obj, crc32)
+                self.offset += total_size
+                if progress:
+                    progress(self.i, self.num)
+                self.i+=1
+                return ret
+        return ObjectIterator(self)
   
-    def iterentries(self, ext_resolve_ref=None):
+    def iterentries(self, ext_resolve_ref=None, progress=None):
+        """Yield entries summarizing the contents of this pack.
+
+        :param ext_resolve_ref: Optional function to resolve base
+            objects (in case this is a thin pack)
+        :param progress: Progress function, called with current and
+            total object count.
+
+        This will yield tuples with (sha, offset, crc32)
+        """
         found = {}
         postponed = defaultdict(list)
         class Postpone(Exception):
             """Raised to postpone delta resolving."""
           
         def get_ref_text(sha):
+            assert len(sha) == 20
             if sha in found:
-                return found[sha]
+                return self.get_object_at(found[sha])
             if ext_resolve_ref:
                 try:
                     return ext_resolve_ref(sha)
                 except KeyError:
                     pass
             raise Postpone, (sha, )
-        todo = list(self.iterobjects())
-        while todo:
-            (offset, type, obj, crc32) = todo.pop(0)
+        extra = []
+        todo = chain(self.iterobjects(progress=progress), extra)
+        for (offset, type, obj, crc32) in todo:
             assert isinstance(offset, int)
             assert isinstance(type, int)
-            assert isinstance(obj, tuple) or isinstance(obj, str)
             try:
-                type, obj = self.resolve_object(offset, type, obj, get_ref_text)
+                type, obj = self.resolve_object(offset, type, obj,
+                    get_ref_text)
             except Postpone, (sha, ):
                 postponed[sha].append((offset, type, obj))
             else:
-                shafile = ShaFile.from_raw_string(type, obj)
+                shafile = ShaFile.from_raw_chunks(type, obj)
                 sha = shafile.sha().digest()
-                found[sha] = (type, obj)
+                found[sha] = offset
                 yield sha, offset, crc32
-                todo += postponed.get(sha, [])
+                extra.extend(postponed.get(sha, []))
         if postponed:
             raise KeyError([sha_to_hex(h) for h in postponed.keys()])
   
-    def sorted_entries(self, resolve_ext_ref=None):
-        ret = list(self.iterentries(resolve_ext_ref))
+    def sorted_entries(self, resolve_ext_ref=None, progress=None):
+        """Return entries in this pack, sorted by SHA.
+
+        :param ext_resolve_ref: Optional function to resolve base
+            objects (in case this is a thin pack)
+        :param progress: Progress function, called with current and
+            total object count.
+        :return: List of tuples with (sha, offset, crc32)
+        """
+        ret = list(self.iterentries(resolve_ext_ref, progress=progress))
         ret.sort()
         return ret
   
-    def create_index_v1(self, filename, resolve_ext_ref=None):
-        entries = self.sorted_entries(resolve_ext_ref)
+    def create_index_v1(self, filename, resolve_ext_ref=None, progress=None):
+        """Create a version 1 file for this data file.
+
+        :param filename: Index filename.
+        :param resolve_ext_ref: Function to use for resolving externally
+            referenced SHA1s (for thin packs)
+        :param progress: Progress report function
+        """
+        entries = self.sorted_entries(resolve_ext_ref, progress=progress)
         write_pack_index_v1(filename, entries, self.calculate_checksum())
   
-    def create_index_v2(self, filename, resolve_ext_ref=None):
-        entries = self.sorted_entries(resolve_ext_ref)
+    def create_index_v2(self, filename, resolve_ext_ref=None, progress=None):
+        """Create a version 2 index file for this data file.
+
+        :param filename: Index filename.
+        :param resolve_ext_ref: Function to use for resolving externally
+            referenced SHA1s (for thin packs)
+        :param progress: Progress report function
+        """
+        entries = self.sorted_entries(resolve_ext_ref, progress=progress)
         write_pack_index_v2(filename, entries, self.calculate_checksum())
+
+    def create_index(self, filename, resolve_ext_ref=None, progress=None,
+                     version=2):
+        """Create an  index file for this data file.
+
+        :param filename: Index filename.
+        :param resolve_ext_ref: Function to use for resolving externally
+            referenced SHA1s (for thin packs)
+        :param progress: Progress report function
+        """
+        if version == 1:
+            self.create_index_v1(filename, resolve_ext_ref, progress)
+        elif version == 2:
+            self.create_index_v2(filename, resolve_ext_ref, progress)
+        else:
+            raise ValueError("unknown index format %d" % version)
   
     def get_stored_checksum(self):
-        return self._stored_checksum
+        """Return the expected checksum stored in this pack."""
+        self._file.seek(self._get_size()-20)
+        return self._file.read(20)
   
     def check(self):
+        """Check the consistency of this pack."""
         return (self.calculate_checksum() == self.get_stored_checksum())
   
     def get_object_at(self, offset):
         """Given an offset in to the packfile return the object that is there.
     
-        Using the associated index the location of an object can be looked up, and
-        then the packfile can be asked directly for that object using this
+        Using the associated index the location of an object can be looked up,
+        and then the packfile can be asked directly for that object using this
         function.
         """
         if offset in self._offset_cache:
@@ -515,16 +726,38 @@ class PackData(object):
         assert isinstance(offset, long) or isinstance(offset, int),\
                 "offset was %r" % offset
         assert offset >= self._header_size
-        f = open(self._filename, 'rb')
-        try:
-            map, map_offset = simple_mmap(f, offset, self._size-offset)
-            ret = unpack_object(map, map_offset)[:2]
-            return ret
-        finally:
-            f.close()
+        self._file.seek(offset)
+        return unpack_object(self._file.read)[:2]
+
+
+class SHA1Reader(object):
+    """Wrapper around a file-like object that remembers the SHA1 of 
+    the data read from it."""
+
+    def __init__(self, f):
+        self.f = f
+        self.sha1 = make_sha("")
+
+    def read(self, num=None):
+        data = self.f.read(num)
+        self.sha1.update(data)
+        return data
+
+    def check_sha(self):
+        stored = self.f.read(20)
+        if stored != self.sha1.digest():
+            raise ChecksumMismatch(self.sha1.hexdigest(), sha_to_hex(stored))
+
+    def close(self):
+        return self.f.close()
+
+    def tell(self):
+        return self.f.tell()
 
 
 class SHA1Writer(object):
+    """Wrapper around a file-like object that remembers the SHA1 of 
+    the data written to it."""
     
     def __init__(self, f):
         self.f = f
@@ -556,11 +789,11 @@ def write_pack_object(f, type, object):
     :param o: Object to write
     :return: Tuple with offset at which the object was written, and crc32
     """
-    ret = f.tell()
+    offset = f.tell()
     packed_data_hdr = ""
-    if type == 6: # ref delta
+    if type == 6: # offset delta
         (delta_base_offset, object) = object
-    elif type == 7: # offset delta
+    elif type == 7: # ref delta
         (basename, object) = object
     size = len(object)
     c = (type << 4) | (size & 15)
@@ -583,11 +816,17 @@ def write_pack_object(f, type, object):
         packed_data_hdr += basename
     packed_data = packed_data_hdr + zlib.compress(object)
     f.write(packed_data)
-    return (f.tell(), (zlib.crc32(packed_data) & 0xffffffff))
+    return (offset, (zlib.crc32(packed_data) & 0xffffffff))
 
 
 def write_pack(filename, objects, num_objects):
-    f = open(filename + ".pack", 'w')
+    """Write a new pack data file.
+
+    :param filename: Path to the new pack file (without .pack extension)
+    :param objects: Iterable over (object, path) tuples to write
+    :param num_objects: Number of objects to write
+    """
+    f = GitFile(filename + ".pack", 'wb')
     try:
         entries, data_sum = write_pack_data(f, objects, num_objects)
     finally:
@@ -610,7 +849,7 @@ def write_pack_data(f, objects, num_objects, window=10):
     # This helps us find good objects to diff against us
     magic = []
     for obj, path in recency:
-        magic.append( (obj.type, path, 1, -len(obj.as_raw_string()[1]), obj) )
+        magic.append( (obj.type_num, path, 1, -obj.raw_length(), obj) )
     magic.sort()
     # Build a map of objects and their index in magic - so we can find preceeding objects
     # to diff against
@@ -625,14 +864,15 @@ def write_pack_data(f, objects, num_objects, window=10):
     f.write(struct.pack(">L", num_objects)) # Number of objects in pack
     for o, path in recency:
         sha1 = o.sha().digest()
-        orig_t, raw = o.as_raw_string()
+        orig_t = o.type_num
+        raw = o.as_raw_string()
         winner = raw
         t = orig_t
         #for i in range(offs[o]-window, window):
         #    if i < 0 or i >= len(offs): continue
         #    b = magic[i][4]
-        #    if b.type != orig_t: continue
-        #    _, base = b.as_raw_string()
+        #    if b.type_num != orig_t: continue
+        #    base = b.as_raw_string()
         #    delta = create_delta(base, raw)
         #    if len(delta) < len(winner):
         #        winner = delta
@@ -646,11 +886,11 @@ def write_pack_index_v1(filename, entries, pack_checksum):
     """Write a new pack index file.
 
     :param filename: The filename of the new pack index file.
-    :param entries: List of tuples with object name (sha), offset_in_pack,  and
-            crc32_checksum.
+    :param entries: List of tuples with object name (sha), offset_in_pack,
+        and crc32_checksum.
     :param pack_checksum: Checksum of the pack file.
     """
-    f = open(filename, 'w')
+    f = GitFile(filename, 'wb')
     f = SHA1Writer(f)
     fan_out_table = defaultdict(lambda: 0)
     for (name, offset, entry_checksum) in entries:
@@ -667,7 +907,11 @@ def write_pack_index_v1(filename, entries, pack_checksum):
 
 
 def create_delta(base_buf, target_buf):
-    """Use python difflib to work out how to transform base_buf to target_buf"""
+    """Use python difflib to work out how to transform base_buf to target_buf.
+
+    :param base_buf: Base buffer
+    :param target_buf: Target buffer
+    """
     assert isinstance(base_buf, str)
     assert isinstance(target_buf, str)
     out_buf = ""
@@ -698,12 +942,12 @@ def create_delta(base_buf, target_buf):
             o = i1
             for i in range(4):
                 if o & 0xff << i*8:
-                    scratch += chr(o >> i)
+                    scratch += chr((o >> i*8) & 0xff)
                     op |= 1 << i
             s = i2 - i1
             for i in range(2):
                 if s & 0xff << i*8:
-                    scratch += chr(s >> i)
+                    scratch += chr((s >> i*8) & 0xff)
                     op |= 1 << (4+i)
             out_buf += chr(op)
             out_buf += scratch
@@ -728,8 +972,10 @@ def apply_delta(src_buf, delta):
     :param src_buf: Source buffer
     :param delta: Delta instructions
     """
-    assert isinstance(src_buf, str), "was %r" % (src_buf,)
-    assert isinstance(delta, str)
+    if type(src_buf) != str:
+        src_buf = "".join(src_buf)
+    if type(delta) != str:
+        delta = "".join(delta)
     out = []
     index = 0
     delta_length = len(delta)
@@ -779,8 +1025,7 @@ def apply_delta(src_buf, delta):
     if index != delta_length:
         raise ApplyDeltaError("delta not empty: %r" % delta[index:])
 
-    out = ''.join(out)
-    if dest_size != len(out):
+    if dest_size != chunks_length(out):
         raise ApplyDeltaError("dest size incorrect")
 
     return out
@@ -790,11 +1035,11 @@ def write_pack_index_v2(filename, entries, pack_checksum):
     """Write a new pack index file.
 
     :param filename: The filename of the new pack index file.
-    :param entries: List of tuples with object name (sha), offset_in_pack,  and
-            crc32_checksum.
+    :param entries: List of tuples with object name (sha), offset_in_pack, and
+        crc32_checksum.
     :param pack_checksum: Checksum of the pack file.
     """
-    f = open(filename, 'w')
+    f = GitFile(filename, 'wb')
     f = SHA1Writer(f)
     f.write('\377tOc') # Magic!
     f.write(struct.pack(">L", 2))
@@ -819,6 +1064,7 @@ def write_pack_index_v2(filename, entries, pack_checksum):
 
 
 class Pack(object):
+    """A Git pack object."""
 
     def __init__(self, basename):
         self._basename = basename
@@ -827,16 +1073,25 @@ class Pack(object):
         self._data = None
         self._idx = None
 
+    @classmethod
+    def from_objects(self, data, idx):
+        """Create a new pack object from pack data and index objects."""
+        ret = Pack("")
+        ret._data = data
+        ret._idx = idx
+        return ret
+
     def name(self):
         """The SHA over the SHAs of the objects in this pack."""
-        return self.idx.objects_sha1()
+        return self.index.objects_sha1()
 
     @property
     def data(self):
+        """The pack data object being used."""
         if self._data is None:
             self._data = PackData(self._data_path)
-            assert len(self.idx) == len(self._data)
-            idx_stored_checksum = self.idx.get_pack_checksum()
+            assert len(self.index) == len(self._data)
+            idx_stored_checksum = self.index.get_pack_checksum()
             data_stored_checksum = self._data.get_stored_checksum()
             if idx_stored_checksum != data_stored_checksum:
                 raise ChecksumMismatch(sha_to_hex(idx_stored_checksum), 
@@ -844,32 +1099,37 @@ class Pack(object):
         return self._data
 
     @property
-    def idx(self):
+    def index(self):
+        """The index being used.
+
+        :note: This may be an in-memory index
+        """
         if self._idx is None:
-            self._idx = PackIndex(self._idx_path)
+            self._idx = load_pack_index(self._idx_path)
         return self._idx
 
     def close(self):
         if self._data is not None:
             self._data.close()
-        self.idx.close()
+        self.index.close()
 
     def __eq__(self, other):
-        return type(self) == type(other) and self.idx == other.idx
+        return type(self) == type(other) and self.index == other.index
 
     def __len__(self):
         """Number of entries in this pack."""
-        return len(self.idx)
+        return len(self.index)
 
     def __repr__(self):
-        return "Pack(%r)" % self._basename
+        return "%s(%r)" % (self.__class__.__name__, self._basename)
 
     def __iter__(self):
         """Iterate over all the sha1s of the objects in this pack."""
-        return iter(self.idx)
+        return iter(self.index)
 
     def check(self):
-        if not self.idx.check():
+        """Check the integrity of this pack."""
+        if not self.index.check():
             return False
         if not self.data.check():
             return False
@@ -880,18 +1140,22 @@ class Pack(object):
 
     def __contains__(self, sha1):
         """Check whether this pack contains a particular SHA1."""
-        return (self.idx.object_index(sha1) is not None)
+        try:
+            self.index.object_index(sha1)
+            return True
+        except KeyError:
+            return False
 
     def get_raw(self, sha1, resolve_ref=None):
-        offset = self.idx.object_index(sha1)
-        if offset is None:
-            raise KeyError(sha1)
-
-        type, obj = self.data.get_object_at(offset)
-        if isinstance(offset, long):
+        offset = self.index.object_index(sha1)
+        obj_type, obj = self.data.get_object_at(offset)
+        if type(offset) is long:
           offset = int(offset)
-        assert isinstance(offset, int)
-        return self.data.resolve_object(offset, type, obj, resolve_ref)
+        if resolve_ref is None:
+            resolve_ref = self.get_raw
+        kind, chunks = self.data.resolve_object(offset, obj_type, obj,
+            resolve_ref)
+        return kind, "".join(chunks)
 
     def __getitem__(self, sha1):
         """Retrieve the specified SHA1."""
@@ -899,19 +1163,16 @@ class Pack(object):
         return ShaFile.from_raw_string(type, uncomp)
 
     def iterobjects(self, get_raw=None):
+        """Iterate over the objects in this pack."""
         if get_raw is None:
-            def get_raw(x):
-                raise KeyError(x)
+            get_raw = self.get_raw
         for offset, type, obj, crc32 in self.data.iterobjects():
             assert isinstance(offset, int)
-            yield ShaFile.from_raw_string(
-                    *self.data.resolve_object(offset, type, obj, get_raw))
-
+            type, obj = self.data.resolve_object(offset, type, obj, get_raw)
+            yield ShaFile.from_raw_chunks(type, obj)
 
-def load_packs(path):
-    if not os.path.exists(path):
-        return
-    for name in os.listdir(path):
-        if name.startswith("pack-") and name.endswith(".pack"):
-            yield Pack(os.path.join(path, name[:-len(".pack")]))
 
+try:
+    from dulwich._pack import apply_delta, bisect_find_sha
+except ImportError:
+    pass