X-Git-Url: http://git.samba.org/samba.git/?p=jelmer%2Fdulwich-libgit2.git;a=blobdiff_plain;f=dulwich%2Fpack.py;h=5d665026e82989e5376995b37b420e94e25286bb;hp=ef26670635921559599ca7d074416a9f31f0317a;hb=2b68a1b70867105ce859bdac43b9bb65285cec44;hpb=802d498ce9d00947999f23d8e6fead4d421cab94

diff --git a/dulwich/pack.py b/dulwich/pack.py
index ef26670..5d66502 100644
--- a/dulwich/pack.py
+++ b/dulwich/pack.py
@@ -1,14 +1,11 @@
 # pack.py -- For dealing wih packed git objects.
 # Copyright (C) 2007 James Westby <jw+debian@jameswestby.net>
 # Copryight (C) 2008 Jelmer Vernooij <jelmer@samba.org>
-# The code is loosely based on that in the sha1_file.c file from git itself,
-# which is Copyright (C) Linus Torvalds, 2005 and distributed under the
-# GPL version 2.
 # 
 # This program is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License
 # as published by the Free Software Foundation; version 2
-# of the License.
+# of the License or (at your option) a later version.
 # 
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -33,24 +30,38 @@ match for the object name. You then use the pointer got from this as
 a pointer in to the corresponding packfile.
 """
 
-from collections import defaultdict
-import hashlib
-from itertools import imap, izip
+try:
+    from collections import defaultdict
+except ImportError:
+    from misc import defaultdict
+
+from itertools import chain, imap, izip
 import mmap
 import os
-import sha
 import struct
+try:
+    from struct import unpack_from
+except ImportError:
+    from dulwich.misc import unpack_from
 import sys
 import zlib
-
-from objects import (
-        ShaFile,
-        hex_to_sha,
-        sha_to_hex,
-        )
-from errors import ApplyDeltaError
-
-supports_mmap_offset = (sys.version_info[0] >= 3 or 
+import difflib
+
+from dulwich.errors import (
+    ApplyDeltaError,
+    ChecksumMismatch,
+    )
+from dulwich.lru_cache import (
+    LRUSizeCache,
+    )
+from dulwich.objects import (
+    ShaFile,
+    hex_to_sha,
+    sha_to_hex,
+    )
+from dulwich.misc import make_sha
+
+supports_mmap_offset = (sys.version_info[0] >= 3 or
         (sys.version_info[0] == 2 and sys.version_info[1] >= 6))
 
 
@@ -64,27 +75,29 @@ def take_msb_bytes(map, offset):
 
 def read_zlib(data, offset, dec_size):
     obj = zlib.decompressobj()
-    x = ""
+    ret = []
     fed = 0
     while obj.unused_data == "":
         base = offset+fed
         add = data[base:base+1024]
+        if len(add) < 1024:
+            add += "Z"
         fed += len(add)
-        x += obj.decompress(add)
+        ret.append(obj.decompress(add))
+    x = "".join(ret)
     assert len(x) == dec_size
     comp_len = fed-len(obj.unused_data)
     return x, comp_len
 
 
 def iter_sha1(iter):
-    sha = hashlib.sha1()
+    """Return the hexdigest of the SHA1 over a set of names."""
+    sha1 = make_sha()
     for name in iter:
-        sha.update(name)
-    return sha.hexdigest()
+        sha1.update(name)
+    return sha1.hexdigest()
 
 
-MAX_MMAP_SIZE = 256 * 1024 * 1024
-
 def simple_mmap(f, offset, size, access=mmap.ACCESS_READ):
     """Simple wrapper for mmap() which always supports the offset parameter.
 
@@ -94,229 +107,244 @@ def simple_mmap(f, offset, size, access=mmap.ACCESS_READ):
     :param access: Access mechanism.
     :return: MMAP'd area.
     """
-    if offset+size > MAX_MMAP_SIZE and not supports_mmap_offset:
-        raise AssertionError("%s is larger than 256 meg, and this version "
-            "of Python does not support the offset argument to mmap().")
-    if supports_mmap_offset:
-        return mmap.mmap(f.fileno(), size, access=access, offset=offset)
-    else:
-        class ArraySkipper(object):
-
-            def __init__(self, array, offset):
-                self.array = array
-                self.offset = offset
-
-            def __getslice__(self, i, j):
-                return self.array[i+self.offset:j+self.offset]
+    mem = mmap.mmap(f.fileno(), size+offset, access=access)
+    return mem, offset
 
-            def __getitem__(self, i):
-                return self.array[i+self.offset]
-
-            def __len__(self):
-                return len(self.array) - self.offset
-
-            def __str__(self):
-                return str(self.array[self.offset:])
-
-        mem = mmap.mmap(f.fileno(), size+offset, access=access)
-        if offset == 0:
-            return mem
-        return ArraySkipper(mem, offset)
 
+def load_pack_index(filename):
+    f = open(filename, 'r')
+    if f.read(4) == '\377tOc':
+        version = struct.unpack(">L", f.read(4))[0]
+        if version == 2:
+            f.seek(0)
+            return PackIndex2(filename, file=f)
+        else:
+            raise KeyError("Unknown pack index format %d" % version)
+    else:
+        f.seek(0)
+        return PackIndex1(filename, file=f)
 
-def resolve_object(offset, type, obj, get_ref, get_offset):
-  """Resolve an object, possibly resolving deltas when necessary."""
-  if not type in (6, 7): # Not a delta
-     return type, obj
 
-  if type == 6: # offset delta
-     (delta_offset, delta) = obj
-     assert isinstance(delta_offset, int)
-     assert isinstance(delta, str)
-     offset = offset-delta_offset
-     type, base_obj = get_offset(offset)
-     assert isinstance(type, int)
-  elif type == 7: # ref delta
-     (basename, delta) = obj
-     assert isinstance(basename, str) and len(basename) == 20
-     assert isinstance(delta, str)
-     type, base_obj = get_ref(basename)
-     assert isinstance(type, int)
-  type, base_text = resolve_object(offset, type, base_obj, get_ref, get_offset)
-  return type, apply_delta(base_text, delta)
+def bisect_find_sha(start, end, sha, unpack_name):
+    assert start <= end
+    while start <= end:
+        i = (start + end)/2
+        file_sha = unpack_name(i)
+        x = cmp(file_sha, sha)
+        if x < 0:
+            start = i + 1
+        elif x > 0:
+            end = i - 1
+        else:
+            return i
+    return None
 
 
 class PackIndex(object):
-  """An index in to a packfile.
-
-  Given a sha id of an object a pack index can tell you the location in the
-  packfile of that object if it has it.
+    """An index in to a packfile.
+  
+    Given a sha id of an object a pack index can tell you the location in the
+    packfile of that object if it has it.
+  
+    To do the loop it opens the file, and indexes first 256 4 byte groups
+    with the first byte of the sha id. The value in the four byte group indexed
+    is the end of the group that shares the same starting byte. Subtract one
+    from the starting byte and index again to find the start of the group.
+    The values are sorted by sha id within the group, so do the math to find
+    the start and end offset and then bisect in to find if the value is present.
+    """
+  
+    def __init__(self, filename, file=None):
+        """Create a pack index object.
+    
+        Provide it with the name of the index file to consider, and it will map
+        it whenever required.
+        """
+        self._filename = filename
+        # Take the size now, so it can be checked each time we map the file to
+        # ensure that it hasn't changed.
+        self._size = os.path.getsize(filename)
+        if file is None:
+            self._file = open(filename, 'r')
+        else:
+            self._file = file
+        self._contents, map_offset = simple_mmap(self._file, 0, self._size)
+        assert map_offset == 0
+  
+    def __eq__(self, other):
+        if not isinstance(other, PackIndex):
+            return False
+    
+        if self._fan_out_table != other._fan_out_table:
+            return False
+    
+        for (name1, _, _), (name2, _, _) in izip(self.iterentries(), other.iterentries()):
+            if name1 != name2:
+                return False
+        return True
+  
+    def close(self):
+        self._file.close()
+  
+    def __len__(self):
+        """Return the number of entries in this pack index."""
+        return self._fan_out_table[-1]
+  
+    def _unpack_entry(self, i):
+        """Unpack the i-th entry in the index file.
+    
+        :return: Tuple with object name (SHA), offset in pack file and 
+              CRC32 checksum (if known)."""
+        raise NotImplementedError(self._unpack_entry)
+  
+    def _unpack_name(self, i):
+        """Unpack the i-th name from the index file."""
+        raise NotImplementedError(self._unpack_name)
+  
+    def _unpack_offset(self, i):
+        """Unpack the i-th object offset from the index file."""
+        raise NotImplementedError(self._unpack_offset)
+
+    def _unpack_crc32_checksum(self, i):
+        """Unpack the crc32 checksum for the i-th object from the index file."""
+        raise NotImplementedError(self._unpack_crc32_checksum)
+  
+    def __iter__(self):
+        return imap(sha_to_hex, self._itersha())
+  
+    def _itersha(self):
+        for i in range(len(self)):
+            yield self._unpack_name(i)
+  
+    def objects_sha1(self):
+        """Return the hex SHA1 over all the shas of all objects in this pack.
+        
+        :note: This is used for the filename of the pack.
+        """
+        return iter_sha1(self._itersha())
+  
+    def iterentries(self):
+        """Iterate over the entries in this pack index.
+       
+        Will yield tuples with object name, offset in packfile and crc32 checksum.
+        """
+        for i in range(len(self)):
+            yield self._unpack_entry(i)
+  
+    def _read_fan_out_table(self, start_offset):
+        ret = []
+        for i in range(0x100):
+            ret.append(struct.unpack(">L", self._contents[start_offset+i*4:start_offset+(i+1)*4])[0])
+        return ret
+  
+    def check(self):
+        """Check that the stored checksum matches the actual checksum."""
+        return self.calculate_checksum() == self.get_stored_checksum()
+  
+    def calculate_checksum(self):
+        return make_sha(self._contents[:-20]).digest()
+
+    def get_pack_checksum(self):
+        """Return the SHA1 checksum stored for the corresponding packfile."""
+        return str(self._contents[-40:-20])
+  
+    def get_stored_checksum(self):
+        """Return the SHA1 checksum stored for this index."""
+        return str(self._contents[-20:])
+  
+    def object_index(self, sha):
+        """Return the index in to the corresponding packfile for the object.
+    
+        Given the name of an object it will return the offset that object lives
+        at within the corresponding pack file. If the pack file doesn't have the
+        object then None will be returned.
+        """
+        if len(sha) == 40:
+            sha = hex_to_sha(sha)
+        return self._object_index(sha)
+  
+    def _object_index(self, sha):
+        """See object_index.
+        
+        :param sha: A *binary* SHA string. (20 characters long)_
+        """
+        assert len(sha) == 20
+        idx = ord(sha[0])
+        if idx == 0:
+            start = 0
+        else:
+            start = self._fan_out_table[idx-1]
+        end = self._fan_out_table[idx]
+        i = bisect_find_sha(start, end, sha, self._unpack_name)
+        if i is None:
+            raise KeyError(sha)
+        return self._unpack_offset(i)
+            
 
-  To do the loop it opens the file, and indexes first 256 4 byte groups
-  with the first byte of the sha id. The value in the four byte group indexed
-  is the end of the group that shares the same starting byte. Subtract one
-  from the starting byte and index again to find the start of the group.
-  The values are sorted by sha id within the group, so do the math to find
-  the start and end offset and then bisect in to find if the value is present.
-  """
 
-  def __init__(self, filename):
-    """Create a pack index object.
+class PackIndex1(PackIndex):
+    """Version 1 Pack Index."""
 
-    Provide it with the name of the index file to consider, and it will map
-    it whenever required.
-    """
-    self._filename = filename
-    # Take the size now, so it can be checked each time we map the file to
-    # ensure that it hasn't changed.
-    self._size = os.path.getsize(filename)
-    self._file = open(filename, 'r')
-    self._contents = simple_mmap(self._file, 0, self._size)
-    if self._contents[:4] != '\377tOc':
+    def __init__(self, filename, file=None):
+        PackIndex.__init__(self, filename, file)
         self.version = 1
         self._fan_out_table = self._read_fan_out_table(0)
-    else:
-        (self.version, ) = struct.unpack_from(">L", self._contents, 4)
-        assert self.version in (2,), "Version was %d" % self.version
+
+    def _unpack_entry(self, i):
+        (offset, name) = unpack_from(">L20s", self._contents, 
+            (0x100 * 4) + (i * 24))
+        return (name, offset, None)
+ 
+    def _unpack_name(self, i):
+        offset = (0x100 * 4) + (i * 24) + 4
+        return self._contents[offset:offset+20]
+  
+    def _unpack_offset(self, i):
+        offset = (0x100 * 4) + (i * 24)
+        return unpack_from(">L", self._contents, offset)[0]
+  
+    def _unpack_crc32_checksum(self, i):
+        # Not stored in v1 index files
+        return None 
+  
+
+class PackIndex2(PackIndex):
+    """Version 2 Pack Index."""
+
+    def __init__(self, filename, file=None):
+        PackIndex.__init__(self, filename, file)
+        assert self._contents[:4] == '\377tOc', "Not a v2 pack index file"
+        (self.version, ) = unpack_from(">L", self._contents, 4)
+        assert self.version == 2, "Version was %d" % self.version
         self._fan_out_table = self._read_fan_out_table(8)
         self._name_table_offset = 8 + 0x100 * 4
         self._crc32_table_offset = self._name_table_offset + 20 * len(self)
         self._pack_offset_table_offset = self._crc32_table_offset + 4 * len(self)
 
-  def __eq__(self, other):
-    if type(self) != type(other):
-        return False
-
-    if self._fan_out_table != other._fan_out_table:
-        return False
-
-    for (name1, _, _), (name2, _, _) in izip(self.iterentries(), other.iterentries()):
-        if name1 != name2:
-            return False
-    return True
-
-  def close(self):
-    self._file.close()
-
-  def __len__(self):
-    """Return the number of entries in this pack index."""
-    return self._fan_out_table[-1]
-
-  def _unpack_entry(self, i):
-    """Unpack the i-th entry in the index file.
-
-    :return: Tuple with object name (SHA), offset in pack file and 
-          CRC32 checksum (if known)."""
-    if self.version == 1:
-        (offset, name) = struct.unpack_from(">L20s", self._contents, 
-            (0x100 * 4) + (i * 24))
-        return (name, offset, None)
-    else:
+    def _unpack_entry(self, i):
         return (self._unpack_name(i), self._unpack_offset(i), 
                 self._unpack_crc32_checksum(i))
-
-  def _unpack_name(self, i):
-    if self.version == 1:
-        return self._unpack_entry(i)[0]
-    else:
-        return struct.unpack_from("20s", self._contents, 
-                                  self._name_table_offset + i * 20)[0]
-
-  def _unpack_offset(self, i):
-    if self.version == 1:
-        return self._unpack_entry(i)[1]
-    else:
-        return struct.unpack_from(">L", self._contents, 
-                                  self._pack_offset_table_offset + i * 4)[0]
-
-  def _unpack_crc32_checksum(self, i):
-    if self.version == 1:
-        return None
-    else:
-        return struct.unpack_from(">L", self._contents, 
-                                  self._crc32_table_offset + i * 4)[0]
-
-  def __iter__(self):
-      return imap(sha_to_hex, self._itersha())
-
-  def _itersha(self):
-    for i in range(len(self)):
-        yield self._unpack_name(i)
-
-  def objects_sha1(self):
-    return iter_sha1(self._itersha())
-
-  def iterentries(self):
-    """Iterate over the entries in this pack index.
-   
-    Will yield tuples with object name, offset in packfile and crc32 checksum.
-    """
-    for i in range(len(self)):
-        yield self._unpack_entry(i)
-
-  def _read_fan_out_table(self, start_offset):
-    ret = []
-    for i in range(0x100):
-        ret.append(struct.unpack(">L", self._contents[start_offset+i*4:start_offset+(i+1)*4])[0])
-    return ret
-
-  def check(self):
-    """Check that the stored checksum matches the actual checksum."""
-    return self.calculate_checksum() == self.get_stored_checksums()[1]
-
-  def calculate_checksum(self):
-    f = open(self._filename, 'r')
-    try:
-        return hashlib.sha1(self._contents[:-20]).digest()
-    finally:
-        f.close()
-
-  def get_stored_checksums(self):
-    """Return the SHA1 checksums stored for the corresponding packfile and 
-    this header file itself."""
-    return str(self._contents[-40:-20]), str(self._contents[-20:])
-
-  def object_index(self, sha):
-    """Return the index in to the corresponding packfile for the object.
-
-    Given the name of an object it will return the offset that object lives
-    at within the corresponding pack file. If the pack file doesn't have the
-    object then None will be returned.
-    """
-    size = os.path.getsize(self._filename)
-    assert size == self._size, "Pack index %s has changed size, I don't " \
-         "like that" % self._filename
-    if len(sha) == 40:
-        sha = hex_to_sha(sha)
-    return self._object_index(sha)
-
-  def _object_index(self, sha):
-      """See object_index"""
-      idx = ord(sha[0])
-      if idx == 0:
-          start = 0
-      else:
-          start = self._fan_out_table[idx-1]
-      end = self._fan_out_table[idx]
-      assert start <= end
-      while start <= end:
-        i = (start + end)/2
-        file_sha = self._unpack_name(i)
-        if file_sha < sha:
-          start = i + 1
-        elif file_sha > sha:
-          end = i - 1
-        else:
-          return self._unpack_offset(i)
-      return None
+ 
+    def _unpack_name(self, i):
+        offset = self._name_table_offset + i * 20
+        return self._contents[offset:offset+20]
+  
+    def _unpack_offset(self, i):
+        offset = self._pack_offset_table_offset + i * 4
+        return unpack_from(">L", self._contents, offset)[0]
+  
+    def _unpack_crc32_checksum(self, i):
+        return unpack_from(">L", self._contents, 
+                          self._crc32_table_offset + i * 4)[0]
+  
 
 
 def read_pack_header(f):
     header = f.read(12)
     assert header[:4] == "PACK"
-    (version,) = struct.unpack_from(">L", header, 4)
+    (version,) = unpack_from(">L", header, 4)
     assert version in (2, 3), "Version was %d" % version
-    (num_objects,) = struct.unpack_from(">L", header, 8)
+    (num_objects,) = unpack_from(">L", header, 8)
     return (version, num_objects)
 
 
@@ -324,15 +352,15 @@ def read_pack_tail(f):
     return (f.read(20),)
 
 
-def unpack_object(map):
-    bytes = take_msb_bytes(map, 0)
+def unpack_object(map, offset=0):
+    bytes = take_msb_bytes(map, offset)
     type = (bytes[0] >> 4) & 0x07
     size = bytes[0] & 0x0f
     for i, byte in enumerate(bytes[1:]):
-      size += (byte & 0x7f) << ((i * 7) + 4)
+        size += (byte & 0x7f) << ((i * 7) + 4)
     raw_base = len(bytes)
     if type == 6: # offset delta
-        bytes = take_msb_bytes(map, raw_base)
+        bytes = take_msb_bytes(map, raw_base + offset)
         assert not (bytes[-1] & 0x80)
         delta_base_offset = bytes[0] & 0x7f
         for byte in bytes[1:]:
@@ -340,177 +368,220 @@ def unpack_object(map):
             delta_base_offset <<= 7
             delta_base_offset += (byte & 0x7f)
         raw_base+=len(bytes)
-        uncomp, comp_len = read_zlib(map, raw_base, size)
+        uncomp, comp_len = read_zlib(map, offset + raw_base, size)
         assert size == len(uncomp)
         return type, (delta_base_offset, uncomp), comp_len+raw_base
     elif type == 7: # ref delta
-        basename = map[raw_base:raw_base+20]
-        uncomp, comp_len = read_zlib(map, raw_base+20, size)
+        basename = map[offset+raw_base:offset+raw_base+20]
+        uncomp, comp_len = read_zlib(map, offset+raw_base+20, size)
         assert size == len(uncomp)
         return type, (basename, uncomp), comp_len+raw_base+20
     else:
-        uncomp, comp_len = read_zlib(map, raw_base, size)
+        uncomp, comp_len = read_zlib(map, offset+raw_base, size)
         assert len(uncomp) == size
         return type, uncomp, comp_len+raw_base
 
 
-class PackData(object):
-  """The data contained in a packfile.
-
-  Pack files can be accessed both sequentially for exploding a pack, and
-  directly with the help of an index to retrieve a specific object.
-
-  The objects within are either complete or a delta aginst another.
-
-  The header is variable length. If the MSB of each byte is set then it
-  indicates that the subsequent byte is still part of the header.
-  For the first byte the next MS bits are the type, which tells you the type
-  of object, and whether it is a delta. The LS byte is the lowest bits of the
-  size. For each subsequent byte the LS 7 bits are the next MS bits of the
-  size, i.e. the last byte of the header contains the MS bits of the size.
+def compute_object_size((num, obj)):
+    if num in (6, 7):
+        return len(obj[1])
+    assert isinstance(obj, str)
+    return len(obj)
 
-  For the complete objects the data is stored as zlib deflated data.
-  The size in the header is the uncompressed object size, so to uncompress
-  you need to just keep feeding data to zlib until you get an object back,
-  or it errors on bad data. This is done here by just giving the complete
-  buffer from the start of the deflated object on. This is bad, but until I
-  get mmap sorted out it will have to do.
 
-  Currently there are no integrity checks done. Also no attempt is made to try
-  and detect the delta case, or a request for an object at the wrong position.
-  It will all just throw a zlib or KeyError.
-  """
-
-  def __init__(self, filename):
-    """Create a PackData object that represents the pack in the given filename.
-
-    The file must exist and stay readable until the object is disposed of. It
-    must also stay the same size. It will be mapped whenever needed.
-
-    Currently there is a restriction on the size of the pack as the python
-    mmap implementation is flawed.
+class PackData(object):
+    """The data contained in a packfile.
+  
+    Pack files can be accessed both sequentially for exploding a pack, and
+    directly with the help of an index to retrieve a specific object.
+  
+    The objects within are either complete or a delta aginst another.
+  
+    The header is variable length. If the MSB of each byte is set then it
+    indicates that the subsequent byte is still part of the header.
+    For the first byte the next MS bits are the type, which tells you the type
+    of object, and whether it is a delta. The LS byte is the lowest bits of the
+    size. For each subsequent byte the LS 7 bits are the next MS bits of the
+    size, i.e. the last byte of the header contains the MS bits of the size.
+  
+    For the complete objects the data is stored as zlib deflated data.
+    The size in the header is the uncompressed object size, so to uncompress
+    you need to just keep feeding data to zlib until you get an object back,
+    or it errors on bad data. This is done here by just giving the complete
+    buffer from the start of the deflated object on. This is bad, but until I
+    get mmap sorted out it will have to do.
+  
+    Currently there are no integrity checks done. Also no attempt is made to try
+    and detect the delta case, or a request for an object at the wrong position.
+    It will all just throw a zlib or KeyError.
     """
-    self._filename = filename
-    assert os.path.exists(filename), "%s is not a packfile" % filename
-    self._size = os.path.getsize(filename)
-    self._header_size = 12
-    assert self._size >= self._header_size, "%s is too small for a packfile" % filename
-    self._read_header()
-
-  def _read_header(self):
-    f = open(self._filename, 'rb')
-    try:
-        (version, self._num_objects) = \
-                read_pack_header(f)
-        f.seek(self._size-20)
-        (self._stored_checksum,) = read_pack_tail(f)
-    finally:
-        f.close()
-
-  def __len__(self):
-      """Returns the number of objects in this pack."""
-      return self._num_objects
-
-  def calculate_checksum(self):
-    f = open(self._filename, 'rb')
-    try:
-        map = simple_mmap(f, 0, self._size)
-        return hashlib.sha1(map[:-20]).digest()
-    finally:
-        f.close()
-
-  def iterobjects(self):
-    offset = self._header_size
-    f = open(self._filename, 'rb')
-    for i in range(len(self)):
-        map = simple_mmap(f, offset, self._size-offset)
-        (type, obj, total_size) = unpack_object(map)
-        yield offset, type, obj
-        offset += total_size
-    f.close()
+  
+    def __init__(self, filename):
+        """Create a PackData object that represents the pack in the given filename.
+    
+        The file must exist and stay readable until the object is disposed of. It
+        must also stay the same size. It will be mapped whenever needed.
+    
+        Currently there is a restriction on the size of the pack as the python
+        mmap implementation is flawed.
+        """
+        self._filename = filename
+        assert os.path.exists(filename), "%s is not a packfile" % filename
+        self._size = os.path.getsize(filename)
+        self._header_size = 12
+        assert self._size >= self._header_size, "%s is too small for a packfile (%d < %d)" % (filename, self._size, self._header_size)
+        self._file = open(self._filename, 'rb')
+        self._read_header()
+        self._offset_cache = LRUSizeCache(1024*1024*20, 
+            compute_size=compute_object_size)
 
-  def iterentries(self, ext_resolve_ref=None):
-    found = {}
-    at = {}
-    postponed = defaultdict(list)
-    class Postpone(Exception):
-        """Raised to postpone delta resolving."""
+    def close(self):
+        self._file.close()
+  
+    def _read_header(self):
+        (version, self._num_objects) = read_pack_header(self._file)
+        self._file.seek(self._size-20)
+        (self._stored_checksum,) = read_pack_tail(self._file)
+  
+    def __len__(self):
+        """Returns the number of objects in this pack."""
+        return self._num_objects
+  
+    def calculate_checksum(self):
+        """Calculate the checksum for this pack."""
+        map, map_offset = simple_mmap(self._file, 0, self._size - 20)
+        try:
+            return make_sha(map[map_offset:self._size-20]).digest()
+        finally:
+            map.close()
+
+    def resolve_object(self, offset, type, obj, get_ref, get_offset=None):
+        """Resolve an object, possibly resolving deltas when necessary.
         
-    def get_ref_text(sha):
-        if sha in found:
-            return found[sha]
-        if ext_resolve_ref:
+        :return: Tuple with object type and contents.
+        """
+        if type not in (6, 7): # Not a delta
+            return type, obj
+
+        if get_offset is None:
+            get_offset = self.get_object_at
+      
+        if type == 6: # offset delta
+            (delta_offset, delta) = obj
+            assert isinstance(delta_offset, int)
+            assert isinstance(delta, str)
+            base_offset = offset-delta_offset
+            type, base_obj = get_offset(base_offset)
+            assert isinstance(type, int)
+        elif type == 7: # ref delta
+            (basename, delta) = obj
+            assert isinstance(basename, str) and len(basename) == 20
+            assert isinstance(delta, str)
+            type, base_obj = get_ref(basename)
+            assert isinstance(type, int)
+            # Can't be a ofs delta, as we wouldn't know the base offset
+            assert type != 6
+            base_offset = None
+        type, base_text = self.resolve_object(base_offset, type, base_obj, get_ref)
+        if base_offset is not None:
+            self._offset_cache[base_offset] = type, base_text
+        ret = (type, apply_delta(base_text, delta))
+        return ret
+  
+    def iterobjects(self, progress=None):
+        offset = self._header_size
+        num = len(self)
+        map, _ = simple_mmap(self._file, 0, self._size)
+        try:
+            for i in range(num):
+                (type, obj, total_size) = unpack_object(map, offset)
+                crc32 = zlib.crc32(map[offset:offset+total_size]) & 0xffffffff
+                yield offset, type, obj, crc32
+                offset += total_size
+                if progress:
+                    progress(i, num)
+        finally:
+            map.close()
+  
+    def iterentries(self, ext_resolve_ref=None, progress=None):
+        found = {}
+        postponed = defaultdict(list)
+        class Postpone(Exception):
+            """Raised to postpone delta resolving."""
+          
+        def get_ref_text(sha):
+            assert len(sha) == 20
+            if sha in found:
+                return found[sha]
+            if ext_resolve_ref:
+                try:
+                    return ext_resolve_ref(sha)
+                except KeyError:
+                    pass
+            raise Postpone, (sha, )
+        extra = []
+        todo = chain(self.iterobjects(progress=progress), extra)
+        for (offset, type, obj, crc32) in todo:
+            assert isinstance(offset, int)
+            assert isinstance(type, int)
+            assert isinstance(obj, tuple) or isinstance(obj, str)
             try:
-                return ext_resolve_ref(sha)
-            except KeyError:
-                pass
-        raise Postpone, (sha, )
-    todo = list(self.iterobjects())
-    while todo:
-      (offset, type, obj) = todo.pop(0)
-      at[offset] = (type, obj)
-      assert isinstance(offset, int)
-      assert isinstance(type, int)
-      assert isinstance(obj, tuple) or isinstance(obj, str)
-      try:
-        type, obj = resolve_object(offset, type, obj, get_ref_text,
-            at.__getitem__)
-      except Postpone, (sha, ):
-        postponed[sha].append((offset, type, obj))
-      else:
-        shafile = ShaFile.from_raw_string(type, obj)
-        sha = shafile.sha().digest()
-        found[sha] = (type, obj)
-        yield sha, offset, shafile.crc32()
-        todo += postponed.get(sha, [])
-    if postponed:
-        raise KeyError([sha_to_hex(h) for h in postponed.keys()])
-
-  def sorted_entries(self, resolve_ext_ref=None):
-    ret = list(self.iterentries(resolve_ext_ref))
-    ret.sort()
-    return ret
-
-  def create_index_v1(self, filename):
-    entries = self.sorted_entries()
-    write_pack_index_v1(filename, entries, self.calculate_checksum())
-
-  def create_index_v2(self, filename):
-    entries = self.sorted_entries()
-    write_pack_index_v2(filename, entries, self.calculate_checksum())
-
-  def get_stored_checksum(self):
-    return self._stored_checksum
-
-  def check(self):
-    return (self.calculate_checksum() == self.get_stored_checksum())
-
-  def get_object_at(self, offset):
-    """Given an offset in to the packfile return the object that is there.
-
-    Using the associated index the location of an object can be looked up, and
-    then the packfile can be asked directly for that object using this
-    function.
-    """
-    assert isinstance(offset, long) or isinstance(offset, int),\
-            "offset was %r" % offset
-    assert offset >= self._header_size
-    size = os.path.getsize(self._filename)
-    assert size == self._size, "Pack data %s has changed size, I don't " \
-         "like that" % self._filename
-    f = open(self._filename, 'rb')
-    try:
-      map = simple_mmap(f, offset, size-offset)
-      return unpack_object(map)[:2]
-    finally:
-      f.close()
+                type, obj = self.resolve_object(offset, type, obj, get_ref_text)
+            except Postpone, (sha, ):
+                postponed[sha].append((offset, type, obj))
+            else:
+                shafile = ShaFile.from_raw_string(type, obj)
+                sha = shafile.sha().digest()
+                found[sha] = (type, obj)
+                yield sha, offset, crc32
+                extra.extend(postponed.get(sha, []))
+        if postponed:
+            raise KeyError([sha_to_hex(h) for h in postponed.keys()])
+  
+    def sorted_entries(self, resolve_ext_ref=None, progress=None):
+        ret = list(self.iterentries(resolve_ext_ref, progress=progress))
+        ret.sort()
+        return ret
+  
+    def create_index_v1(self, filename, resolve_ext_ref=None, progress=None):
+        entries = self.sorted_entries(resolve_ext_ref, progress=progress)
+        write_pack_index_v1(filename, entries, self.calculate_checksum())
+  
+    def create_index_v2(self, filename, resolve_ext_ref=None, progress=None):
+        entries = self.sorted_entries(resolve_ext_ref, progress=progress)
+        write_pack_index_v2(filename, entries, self.calculate_checksum())
+  
+    def get_stored_checksum(self):
+        return self._stored_checksum
+  
+    def check(self):
+        return (self.calculate_checksum() == self.get_stored_checksum())
+  
+    def get_object_at(self, offset):
+        """Given an offset in to the packfile return the object that is there.
+    
+        Using the associated index the location of an object can be looked up, and
+        then the packfile can be asked directly for that object using this
+        function.
+        """
+        if offset in self._offset_cache:
+            return self._offset_cache[offset]
+        assert isinstance(offset, long) or isinstance(offset, int),\
+                "offset was %r" % offset
+        assert offset >= self._header_size
+        map, map_offset = simple_mmap(self._file, offset, self._size-offset)
+        try:
+            ret = unpack_object(map, map_offset)[:2]
+            return ret
+        finally:
+            map.close()
 
 
 class SHA1Writer(object):
     
     def __init__(self, f):
         self.f = f
-        self.sha1 = hashlib.sha1("")
+        self.sha1 = make_sha("")
 
     def write(self, data):
         self.sha1.update(data)
@@ -536,8 +607,10 @@ def write_pack_object(f, type, object):
 
     :param f: File to write to
     :param o: Object to write
+    :return: Tuple with offset at which the object was written, and crc32
     """
     ret = f.tell()
+    packed_data_hdr = ""
     if type == 6: # ref delta
         (delta_base_offset, object) = object
     elif type == 7: # offset delta
@@ -546,10 +619,10 @@ def write_pack_object(f, type, object):
     c = (type << 4) | (size & 15)
     size >>= 4
     while size:
-        f.write(chr(c | 0x80))
+        packed_data_hdr += (chr(c | 0x80))
         c = size & 0x7f
         size >>= 7
-    f.write(chr(c))
+    packed_data_hdr += chr(c)
     if type == 6: # offset delta
         ret = [delta_base_offset & 0x7f]
         delta_base_offset >>= 7
@@ -557,42 +630,67 @@ def write_pack_object(f, type, object):
             delta_base_offset -= 1
             ret.insert(0, 0x80 | (delta_base_offset & 0x7f))
             delta_base_offset >>= 7
-        f.write("".join([chr(x) for x in ret]))
+        packed_data_hdr += "".join([chr(x) for x in ret])
     elif type == 7: # ref delta
         assert len(basename) == 20
-        f.write(basename)
-    f.write(zlib.compress(object))
-    return f.tell()
+        packed_data_hdr += basename
+    packed_data = packed_data_hdr + zlib.compress(object)
+    f.write(packed_data)
+    return (f.tell(), (zlib.crc32(packed_data) & 0xffffffff))
 
 
 def write_pack(filename, objects, num_objects):
     f = open(filename + ".pack", 'w')
     try:
         entries, data_sum = write_pack_data(f, objects, num_objects)
-    except:
+    finally:
         f.close()
     entries.sort()
     write_pack_index_v2(filename + ".idx", entries, data_sum)
 
 
-def write_pack_data(f, objects, num_objects):
+def write_pack_data(f, objects, num_objects, window=10):
     """Write a new pack file.
 
     :param filename: The filename of the new pack file.
-    :param objects: List of objects to write.
+    :param objects: List of objects to write (tuples with object and path)
     :return: List with (name, offset, crc32 checksum) entries, pack checksum
     """
+    recency = list(objects)
+    # FIXME: Somehow limit delta depth
+    # FIXME: Make thin-pack optional (its not used when cloning a pack)
+    # Build a list of objects ordered by the magic Linus heuristic
+    # This helps us find good objects to diff against us
+    magic = []
+    for obj, path in recency:
+        magic.append( (obj.type, path, 1, -len(obj.as_raw_string()[1]), obj) )
+    magic.sort()
+    # Build a map of objects and their index in magic - so we can find preceeding objects
+    # to diff against
+    offs = {}
+    for i in range(len(magic)):
+        offs[magic[i][4]] = i
+    # Write the pack
     entries = []
     f = SHA1Writer(f)
     f.write("PACK")               # Pack header
     f.write(struct.pack(">L", 2)) # Pack version
     f.write(struct.pack(">L", num_objects)) # Number of objects in pack
-    for o in objects:
+    for o, path in recency:
         sha1 = o.sha().digest()
-        crc32 = o.crc32()
-        # FIXME: Delta !
-        t, o = o.as_raw_string()
-        offset = write_pack_object(f, t, o)
+        orig_t, raw = o.as_raw_string()
+        winner = raw
+        t = orig_t
+        #for i in range(offs[o]-window, window):
+        #    if i < 0 or i >= len(offs): continue
+        #    b = magic[i][4]
+        #    if b.type != orig_t: continue
+        #    _, base = b.as_raw_string()
+        #    delta = create_delta(base, raw)
+        #    if len(delta) < len(winner):
+        #        winner = delta
+        #        t = 6 if magic[i][2] == 1 else 7
+        offset, crc32 = write_pack_object(f, t, winner)
         entries.append((sha1, offset, crc32))
     return entries, f.write_sha()
 
@@ -621,40 +719,102 @@ def write_pack_index_v1(filename, entries, pack_checksum):
     f.close()
 
 
+def create_delta(base_buf, target_buf):
+    """Use python difflib to work out how to transform base_buf to target_buf"""
+    assert isinstance(base_buf, str)
+    assert isinstance(target_buf, str)
+    out_buf = ""
+    # write delta header
+    def encode_size(size):
+        ret = ""
+        c = size & 0x7f
+        size >>= 7
+        while size:
+            ret += chr(c | 0x80)
+            c = size & 0x7f
+            size >>= 7
+        ret += chr(c)
+        return ret
+    out_buf += encode_size(len(base_buf))
+    out_buf += encode_size(len(target_buf))
+    # write out delta opcodes
+    seq = difflib.SequenceMatcher(a=base_buf, b=target_buf)
+    for opcode, i1, i2, j1, j2 in seq.get_opcodes():
+        # Git patch opcodes don't care about deletes!
+        #if opcode == "replace" or opcode == "delete":
+        #    pass
+        if opcode == "equal":
+            # If they are equal, unpacker will use data from base_buf
+            # Write out an opcode that says what range to use
+            scratch = ""
+            op = 0x80
+            o = i1
+            for i in range(4):
+                if o & 0xff << i*8:
+                    scratch += chr(o >> i)
+                    op |= 1 << i
+            s = i2 - i1
+            for i in range(2):
+                if s & 0xff << i*8:
+                    scratch += chr(s >> i)
+                    op |= 1 << (4+i)
+            out_buf += chr(op)
+            out_buf += scratch
+        if opcode == "replace" or opcode == "insert":
+            # If we are replacing a range or adding one, then we just
+            # output it to the stream (prefixed by its size)
+            s = j2 - j1
+            o = j1
+            while s > 127:
+                out_buf += chr(127)
+                out_buf += target_buf[o:o+127]
+                s -= 127
+                o += 127
+            out_buf += chr(s)
+            out_buf += target_buf[o:o+s]
+    return out_buf
+
+
 def apply_delta(src_buf, delta):
-    """Based on the similar function in git's patch-delta.c."""
+    """Based on the similar function in git's patch-delta.c.
+    
+    :param src_buf: Source buffer
+    :param delta: Delta instructions
+    """
     assert isinstance(src_buf, str), "was %r" % (src_buf,)
     assert isinstance(delta, str)
-    out = ""
-    def pop(delta):
-        ret = delta[0]
-        delta = delta[1:]
-        return ord(ret), delta
-    def get_delta_header_size(delta):
+    out = []
+    index = 0
+    delta_length = len(delta)
+    def get_delta_header_size(delta, index):
         size = 0
         i = 0
         while delta:
-            cmd, delta = pop(delta)
+            cmd = ord(delta[index])
+            index += 1
             size |= (cmd & ~0x80) << i
             i += 7
             if not cmd & 0x80:
                 break
-        return size, delta
-    src_size, delta = get_delta_header_size(delta)
-    dest_size, delta = get_delta_header_size(delta)
-    assert src_size == len(src_buf)
-    while delta:
-        cmd, delta = pop(delta)
+        return size, index
+    src_size, index = get_delta_header_size(delta, index)
+    dest_size, index = get_delta_header_size(delta, index)
+    assert src_size == len(src_buf), "%d vs %d" % (src_size, len(src_buf))
+    while index < delta_length:
+        cmd = ord(delta[index])
+        index += 1
         if cmd & 0x80:
             cp_off = 0
             for i in range(4):
                 if cmd & (1 << i): 
-                    x, delta = pop(delta)
+                    x = ord(delta[index])
+                    index += 1
                     cp_off |= x << (i * 8)
             cp_size = 0
             for i in range(3):
                 if cmd & (1 << (4+i)): 
-                    x, delta = pop(delta)
+                    x = ord(delta[index])
+                    index += 1
                     cp_size |= x << (i * 8)
             if cp_size == 0: 
                 cp_size = 0x10000
@@ -662,16 +822,17 @@ def apply_delta(src_buf, delta):
                 cp_off + cp_size > src_size or
                 cp_size > dest_size):
                 break
-            out += src_buf[cp_off:cp_off+cp_size]
+            out.append(src_buf[cp_off:cp_off+cp_size])
         elif cmd != 0:
-            out += delta[:cmd]
-            delta = delta[cmd:]
+            out.append(delta[index:index+cmd])
+            index += cmd
         else:
             raise ApplyDeltaError("Invalid opcode 0")
     
-    if delta != "":
-        raise ApplyDeltaError("delta not empty: %r" % delta)
+    if index != delta_length:
+        raise ApplyDeltaError("delta not empty: %r" % delta[index:])
 
+    out = ''.join(out)
     if dest_size != len(out):
         raise ApplyDeltaError("dest size incorrect")
 
@@ -700,7 +861,7 @@ def write_pack_index_v2(filename, entries, pack_checksum):
     for (name, offset, entry_checksum) in entries:
         f.write(name)
     for (name, offset, entry_checksum) in entries:
-        f.write(struct.pack(">l", entry_checksum))
+        f.write(struct.pack(">L", entry_checksum))
     for (name, offset, entry_checksum) in entries:
         # FIXME: handle if MSBit is set in offset
         f.write(struct.pack(">L", offset))
@@ -719,7 +880,15 @@ class Pack(object):
         self._data = None
         self._idx = None
 
+    @classmethod
+    def from_objects(self, data, idx):
+        ret = Pack("")
+        ret._data = data
+        ret._idx = idx
+        return ret
+
     def name(self):
+        """The SHA over the SHAs of the objects in this pack."""
         return self.idx.objects_sha1()
 
     @property
@@ -727,13 +896,17 @@ class Pack(object):
         if self._data is None:
             self._data = PackData(self._data_path)
             assert len(self.idx) == len(self._data)
-            assert self.idx.get_stored_checksums()[0] == self._data.get_stored_checksum()
+            idx_stored_checksum = self.idx.get_pack_checksum()
+            data_stored_checksum = self._data.get_stored_checksum()
+            if idx_stored_checksum != data_stored_checksum:
+                raise ChecksumMismatch(sha_to_hex(idx_stored_checksum), 
+                                       sha_to_hex(data_stored_checksum))
         return self._data
 
     @property
     def idx(self):
         if self._idx is None:
-            self._idx = PackIndex(self._idx_path)
+            self._idx = load_pack_index(self._idx_path)
         return self._idx
 
     def close(self):
@@ -749,45 +922,51 @@ class Pack(object):
         return len(self.idx)
 
     def __repr__(self):
-        return "Pack(%r)" % self._basename
+        return "%s(%r)" % (self.__class__.__name__, self._basename)
 
     def __iter__(self):
         """Iterate over all the sha1s of the objects in this pack."""
         return iter(self.idx)
 
     def check(self):
-        return self.idx.check() and self.data.check()
+        if not self.idx.check():
+            return False
+        if not self.data.check():
+            return False
+        return True
 
     def get_stored_checksum(self):
         return self.data.get_stored_checksum()
 
     def __contains__(self, sha1):
         """Check whether this pack contains a particular SHA1."""
-        return (self.idx.object_index(sha1) is not None)
+        try:
+            self.idx.object_index(sha1)
+            return True
+        except KeyError:
+            return False
 
     def get_raw(self, sha1, resolve_ref=None):
+        offset = self.idx.object_index(sha1)
+        obj_type, obj = self.data.get_object_at(offset)
+        if type(offset) is long:
+          offset = int(offset)
         if resolve_ref is None:
             resolve_ref = self.get_raw
-        offset = self.idx.object_index(sha1)
-        if offset is None:
-            raise KeyError(sha1)
-
-        type, obj = self.data.get_object_at(offset)
-        assert isinstance(offset, int)
-        return resolve_object(offset, type, obj, resolve_ref,
-            self.data.get_object_at)
+        return self.data.resolve_object(offset, obj_type, obj, resolve_ref)
 
     def __getitem__(self, sha1):
         """Retrieve the specified SHA1."""
         type, uncomp = self.get_raw(sha1)
         return ShaFile.from_raw_string(type, uncomp)
 
-    def iterobjects(self):
-        for offset, type, obj in self.data.iterobjects():
+    def iterobjects(self, get_raw=None):
+        if get_raw is None:
+            get_raw = self.get_raw
+        for offset, type, obj, crc32 in self.data.iterobjects():
             assert isinstance(offset, int)
             yield ShaFile.from_raw_string(
-                    *resolve_object(offset, type, obj, self.get_raw, 
-                self.data.get_object_at))
+                    *self.data.resolve_object(offset, type, obj, get_raw))
 
 
 def load_packs(path):
@@ -797,3 +976,8 @@ def load_packs(path):
         if name.startswith("pack-") and name.endswith(".pack"):
             yield Pack(os.path.join(path, name[:-len(".pack")]))
 
+
+try:
+    from dulwich._pack import apply_delta, bisect_find_sha
+except ImportError:
+    pass