Add SEEK_END to misc.py, since it was added in 2.5.
[jelmer/dulwich-libgit2.git] / dulwich / pack.py
index 390eee3c17c5ae40a016df3d46ebfec86aaff695..43bc24de0c0560f9984ef9a97eb74b5f08e0456d 100644 (file)
@@ -65,19 +65,27 @@ from dulwich.file import GitFile
 from dulwich.lru_cache import (
     LRUSizeCache,
     )
+from dulwich.misc import (
+    make_sha,
+    SEEK_END,
+    )
 from dulwich.objects import (
     ShaFile,
     hex_to_sha,
     sha_to_hex,
-    )
-from dulwich.misc import (
-    make_sha,
+    object_header,
     )
 
 supports_mmap_offset = (sys.version_info[0] >= 3 or
         (sys.version_info[0] == 2 and sys.version_info[1] >= 6))
 
 
+OFS_DELTA = 6
+REF_DELTA = 7
+
+DELTA_TYPES = (OFS_DELTA, REF_DELTA)
+
+
 def take_msb_bytes(read):
     """Read bytes marked with most significant bit.
 
@@ -122,6 +130,7 @@ def read_zlib_chunks(read_some, dec_size, buffer_size=4096):
     comp_len = fed - len(obj.unused_data)
     return ret, comp_len, obj.unused_data
 
+
 def iter_sha1(iter):
     """Return the hexdigest of the SHA1 over a set of names.
 
@@ -267,7 +276,8 @@ class PackIndex(object):
         """Unpack the i-th entry in the index file.
 
         :return: Tuple with object name (SHA), offset in pack file and CRC32
-            checksum (if known)."""
+            checksum (if known).
+        """
         raise NotImplementedError(self._unpack_entry)
 
     def _unpack_name(self, i):
@@ -300,8 +310,7 @@ class PackIndex(object):
     def iterentries(self):
         """Iterate over the entries in this pack index.
 
-        Will yield tuples with object name, offset in packfile and crc32
-        checksum.
+        :yields: tuples with object name, offset in packfile and crc32 checksum.
         """
         for i in range(len(self)):
             yield self._unpack_entry(i)
@@ -315,8 +324,10 @@ class PackIndex(object):
 
     def check(self):
         """Check that the stored checksum matches the actual checksum."""
-        # TODO: Check pack contents, too
-        return self.calculate_checksum() == self.get_stored_checksum()
+        actual = self.calculate_checksum()
+        stored = self.get_stored_checksum()
+        if actual != stored:
+            raise ChecksumMismatch(stored, actual)
 
     def calculate_checksum(self):
         """Calculate the SHA1 checksum over this pack index.
@@ -460,7 +471,7 @@ def unpack_object(read_all, read_some=None):
     for i, byte in enumerate(bytes[1:]):
         size += (byte & 0x7f) << ((i * 7) + 4)
     raw_base = len(bytes)
-    if type == 6: # offset delta
+    if type == OFS_DELTA:
         bytes = take_msb_bytes(read_all)
         raw_base += len(bytes)
         assert not (bytes[-1] & 0x80)
@@ -472,7 +483,7 @@ def unpack_object(read_all, read_some=None):
         uncomp, comp_len, unused = read_zlib_chunks(read_some, size)
         assert size == chunks_length(uncomp)
         return type, (delta_base_offset, uncomp), comp_len+raw_base, unused
-    elif type == 7: # ref delta
+    elif type == REF_DELTA:
         basename = read_all(20)
         raw_base += 20
         uncomp, comp_len, unused = read_zlib_chunks(read_some, size)
@@ -486,7 +497,7 @@ def unpack_object(read_all, read_some=None):
 
 def _compute_object_size((num, obj)):
     """Compute the size of a unresolved object for use with LRUSizeCache."""
-    if num in (6, 7):
+    if num in DELTA_TYPES:
         return chunks_length(obj[1])
     return chunks_length(obj)
 
@@ -543,7 +554,7 @@ class PackStreamReader(object):
     def _buf_len(self):
         buf = self._rbuf
         start = buf.tell()
-        buf.seek(0, os.SEEK_END)
+        buf.seek(0, SEEK_END)
         end = buf.tell()
         buf.seek(start)
         return end - start
@@ -630,6 +641,14 @@ class PackObjectIterator(object):
         self.i+=1
         return ret
 
+def obj_sha(type, chunks):
+    """Compute the SHA for a numeric type and object chunks."""
+    sha = make_sha()
+    sha.update(object_header(type, chunks_length(chunks)))
+    for chunk in chunks:
+        sha.update(chunk)
+    return sha.digest()
+
 
 class PackData(object):
     """The data contained in a packfile.
@@ -661,8 +680,8 @@ class PackData(object):
     def __init__(self, filename, file=None, size=None):
         """Create a PackData object representing the pack in the given filename.
 
-        The file must exist and stay readable until the object is disposed of.
-        It must also stay the same size. It will be mapped whenever needed.
+        The file must exist and stay readable until the object is disposed of. It
+        must also stay the same size. It will be mapped whenever needed.
 
         Currently there is a restriction on the size of the pack as the python
         mmap implementation is flawed.
@@ -677,6 +696,7 @@ class PackData(object):
         (version, self._num_objects) = read_pack_header(self._file.read)
         self._offset_cache = LRUSizeCache(1024*1024*20,
             compute_size=_compute_object_size)
+        self.pack = None
 
     @classmethod
     def from_file(cls, file, size):
@@ -720,132 +740,107 @@ class PackData(object):
             todo -= len(x)
         return s.digest()
 
-    def resolve_object(self, offset, type, obj, get_ref, get_offset=None):
+    def get_ref(self, sha):
+        """Get the object for a ref SHA, only looking in this pack."""
+        # TODO: cache these results
+        if self.pack is None:
+            raise KeyError(sha)
+        offset = self.pack.index.object_index(sha)
+        if not offset:
+            raise KeyError(sha)
+        type, obj = self.get_object_at(offset)
+        return offset, type, obj
+
+    def resolve_object(self, offset, type, obj, get_ref=None):
         """Resolve an object, possibly resolving deltas when necessary.
 
         :return: Tuple with object type and contents.
         """
-        if type not in (6, 7): # Not a delta
+        if type not in DELTA_TYPES:
             return type, obj
 
-        if get_offset is None:
-            get_offset = self.get_object_at
-
-        if type == 6: # offset delta
+        if get_ref is None:
+            get_ref = self.get_ref
+        if type == OFS_DELTA:
             (delta_offset, delta) = obj
+            # TODO: clean up asserts and replace with nicer error messages
+            assert isinstance(offset, int)
             assert isinstance(delta_offset, int)
             base_offset = offset-delta_offset
-            type, base_obj = get_offset(base_offset)
+            type, base_obj = self.get_object_at(base_offset)
             assert isinstance(type, int)
-        elif type == 7: # ref delta
+        elif type == REF_DELTA:
             (basename, delta) = obj
             assert isinstance(basename, str) and len(basename) == 20
-            type, base_obj = get_ref(basename)
+            base_offset, type, base_obj = get_ref(basename)
             assert isinstance(type, int)
-            # Can't be a ofs delta, as we wouldn't know the base offset
-            assert type != 6
-            base_offset = None
-        type, base_chunks = self.resolve_object(base_offset, type, base_obj,
-                                                get_ref)
-        if base_offset is not None:
-            self._offset_cache[base_offset] = type, base_chunks
-        return (type, apply_delta(base_chunks, delta))
+        type, base_chunks = self.resolve_object(base_offset, type, base_obj)
+        chunks = apply_delta(base_chunks, delta)
+        # TODO(dborowitz): This can result in poor performance if large base
+        # objects are separated from deltas in the pack. We should reorganize
+        # so that we apply deltas to all objects in a chain one after the other
+        # to optimize cache performance.
+        if offset is not None:
+            self._offset_cache[offset] = type, chunks
+        return type, chunks
 
     def iterobjects(self, progress=None):
         return PackObjectIterator(self, progress)
 
-    def iterentries(self, ext_resolve_ref=None, progress=None):
+    def iterentries(self, progress=None):
         """Yield entries summarizing the contents of this pack.
 
-        :param ext_resolve_ref: Optional function to resolve base
-            objects (in case this is a thin pack)
-        :param progress: Progress function, called with current and
-            total object count.
-
-        This will yield tuples with (sha, offset, crc32)
+        :param progress: Progress function, called with current and total object
+            count.
+        :yields: tuples with (sha, offset, crc32)
         """
-        found = {}
-        postponed = defaultdict(list)
-        class Postpone(Exception):
-            """Raised to postpone delta resolving."""
-
-        def get_ref_text(sha):
-            assert len(sha) == 20
-            if sha in found:
-                return self.get_object_at(found[sha])
-            if ext_resolve_ref:
-                try:
-                    return ext_resolve_ref(sha)
-                except KeyError:
-                    pass
-            raise Postpone, (sha, )
-        extra = []
-        todo = chain(self.iterobjects(progress=progress), extra)
-        for (offset, type, obj, crc32) in todo:
+        for offset, type, obj, crc32 in self.iterobjects(progress=progress):
             assert isinstance(offset, int)
             assert isinstance(type, int)
-            try:
-                type, obj = self.resolve_object(offset, type, obj,
-                    get_ref_text)
-            except Postpone, (sha, ):
-                postponed[sha].append((offset, type, obj))
-            else:
-                shafile = ShaFile.from_raw_chunks(type, obj)
-                sha = shafile.sha().digest()
-                found[sha] = offset
-                yield sha, offset, crc32
-                extra.extend(postponed.get(sha, []))
-        if postponed:
-            raise KeyError([sha_to_hex(h) for h in postponed.keys()])
+            assert isinstance(obj, list) or isinstance(obj, tuple)
+            type, obj = self.resolve_object(offset, type, obj)
+            yield obj_sha(type, obj), offset, crc32
 
-    def sorted_entries(self, resolve_ext_ref=None, progress=None):
+    def sorted_entries(self, progress=None):
         """Return entries in this pack, sorted by SHA.
 
-        :param resolve_ext_ref: Optional function to resolve base
-            objects (in case this is a thin pack)
-        :param progress: Progress function, called with current and
-            total object count
+        :param progress: Progress function, called with current and total object
+            count
         :return: List of tuples with (sha, offset, crc32)
         """
-        ret = list(self.iterentries(resolve_ext_ref, progress=progress))
+        ret = list(self.iterentries(progress=progress))
         ret.sort()
         return ret
 
-    def create_index_v1(self, filename, resolve_ext_ref=None, progress=None):
+    def create_index_v1(self, filename, progress=None):
         """Create a version 1 file for this data file.
 
         :param filename: Index filename.
-        :param resolve_ext_ref: Function to use for resolving externally
-            referenced SHA1s (for thin packs)
         :param progress: Progress report function
         """
-        entries = self.sorted_entries(resolve_ext_ref, progress=progress)
+        entries = self.sorted_entries(progress=progress)
         write_pack_index_v1(filename, entries, self.calculate_checksum())
 
-    def create_index_v2(self, filename, resolve_ext_ref=None, progress=None):
+    def create_index_v2(self, filename, progress=None):
         """Create a version 2 index file for this data file.
 
         :param filename: Index filename.
-        :param resolve_ext_ref: Function to use for resolving externally
-            referenced SHA1s (for thin packs)
         :param progress: Progress report function
         """
-        entries = self.sorted_entries(resolve_ext_ref, progress=progress)
+        entries = self.sorted_entries(progress=progress)
         write_pack_index_v2(filename, entries, self.calculate_checksum())
 
-    def create_index(self, filename, resolve_ext_ref=None, progress=None,
+    def create_index(self, filename, progress=None,
                      version=2):
         """Create an  index file for this data file.
 
         :param filename: Index filename.
-        :param resolve_ext_ref: Function to use for resolving externally
-            referenced SHA1s (for thin packs)
         :param progress: Progress report function
         """
         if version == 1:
-            self.create_index_v1(filename, resolve_ext_ref, progress)
+            self.create_index_v1(filename, progress)
         elif version == 2:
-            self.create_index_v2(filename, resolve_ext_ref, progress)
+            self.create_index_v2(filename, progress)
         else:
             raise ValueError("unknown index format %d" % version)
 
@@ -856,7 +851,10 @@ class PackData(object):
 
     def check(self):
         """Check the consistency of this pack."""
-        return (self.calculate_checksum() == self.get_stored_checksum())
+        actual = self.calculate_checksum()
+        stored = self.get_stored_checksum()
+        if actual != stored:
+            raise ChecksumMismatch(stored, actual)
 
     def get_object_at(self, offset):
         """Given an offset in to the packfile return the object that is there.
@@ -874,6 +872,77 @@ class PackData(object):
         return unpack_object(self._file.read)[:2]
 
 
+class ThinPackData(PackData):
+    """PackData for thin packs, which require an ObjectStore for resolving."""
+
+    def __init__(self, resolve_ext_ref, *args, **kwargs):
+        super(ThinPackData, self).__init__(*args, **kwargs)
+        self.resolve_ext_ref = resolve_ext_ref
+
+    def get_ref(self, sha):
+        """Resolve a reference looking in both this pack and the store."""
+        try:
+            # As part of completing a pack we create a Pack object with a
+            # ThinPackData and a full PackIndex, so check in the index first if
+            # possible.
+            # TODO(dborowitz): reevaluate this when the pack completion code is
+            # rewritten.
+            return super(ThinPackData, self).get_ref(sha)
+        except KeyError:
+            type, obj = self.resolve_ext_ref(sha)
+            return None, type, obj
+
+    def iterentries(self, progress=None):
+        """Yield entries summarizing the contents of this pack.
+
+        :param progress: Progress function, called with current and
+            total object count.
+
+        This will yield tuples with (sha, offset, crc32)
+        """
+        found = {}
+        postponed = defaultdict(list)
+
+        class Postpone(Exception):
+            """Raised to postpone delta resolving."""
+
+            def __init__(self, sha):
+                self.sha = sha
+
+        def get_ref_text(sha):
+            assert len(sha) == 20
+            if sha in found:
+                offset = found[sha]
+                type, obj = self.get_object_at(offset)
+                return offset, type, obj
+            try:
+                return self.get_ref(sha)
+            except KeyError:
+                raise Postpone(sha)
+
+        extra = []
+        todo = chain(self.iterobjects(progress=progress), extra)
+        for (offset, type, obj, crc32) in todo:
+            assert isinstance(offset, int)
+            if obj is None:
+                # Inflate postponed delta
+                obj, type = self.get_object_at(offset)
+            assert isinstance(type, int)
+            assert isinstance(obj, list) or isinstance(obj, tuple)
+            try:
+                type, obj = self.resolve_object(offset, type, obj, get_ref_text)
+            except Postpone, e:
+                # Save memory by not storing the inflated obj in postponed
+                postponed[e.sha].append((offset, type, None, crc32))
+            else:
+                sha = obj_sha(type, obj)
+                found[sha] = offset
+                yield sha, offset, crc32
+                extra.extend(postponed.pop(sha, []))
+        if postponed:
+            raise KeyError([sha_to_hex(h) for h in postponed.keys()])
+
+
 class SHA1Reader(object):
     """Wrapper around a file-like object that remembers the SHA1 of its data."""
 
@@ -934,9 +1003,9 @@ def write_pack_object(f, type, object):
     """
     offset = f.tell()
     packed_data_hdr = ""
-    if type == 6: # offset delta
+    if type == OFS_DELTA:
         (delta_base_offset, object) = object
-    elif type == 7: # ref delta
+    elif type == REF_DELTA:
         (basename, object) = object
     size = len(object)
     c = (type << 4) | (size & 15)
@@ -946,7 +1015,7 @@ def write_pack_object(f, type, object):
         c = size & 0x7f
         size >>= 7
     packed_data_hdr += chr(c)
-    if type == 6: # offset delta
+    if type == OFS_DELTA:
         ret = [delta_base_offset & 0x7f]
         delta_base_offset >>= 7
         while delta_base_offset:
@@ -954,7 +1023,7 @@ def write_pack_object(f, type, object):
             ret.insert(0, 0x80 | (delta_base_offset & 0x7f))
             delta_base_offset >>= 7
         packed_data_hdr += "".join([chr(x) for x in ret])
-    elif type == 7: # ref delta
+    elif type == REF_DELTA:
         assert len(basename) == 20
         packed_data_hdr += basename
     packed_data = packed_data_hdr + zlib.compress(object)
@@ -1226,6 +1295,7 @@ class Pack(object):
         ret = Pack("")
         ret._data = data
         ret._idx = idx
+        data.pack = ret
         return ret
 
     def name(self):
@@ -1237,6 +1307,7 @@ class Pack(object):
         """The pack data object being used."""
         if self._data is None:
             self._data = PackData(self._data_path)
+            self._data.pack = self
             assert len(self.index) == len(self._data)
             idx_stored_checksum = self.index.get_pack_checksum()
             data_stored_checksum = self._data.get_stored_checksum()
@@ -1275,12 +1346,15 @@ class Pack(object):
         return iter(self.index)
 
     def check(self):
-        """Check the integrity of this pack."""
-        if not self.index.check():
-            return False
-        if not self.data.check():
-            return False
-        return True
+        """Check the integrity of this pack.
+
+        :raise ChecksumMismatch: if a checksum for the index or data is wrong
+        """
+        self.index.check()
+        self.data.check()
+        for obj in self.iterobjects():
+            obj.check()
+        # TODO: object connectivity checks
 
     def get_stored_checksum(self):
         return self.data.get_stored_checksum()
@@ -1293,30 +1367,25 @@ class Pack(object):
         except KeyError:
             return False
 
-    def get_raw(self, sha1, resolve_ref=None):
+    def get_raw(self, sha1):
         offset = self.index.object_index(sha1)
         obj_type, obj = self.data.get_object_at(offset)
         if type(offset) is long:
           offset = int(offset)
-        if resolve_ref is None:
-            resolve_ref = self.get_raw
-        kind, chunks = self.data.resolve_object(offset, obj_type, obj,
-            resolve_ref)
-        return kind, "".join(chunks)
+        type_num, chunks = self.data.resolve_object(offset, obj_type, obj)
+        return type_num, "".join(chunks)
 
     def __getitem__(self, sha1):
         """Retrieve the specified SHA1."""
         type, uncomp = self.get_raw(sha1)
         return ShaFile.from_raw_string(type, uncomp)
 
-    def iterobjects(self, get_raw=None):
+    def iterobjects(self):
         """Iterate over the objects in this pack."""
-        if get_raw is None:
-            get_raw = self.get_raw
         for offset, type, obj, crc32 in self.data.iterobjects():
             assert isinstance(offset, int)
-            type, obj = self.data.resolve_object(offset, type, obj, get_raw)
-            yield ShaFile.from_raw_chunks(type, obj)
+            yield ShaFile.from_raw_chunks(
+              *self.data.resolve_object(offset, type, obj))
 
 
 try: