Add SEEK_END to misc.py, since it was added in 2.5.

[jelmer/dulwich-libgit2.git] / dulwich / pack.py
diff --git a/dulwich/pack.py b/dulwich/pack.py

index 390eee3c17c5ae40a016df3d46ebfec86aaff695..43bc24de0c0560f9984ef9a97eb74b5f08e0456d 100644 (file)
--- a/dulwich/pack.py
+++ b/dulwich/pack.py
@@ -65,19 +65,27 @@ from dulwich.file import GitFile
  from dulwich.lru_cache import (
      LRUSizeCache,
      )
+from dulwich.misc import (
+    make_sha,
+    SEEK_END,
+    )
  from dulwich.objects import (
      ShaFile,
      hex_to_sha,
      sha_to_hex,
-    )
-from dulwich.misc import (
-    make_sha,
+    object_header,
      )
  
  supports_mmap_offset = (sys.version_info[0] >= 3 or
          (sys.version_info[0] == 2 and sys.version_info[1] >= 6))
  
  
+OFS_DELTA = 6
+REF_DELTA = 7
+
+DELTA_TYPES = (OFS_DELTA, REF_DELTA)
+
+
  def take_msb_bytes(read):
      """Read bytes marked with most significant bit.
  
@@ -122,6 +130,7 @@ def read_zlib_chunks(read_some, dec_size, buffer_size=4096):
      comp_len = fed - len(obj.unused_data)
      return ret, comp_len, obj.unused_data
  
+
  def iter_sha1(iter):
      """Return the hexdigest of the SHA1 over a set of names.
  
@@ -267,7 +276,8 @@ class PackIndex(object):
          """Unpack the i-th entry in the index file.
  
          :return: Tuple with object name (SHA), offset in pack file and CRC32
-            checksum (if known)."""
+            checksum (if known).
+        """
          raise NotImplementedError(self._unpack_entry)
  
      def _unpack_name(self, i):
@@ -300,8 +310,7 @@ class PackIndex(object):
      def iterentries(self):
          """Iterate over the entries in this pack index.
  
-        Will yield tuples with object name, offset in packfile and crc32
-        checksum.
+        :yields: tuples with object name, offset in packfile and crc32 checksum.
          """
          for i in range(len(self)):
              yield self._unpack_entry(i)
@@ -315,8 +324,10 @@ class PackIndex(object):
  
      def check(self):
          """Check that the stored checksum matches the actual checksum."""
-        # TODO: Check pack contents, too
-        return self.calculate_checksum() == self.get_stored_checksum()
+        actual = self.calculate_checksum()
+        stored = self.get_stored_checksum()
+        if actual != stored:
+            raise ChecksumMismatch(stored, actual)
  
      def calculate_checksum(self):
          """Calculate the SHA1 checksum over this pack index.
@@ -460,7 +471,7 @@ def unpack_object(read_all, read_some=None):
      for i, byte in enumerate(bytes[1:]):
          size += (byte & 0x7f) << ((i * 7) + 4)
      raw_base = len(bytes)
-    if type == 6: # offset delta
+    if type == OFS_DELTA:
          bytes = take_msb_bytes(read_all)
          raw_base += len(bytes)
          assert not (bytes[-1] & 0x80)
@@ -472,7 +483,7 @@ def unpack_object(read_all, read_some=None):
          uncomp, comp_len, unused = read_zlib_chunks(read_some, size)
          assert size == chunks_length(uncomp)
          return type, (delta_base_offset, uncomp), comp_len+raw_base, unused
-    elif type == 7: # ref delta
+    elif type == REF_DELTA:
          basename = read_all(20)
          raw_base += 20
          uncomp, comp_len, unused = read_zlib_chunks(read_some, size)
@@ -486,7 +497,7 @@ def unpack_object(read_all, read_some=None):
  
  def _compute_object_size((num, obj)):
      """Compute the size of a unresolved object for use with LRUSizeCache."""
-    if num in (6, 7):
+    if num in DELTA_TYPES:
          return chunks_length(obj[1])
      return chunks_length(obj)
  
@@ -543,7 +554,7 @@ class PackStreamReader(object):
      def _buf_len(self):
          buf = self._rbuf
          start = buf.tell()
-        buf.seek(0, os.SEEK_END)
+        buf.seek(0, SEEK_END)
          end = buf.tell()
          buf.seek(start)
          return end - start
@@ -630,6 +641,14 @@ class PackObjectIterator(object):
          self.i+=1
          return ret
  
+def obj_sha(type, chunks):
+    """Compute the SHA for a numeric type and object chunks."""
+    sha = make_sha()
+    sha.update(object_header(type, chunks_length(chunks)))
+    for chunk in chunks:
+        sha.update(chunk)
+    return sha.digest()
+
  
  class PackData(object):
      """The data contained in a packfile.
@@ -661,8 +680,8 @@ class PackData(object):
      def __init__(self, filename, file=None, size=None):
          """Create a PackData object representing the pack in the given filename.
  
-        The file must exist and stay readable until the object is disposed of.
-        It must also stay the same size. It will be mapped whenever needed.
+        The file must exist and stay readable until the object is disposed of. It
+        must also stay the same size. It will be mapped whenever needed.
  
          Currently there is a restriction on the size of the pack as the python
          mmap implementation is flawed.
@@ -677,6 +696,7 @@ class PackData(object):
          (version, self._num_objects) = read_pack_header(self._file.read)
          self._offset_cache = LRUSizeCache(1024*1024*20,
              compute_size=_compute_object_size)
+        self.pack = None
  
      @classmethod
      def from_file(cls, file, size):
@@ -720,132 +740,107 @@ class PackData(object):
              todo -= len(x)
          return s.digest()
  
-    def resolve_object(self, offset, type, obj, get_ref, get_offset=None):
+    def get_ref(self, sha):
+        """Get the object for a ref SHA, only looking in this pack."""
+        # TODO: cache these results
+        if self.pack is None:
+            raise KeyError(sha)
+        offset = self.pack.index.object_index(sha)
+        if not offset:
+            raise KeyError(sha)
+        type, obj = self.get_object_at(offset)
+        return offset, type, obj
+
+    def resolve_object(self, offset, type, obj, get_ref=None):
          """Resolve an object, possibly resolving deltas when necessary.
  
          :return: Tuple with object type and contents.
          """
-        if type not in (6, 7): # Not a delta
+        if type not in DELTA_TYPES:
              return type, obj
  
-        if get_offset is None:
-            get_offset = self.get_object_at
-
-        if type == 6: # offset delta
+        if get_ref is None:
+            get_ref = self.get_ref
+        if type == OFS_DELTA:
              (delta_offset, delta) = obj
+            # TODO: clean up asserts and replace with nicer error messages
+            assert isinstance(offset, int)
              assert isinstance(delta_offset, int)
              base_offset = offset-delta_offset
-            type, base_obj = get_offset(base_offset)
+            type, base_obj = self.get_object_at(base_offset)
              assert isinstance(type, int)
-        elif type == 7: # ref delta
+        elif type == REF_DELTA:
              (basename, delta) = obj
              assert isinstance(basename, str) and len(basename) == 20
-            type, base_obj = get_ref(basename)
+            base_offset, type, base_obj = get_ref(basename)
              assert isinstance(type, int)
-            # Can't be a ofs delta, as we wouldn't know the base offset
-            assert type != 6
-            base_offset = None
-        type, base_chunks = self.resolve_object(base_offset, type, base_obj,
-                                                get_ref)
-        if base_offset is not None:
-            self._offset_cache[base_offset] = type, base_chunks
-        return (type, apply_delta(base_chunks, delta))
+        type, base_chunks = self.resolve_object(base_offset, type, base_obj)
+        chunks = apply_delta(base_chunks, delta)
+        # TODO(dborowitz): This can result in poor performance if large base
+        # objects are separated from deltas in the pack. We should reorganize
+        # so that we apply deltas to all objects in a chain one after the other
+        # to optimize cache performance.
+        if offset is not None:
+            self._offset_cache[offset] = type, chunks
+        return type, chunks
  
      def iterobjects(self, progress=None):
          return PackObjectIterator(self, progress)
  
-    def iterentries(self, ext_resolve_ref=None, progress=None):
+    def iterentries(self, progress=None):
          """Yield entries summarizing the contents of this pack.
  
-        :param ext_resolve_ref: Optional function to resolve base
-            objects (in case this is a thin pack)
-        :param progress: Progress function, called with current and
-            total object count.
-
-        This will yield tuples with (sha, offset, crc32)
+        :param progress: Progress function, called with current and total object
+            count.
+        :yields: tuples with (sha, offset, crc32)
          """
-        found = {}
-        postponed = defaultdict(list)
-        class Postpone(Exception):
-            """Raised to postpone delta resolving."""
-
-        def get_ref_text(sha):
-            assert len(sha) == 20
-            if sha in found:
-                return self.get_object_at(found[sha])
-            if ext_resolve_ref:
-                try:
-                    return ext_resolve_ref(sha)
-                except KeyError:
-                    pass
-            raise Postpone, (sha, )
-        extra = []
-        todo = chain(self.iterobjects(progress=progress), extra)
-        for (offset, type, obj, crc32) in todo:
+        for offset, type, obj, crc32 in self.iterobjects(progress=progress):
              assert isinstance(offset, int)
              assert isinstance(type, int)
-            try:
-                type, obj = self.resolve_object(offset, type, obj,
-                    get_ref_text)
-            except Postpone, (sha, ):
-                postponed[sha].append((offset, type, obj))
-            else:
-                shafile = ShaFile.from_raw_chunks(type, obj)
-                sha = shafile.sha().digest()
-                found[sha] = offset
-                yield sha, offset, crc32
-                extra.extend(postponed.get(sha, []))
-        if postponed:
-            raise KeyError([sha_to_hex(h) for h in postponed.keys()])
+            assert isinstance(obj, list) or isinstance(obj, tuple)
+            type, obj = self.resolve_object(offset, type, obj)
+            yield obj_sha(type, obj), offset, crc32
  
-    def sorted_entries(self, resolve_ext_ref=None, progress=None):
+    def sorted_entries(self, progress=None):
          """Return entries in this pack, sorted by SHA.
  
-        :param resolve_ext_ref: Optional function to resolve base
-            objects (in case this is a thin pack)
-        :param progress: Progress function, called with current and
-            total object count
+        :param progress: Progress function, called with current and total object
+            count
          :return: List of tuples with (sha, offset, crc32)
          """
-        ret = list(self.iterentries(resolve_ext_ref, progress=progress))
+        ret = list(self.iterentries(progress=progress))
          ret.sort()
          return ret
  
-    def create_index_v1(self, filename, resolve_ext_ref=None, progress=None):
+    def create_index_v1(self, filename, progress=None):
          """Create a version 1 file for this data file.
  
          :param filename: Index filename.
-        :param resolve_ext_ref: Function to use for resolving externally
-            referenced SHA1s (for thin packs)
          :param progress: Progress report function
          """
-        entries = self.sorted_entries(resolve_ext_ref, progress=progress)
+        entries = self.sorted_entries(progress=progress)
          write_pack_index_v1(filename, entries, self.calculate_checksum())
  
-    def create_index_v2(self, filename, resolve_ext_ref=None, progress=None):
+    def create_index_v2(self, filename, progress=None):
          """Create a version 2 index file for this data file.
  
          :param filename: Index filename.
-        :param resolve_ext_ref: Function to use for resolving externally
-            referenced SHA1s (for thin packs)
          :param progress: Progress report function
          """
-        entries = self.sorted_entries(resolve_ext_ref, progress=progress)
+        entries = self.sorted_entries(progress=progress)
          write_pack_index_v2(filename, entries, self.calculate_checksum())
  
-    def create_index(self, filename, resolve_ext_ref=None, progress=None,
+    def create_index(self, filename, progress=None,
                       version=2):
          """Create an  index file for this data file.
  
          :param filename: Index filename.
-        :param resolve_ext_ref: Function to use for resolving externally
-            referenced SHA1s (for thin packs)
          :param progress: Progress report function
          """
          if version == 1:
-            self.create_index_v1(filename, resolve_ext_ref, progress)
+            self.create_index_v1(filename, progress)
          elif version == 2:
-            self.create_index_v2(filename, resolve_ext_ref, progress)
+            self.create_index_v2(filename, progress)
          else:
              raise ValueError("unknown index format %d" % version)
  
@@ -856,7 +851,10 @@ class PackData(object):
  
      def check(self):
          """Check the consistency of this pack."""
-        return (self.calculate_checksum() == self.get_stored_checksum())
+        actual = self.calculate_checksum()
+        stored = self.get_stored_checksum()
+        if actual != stored:
+            raise ChecksumMismatch(stored, actual)
  
      def get_object_at(self, offset):
          """Given an offset in to the packfile return the object that is there.
@@ -874,6 +872,77 @@ class PackData(object):
          return unpack_object(self._file.read)[:2]
  
  
+class ThinPackData(PackData):
+    """PackData for thin packs, which require an ObjectStore for resolving."""
+
+    def __init__(self, resolve_ext_ref, *args, **kwargs):
+        super(ThinPackData, self).__init__(*args, **kwargs)
+        self.resolve_ext_ref = resolve_ext_ref
+
+    def get_ref(self, sha):
+        """Resolve a reference looking in both this pack and the store."""
+        try:
+            # As part of completing a pack we create a Pack object with a
+            # ThinPackData and a full PackIndex, so check in the index first if
+            # possible.
+            # TODO(dborowitz): reevaluate this when the pack completion code is
+            # rewritten.
+            return super(ThinPackData, self).get_ref(sha)
+        except KeyError:
+            type, obj = self.resolve_ext_ref(sha)
+            return None, type, obj
+
+    def iterentries(self, progress=None):
+        """Yield entries summarizing the contents of this pack.
+
+        :param progress: Progress function, called with current and
+            total object count.
+
+        This will yield tuples with (sha, offset, crc32)
+        """
+        found = {}
+        postponed = defaultdict(list)
+
+        class Postpone(Exception):
+            """Raised to postpone delta resolving."""
+
+            def __init__(self, sha):
+                self.sha = sha
+
+        def get_ref_text(sha):
+            assert len(sha) == 20
+            if sha in found:
+                offset = found[sha]
+                type, obj = self.get_object_at(offset)
+                return offset, type, obj
+            try:
+                return self.get_ref(sha)
+            except KeyError:
+                raise Postpone(sha)
+
+        extra = []
+        todo = chain(self.iterobjects(progress=progress), extra)
+        for (offset, type, obj, crc32) in todo:
+            assert isinstance(offset, int)
+            if obj is None:
+                # Inflate postponed delta
+                obj, type = self.get_object_at(offset)
+            assert isinstance(type, int)
+            assert isinstance(obj, list) or isinstance(obj, tuple)
+            try:
+                type, obj = self.resolve_object(offset, type, obj, get_ref_text)
+            except Postpone, e:
+                # Save memory by not storing the inflated obj in postponed
+                postponed[e.sha].append((offset, type, None, crc32))
+            else:
+                sha = obj_sha(type, obj)
+                found[sha] = offset
+                yield sha, offset, crc32
+                extra.extend(postponed.pop(sha, []))
+        if postponed:
+            raise KeyError([sha_to_hex(h) for h in postponed.keys()])
+
+
  class SHA1Reader(object):
      """Wrapper around a file-like object that remembers the SHA1 of its data."""
  
@@ -934,9 +1003,9 @@ def write_pack_object(f, type, object):
      """
      offset = f.tell()
      packed_data_hdr = ""
-    if type == 6: # offset delta
+    if type == OFS_DELTA:
          (delta_base_offset, object) = object
-    elif type == 7: # ref delta
+    elif type == REF_DELTA:
          (basename, object) = object
      size = len(object)
      c = (type << 4) | (size & 15)
@@ -946,7 +1015,7 @@ def write_pack_object(f, type, object):
          c = size & 0x7f
          size >>= 7
      packed_data_hdr += chr(c)
-    if type == 6: # offset delta
+    if type == OFS_DELTA:
          ret = [delta_base_offset & 0x7f]
          delta_base_offset >>= 7
          while delta_base_offset:
@@ -954,7 +1023,7 @@ def write_pack_object(f, type, object):
              ret.insert(0, 0x80 | (delta_base_offset & 0x7f))
              delta_base_offset >>= 7
          packed_data_hdr += "".join([chr(x) for x in ret])
-    elif type == 7: # ref delta
+    elif type == REF_DELTA:
          assert len(basename) == 20
          packed_data_hdr += basename
      packed_data = packed_data_hdr + zlib.compress(object)
@@ -1226,6 +1295,7 @@ class Pack(object):
          ret = Pack("")
          ret._data = data
          ret._idx = idx
+        data.pack = ret
          return ret
  
      def name(self):
@@ -1237,6 +1307,7 @@ class Pack(object):
          """The pack data object being used."""
          if self._data is None:
              self._data = PackData(self._data_path)
+            self._data.pack = self
              assert len(self.index) == len(self._data)
              idx_stored_checksum = self.index.get_pack_checksum()
              data_stored_checksum = self._data.get_stored_checksum()
@@ -1275,12 +1346,15 @@ class Pack(object):
          return iter(self.index)
  
      def check(self):
-        """Check the integrity of this pack."""
-        if not self.index.check():
-            return False
-        if not self.data.check():
-            return False
-        return True
+        """Check the integrity of this pack.
+
+        :raise ChecksumMismatch: if a checksum for the index or data is wrong
+        """
+        self.index.check()
+        self.data.check()
+        for obj in self.iterobjects():
+            obj.check()
+        # TODO: object connectivity checks
  
      def get_stored_checksum(self):
          return self.data.get_stored_checksum()
@@ -1293,30 +1367,25 @@ class Pack(object):
          except KeyError:
              return False
  
-    def get_raw(self, sha1, resolve_ref=None):
+    def get_raw(self, sha1):
          offset = self.index.object_index(sha1)
          obj_type, obj = self.data.get_object_at(offset)
          if type(offset) is long:
            offset = int(offset)
-        if resolve_ref is None:
-            resolve_ref = self.get_raw
-        kind, chunks = self.data.resolve_object(offset, obj_type, obj,
-            resolve_ref)
-        return kind, "".join(chunks)
+        type_num, chunks = self.data.resolve_object(offset, obj_type, obj)
+        return type_num, "".join(chunks)
  
      def __getitem__(self, sha1):
          """Retrieve the specified SHA1."""
          type, uncomp = self.get_raw(sha1)
          return ShaFile.from_raw_string(type, uncomp)
  
-    def iterobjects(self, get_raw=None):
+    def iterobjects(self):
          """Iterate over the objects in this pack."""
-        if get_raw is None:
-            get_raw = self.get_raw
          for offset, type, obj, crc32 in self.data.iterobjects():
              assert isinstance(offset, int)
-            type, obj = self.data.resolve_object(offset, type, obj, get_raw)
-            yield ShaFile.from_raw_chunks(type, obj)
+            yield ShaFile.from_raw_chunks(
+              *self.data.resolve_object(offset, type, obj))
  
  
  try: