from dulwich.lru_cache import (
LRUSizeCache,
)
+from dulwich.misc import (
+ make_sha,
+ SEEK_END,
+ )
from dulwich.objects import (
ShaFile,
hex_to_sha,
sha_to_hex,
- )
-from dulwich.misc import (
- make_sha,
+ object_header,
)
supports_mmap_offset = (sys.version_info[0] >= 3 or
(sys.version_info[0] == 2 and sys.version_info[1] >= 6))
+OFS_DELTA = 6
+REF_DELTA = 7
+
+DELTA_TYPES = (OFS_DELTA, REF_DELTA)
+
+
def take_msb_bytes(read):
"""Read bytes marked with most significant bit.
comp_len = fed - len(obj.unused_data)
return ret, comp_len, obj.unused_data
+
def iter_sha1(iter):
"""Return the hexdigest of the SHA1 over a set of names.
"""Unpack the i-th entry in the index file.
:return: Tuple with object name (SHA), offset in pack file and CRC32
- checksum (if known)."""
+ checksum (if known).
+ """
raise NotImplementedError(self._unpack_entry)
def _unpack_name(self, i):
def iterentries(self):
"""Iterate over the entries in this pack index.
- Will yield tuples with object name, offset in packfile and crc32
- checksum.
+ :yields: tuples with object name, offset in packfile and crc32 checksum.
"""
for i in range(len(self)):
yield self._unpack_entry(i)
def check(self):
"""Check that the stored checksum matches the actual checksum."""
- # TODO: Check pack contents, too
- return self.calculate_checksum() == self.get_stored_checksum()
+ actual = self.calculate_checksum()
+ stored = self.get_stored_checksum()
+ if actual != stored:
+ raise ChecksumMismatch(stored, actual)
def calculate_checksum(self):
"""Calculate the SHA1 checksum over this pack index.
for i, byte in enumerate(bytes[1:]):
size += (byte & 0x7f) << ((i * 7) + 4)
raw_base = len(bytes)
- if type == 6: # offset delta
+ if type == OFS_DELTA:
bytes = take_msb_bytes(read_all)
raw_base += len(bytes)
assert not (bytes[-1] & 0x80)
uncomp, comp_len, unused = read_zlib_chunks(read_some, size)
assert size == chunks_length(uncomp)
return type, (delta_base_offset, uncomp), comp_len+raw_base, unused
- elif type == 7: # ref delta
+ elif type == REF_DELTA:
basename = read_all(20)
raw_base += 20
uncomp, comp_len, unused = read_zlib_chunks(read_some, size)
def _compute_object_size((num, obj)):
"""Compute the size of a unresolved object for use with LRUSizeCache."""
- if num in (6, 7):
+ if num in DELTA_TYPES:
return chunks_length(obj[1])
return chunks_length(obj)
def _buf_len(self):
buf = self._rbuf
start = buf.tell()
- buf.seek(0, os.SEEK_END)
+ buf.seek(0, SEEK_END)
end = buf.tell()
buf.seek(start)
return end - start
self.i+=1
return ret
+def obj_sha(type, chunks):
+ """Compute the SHA for a numeric type and object chunks."""
+ sha = make_sha()
+ sha.update(object_header(type, chunks_length(chunks)))
+ for chunk in chunks:
+ sha.update(chunk)
+ return sha.digest()
+
class PackData(object):
"""The data contained in a packfile.
def __init__(self, filename, file=None, size=None):
"""Create a PackData object representing the pack in the given filename.
- The file must exist and stay readable until the object is disposed of.
- It must also stay the same size. It will be mapped whenever needed.
+ The file must exist and stay readable until the object is disposed of. It
+ must also stay the same size. It will be mapped whenever needed.
Currently there is a restriction on the size of the pack as the python
mmap implementation is flawed.
(version, self._num_objects) = read_pack_header(self._file.read)
self._offset_cache = LRUSizeCache(1024*1024*20,
compute_size=_compute_object_size)
+ self.pack = None
@classmethod
def from_file(cls, file, size):
todo -= len(x)
return s.digest()
- def resolve_object(self, offset, type, obj, get_ref, get_offset=None):
+ def get_ref(self, sha):
+ """Get the object for a ref SHA, only looking in this pack."""
+ # TODO: cache these results
+ if self.pack is None:
+ raise KeyError(sha)
+ offset = self.pack.index.object_index(sha)
+ if not offset:
+ raise KeyError(sha)
+ type, obj = self.get_object_at(offset)
+ return offset, type, obj
+
+ def resolve_object(self, offset, type, obj, get_ref=None):
"""Resolve an object, possibly resolving deltas when necessary.
:return: Tuple with object type and contents.
"""
- if type not in (6, 7): # Not a delta
+ if type not in DELTA_TYPES:
return type, obj
- if get_offset is None:
- get_offset = self.get_object_at
-
- if type == 6: # offset delta
+ if get_ref is None:
+ get_ref = self.get_ref
+ if type == OFS_DELTA:
(delta_offset, delta) = obj
+ # TODO: clean up asserts and replace with nicer error messages
+ assert isinstance(offset, int)
assert isinstance(delta_offset, int)
base_offset = offset-delta_offset
- type, base_obj = get_offset(base_offset)
+ type, base_obj = self.get_object_at(base_offset)
assert isinstance(type, int)
- elif type == 7: # ref delta
+ elif type == REF_DELTA:
(basename, delta) = obj
assert isinstance(basename, str) and len(basename) == 20
- type, base_obj = get_ref(basename)
+ base_offset, type, base_obj = get_ref(basename)
assert isinstance(type, int)
- # Can't be a ofs delta, as we wouldn't know the base offset
- assert type != 6
- base_offset = None
- type, base_chunks = self.resolve_object(base_offset, type, base_obj,
- get_ref)
- if base_offset is not None:
- self._offset_cache[base_offset] = type, base_chunks
- return (type, apply_delta(base_chunks, delta))
+ type, base_chunks = self.resolve_object(base_offset, type, base_obj)
+ chunks = apply_delta(base_chunks, delta)
+ # TODO(dborowitz): This can result in poor performance if large base
+ # objects are separated from deltas in the pack. We should reorganize
+ # so that we apply deltas to all objects in a chain one after the other
+ # to optimize cache performance.
+ if offset is not None:
+ self._offset_cache[offset] = type, chunks
+ return type, chunks
def iterobjects(self, progress=None):
return PackObjectIterator(self, progress)
- def iterentries(self, ext_resolve_ref=None, progress=None):
+ def iterentries(self, progress=None):
"""Yield entries summarizing the contents of this pack.
- :param ext_resolve_ref: Optional function to resolve base
- objects (in case this is a thin pack)
- :param progress: Progress function, called with current and
- total object count.
-
- This will yield tuples with (sha, offset, crc32)
+ :param progress: Progress function, called with current and total object
+ count.
+ :yields: tuples with (sha, offset, crc32)
"""
- found = {}
- postponed = defaultdict(list)
- class Postpone(Exception):
- """Raised to postpone delta resolving."""
-
- def get_ref_text(sha):
- assert len(sha) == 20
- if sha in found:
- return self.get_object_at(found[sha])
- if ext_resolve_ref:
- try:
- return ext_resolve_ref(sha)
- except KeyError:
- pass
- raise Postpone, (sha, )
- extra = []
- todo = chain(self.iterobjects(progress=progress), extra)
- for (offset, type, obj, crc32) in todo:
+ for offset, type, obj, crc32 in self.iterobjects(progress=progress):
assert isinstance(offset, int)
assert isinstance(type, int)
- try:
- type, obj = self.resolve_object(offset, type, obj,
- get_ref_text)
- except Postpone, (sha, ):
- postponed[sha].append((offset, type, obj))
- else:
- shafile = ShaFile.from_raw_chunks(type, obj)
- sha = shafile.sha().digest()
- found[sha] = offset
- yield sha, offset, crc32
- extra.extend(postponed.get(sha, []))
- if postponed:
- raise KeyError([sha_to_hex(h) for h in postponed.keys()])
+ assert isinstance(obj, list) or isinstance(obj, tuple)
+ type, obj = self.resolve_object(offset, type, obj)
+ yield obj_sha(type, obj), offset, crc32
- def sorted_entries(self, resolve_ext_ref=None, progress=None):
+ def sorted_entries(self, progress=None):
"""Return entries in this pack, sorted by SHA.
- :param resolve_ext_ref: Optional function to resolve base
- objects (in case this is a thin pack)
- :param progress: Progress function, called with current and
- total object count
+ :param progress: Progress function, called with current and total object
+ count
:return: List of tuples with (sha, offset, crc32)
"""
- ret = list(self.iterentries(resolve_ext_ref, progress=progress))
+ ret = list(self.iterentries(progress=progress))
ret.sort()
return ret
- def create_index_v1(self, filename, resolve_ext_ref=None, progress=None):
+ def create_index_v1(self, filename, progress=None):
"""Create a version 1 file for this data file.
:param filename: Index filename.
- :param resolve_ext_ref: Function to use for resolving externally
- referenced SHA1s (for thin packs)
:param progress: Progress report function
"""
- entries = self.sorted_entries(resolve_ext_ref, progress=progress)
+ entries = self.sorted_entries(progress=progress)
write_pack_index_v1(filename, entries, self.calculate_checksum())
- def create_index_v2(self, filename, resolve_ext_ref=None, progress=None):
+ def create_index_v2(self, filename, progress=None):
"""Create a version 2 index file for this data file.
:param filename: Index filename.
- :param resolve_ext_ref: Function to use for resolving externally
- referenced SHA1s (for thin packs)
:param progress: Progress report function
"""
- entries = self.sorted_entries(resolve_ext_ref, progress=progress)
+ entries = self.sorted_entries(progress=progress)
write_pack_index_v2(filename, entries, self.calculate_checksum())
- def create_index(self, filename, resolve_ext_ref=None, progress=None,
+ def create_index(self, filename, progress=None,
version=2):
"""Create an index file for this data file.
:param filename: Index filename.
- :param resolve_ext_ref: Function to use for resolving externally
- referenced SHA1s (for thin packs)
:param progress: Progress report function
"""
if version == 1:
- self.create_index_v1(filename, resolve_ext_ref, progress)
+ self.create_index_v1(filename, progress)
elif version == 2:
- self.create_index_v2(filename, resolve_ext_ref, progress)
+ self.create_index_v2(filename, progress)
else:
raise ValueError("unknown index format %d" % version)
def check(self):
"""Check the consistency of this pack."""
- return (self.calculate_checksum() == self.get_stored_checksum())
+ actual = self.calculate_checksum()
+ stored = self.get_stored_checksum()
+ if actual != stored:
+ raise ChecksumMismatch(stored, actual)
def get_object_at(self, offset):
"""Given an offset in to the packfile return the object that is there.
return unpack_object(self._file.read)[:2]
+class ThinPackData(PackData):
+ """PackData for thin packs, which require an ObjectStore for resolving."""
+
+ def __init__(self, resolve_ext_ref, *args, **kwargs):
+ super(ThinPackData, self).__init__(*args, **kwargs)
+ self.resolve_ext_ref = resolve_ext_ref
+
+ def get_ref(self, sha):
+ """Resolve a reference looking in both this pack and the store."""
+ try:
+ # As part of completing a pack we create a Pack object with a
+ # ThinPackData and a full PackIndex, so check in the index first if
+ # possible.
+ # TODO(dborowitz): reevaluate this when the pack completion code is
+ # rewritten.
+ return super(ThinPackData, self).get_ref(sha)
+ except KeyError:
+ type, obj = self.resolve_ext_ref(sha)
+ return None, type, obj
+
+ def iterentries(self, progress=None):
+ """Yield entries summarizing the contents of this pack.
+
+ :param progress: Progress function, called with current and
+ total object count.
+
+ This will yield tuples with (sha, offset, crc32)
+ """
+ found = {}
+ postponed = defaultdict(list)
+
+ class Postpone(Exception):
+ """Raised to postpone delta resolving."""
+
+ def __init__(self, sha):
+ self.sha = sha
+
+ def get_ref_text(sha):
+ assert len(sha) == 20
+ if sha in found:
+ offset = found[sha]
+ type, obj = self.get_object_at(offset)
+ return offset, type, obj
+ try:
+ return self.get_ref(sha)
+ except KeyError:
+ raise Postpone(sha)
+
+ extra = []
+ todo = chain(self.iterobjects(progress=progress), extra)
+ for (offset, type, obj, crc32) in todo:
+ assert isinstance(offset, int)
+ if obj is None:
+ # Inflate postponed delta
+ obj, type = self.get_object_at(offset)
+ assert isinstance(type, int)
+ assert isinstance(obj, list) or isinstance(obj, tuple)
+ try:
+ type, obj = self.resolve_object(offset, type, obj, get_ref_text)
+ except Postpone, e:
+ # Save memory by not storing the inflated obj in postponed
+ postponed[e.sha].append((offset, type, None, crc32))
+ else:
+ sha = obj_sha(type, obj)
+ found[sha] = offset
+ yield sha, offset, crc32
+ extra.extend(postponed.pop(sha, []))
+ if postponed:
+ raise KeyError([sha_to_hex(h) for h in postponed.keys()])
+
+
class SHA1Reader(object):
"""Wrapper around a file-like object that remembers the SHA1 of its data."""
"""
offset = f.tell()
packed_data_hdr = ""
- if type == 6: # offset delta
+ if type == OFS_DELTA:
(delta_base_offset, object) = object
- elif type == 7: # ref delta
+ elif type == REF_DELTA:
(basename, object) = object
size = len(object)
c = (type << 4) | (size & 15)
c = size & 0x7f
size >>= 7
packed_data_hdr += chr(c)
- if type == 6: # offset delta
+ if type == OFS_DELTA:
ret = [delta_base_offset & 0x7f]
delta_base_offset >>= 7
while delta_base_offset:
ret.insert(0, 0x80 | (delta_base_offset & 0x7f))
delta_base_offset >>= 7
packed_data_hdr += "".join([chr(x) for x in ret])
- elif type == 7: # ref delta
+ elif type == REF_DELTA:
assert len(basename) == 20
packed_data_hdr += basename
packed_data = packed_data_hdr + zlib.compress(object)
ret = Pack("")
ret._data = data
ret._idx = idx
+ data.pack = ret
return ret
def name(self):
"""The pack data object being used."""
if self._data is None:
self._data = PackData(self._data_path)
+ self._data.pack = self
assert len(self.index) == len(self._data)
idx_stored_checksum = self.index.get_pack_checksum()
data_stored_checksum = self._data.get_stored_checksum()
return iter(self.index)
def check(self):
- """Check the integrity of this pack."""
- if not self.index.check():
- return False
- if not self.data.check():
- return False
- return True
+ """Check the integrity of this pack.
+
+ :raise ChecksumMismatch: if a checksum for the index or data is wrong
+ """
+ self.index.check()
+ self.data.check()
+ for obj in self.iterobjects():
+ obj.check()
+ # TODO: object connectivity checks
def get_stored_checksum(self):
return self.data.get_stored_checksum()
except KeyError:
return False
- def get_raw(self, sha1, resolve_ref=None):
+ def get_raw(self, sha1):
offset = self.index.object_index(sha1)
obj_type, obj = self.data.get_object_at(offset)
if type(offset) is long:
offset = int(offset)
- if resolve_ref is None:
- resolve_ref = self.get_raw
- kind, chunks = self.data.resolve_object(offset, obj_type, obj,
- resolve_ref)
- return kind, "".join(chunks)
+ type_num, chunks = self.data.resolve_object(offset, obj_type, obj)
+ return type_num, "".join(chunks)
def __getitem__(self, sha1):
"""Retrieve the specified SHA1."""
type, uncomp = self.get_raw(sha1)
return ShaFile.from_raw_string(type, uncomp)
- def iterobjects(self, get_raw=None):
+ def iterobjects(self):
"""Iterate over the objects in this pack."""
- if get_raw is None:
- get_raw = self.get_raw
for offset, type, obj, crc32 in self.data.iterobjects():
assert isinstance(offset, int)
- type, obj = self.data.resolve_object(offset, type, obj, get_raw)
- yield ShaFile.from_raw_chunks(type, obj)
+ yield ShaFile.from_raw_chunks(
+ *self.data.resolve_object(offset, type, obj))
try: