# pack.py -- For dealing wih packed git objects.
# Copyright (C) 2007 James Westby <jw+debian@jameswestby.net>
-# Copryight (C) 2008 Jelmer Vernooij <jelmer@samba.org>
+# Copryight (C) 2008-2009 Jelmer Vernooij <jelmer@samba.org>
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
except ImportError:
from misc import defaultdict
-from itertools import imap, izip
+import difflib
+import errno
+from itertools import (
+ chain,
+ imap,
+ izip,
+ )
import mmap
import os
import struct
from dulwich.misc import unpack_from
import sys
import zlib
-import difflib
from dulwich.errors import (
ApplyDeltaError,
ChecksumMismatch,
)
+from dulwich.file import GitFile
from dulwich.lru_cache import (
LRUSizeCache,
)
hex_to_sha,
sha_to_hex,
)
-from dulwich.misc import make_sha
+from dulwich.misc import (
+ make_sha,
+ )
supports_mmap_offset = (sys.version_info[0] >= 3 or
(sys.version_info[0] == 2 and sys.version_info[1] >= 6))
-def take_msb_bytes(map, offset):
+def take_msb_bytes(read):
+ """Read bytes marked with most significant bit.
+
+ :param read: Read function
+ """
ret = []
while len(ret) == 0 or ret[-1] & 0x80:
- ret.append(ord(map[offset]))
- offset += 1
+ ret.append(ord(read(1)))
return ret
-def read_zlib(data, offset, dec_size):
+def read_zlib_chunks(read, buffer_size=4096):
+ """Read chunks of zlib data from a buffer.
+
+ :param read: Read function
+ :return: Tuple with list of chunks, length of
+ compressed data length and unused read data
+ """
obj = zlib.decompressobj()
- x = ""
+ ret = []
fed = 0
while obj.unused_data == "":
- base = offset+fed
- add = data[base:base+1024]
- if len(add) < 1024:
+ add = read(buffer_size)
+ if len(add) < buffer_size:
add += "Z"
fed += len(add)
- x += obj.decompress(add)
- assert len(x) == dec_size
+ ret.append(obj.decompress(add))
comp_len = fed-len(obj.unused_data)
- return x, comp_len
+ return ret, comp_len, obj.unused_data
def iter_sha1(iter):
+ """Return the hexdigest of the SHA1 over a set of names.
+
+ :param iter: Iterator over string objects
+ :return: 40-byte hex sha1 digest
+ """
sha1 = make_sha()
for name in iter:
sha1.update(name)
return sha1.hexdigest()
-MAX_MMAP_SIZE = 1024 * 1024 * 1024
+def load_pack_index(path):
+ """Load an index file by path.
+
+ :param filename: Path to the index file
+ """
+ f = GitFile(path, 'rb')
+ return load_pack_index_file(path, f)
+
+
+def _load_file_contents(f, size=None):
+ fileno = getattr(f, 'fileno', None)
+ # Attempt to use mmap if possible
+ if fileno is not None:
+ fd = f.fileno()
+ if size is None:
+ size = os.fstat(fd).st_size
+ try:
+ contents = mmap.mmap(fd, size, access=mmap.ACCESS_READ)
+ except mmap.error:
+ # Perhaps a socket?
+ pass
+ else:
+ return contents, size
+ contents = f.read()
+ size = len(contents)
+ return contents, size
-def simple_mmap(f, offset, size, access=mmap.ACCESS_READ):
- """Simple wrapper for mmap() which always supports the offset parameter.
- :param f: File object.
- :param offset: Offset in the file, from the beginning of the file.
- :param size: Size of the mmap'ed area
- :param access: Access mechanism.
- :return: MMAP'd area.
+def load_pack_index_file(path, f):
+ """Load an index file from a file-like object.
+
+ :param path: Path for the index file
+ :param f: File-like object
"""
- if offset+size > MAX_MMAP_SIZE and not supports_mmap_offset:
- raise AssertionError("%s is larger than 256 meg, and this version "
- "of Python does not support the offset argument to mmap().")
- if supports_mmap_offset:
- return mmap.mmap(f.fileno(), size, access=access, offset=offset), 0
+ contents, size = _load_file_contents(f)
+ if contents[:4] == '\377tOc':
+ version = struct.unpack(">L", contents[4:8])[0]
+ if version == 2:
+ return PackIndex2(path, file=f, contents=contents,
+ size=size)
+ else:
+ raise KeyError("Unknown pack index format %d" % version)
else:
- mem = mmap.mmap(f.fileno(), size+offset, access=access)
- return mem, offset
+ return PackIndex1(path, file=f, contents=contents, size=size)
+
+
+def bisect_find_sha(start, end, sha, unpack_name):
+ """Find a SHA in a data blob with sorted SHAs.
+
+ :param start: Start index of range to search
+ :param end: End index of range to search
+ :param sha: Sha to find
+ :param unpack_name: Callback to retrieve SHA by index
+ :return: Index of the SHA, or None if it wasn't found
+ """
+ assert start <= end
+ while start <= end:
+ i = (start + end)/2
+ file_sha = unpack_name(i)
+ x = cmp(file_sha, sha)
+ if x < 0:
+ start = i + 1
+ elif x > 0:
+ end = i - 1
+ else:
+ return i
+ return None
class PackIndex(object):
the start and end offset and then bisect in to find if the value is present.
"""
- def __init__(self, filename):
+ def __init__(self, filename, file=None, contents=None, size=None):
"""Create a pack index object.
Provide it with the name of the index file to consider, and it will map
self._filename = filename
# Take the size now, so it can be checked each time we map the file to
# ensure that it hasn't changed.
- self._size = os.path.getsize(filename)
- self._file = open(filename, 'r')
- self._contents, map_offset = simple_mmap(self._file, 0, self._size)
- assert map_offset == 0
- if self._contents[:4] != '\377tOc':
- self.version = 1
- self._fan_out_table = self._read_fan_out_table(0)
+ if file is None:
+ self._file = GitFile(filename, 'rb')
+ else:
+ self._file = file
+ if contents is None:
+ self._contents, self._size = _load_file_contents(file, size)
else:
- (self.version, ) = unpack_from(">L", self._contents, 4)
- assert self.version in (2,), "Version was %d" % self.version
- self._fan_out_table = self._read_fan_out_table(8)
- self._name_table_offset = 8 + 0x100 * 4
- self._crc32_table_offset = self._name_table_offset + 20 * len(self)
- self._pack_offset_table_offset = self._crc32_table_offset + 4 * len(self)
+ self._contents, self._size = (contents, size)
def __eq__(self, other):
- if type(self) != type(other):
+ if not isinstance(other, PackIndex):
return False
if self._fan_out_table != other._fan_out_table:
return False
- for (name1, _, _), (name2, _, _) in izip(self.iterentries(), other.iterentries()):
+ for (name1, _, _), (name2, _, _) in izip(self.iterentries(),
+ other.iterentries()):
if name1 != name2:
return False
return True
+
+ def __ne__(self, other):
+ return not self.__eq__(other)
def close(self):
self._file.close()
:return: Tuple with object name (SHA), offset in pack file and
CRC32 checksum (if known)."""
- if self.version == 1:
- (offset, name) = unpack_from(">L20s", self._contents,
- (0x100 * 4) + (i * 24))
- return (name, offset, None)
- else:
- return (self._unpack_name(i), self._unpack_offset(i),
- self._unpack_crc32_checksum(i))
+ raise NotImplementedError(self._unpack_entry)
def _unpack_name(self, i):
- if self.version == 1:
- offset = (0x100 * 4) + (i * 24) + 4
- else:
- offset = self._name_table_offset + i * 20
- return self._contents[offset:offset+20]
+ """Unpack the i-th name from the index file."""
+ raise NotImplementedError(self._unpack_name)
def _unpack_offset(self, i):
- if self.version == 1:
- offset = (0x100 * 4) + (i * 24)
- else:
- offset = self._pack_offset_table_offset + i * 4
- return unpack_from(">L", self._contents, offset)[0]
-
+ """Unpack the i-th object offset from the index file."""
+ raise NotImplementedError(self._unpack_offset)
+
def _unpack_crc32_checksum(self, i):
- if self.version == 1:
- return None
- else:
- return unpack_from(">L", self._contents,
- self._crc32_table_offset + i * 4)[0]
+ """Unpack the crc32 checksum for the i-th object from the index file."""
+ raise NotImplementedError(self._unpack_crc32_checksum)
def __iter__(self):
+ """Iterate over the SHAs in this pack."""
return imap(sha_to_hex, self._itersha())
def _itersha(self):
yield self._unpack_name(i)
def objects_sha1(self):
+ """Return the hex SHA1 over all the shas of all objects in this pack.
+
+ :note: This is used for the filename of the pack.
+ """
return iter_sha1(self._itersha())
def iterentries(self):
"""Iterate over the entries in this pack index.
- Will yield tuples with object name, offset in packfile and crc32 checksum.
+ Will yield tuples with object name, offset in packfile and crc32
+ checksum.
"""
for i in range(len(self)):
yield self._unpack_entry(i)
def _read_fan_out_table(self, start_offset):
ret = []
for i in range(0x100):
- ret.append(struct.unpack(">L", self._contents[start_offset+i*4:start_offset+(i+1)*4])[0])
+ ret.append(struct.unpack(">L",
+ self._contents[start_offset+i*4:start_offset+(i+1)*4])[0])
return ret
def check(self):
"""Check that the stored checksum matches the actual checksum."""
+ # TODO: Check pack contents, too
return self.calculate_checksum() == self.get_stored_checksum()
def calculate_checksum(self):
- f = open(self._filename, 'r')
- try:
- return make_sha(self._contents[:-20]).digest()
- finally:
- f.close()
+ """Calculate the SHA1 checksum over this pack index.
+
+ :return: This is a 20-byte binary digest
+ """
+ return make_sha(self._contents[:-20]).digest()
def get_pack_checksum(self):
- """Return the SHA1 checksum stored for the corresponding packfile."""
+ """Return the SHA1 checksum stored for the corresponding packfile.
+
+ :return: 20-byte binary digest
+ """
return str(self._contents[-40:-20])
def get_stored_checksum(self):
- """Return the SHA1 checksum stored for this index."""
+ """Return the SHA1 checksum stored for this index.
+
+ :return: 20-byte binary digest
+ """
return str(self._contents[-20:])
def object_index(self, sha):
"""Return the index in to the corresponding packfile for the object.
- Given the name of an object it will return the offset that object lives
- at within the corresponding pack file. If the pack file doesn't have the
- object then None will be returned.
+ Given the name of an object it will return the offset that object
+ lives at within the corresponding pack file. If the pack file doesn't
+ have the object then None will be returned.
"""
if len(sha) == 40:
sha = hex_to_sha(sha)
else:
start = self._fan_out_table[idx-1]
end = self._fan_out_table[idx]
- assert start <= end
- while start <= end:
- i = (start + end)/2
- file_sha = self._unpack_name(i)
- if file_sha < sha:
- start = i + 1
- elif file_sha > sha:
- end = i - 1
- else:
- return self._unpack_offset(i)
- return None
+ i = bisect_find_sha(start, end, sha, self._unpack_name)
+ if i is None:
+ raise KeyError(sha)
+ return self._unpack_offset(i)
+
+
+
+class PackIndex1(PackIndex):
+ """Version 1 Pack Index."""
+
+ def __init__(self, filename, file=None, contents=None, size=None):
+ PackIndex.__init__(self, filename, file, contents, size)
+ self.version = 1
+ self._fan_out_table = self._read_fan_out_table(0)
+
+ def _unpack_entry(self, i):
+ (offset, name) = unpack_from(">L20s", self._contents,
+ (0x100 * 4) + (i * 24))
+ return (name, offset, None)
+
+ def _unpack_name(self, i):
+ offset = (0x100 * 4) + (i * 24) + 4
+ return self._contents[offset:offset+20]
+
+ def _unpack_offset(self, i):
+ offset = (0x100 * 4) + (i * 24)
+ return unpack_from(">L", self._contents, offset)[0]
+
+ def _unpack_crc32_checksum(self, i):
+ # Not stored in v1 index files
+ return None
+
+
+class PackIndex2(PackIndex):
+ """Version 2 Pack Index."""
+
+ def __init__(self, filename, file=None, contents=None, size=None):
+ PackIndex.__init__(self, filename, file, contents, size)
+ assert self._contents[:4] == '\377tOc', "Not a v2 pack index file"
+ (self.version, ) = unpack_from(">L", self._contents, 4)
+ assert self.version == 2, "Version was %d" % self.version
+ self._fan_out_table = self._read_fan_out_table(8)
+ self._name_table_offset = 8 + 0x100 * 4
+ self._crc32_table_offset = self._name_table_offset + 20 * len(self)
+ self._pack_offset_table_offset = self._crc32_table_offset + 4 * len(self)
+
+ def _unpack_entry(self, i):
+ return (self._unpack_name(i), self._unpack_offset(i),
+ self._unpack_crc32_checksum(i))
+
+ def _unpack_name(self, i):
+ offset = self._name_table_offset + i * 20
+ return self._contents[offset:offset+20]
+
+ def _unpack_offset(self, i):
+ offset = self._pack_offset_table_offset + i * 4
+ return unpack_from(">L", self._contents, offset)[0]
+
+ def _unpack_crc32_checksum(self, i):
+ return unpack_from(">L", self._contents,
+ self._crc32_table_offset + i * 4)[0]
+
def read_pack_header(f):
+ """Read the header of a pack file.
+
+ :param f: File-like object to read from
+ """
header = f.read(12)
assert header[:4] == "PACK"
(version,) = unpack_from(">L", header, 4)
return (version, num_objects)
-def read_pack_tail(f):
- return (f.read(20),)
+def chunks_length(chunks):
+ return sum(imap(len, chunks))
-def unpack_object(map, offset=0):
- bytes = take_msb_bytes(map, offset)
+def unpack_object(read):
+ """Unpack a Git object.
+
+ :return: tuple with type, uncompressed data as chunks, compressed size and
+ tail data
+ """
+ bytes = take_msb_bytes(read)
type = (bytes[0] >> 4) & 0x07
size = bytes[0] & 0x0f
for i, byte in enumerate(bytes[1:]):
size += (byte & 0x7f) << ((i * 7) + 4)
raw_base = len(bytes)
if type == 6: # offset delta
- bytes = take_msb_bytes(map, raw_base + offset)
+ bytes = take_msb_bytes(read)
+ raw_base += len(bytes)
assert not (bytes[-1] & 0x80)
delta_base_offset = bytes[0] & 0x7f
for byte in bytes[1:]:
delta_base_offset += 1
delta_base_offset <<= 7
delta_base_offset += (byte & 0x7f)
- raw_base+=len(bytes)
- uncomp, comp_len = read_zlib(map, offset + raw_base, size)
- assert size == len(uncomp)
- return type, (delta_base_offset, uncomp), comp_len+raw_base
+ uncomp, comp_len, unused = read_zlib_chunks(read)
+ assert size == chunks_length(uncomp)
+ return type, (delta_base_offset, uncomp), comp_len+raw_base, unused
elif type == 7: # ref delta
- basename = map[offset+raw_base:offset+raw_base+20]
- uncomp, comp_len = read_zlib(map, offset+raw_base+20, size)
- assert size == len(uncomp)
- return type, (basename, uncomp), comp_len+raw_base+20
+ basename = read(20)
+ raw_base += 20
+ uncomp, comp_len, unused = read_zlib_chunks(read)
+ assert size == chunks_length(uncomp)
+ return type, (basename, uncomp), comp_len+raw_base, unused
else:
- uncomp, comp_len = read_zlib(map, offset+raw_base, size)
- assert len(uncomp) == size
- return type, uncomp, comp_len+raw_base
+ uncomp, comp_len, unused = read_zlib_chunks(read)
+ assert chunks_length(uncomp) == size
+ return type, uncomp, comp_len+raw_base, unused
-def compute_object_size((num, obj)):
+def _compute_object_size((num, obj)):
+ """Compute the size of a unresolved object for use with LRUSizeCache.
+ """
if num in (6, 7):
- return len(obj[1])
- assert isinstance(obj, str)
- return len(obj)
+ return chunks_length(obj[1])
+ return chunks_length(obj)
class PackData(object):
buffer from the start of the deflated object on. This is bad, but until I
get mmap sorted out it will have to do.
- Currently there are no integrity checks done. Also no attempt is made to try
- and detect the delta case, or a request for an object at the wrong position.
- It will all just throw a zlib or KeyError.
+ Currently there are no integrity checks done. Also no attempt is made to
+ try and detect the delta case, or a request for an object at the wrong
+ position. It will all just throw a zlib or KeyError.
"""
- def __init__(self, filename):
- """Create a PackData object that represents the pack in the given filename.
+ def __init__(self, filename, file=None, size=None):
+ """Create a PackData object that represents the pack in the given
+ filename.
- The file must exist and stay readable until the object is disposed of. It
- must also stay the same size. It will be mapped whenever needed.
+ The file must exist and stay readable until the object is disposed of.
+ It must also stay the same size. It will be mapped whenever needed.
Currently there is a restriction on the size of the pack as the python
mmap implementation is flawed.
"""
self._filename = filename
- assert os.path.exists(filename), "%s is not a packfile" % filename
- self._size = os.path.getsize(filename)
+ self._size = size
self._header_size = 12
- assert self._size >= self._header_size, "%s is too small for a packfile (%d < %d)" % (filename, self._size, self._header_size)
- self._read_header()
- self._offset_cache = LRUSizeCache(1024*1024*100,
- compute_size=compute_object_size)
-
- def _read_header(self):
- f = open(self._filename, 'rb')
- try:
- (version, self._num_objects) = \
- read_pack_header(f)
- f.seek(self._size-20)
- (self._stored_checksum,) = read_pack_tail(f)
- finally:
- f.close()
+ if file is None:
+ self._file = GitFile(self._filename, 'rb')
+ else:
+ self._file = file
+ (version, self._num_objects) = read_pack_header(self._file)
+ self._offset_cache = LRUSizeCache(1024*1024*20,
+ compute_size=_compute_object_size)
+
+ @classmethod
+ def from_file(cls, file, size):
+ return cls(str(file), file=file, size=size)
+
+ @classmethod
+ def from_path(cls, path):
+ return cls(filename=path)
+
+ def close(self):
+ self._file.close()
+
+ def _get_size(self):
+ if self._size is not None:
+ return self._size
+ self._size = os.path.getsize(self._filename)
+ assert self._size >= self._header_size, "%s is too small for a packfile (%d < %d)" % (self._filename, self._size, self._header_size)
+ return self._size
def __len__(self):
"""Returns the number of objects in this pack."""
return self._num_objects
def calculate_checksum(self):
- """Calculate the checksum for this pack."""
- f = open(self._filename, 'rb')
- try:
- map, map_offset = simple_mmap(f, 0, self._size - 20)
- return make_sha(map[map_offset:self._size-20]).digest()
- finally:
- f.close()
+ """Calculate the checksum for this pack.
+
+ :return: 20-byte binary SHA1 digest
+ """
+ s = make_sha()
+ self._file.seek(0)
+ todo = self._get_size() - 20
+ while todo > 0:
+ x = self._file.read(min(todo, 1<<16))
+ s.update(x)
+ todo -= len(x)
+ return s.digest()
def resolve_object(self, offset, type, obj, get_ref, get_offset=None):
"""Resolve an object, possibly resolving deltas when necessary.
:return: Tuple with object type and contents.
"""
- if not type in (6, 7): # Not a delta
+ if type not in (6, 7): # Not a delta
return type, obj
if get_offset is None:
if type == 6: # offset delta
(delta_offset, delta) = obj
assert isinstance(delta_offset, int)
- assert isinstance(delta, str)
base_offset = offset-delta_offset
type, base_obj = get_offset(base_offset)
assert isinstance(type, int)
elif type == 7: # ref delta
(basename, delta) = obj
assert isinstance(basename, str) and len(basename) == 20
- assert isinstance(delta, str)
type, base_obj = get_ref(basename)
assert isinstance(type, int)
# Can't be a ofs delta, as we wouldn't know the base offset
assert type != 6
base_offset = None
- type, base_text = self.resolve_object(base_offset, type, base_obj, get_ref)
+ type, base_chunks = self.resolve_object(base_offset, type, base_obj,
+ get_ref)
if base_offset is not None:
- self._offset_cache[base_offset] = type, base_text
- ret = (type, apply_delta(base_text, delta))
- return ret
+ self._offset_cache[base_offset] = type, base_chunks
+ return (type, apply_delta(base_chunks, delta))
- def iterobjects(self):
- offset = self._header_size
- f = open(self._filename, 'rb')
- num = len(self)
- map, _ = simple_mmap(f, 0, self._size)
- for i in range(num):
- (type, obj, total_size) = unpack_object(map, offset)
- crc32 = zlib.crc32(map[offset:offset+total_size]) & 0xffffffff
- yield offset, type, obj, crc32
- offset += total_size
- f.close()
+ def iterobjects(self, progress=None):
+
+ class ObjectIterator(object):
+
+ def __init__(self, pack):
+ self.i = 0
+ self.offset = pack._header_size
+ self.num = len(pack)
+ self.map = pack._file
+
+ def __iter__(self):
+ return self
+
+ def __len__(self):
+ return self.num
+
+ def next(self):
+ if self.i == self.num:
+ raise StopIteration
+ self.map.seek(self.offset)
+ (type, obj, total_size, unused) = unpack_object(self.map.read)
+ self.map.seek(self.offset)
+ crc32 = zlib.crc32(self.map.read(total_size)) & 0xffffffff
+ ret = (self.offset, type, obj, crc32)
+ self.offset += total_size
+ if progress:
+ progress(self.i, self.num)
+ self.i+=1
+ return ret
+ return ObjectIterator(self)
- def iterentries(self, ext_resolve_ref=None):
+ def iterentries(self, ext_resolve_ref=None, progress=None):
+ """Yield entries summarizing the contents of this pack.
+
+ :param ext_resolve_ref: Optional function to resolve base
+ objects (in case this is a thin pack)
+ :param progress: Progress function, called with current and
+ total object count.
+
+ This will yield tuples with (sha, offset, crc32)
+ """
found = {}
postponed = defaultdict(list)
class Postpone(Exception):
"""Raised to postpone delta resolving."""
def get_ref_text(sha):
+ assert len(sha) == 20
if sha in found:
- return found[sha]
+ return self.get_object_at(found[sha])
if ext_resolve_ref:
try:
return ext_resolve_ref(sha)
except KeyError:
pass
raise Postpone, (sha, )
- todo = list(self.iterobjects())
- while todo:
- (offset, type, obj, crc32) = todo.pop(0)
+ extra = []
+ todo = chain(self.iterobjects(progress=progress), extra)
+ for (offset, type, obj, crc32) in todo:
assert isinstance(offset, int)
assert isinstance(type, int)
- assert isinstance(obj, tuple) or isinstance(obj, str)
try:
- type, obj = self.resolve_object(offset, type, obj, get_ref_text)
+ type, obj = self.resolve_object(offset, type, obj,
+ get_ref_text)
except Postpone, (sha, ):
postponed[sha].append((offset, type, obj))
else:
- shafile = ShaFile.from_raw_string(type, obj)
+ shafile = ShaFile.from_raw_chunks(type, obj)
sha = shafile.sha().digest()
- found[sha] = (type, obj)
+ found[sha] = offset
yield sha, offset, crc32
- todo += postponed.get(sha, [])
+ extra.extend(postponed.get(sha, []))
if postponed:
raise KeyError([sha_to_hex(h) for h in postponed.keys()])
- def sorted_entries(self, resolve_ext_ref=None):
- ret = list(self.iterentries(resolve_ext_ref))
+ def sorted_entries(self, resolve_ext_ref=None, progress=None):
+ """Return entries in this pack, sorted by SHA.
+
+ :param ext_resolve_ref: Optional function to resolve base
+ objects (in case this is a thin pack)
+ :param progress: Progress function, called with current and
+ total object count.
+ :return: List of tuples with (sha, offset, crc32)
+ """
+ ret = list(self.iterentries(resolve_ext_ref, progress=progress))
ret.sort()
return ret
- def create_index_v1(self, filename, resolve_ext_ref=None):
- entries = self.sorted_entries(resolve_ext_ref)
+ def create_index_v1(self, filename, resolve_ext_ref=None, progress=None):
+ """Create a version 1 file for this data file.
+
+ :param filename: Index filename.
+ :param resolve_ext_ref: Function to use for resolving externally
+ referenced SHA1s (for thin packs)
+ :param progress: Progress report function
+ """
+ entries = self.sorted_entries(resolve_ext_ref, progress=progress)
write_pack_index_v1(filename, entries, self.calculate_checksum())
- def create_index_v2(self, filename, resolve_ext_ref=None):
- entries = self.sorted_entries(resolve_ext_ref)
+ def create_index_v2(self, filename, resolve_ext_ref=None, progress=None):
+ """Create a version 2 index file for this data file.
+
+ :param filename: Index filename.
+ :param resolve_ext_ref: Function to use for resolving externally
+ referenced SHA1s (for thin packs)
+ :param progress: Progress report function
+ """
+ entries = self.sorted_entries(resolve_ext_ref, progress=progress)
write_pack_index_v2(filename, entries, self.calculate_checksum())
+
+ def create_index(self, filename, resolve_ext_ref=None, progress=None,
+ version=2):
+ """Create an index file for this data file.
+
+ :param filename: Index filename.
+ :param resolve_ext_ref: Function to use for resolving externally
+ referenced SHA1s (for thin packs)
+ :param progress: Progress report function
+ """
+ if version == 1:
+ self.create_index_v1(filename, resolve_ext_ref, progress)
+ elif version == 2:
+ self.create_index_v2(filename, resolve_ext_ref, progress)
+ else:
+ raise ValueError("unknown index format %d" % version)
def get_stored_checksum(self):
- return self._stored_checksum
+ """Return the expected checksum stored in this pack."""
+ self._file.seek(self._get_size()-20)
+ return self._file.read(20)
def check(self):
+ """Check the consistency of this pack."""
return (self.calculate_checksum() == self.get_stored_checksum())
def get_object_at(self, offset):
"""Given an offset in to the packfile return the object that is there.
- Using the associated index the location of an object can be looked up, and
- then the packfile can be asked directly for that object using this
+ Using the associated index the location of an object can be looked up,
+ and then the packfile can be asked directly for that object using this
function.
"""
if offset in self._offset_cache:
assert isinstance(offset, long) or isinstance(offset, int),\
"offset was %r" % offset
assert offset >= self._header_size
- f = open(self._filename, 'rb')
- try:
- map, map_offset = simple_mmap(f, offset, self._size-offset)
- ret = unpack_object(map, map_offset)[:2]
- return ret
- finally:
- f.close()
+ self._file.seek(offset)
+ return unpack_object(self._file.read)[:2]
+
+
+class SHA1Reader(object):
+ """Wrapper around a file-like object that remembers the SHA1 of
+ the data read from it."""
+
+ def __init__(self, f):
+ self.f = f
+ self.sha1 = make_sha("")
+
+ def read(self, num=None):
+ data = self.f.read(num)
+ self.sha1.update(data)
+ return data
+
+ def check_sha(self):
+ stored = self.f.read(20)
+ if stored != self.sha1.digest():
+ raise ChecksumMismatch(self.sha1.hexdigest(), sha_to_hex(stored))
+
+ def close(self):
+ return self.f.close()
+
+ def tell(self):
+ return self.f.tell()
class SHA1Writer(object):
+ """Wrapper around a file-like object that remembers the SHA1 of
+ the data written to it."""
def __init__(self, f):
self.f = f
:param o: Object to write
:return: Tuple with offset at which the object was written, and crc32
"""
- ret = f.tell()
+ offset = f.tell()
packed_data_hdr = ""
- if type == 6: # ref delta
+ if type == 6: # offset delta
(delta_base_offset, object) = object
- elif type == 7: # offset delta
+ elif type == 7: # ref delta
(basename, object) = object
size = len(object)
c = (type << 4) | (size & 15)
packed_data_hdr += basename
packed_data = packed_data_hdr + zlib.compress(object)
f.write(packed_data)
- return (f.tell(), (zlib.crc32(packed_data) & 0xffffffff))
+ return (offset, (zlib.crc32(packed_data) & 0xffffffff))
def write_pack(filename, objects, num_objects):
- f = open(filename + ".pack", 'w')
+ """Write a new pack data file.
+
+ :param filename: Path to the new pack file (without .pack extension)
+ :param objects: Iterable over (object, path) tuples to write
+ :param num_objects: Number of objects to write
+ """
+ f = GitFile(filename + ".pack", 'wb')
try:
entries, data_sum = write_pack_data(f, objects, num_objects)
finally:
# This helps us find good objects to diff against us
magic = []
for obj, path in recency:
- magic.append( (obj.type, path, 1, -len(obj.as_raw_string()[1]), obj) )
+ magic.append( (obj.type_num, path, 1, -obj.raw_length(), obj) )
magic.sort()
# Build a map of objects and their index in magic - so we can find preceeding objects
# to diff against
f.write(struct.pack(">L", num_objects)) # Number of objects in pack
for o, path in recency:
sha1 = o.sha().digest()
- orig_t, raw = o.as_raw_string()
+ orig_t = o.type_num
+ raw = o.as_raw_string()
winner = raw
t = orig_t
#for i in range(offs[o]-window, window):
# if i < 0 or i >= len(offs): continue
# b = magic[i][4]
- # if b.type != orig_t: continue
- # _, base = b.as_raw_string()
+ # if b.type_num != orig_t: continue
+ # base = b.as_raw_string()
# delta = create_delta(base, raw)
# if len(delta) < len(winner):
# winner = delta
"""Write a new pack index file.
:param filename: The filename of the new pack index file.
- :param entries: List of tuples with object name (sha), offset_in_pack, and
- crc32_checksum.
+ :param entries: List of tuples with object name (sha), offset_in_pack,
+ and crc32_checksum.
:param pack_checksum: Checksum of the pack file.
"""
- f = open(filename, 'w')
+ f = GitFile(filename, 'wb')
f = SHA1Writer(f)
fan_out_table = defaultdict(lambda: 0)
for (name, offset, entry_checksum) in entries:
def create_delta(base_buf, target_buf):
- """Use python difflib to work out how to transform base_buf to target_buf"""
+ """Use python difflib to work out how to transform base_buf to target_buf.
+
+ :param base_buf: Base buffer
+ :param target_buf: Target buffer
+ """
assert isinstance(base_buf, str)
assert isinstance(target_buf, str)
out_buf = ""
o = i1
for i in range(4):
if o & 0xff << i*8:
- scratch += chr(o >> i)
+ scratch += chr((o >> i*8) & 0xff)
op |= 1 << i
s = i2 - i1
for i in range(2):
if s & 0xff << i*8:
- scratch += chr(s >> i)
+ scratch += chr((s >> i*8) & 0xff)
op |= 1 << (4+i)
out_buf += chr(op)
out_buf += scratch
:param src_buf: Source buffer
:param delta: Delta instructions
"""
- assert isinstance(src_buf, str), "was %r" % (src_buf,)
- assert isinstance(delta, str)
+ if type(src_buf) != str:
+ src_buf = "".join(src_buf)
+ if type(delta) != str:
+ delta = "".join(delta)
out = []
index = 0
delta_length = len(delta)
if index != delta_length:
raise ApplyDeltaError("delta not empty: %r" % delta[index:])
- out = ''.join(out)
- if dest_size != len(out):
+ if dest_size != chunks_length(out):
raise ApplyDeltaError("dest size incorrect")
return out
"""Write a new pack index file.
:param filename: The filename of the new pack index file.
- :param entries: List of tuples with object name (sha), offset_in_pack, and
- crc32_checksum.
+ :param entries: List of tuples with object name (sha), offset_in_pack, and
+ crc32_checksum.
:param pack_checksum: Checksum of the pack file.
"""
- f = open(filename, 'w')
+ f = GitFile(filename, 'wb')
f = SHA1Writer(f)
f.write('\377tOc') # Magic!
f.write(struct.pack(">L", 2))
class Pack(object):
+ """A Git pack object."""
def __init__(self, basename):
self._basename = basename
self._data = None
self._idx = None
+ @classmethod
+ def from_objects(self, data, idx):
+ """Create a new pack object from pack data and index objects."""
+ ret = Pack("")
+ ret._data = data
+ ret._idx = idx
+ return ret
+
def name(self):
"""The SHA over the SHAs of the objects in this pack."""
- return self.idx.objects_sha1()
+ return self.index.objects_sha1()
@property
def data(self):
+ """The pack data object being used."""
if self._data is None:
self._data = PackData(self._data_path)
- assert len(self.idx) == len(self._data)
- idx_stored_checksum = self.idx.get_pack_checksum()
+ assert len(self.index) == len(self._data)
+ idx_stored_checksum = self.index.get_pack_checksum()
data_stored_checksum = self._data.get_stored_checksum()
if idx_stored_checksum != data_stored_checksum:
raise ChecksumMismatch(sha_to_hex(idx_stored_checksum),
return self._data
@property
- def idx(self):
+ def index(self):
+ """The index being used.
+
+ :note: This may be an in-memory index
+ """
if self._idx is None:
- self._idx = PackIndex(self._idx_path)
+ self._idx = load_pack_index(self._idx_path)
return self._idx
def close(self):
if self._data is not None:
self._data.close()
- self.idx.close()
+ self.index.close()
def __eq__(self, other):
- return type(self) == type(other) and self.idx == other.idx
+ return type(self) == type(other) and self.index == other.index
def __len__(self):
"""Number of entries in this pack."""
- return len(self.idx)
+ return len(self.index)
def __repr__(self):
- return "Pack(%r)" % self._basename
+ return "%s(%r)" % (self.__class__.__name__, self._basename)
def __iter__(self):
"""Iterate over all the sha1s of the objects in this pack."""
- return iter(self.idx)
+ return iter(self.index)
def check(self):
- if not self.idx.check():
+ """Check the integrity of this pack."""
+ if not self.index.check():
return False
if not self.data.check():
return False
def __contains__(self, sha1):
"""Check whether this pack contains a particular SHA1."""
- return (self.idx.object_index(sha1) is not None)
+ try:
+ self.index.object_index(sha1)
+ return True
+ except KeyError:
+ return False
def get_raw(self, sha1, resolve_ref=None):
- offset = self.idx.object_index(sha1)
- if offset is None:
- raise KeyError(sha1)
-
- type, obj = self.data.get_object_at(offset)
- if isinstance(offset, long):
+ offset = self.index.object_index(sha1)
+ obj_type, obj = self.data.get_object_at(offset)
+ if type(offset) is long:
offset = int(offset)
- assert isinstance(offset, int)
- return self.data.resolve_object(offset, type, obj, resolve_ref)
+ if resolve_ref is None:
+ resolve_ref = self.get_raw
+ kind, chunks = self.data.resolve_object(offset, obj_type, obj,
+ resolve_ref)
+ return kind, "".join(chunks)
def __getitem__(self, sha1):
"""Retrieve the specified SHA1."""
return ShaFile.from_raw_string(type, uncomp)
def iterobjects(self, get_raw=None):
+ """Iterate over the objects in this pack."""
if get_raw is None:
- def get_raw(x):
- raise KeyError(x)
+ get_raw = self.get_raw
for offset, type, obj, crc32 in self.data.iterobjects():
assert isinstance(offset, int)
- yield ShaFile.from_raw_string(
- *self.data.resolve_object(offset, type, obj, get_raw))
-
+ type, obj = self.data.resolve_object(offset, type, obj, get_raw)
+ yield ShaFile.from_raw_chunks(type, obj)
-def load_packs(path):
- if not os.path.exists(path):
- return
- for name in os.listdir(path):
- if name.startswith("pack-") and name.endswith(".pack"):
- yield Pack(os.path.join(path, name[:-len(".pack")]))
+try:
+ from dulwich._pack import apply_delta, bisect_find_sha
+except ImportError:
+ pass