X-Git-Url: http://git.samba.org/samba.git/?a=blobdiff_plain;f=dulwich%2Fpack.py;h=323dc9a00d9fa2fb219d4c68c7b0a0942eb3c9bc;hb=2dbedff35566546f6ce5ae1d3e5ac836d663ac46;hp=e165b5d255425cc8aa1c470d1faaf1bc4cc306b9;hpb=fb1513b2f773f89bd11ed2baa3aff38248e22d51;p=jelmer%2Fdulwich-libgit2.git diff --git a/dulwich/pack.py b/dulwich/pack.py index e165b5d..323dc9a 100644 --- a/dulwich/pack.py +++ b/dulwich/pack.py @@ -72,53 +72,50 @@ supports_mmap_offset = (sys.version_info[0] >= 3 or (sys.version_info[0] == 2 and sys.version_info[1] >= 6)) -def take_msb_bytes(map, offset): +def take_msb_bytes(read): """Read bytes marked with most significant bit. - :param map: The buffer. - :param offset: Offset in the buffer at which to start reading. + :param read: Read function """ ret = [] while len(ret) == 0 or ret[-1] & 0x80: - ret.append(ord(map[offset])) - offset += 1 + ret.append(ord(read(1))) return ret -def read_zlib_chunks(data, offset): +def read_zlib_chunks(read, buffer_size=4096): """Read chunks of zlib data from a buffer. - :param data: Buffer to read from - :param offset: Offset at which to start reading - :return: Tuple with list of chunks and length of - compressed data length + :param read: Read function + :return: Tuple with list of chunks, length of + compressed data length and unused read data """ obj = zlib.decompressobj() ret = [] fed = 0 while obj.unused_data == "": - base = offset+fed - add = data[base:base+1024] - if len(add) < 1024: + add = read(buffer_size) + if len(add) < buffer_size: add += "Z" fed += len(add) ret.append(obj.decompress(add)) comp_len = fed-len(obj.unused_data) - return ret, comp_len + return ret, comp_len, obj.unused_data -def read_zlib(data, offset, dec_size): +def read_zlib(read, dec_size): """Read zlib-compressed data from a buffer. - :param data: Buffer - :param offset: Offset in the buffer at which to read + :param read: Read function :param dec_size: Size of the decompressed buffer - :return: Uncompressed buffer and compressed buffer length. + :return: Uncompressed buffer, compressed buffer length and unused read + data. """ - ret, comp_len = read_zlib_chunks(data, offset) + ret, comp_len, unused = read_zlib_chunks(read) x = "".join(ret) assert len(x) == dec_size - return x, comp_len + return x, comp_len, unused + def iter_sha1(iter): @@ -133,35 +130,31 @@ def iter_sha1(iter): return sha1.hexdigest() -def simple_mmap(f, offset, size): - """Simple wrapper for mmap() which always supports the offset parameter. +def load_pack_index(path): + """Load an index file by path. - :param f: File object. - :param offset: Offset in the file, from the beginning of the file. - :param size: Size of the mmap'ed area - :param access: Access mechanism. - :return: MMAP'd area. + :param filename: Path to the index file """ - mem = mmap.mmap(f.fileno(), size+offset, access=mmap.ACCESS_READ) - return mem, offset + f = GitFile(path, 'rb') + return load_pack_index_file(path, f) -def load_pack_index(filename): - """Load an index file by path. +def load_pack_index_file(path, f): + """Load an index file from a file-like object. - :param filename: Path to the index file + :param path: Path for the index file + :param f: File-like object """ - f = GitFile(filename, 'rb') if f.read(4) == '\377tOc': version = struct.unpack(">L", f.read(4))[0] if version == 2: f.seek(0) - return PackIndex2(filename, file=f) + return PackIndex2(path, file=f) else: raise KeyError("Unknown pack index format %d" % version) else: f.seek(0) - return PackIndex1(filename, file=f) + return PackIndex1(path, file=f) def bisect_find_sha(start, end, sha, unpack_name): @@ -201,7 +194,7 @@ class PackIndex(object): the start and end offset and then bisect in to find if the value is present. """ - def __init__(self, filename, file=None): + def __init__(self, filename, file=None, size=None): """Create a pack index object. Provide it with the name of the index file to consider, and it will map @@ -210,13 +203,23 @@ class PackIndex(object): self._filename = filename # Take the size now, so it can be checked each time we map the file to # ensure that it hasn't changed. - self._size = os.path.getsize(filename) if file is None: self._file = GitFile(filename, 'rb') else: self._file = file - self._contents, map_offset = simple_mmap(self._file, 0, self._size) - assert map_offset == 0 + fileno = getattr(self._file, 'fileno', None) + if fileno is not None: + fd = self._file.fileno() + if size is None: + self._size = os.fstat(fd).st_size + else: + self._size = size + self._contents = mmap.mmap(fd, self._size, + access=mmap.ACCESS_READ) + else: + self._file.seek(0) + self._contents = self._file.read() + self._size = len(self._contents) def __eq__(self, other): if not isinstance(other, PackIndex): @@ -347,8 +350,8 @@ class PackIndex(object): class PackIndex1(PackIndex): """Version 1 Pack Index.""" - def __init__(self, filename, file=None): - PackIndex.__init__(self, filename, file) + def __init__(self, filename, file=None, size=None): + PackIndex.__init__(self, filename, file, size) self.version = 1 self._fan_out_table = self._read_fan_out_table(0) @@ -373,8 +376,8 @@ class PackIndex1(PackIndex): class PackIndex2(PackIndex): """Version 2 Pack Index.""" - def __init__(self, filename, file=None): - PackIndex.__init__(self, filename, file) + def __init__(self, filename, file=None, size=None): + PackIndex.__init__(self, filename, file, size) assert self._contents[:4] == '\377tOc', "Not a v2 pack index file" (self.version, ) = unpack_from(">L", self._contents, 4) assert self.version == 2, "Version was %d" % self.version @@ -414,38 +417,40 @@ def read_pack_header(f): return (version, num_objects) -def unpack_object(map, offset=0): +def unpack_object(read): """Unpack a Git object. - :return: tuple with type, uncompressed data and compressed size + :return: tuple with type, uncompressed data, compressed size and + tail data """ - bytes = take_msb_bytes(map, offset) + bytes = take_msb_bytes(read) type = (bytes[0] >> 4) & 0x07 size = bytes[0] & 0x0f for i, byte in enumerate(bytes[1:]): size += (byte & 0x7f) << ((i * 7) + 4) raw_base = len(bytes) if type == 6: # offset delta - bytes = take_msb_bytes(map, raw_base + offset) + bytes = take_msb_bytes(read) + raw_base += len(bytes) assert not (bytes[-1] & 0x80) delta_base_offset = bytes[0] & 0x7f for byte in bytes[1:]: delta_base_offset += 1 delta_base_offset <<= 7 delta_base_offset += (byte & 0x7f) - raw_base+=len(bytes) - uncomp, comp_len = read_zlib(map, offset + raw_base, size) + uncomp, comp_len, unused = read_zlib(read, size) assert size == len(uncomp) - return type, (delta_base_offset, uncomp), comp_len+raw_base + return type, (delta_base_offset, uncomp), comp_len+raw_base, unused elif type == 7: # ref delta - basename = map[offset+raw_base:offset+raw_base+20] - uncomp, comp_len = read_zlib(map, offset+raw_base+20, size) + basename = read(20) + raw_base += 20 + uncomp, comp_len, unused = read_zlib(read, size) assert size == len(uncomp) - return type, (basename, uncomp), comp_len+raw_base+20 + return type, (basename, uncomp), comp_len+raw_base, unused else: - uncomp, comp_len = read_zlib(map, offset+raw_base, size) + uncomp, comp_len, unused = read_zlib(read, size) assert len(uncomp) == size - return type, uncomp, comp_len+raw_base + return type, uncomp, comp_len+raw_base, unused def _compute_object_size((num, obj)): @@ -484,7 +489,7 @@ class PackData(object): It will all just throw a zlib or KeyError. """ - def __init__(self, filename): + def __init__(self, filename, file=None, size=None): """Create a PackData object that represents the pack in the given filename. The file must exist and stay readable until the object is disposed of. It @@ -494,21 +499,33 @@ class PackData(object): mmap implementation is flawed. """ self._filename = filename - self._size = os.path.getsize(filename) + self._size = size self._header_size = 12 - assert self._size >= self._header_size, "%s is too small for a packfile (%d < %d)" % (filename, self._size, self._header_size) - self._file = GitFile(self._filename, 'rb') - self._read_header() + if file is None: + self._file = GitFile(self._filename, 'rb') + else: + self._file = file + (version, self._num_objects) = read_pack_header(self._file) self._offset_cache = LRUSizeCache(1024*1024*20, compute_size=_compute_object_size) + @classmethod + def from_file(cls, file, size): + return cls(str(file), file=file, size=size) + + @classmethod + def from_path(cls, path): + return cls(filename=path) + def close(self): self._file.close() - - def _read_header(self): - (version, self._num_objects) = read_pack_header(self._file) - self._file.seek(self._size-20) - self._stored_checksum = self._file.read(20) + + def _get_size(self): + if self._size is not None: + return self._size + self._size = os.path.getsize(self._filename) + assert self._size >= self._header_size, "%s is too small for a packfile (%d < %d)" % (self._filename, self._size, self._header_size) + return self._size def __len__(self): """Returns the number of objects in this pack.""" @@ -519,11 +536,14 @@ class PackData(object): :return: 20-byte binary SHA1 digest """ - map, map_offset = simple_mmap(self._file, 0, self._size - 20) - try: - return make_sha(map[map_offset:self._size-20]).digest() - finally: - map.close() + s = make_sha() + self._file.seek(0) + todo = self._get_size() - 20 + while todo > 0: + x = self._file.read(min(todo, 1<<16)) + s.update(x) + todo -= len(x) + return s.digest() def resolve_object(self, offset, type, obj, get_ref, get_offset=None): """Resolve an object, possibly resolving deltas when necessary. @@ -566,10 +586,7 @@ class PackData(object): self.i = 0 self.offset = pack._header_size self.num = len(pack) - self.map, _ = simple_mmap(pack._file, 0, pack._size) - - def __del__(self): - self.map.close() + self.map = pack._file def __iter__(self): return self @@ -580,8 +597,10 @@ class PackData(object): def next(self): if self.i == self.num: raise StopIteration - (type, obj, total_size) = unpack_object(self.map, self.offset) - crc32 = zlib.crc32(self.map[self.offset:self.offset+total_size]) & 0xffffffff + self.map.seek(self.offset) + (type, obj, total_size, unused) = unpack_object(self.map.read) + self.map.seek(self.offset) + crc32 = zlib.crc32(self.map.read(total_size)) & 0xffffffff ret = (self.offset, type, obj, crc32) self.offset += total_size if progress: @@ -687,7 +706,8 @@ class PackData(object): def get_stored_checksum(self): """Return the expected checksum stored in this pack.""" - return self._stored_checksum + self._file.seek(self._get_size()-20) + return self._file.read(20) def check(self): """Check the consistency of this pack.""" @@ -705,12 +725,8 @@ class PackData(object): assert isinstance(offset, long) or isinstance(offset, int),\ "offset was %r" % offset assert offset >= self._header_size - map, map_offset = simple_mmap(self._file, offset, self._size-offset) - try: - ret = unpack_object(map, map_offset)[:2] - return ret - finally: - map.close() + self._file.seek(offset) + return unpack_object(self._file.read)[:2] class SHA1Reader(object): @@ -832,7 +848,7 @@ def write_pack_data(f, objects, num_objects, window=10): # This helps us find good objects to diff against us magic = [] for obj, path in recency: - magic.append( (obj.type, path, 1, -len(obj.as_raw_string()), obj) ) + magic.append( (obj.type, path, 1, -obj.raw_length(), obj) ) magic.sort() # Build a map of objects and their index in magic - so we can find preceeding objects # to diff against