X-Git-Url: http://git.samba.org/samba.git/?a=blobdiff_plain;f=dulwich%2Fpack.py;h=9fb54972b4acd0e2d3fb0d8ee43bb8d4971d7db6;hb=f0bb0dc08bb08ab6c305f5d86d39e6b385efd0b5;hp=0b4f43f2114f82ea81c0f4e656b9c4b4f2d6a23d;hpb=838bb0440fa715ec6588a90590c2553be027efd4;p=jelmer%2Fdulwich-libgit2.git diff --git a/dulwich/pack.py b/dulwich/pack.py index 0b4f43f..9fb5497 100644 --- a/dulwich/pack.py +++ b/dulwich/pack.py @@ -1,14 +1,11 @@ # pack.py -- For dealing wih packed git objects. # Copyright (C) 2007 James Westby # Copryight (C) 2008 Jelmer Vernooij -# The code is loosely based on that in the sha1_file.c file from git itself, -# which is Copyright (C) Linus Torvalds, 2005 and distributed under the -# GPL version 2. # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; version 2 -# of the License. +# of the License or (at your option) a later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -35,14 +32,19 @@ a pointer in to the corresponding packfile. from collections import defaultdict import hashlib +from itertools import imap, izip import mmap import os +import sha import struct import sys import zlib +import difflib from objects import ( ShaFile, + hex_to_sha, + sha_to_hex, ) from errors import ApplyDeltaError @@ -65,6 +67,8 @@ def read_zlib(data, offset, dec_size): while obj.unused_data == "": base = offset+fed add = data[base:base+1024] + if len(add) < 1024: + add += "Z" fed += len(add) x += obj.decompress(add) assert len(x) == dec_size @@ -72,19 +76,12 @@ def read_zlib(data, offset, dec_size): return x, comp_len -def hex_to_sha(hex): - """Convert a hex string to a binary sha string.""" - ret = "" - for i in range(0, len(hex), 2): - ret += chr(int(hex[i:i+2], 16)) - return ret +def iter_sha1(iter): + sha = hashlib.sha1() + for name in iter: + sha.update(name) + return sha.hexdigest() -def sha_to_hex(sha): - """Convert a binary sha string to a hex sha string.""" - ret = "" - for i in sha: - ret += "%02x" % ord(i) - return ret MAX_MMAP_SIZE = 256 * 1024 * 1024 @@ -143,11 +140,12 @@ def resolve_object(offset, type, obj, get_ref, get_offset): (basename, delta) = obj assert isinstance(basename, str) and len(basename) == 20 assert isinstance(delta, str) - type, base_obj= get_ref(basename) + type, base_obj = get_ref(basename) assert isinstance(type, int) type, base_text = resolve_object(offset, type, base_obj, get_ref, get_offset) return type, apply_delta(base_text, delta) + class PackIndex(object): """An index in to a packfile. @@ -169,7 +167,6 @@ class PackIndex(object): it whenever required. """ self._filename = filename - assert os.path.exists(filename), "%s is not a pack index" % filename # Take the size now, so it can be checked each time we map the file to # ensure that it hasn't changed. self._size = os.path.getsize(filename) @@ -187,9 +184,16 @@ class PackIndex(object): self._pack_offset_table_offset = self._crc32_table_offset + 4 * len(self) def __eq__(self, other): - return (type(self) == type(other) and - self._fan_out_table == other._fan_out_table and - list(self.iterentries()) == list(other.iterentries())) + if type(self) != type(other): + return False + + if self._fan_out_table != other._fan_out_table: + return False + + for (name1, _, _), (name2, _, _) in izip(self.iterentries(), other.iterentries()): + if name1 != name2: + return False + return True def close(self): self._file.close() @@ -233,8 +237,14 @@ class PackIndex(object): self._crc32_table_offset + i * 4)[0] def __iter__(self): + return imap(sha_to_hex, self._itersha()) + + def _itersha(self): for i in range(len(self)): - yield sha_to_hex(self._unpack_name(i)) + yield self._unpack_name(i) + + def objects_sha1(self): + return iter_sha1(self._itersha()) def iterentries(self): """Iterate over the entries in this pack index. @@ -301,6 +311,49 @@ class PackIndex(object): return None +def read_pack_header(f): + header = f.read(12) + assert header[:4] == "PACK" + (version,) = struct.unpack_from(">L", header, 4) + assert version in (2, 3), "Version was %d" % version + (num_objects,) = struct.unpack_from(">L", header, 8) + return (version, num_objects) + + +def read_pack_tail(f): + return (f.read(20),) + + +def unpack_object(map): + bytes = take_msb_bytes(map, 0) + type = (bytes[0] >> 4) & 0x07 + size = bytes[0] & 0x0f + for i, byte in enumerate(bytes[1:]): + size += (byte & 0x7f) << ((i * 7) + 4) + raw_base = len(bytes) + if type == 6: # offset delta + bytes = take_msb_bytes(map, raw_base) + assert not (bytes[-1] & 0x80) + delta_base_offset = bytes[0] & 0x7f + for byte in bytes[1:]: + delta_base_offset += 1 + delta_base_offset <<= 7 + delta_base_offset += (byte & 0x7f) + raw_base+=len(bytes) + uncomp, comp_len = read_zlib(map, raw_base, size) + assert size == len(uncomp) + return type, (delta_base_offset, uncomp), comp_len+raw_base + elif type == 7: # ref delta + basename = map[raw_base:raw_base+20] + uncomp, comp_len = read_zlib(map, raw_base+20, size) + assert size == len(uncomp) + return type, (basename, uncomp), comp_len+raw_base+20 + else: + uncomp, comp_len = read_zlib(map, raw_base, size) + assert len(uncomp) == size + return type, uncomp, comp_len+raw_base + + class PackData(object): """The data contained in a packfile. @@ -340,21 +393,19 @@ class PackData(object): self._filename = filename assert os.path.exists(filename), "%s is not a packfile" % filename self._size = os.path.getsize(filename) - self._header_size = self._read_header() + self._header_size = 12 + assert self._size >= self._header_size, "%s is too small for a packfile" % filename + self._read_header() def _read_header(self): f = open(self._filename, 'rb') try: - header = f.read(12) + (version, self._num_objects) = \ + read_pack_header(f) f.seek(self._size-20) - self._stored_checksum = f.read(20) + (self._stored_checksum,) = read_pack_tail(f) finally: f.close() - assert header[:4] == "PACK" - (version,) = struct.unpack_from(">L", header, 4) - assert version in (2, 3), "Version was %d" % version - (self._num_objects,) = struct.unpack_from(">L", header, 8) - return 12 # Header size def __len__(self): """Returns the number of objects in this pack.""" @@ -369,45 +420,70 @@ class PackData(object): f.close() def iterobjects(self): - """Yields (name, offset, crc32 checksum).""" offset = self._header_size f = open(self._filename, 'rb') for i in range(len(self)): map = simple_mmap(f, offset, self._size-offset) - (type, obj, total_size) = self._unpack_object(map) + (type, obj, total_size) = unpack_object(map) yield offset, type, obj offset += total_size f.close() - def iterentries(self): + def iterentries(self, ext_resolve_ref=None): found = {} - postponed = list(self.iterobjects()) - while postponed: - (offset, type, obj) = postponed.pop() + at = {} + postponed = defaultdict(list) + class Postpone(Exception): + """Raised to postpone delta resolving.""" + + def get_ref_text(sha): + if sha in found: + return found[sha] + if ext_resolve_ref: + try: + return ext_resolve_ref(sha) + except KeyError: + pass + raise Postpone, (sha, ) + todo = list(self.iterobjects()) + while todo: + (offset, type, obj) = todo.pop(0) + at[offset] = (type, obj) assert isinstance(offset, int) assert isinstance(type, int) assert isinstance(obj, tuple) or isinstance(obj, str) try: - type, obj = resolve_object(offset, type, obj, found.__getitem__, - self.get_object_at) - except KeyError: - postponed.append((offset, type, obj)) + type, obj = resolve_object(offset, type, obj, get_ref_text, + at.__getitem__) + except Postpone, (sha, ): + postponed[sha].append((offset, type, obj)) else: shafile = ShaFile.from_raw_string(type, obj) sha = shafile.sha().digest() found[sha] = (type, obj) yield sha, offset, shafile.crc32() + todo += postponed.get(sha, []) + if postponed: + raise KeyError([sha_to_hex(h) for h in postponed.keys()]) + + def sorted_entries(self, resolve_ext_ref=None): + ret = list(self.iterentries(resolve_ext_ref)) + ret.sort() + return ret def create_index_v1(self, filename): - entries = list(self.iterentries()) + entries = self.sorted_entries() write_pack_index_v1(filename, entries, self.calculate_checksum()) def create_index_v2(self, filename): - entries = list(self.iterentries()) - write_pack_index_v1(filename, entries, self.calculate_checksum()) + entries = self.sorted_entries() + write_pack_index_v2(filename, entries, self.calculate_checksum()) + + def get_stored_checksum(self): + return self._stored_checksum def check(self): - return (self.calculate_checksum() == self._stored_checksum) + return (self.calculate_checksum() == self.get_stored_checksum()) def get_object_at(self, offset): """Given an offset in to the packfile return the object that is there. @@ -425,39 +501,10 @@ class PackData(object): f = open(self._filename, 'rb') try: map = simple_mmap(f, offset, size-offset) - return self._unpack_object(map)[:2] + return unpack_object(map)[:2] finally: f.close() - def _unpack_object(self, map): - bytes = take_msb_bytes(map, 0) - type = (bytes[0] >> 4) & 0x07 - size = bytes[0] & 0x0f - for i, byte in enumerate(bytes[1:]): - size += (byte & 0x7f) << ((i * 7) + 4) - raw_base = len(bytes) - if type == 6: # offset delta - bytes = take_msb_bytes(map, raw_base) - assert not (bytes[-1] & 0x80) - delta_base_offset = bytes[0] & 0x7f - for byte in bytes[1:]: - delta_base_offset += 1 - delta_base_offset <<= 7 - delta_base_offset += (byte & 0x7f) - raw_base+=len(bytes) - uncomp, comp_len = read_zlib(map, raw_base, size) - assert size == len(uncomp) - return type, (delta_base_offset, uncomp), comp_len+raw_base - elif type == 7: # ref delta - basename = map[raw_base:raw_base+20] - uncomp, comp_len = read_zlib(map, raw_base+20, size) - assert size == len(uncomp) - return type, (basename, uncomp), comp_len+raw_base - else: - uncomp, comp_len = read_zlib(map, raw_base, size) - assert len(uncomp) == size - return type, uncomp, comp_len+raw_base - class SHA1Writer(object): @@ -469,30 +516,110 @@ class SHA1Writer(object): self.sha1.update(data) self.f.write(data) - def close(self): + def write_sha(self): sha = self.sha1.digest() assert len(sha) == 20 self.f.write(sha) + return sha + + def close(self): + sha = self.write_sha() self.f.close() return sha + def tell(self): + return self.f.tell() + + +def write_pack_object(f, type, object): + """Write pack object to a file. + + :param f: File to write to + :param o: Object to write + """ + ret = f.tell() + if type == 6: # ref delta + (delta_base_offset, object) = object + elif type == 7: # offset delta + (basename, object) = object + size = len(object) + c = (type << 4) | (size & 15) + size >>= 4 + while size: + f.write(chr(c | 0x80)) + c = size & 0x7f + size >>= 7 + f.write(chr(c)) + if type == 6: # offset delta + ret = [delta_base_offset & 0x7f] + delta_base_offset >>= 7 + while delta_base_offset: + delta_base_offset -= 1 + ret.insert(0, 0x80 | (delta_base_offset & 0x7f)) + delta_base_offset >>= 7 + f.write("".join([chr(x) for x in ret])) + elif type == 7: # ref delta + assert len(basename) == 20 + f.write(basename) + f.write(zlib.compress(object)) + return f.tell() + + +def write_pack(filename, objects, num_objects): + f = open(filename + ".pack", 'w') + try: + entries, data_sum = write_pack_data(f, objects, num_objects) + finally: + f.close() + entries.sort() + write_pack_index_v2(filename + ".idx", entries, data_sum) + -def write_pack(filename, objects): +def write_pack_data(f, objects, num_objects, window=10): """Write a new pack file. :param filename: The filename of the new pack file. - :param objects: List of objects to write. + :param objects: List of objects to write (tuples with object and path) :return: List with (name, offset, crc32 checksum) entries, pack checksum """ - f = open(filename, 'w') + recency = list(objects) + # FIXME: Somehow limit delta depth + # FIXME: Make thin-pack optional (its not used when cloning a pack) + # Build a list of objects ordered by the magic Linus heuristic + # This helps us find good objects to diff against us + magic = [] + for obj, path in recency: + magic.append( (obj.type, path, 1, -len(obj.as_raw_string()[1]), obj) ) + magic.sort() + # Build a map of objects and their index in magic - so we can find preceeding objects + # to diff against + offs = {} + for i in range(len(magic)): + offs[magic[i][4]] = i + # Write the pack entries = [] f = SHA1Writer(f) f.write("PACK") # Pack header f.write(struct.pack(">L", 2)) # Pack version - f.write(struct.pack(">L", len(objects))) # Number of objects in pack - for o in objects: - pass # FIXME: Write object - return entries, f.close() + f.write(struct.pack(">L", num_objects)) # Number of objects in pack + for o, path in recency: + sha1 = o.sha().digest() + crc32 = o.crc32() + orig_t, raw = o.as_raw_string() + winner = raw + t = orig_t + #for i in range(offs[o]-window, window): + # if i < 0 or i >= len(offs): continue + # b = magic[i][4] + # if b.type != orig_t: continue + # _, base = b.as_raw_string() + # delta = create_delta(base, raw) + # if len(delta) < len(winner): + # winner = delta + # t = 6 if magic[i][2] == 1 else 7 + offset = write_pack_object(f, t, winner) + entries.append((sha1, offset, crc32)) + return entries, f.write_sha() def write_pack_index_v1(filename, entries, pack_checksum): @@ -503,9 +630,6 @@ def write_pack_index_v1(filename, entries, pack_checksum): crc32_checksum. :param pack_checksum: Checksum of the pack file. """ - # Sort entries first - - entries = sorted(entries) f = open(filename, 'w') f = SHA1Writer(f) fan_out_table = defaultdict(lambda: 0) @@ -522,11 +646,66 @@ def write_pack_index_v1(filename, entries, pack_checksum): f.close() +def create_delta(base_buf, target_buf): + """Use python difflib to work out how to transform base_buf to target_buf""" + assert isinstance(base_buf, str) + assert isinstance(target_buf, str) + out_buf = "" + # write delta header + def encode_size(size): + ret = "" + c = size & 0x7f + size >>= 7 + while size: + ret += chr(c | 0x80) + c = size & 0x7f + size >>= 7 + ret += chr(c) + return ret + out_buf += encode_size(len(base_buf)) + out_buf += encode_size(len(target_buf)) + # write out delta opcodes + seq = difflib.SequenceMatcher(a=base_buf, b=target_buf) + for opcode, i1, i2, j1, j2 in seq.get_opcodes(): + # Git patch opcodes don't care about deletes! + #if opcode == "replace" or opcode == "delete": + # pass + if opcode == "equal": + # If they are equal, unpacker will use data from base_buf + # Write out an opcode that says what range to use + scratch = "" + op = 0x80 + o = i1 + for i in range(4): + if o & 0xff << i*8: + scratch += chr(o >> i) + op |= 1 << i + s = i2 - i1 + for i in range(2): + if s & 0xff << i*8: + scratch += chr(s >> i) + op |= 1 << (4+i) + out_buf += chr(op) + out_buf += scratch + if opcode == "replace" or opcode == "insert": + # If we are replacing a range or adding one, then we just + # output it to the stream (prefixed by its size) + s = j2 - j1 + o = j1 + while s > 127: + out_buf += chr(127) + out_buf += target_buf[o:o+127] + s -= 127 + o += 127 + out_buf += chr(s) + out_buf += target_buf[o:o+s] + return out_buf + + def apply_delta(src_buf, delta): """Based on the similar function in git's patch-delta.c.""" assert isinstance(src_buf, str), "was %r" % (src_buf,) assert isinstance(delta, str) - data = str(delta) out = "" def pop(delta): ret = delta[0] @@ -544,6 +723,7 @@ def apply_delta(src_buf, delta): return size, delta src_size, delta = get_delta_header_size(delta) dest_size, delta = get_delta_header_size(delta) + assert src_size == len(src_buf), "%d vs %d" % (src_size, len(src_buf)) while delta: cmd, delta = pop(delta) if cmd & 0x80: @@ -551,33 +731,30 @@ def apply_delta(src_buf, delta): for i in range(4): if cmd & (1 << i): x, delta = pop(delta) - cp_off |= x << (x << (i * 8)) + cp_off |= x << (i * 8) cp_size = 0 for i in range(3): - if cmd & (1 << (2 << 3+i)): + if cmd & (1 << (4+i)): x, delta = pop(delta) - cp_size |= x << (x << (i * 8)) - if cp_size == 0: cp_size = 0x10000 + cp_size |= x << (i * 8) + if cp_size == 0: + cp_size = 0x10000 if (cp_off + cp_size < cp_size or cp_off + cp_size > src_size or cp_size > dest_size): break out += src_buf[cp_off:cp_off+cp_size] - dest_size -= cp_size elif cmd != 0: - if cmd > dest_size: - break - out += data[:cmd] - data = data[cmd:] - dest_size -= cmd + out += delta[:cmd] + delta = delta[cmd:] else: raise ApplyDeltaError("Invalid opcode 0") - if data != []: - raise ApplyDeltaError("data not empty: %r" % data) + if delta != "": + raise ApplyDeltaError("delta not empty: %r" % delta) - if dest_size != 0: - raise ApplyDeltaError("dest size not empty") + if dest_size != len(out): + raise ApplyDeltaError("dest size incorrect") return out @@ -590,11 +767,9 @@ def write_pack_index_v2(filename, entries, pack_checksum): crc32_checksum. :param pack_checksum: Checksum of the pack file. """ - # Sort entries first - entries = sorted(entries) f = open(filename, 'w') f = SHA1Writer(f) - f.write('\377tOc') + f.write('\377tOc') # Magic! f.write(struct.pack(">L", 2)) fan_out_table = defaultdict(lambda: 0) for (name, offset, entry_checksum) in entries: @@ -606,7 +781,7 @@ def write_pack_index_v2(filename, entries, pack_checksum): for (name, offset, entry_checksum) in entries: f.write(name) for (name, offset, entry_checksum) in entries: - f.write(struct.pack(">L", entry_checksum)) + f.write(struct.pack(">l", entry_checksum)) for (name, offset, entry_checksum) in entries: # FIXME: handle if MSBit is set in offset f.write(struct.pack(">L", offset)) @@ -620,39 +795,88 @@ class Pack(object): def __init__(self, basename): self._basename = basename - self._idx = PackIndex(basename + ".idx") - self._pack = PackData(basename + ".pack") - assert len(self._idx) == len(self._pack) + self._data_path = self._basename + ".pack" + self._idx_path = self._basename + ".idx" + self._data = None + self._idx = None + + def name(self): + return self.idx.objects_sha1() + + @property + def data(self): + if self._data is None: + self._data = PackData(self._data_path) + assert len(self.idx) == len(self._data) + assert self.idx.get_stored_checksums()[0] == self._data.get_stored_checksum() + return self._data + + @property + def idx(self): + if self._idx is None: + self._idx = PackIndex(self._idx_path) + return self._idx + + def close(self): + if self._data is not None: + self._data.close() + self.idx.close() + + def __eq__(self, other): + return type(self) == type(other) and self.idx == other.idx def __len__(self): """Number of entries in this pack.""" - return len(self._idx) + return len(self.idx) def __repr__(self): return "Pack(%r)" % self._basename def __iter__(self): """Iterate over all the sha1s of the objects in this pack.""" - return iter(self._idx) + return iter(self.idx) def check(self): - return self._idx.check() and self._pack.check() + return self.idx.check() and self.data.check() + + def get_stored_checksum(self): + return self.data.get_stored_checksum() def __contains__(self, sha1): """Check whether this pack contains a particular SHA1.""" - return (self._idx.object_index(sha1) is not None) + return (self.idx.object_index(sha1) is not None) - def _get_text(self, sha1): - offset = self._idx.object_index(sha1) + def get_raw(self, sha1, resolve_ref=None): + offset = self.idx.object_index(sha1) if offset is None: raise KeyError(sha1) - type, obj = self._pack.get_object_at(offset) + type, obj = self.data.get_object_at(offset) assert isinstance(offset, int) - return resolve_object(offset, type, obj, self._get_text, - self._pack.get_object_at) + return resolve_object(offset, type, obj, resolve_ref, + self.data.get_object_at) def __getitem__(self, sha1): """Retrieve the specified SHA1.""" - type, uncomp = self._get_text(sha1) + type, uncomp = self.get_raw(sha1) return ShaFile.from_raw_string(type, uncomp) + + def iterobjects(self, get_raw=None): + if get_raw is None: + def get_raw(x): + raise KeyError(x) + for offset, type, obj in self.data.iterobjects(): + assert isinstance(offset, int) + yield ShaFile.from_raw_string( + *resolve_object(offset, type, obj, + get_raw, + self.data.get_object_at)) + + +def load_packs(path): + if not os.path.exists(path): + return + for name in os.listdir(path): + if name.startswith("pack-") and name.endswith(".pack"): + yield Pack(os.path.join(path, name[:-len(".pack")])) +