# pack.py -- For dealing wih packed git objects.
# Copyright (C) 2007 James Westby <jw+debian@jameswestby.net>
# Copryight (C) 2008 Jelmer Vernooij <jelmer@samba.org>
-# The code is loosely based on that in the sha1_file.c file from git itself,
-# which is Copyright (C) Linus Torvalds, 2005 and distributed under the
-# GPL version 2.
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; version 2
-# of the License.
+# of the License or (at your option) a later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
from collections import defaultdict
import hashlib
+from itertools import imap, izip
import mmap
import os
+import sha
import struct
import sys
import zlib
+import difflib
from objects import (
ShaFile,
+ hex_to_sha,
+ sha_to_hex,
)
from errors import ApplyDeltaError
while obj.unused_data == "":
base = offset+fed
add = data[base:base+1024]
+ if len(add) < 1024:
+ add += "Z"
fed += len(add)
x += obj.decompress(add)
assert len(x) == dec_size
return x, comp_len
-def hex_to_sha(hex):
- """Convert a hex string to a binary sha string."""
- ret = ""
- for i in range(0, len(hex), 2):
- ret += chr(int(hex[i:i+2], 16))
- return ret
+def iter_sha1(iter):
+ sha = hashlib.sha1()
+ for name in iter:
+ sha.update(name)
+ return sha.hexdigest()
-def sha_to_hex(sha):
- """Convert a binary sha string to a hex sha string."""
- ret = ""
- for i in sha:
- ret += "%02x" % ord(i)
- return ret
MAX_MMAP_SIZE = 256 * 1024 * 1024
(basename, delta) = obj
assert isinstance(basename, str) and len(basename) == 20
assert isinstance(delta, str)
- type, base_obj= get_ref(basename)
+ type, base_obj = get_ref(basename)
assert isinstance(type, int)
type, base_text = resolve_object(offset, type, base_obj, get_ref, get_offset)
return type, apply_delta(base_text, delta)
+
class PackIndex(object):
"""An index in to a packfile.
it whenever required.
"""
self._filename = filename
- assert os.path.exists(filename), "%s is not a pack index" % filename
# Take the size now, so it can be checked each time we map the file to
# ensure that it hasn't changed.
self._size = os.path.getsize(filename)
self._pack_offset_table_offset = self._crc32_table_offset + 4 * len(self)
def __eq__(self, other):
- return (type(self) == type(other) and
- self._fan_out_table == other._fan_out_table and
- list(self.iterentries()) == list(other.iterentries()))
+ if type(self) != type(other):
+ return False
+
+ if self._fan_out_table != other._fan_out_table:
+ return False
+
+ for (name1, _, _), (name2, _, _) in izip(self.iterentries(), other.iterentries()):
+ if name1 != name2:
+ return False
+ return True
def close(self):
self._file.close()
self._crc32_table_offset + i * 4)[0]
def __iter__(self):
+ return imap(sha_to_hex, self._itersha())
+
+ def _itersha(self):
for i in range(len(self)):
- yield sha_to_hex(self._unpack_name(i))
+ yield self._unpack_name(i)
+
+ def objects_sha1(self):
+ return iter_sha1(self._itersha())
def iterentries(self):
"""Iterate over the entries in this pack index.
return None
+def read_pack_header(f):
+ header = f.read(12)
+ assert header[:4] == "PACK"
+ (version,) = struct.unpack_from(">L", header, 4)
+ assert version in (2, 3), "Version was %d" % version
+ (num_objects,) = struct.unpack_from(">L", header, 8)
+ return (version, num_objects)
+
+
+def read_pack_tail(f):
+ return (f.read(20),)
+
+
+def unpack_object(map):
+ bytes = take_msb_bytes(map, 0)
+ type = (bytes[0] >> 4) & 0x07
+ size = bytes[0] & 0x0f
+ for i, byte in enumerate(bytes[1:]):
+ size += (byte & 0x7f) << ((i * 7) + 4)
+ raw_base = len(bytes)
+ if type == 6: # offset delta
+ bytes = take_msb_bytes(map, raw_base)
+ assert not (bytes[-1] & 0x80)
+ delta_base_offset = bytes[0] & 0x7f
+ for byte in bytes[1:]:
+ delta_base_offset += 1
+ delta_base_offset <<= 7
+ delta_base_offset += (byte & 0x7f)
+ raw_base+=len(bytes)
+ uncomp, comp_len = read_zlib(map, raw_base, size)
+ assert size == len(uncomp)
+ return type, (delta_base_offset, uncomp), comp_len+raw_base
+ elif type == 7: # ref delta
+ basename = map[raw_base:raw_base+20]
+ uncomp, comp_len = read_zlib(map, raw_base+20, size)
+ assert size == len(uncomp)
+ return type, (basename, uncomp), comp_len+raw_base+20
+ else:
+ uncomp, comp_len = read_zlib(map, raw_base, size)
+ assert len(uncomp) == size
+ return type, uncomp, comp_len+raw_base
+
+
class PackData(object):
"""The data contained in a packfile.
self._filename = filename
assert os.path.exists(filename), "%s is not a packfile" % filename
self._size = os.path.getsize(filename)
- self._header_size = self._read_header()
+ self._header_size = 12
+ assert self._size >= self._header_size, "%s is too small for a packfile" % filename
+ self._read_header()
def _read_header(self):
f = open(self._filename, 'rb')
try:
- header = f.read(12)
+ (version, self._num_objects) = \
+ read_pack_header(f)
f.seek(self._size-20)
- self._stored_checksum = f.read(20)
+ (self._stored_checksum,) = read_pack_tail(f)
finally:
f.close()
- assert header[:4] == "PACK"
- (version,) = struct.unpack_from(">L", header, 4)
- assert version in (2, 3), "Version was %d" % version
- (self._num_objects,) = struct.unpack_from(">L", header, 8)
- return 12 # Header size
def __len__(self):
"""Returns the number of objects in this pack."""
f.close()
def iterobjects(self):
- """Yields (name, offset, crc32 checksum)."""
offset = self._header_size
f = open(self._filename, 'rb')
for i in range(len(self)):
map = simple_mmap(f, offset, self._size-offset)
- (type, obj, total_size) = self._unpack_object(map)
+ (type, obj, total_size) = unpack_object(map)
yield offset, type, obj
offset += total_size
f.close()
- def iterentries(self):
+ def iterentries(self, ext_resolve_ref=None):
found = {}
- postponed = list(self.iterobjects())
- while postponed:
- (offset, type, obj) = postponed.pop()
+ at = {}
+ postponed = defaultdict(list)
+ class Postpone(Exception):
+ """Raised to postpone delta resolving."""
+
+ def get_ref_text(sha):
+ if sha in found:
+ return found[sha]
+ if ext_resolve_ref:
+ try:
+ return ext_resolve_ref(sha)
+ except KeyError:
+ pass
+ raise Postpone, (sha, )
+ todo = list(self.iterobjects())
+ while todo:
+ (offset, type, obj) = todo.pop(0)
+ at[offset] = (type, obj)
assert isinstance(offset, int)
assert isinstance(type, int)
assert isinstance(obj, tuple) or isinstance(obj, str)
try:
- type, obj = resolve_object(offset, type, obj, found.__getitem__,
- self.get_object_at)
- except KeyError:
- postponed.append((offset, type, obj))
+ type, obj = resolve_object(offset, type, obj, get_ref_text,
+ at.__getitem__)
+ except Postpone, (sha, ):
+ postponed[sha].append((offset, type, obj))
else:
shafile = ShaFile.from_raw_string(type, obj)
sha = shafile.sha().digest()
found[sha] = (type, obj)
yield sha, offset, shafile.crc32()
+ todo += postponed.get(sha, [])
+ if postponed:
+ raise KeyError([sha_to_hex(h) for h in postponed.keys()])
+
+ def sorted_entries(self, resolve_ext_ref=None):
+ ret = list(self.iterentries(resolve_ext_ref))
+ ret.sort()
+ return ret
def create_index_v1(self, filename):
- entries = list(self.iterentries())
+ entries = self.sorted_entries()
write_pack_index_v1(filename, entries, self.calculate_checksum())
def create_index_v2(self, filename):
- entries = list(self.iterentries())
- write_pack_index_v1(filename, entries, self.calculate_checksum())
+ entries = self.sorted_entries()
+ write_pack_index_v2(filename, entries, self.calculate_checksum())
+
+ def get_stored_checksum(self):
+ return self._stored_checksum
def check(self):
- return (self.calculate_checksum() == self._stored_checksum)
+ return (self.calculate_checksum() == self.get_stored_checksum())
def get_object_at(self, offset):
"""Given an offset in to the packfile return the object that is there.
f = open(self._filename, 'rb')
try:
map = simple_mmap(f, offset, size-offset)
- return self._unpack_object(map)[:2]
+ return unpack_object(map)[:2]
finally:
f.close()
- def _unpack_object(self, map):
- bytes = take_msb_bytes(map, 0)
- type = (bytes[0] >> 4) & 0x07
- size = bytes[0] & 0x0f
- for i, byte in enumerate(bytes[1:]):
- size += (byte & 0x7f) << ((i * 7) + 4)
- raw_base = len(bytes)
- if type == 6: # offset delta
- bytes = take_msb_bytes(map, raw_base)
- assert not (bytes[-1] & 0x80)
- delta_base_offset = bytes[0] & 0x7f
- for byte in bytes[1:]:
- delta_base_offset += 1
- delta_base_offset <<= 7
- delta_base_offset += (byte & 0x7f)
- raw_base+=len(bytes)
- uncomp, comp_len = read_zlib(map, raw_base, size)
- assert size == len(uncomp)
- return type, (delta_base_offset, uncomp), comp_len+raw_base
- elif type == 7: # ref delta
- basename = map[raw_base:raw_base+20]
- uncomp, comp_len = read_zlib(map, raw_base+20, size)
- assert size == len(uncomp)
- return type, (basename, uncomp), comp_len+raw_base
- else:
- uncomp, comp_len = read_zlib(map, raw_base, size)
- assert len(uncomp) == size
- return type, uncomp, comp_len+raw_base
-
class SHA1Writer(object):
self.sha1.update(data)
self.f.write(data)
- def close(self):
+ def write_sha(self):
sha = self.sha1.digest()
assert len(sha) == 20
self.f.write(sha)
+ return sha
+
+ def close(self):
+ sha = self.write_sha()
self.f.close()
return sha
+ def tell(self):
+ return self.f.tell()
+
+
+def write_pack_object(f, type, object):
+ """Write pack object to a file.
+
+ :param f: File to write to
+ :param o: Object to write
+ """
+ ret = f.tell()
+ if type == 6: # ref delta
+ (delta_base_offset, object) = object
+ elif type == 7: # offset delta
+ (basename, object) = object
+ size = len(object)
+ c = (type << 4) | (size & 15)
+ size >>= 4
+ while size:
+ f.write(chr(c | 0x80))
+ c = size & 0x7f
+ size >>= 7
+ f.write(chr(c))
+ if type == 6: # offset delta
+ ret = [delta_base_offset & 0x7f]
+ delta_base_offset >>= 7
+ while delta_base_offset:
+ delta_base_offset -= 1
+ ret.insert(0, 0x80 | (delta_base_offset & 0x7f))
+ delta_base_offset >>= 7
+ f.write("".join([chr(x) for x in ret]))
+ elif type == 7: # ref delta
+ assert len(basename) == 20
+ f.write(basename)
+ f.write(zlib.compress(object))
+ return f.tell()
+
+
+def write_pack(filename, objects, num_objects):
+ f = open(filename + ".pack", 'w')
+ try:
+ entries, data_sum = write_pack_data(f, objects, num_objects)
+ finally:
+ f.close()
+ entries.sort()
+ write_pack_index_v2(filename + ".idx", entries, data_sum)
+
-def write_pack(filename, objects):
+def write_pack_data(f, objects, num_objects, window=10):
"""Write a new pack file.
:param filename: The filename of the new pack file.
- :param objects: List of objects to write.
+ :param objects: List of objects to write (tuples with object and path)
:return: List with (name, offset, crc32 checksum) entries, pack checksum
"""
- f = open(filename, 'w')
+ recency = list(objects)
+ # FIXME: Somehow limit delta depth
+ # FIXME: Make thin-pack optional (its not used when cloning a pack)
+ # Build a list of objects ordered by the magic Linus heuristic
+ # This helps us find good objects to diff against us
+ magic = []
+ for obj, path in recency:
+ magic.append( (obj.type, path, 1, -len(obj.as_raw_string()[1]), obj) )
+ magic.sort()
+ # Build a map of objects and their index in magic - so we can find preceeding objects
+ # to diff against
+ offs = {}
+ for i in range(len(magic)):
+ offs[magic[i][4]] = i
+ # Write the pack
entries = []
f = SHA1Writer(f)
f.write("PACK") # Pack header
f.write(struct.pack(">L", 2)) # Pack version
- f.write(struct.pack(">L", len(objects))) # Number of objects in pack
- for o in objects:
- pass # FIXME: Write object
- return entries, f.close()
+ f.write(struct.pack(">L", num_objects)) # Number of objects in pack
+ for o, path in recency:
+ sha1 = o.sha().digest()
+ crc32 = o.crc32()
+ orig_t, raw = o.as_raw_string()
+ winner = raw
+ t = orig_t
+ #for i in range(offs[o]-window, window):
+ # if i < 0 or i >= len(offs): continue
+ # b = magic[i][4]
+ # if b.type != orig_t: continue
+ # _, base = b.as_raw_string()
+ # delta = create_delta(base, raw)
+ # if len(delta) < len(winner):
+ # winner = delta
+ # t = 6 if magic[i][2] == 1 else 7
+ offset = write_pack_object(f, t, winner)
+ entries.append((sha1, offset, crc32))
+ return entries, f.write_sha()
def write_pack_index_v1(filename, entries, pack_checksum):
crc32_checksum.
:param pack_checksum: Checksum of the pack file.
"""
- # Sort entries first
-
- entries = sorted(entries)
f = open(filename, 'w')
f = SHA1Writer(f)
fan_out_table = defaultdict(lambda: 0)
f.close()
+def create_delta(base_buf, target_buf):
+ """Use python difflib to work out how to transform base_buf to target_buf"""
+ assert isinstance(base_buf, str)
+ assert isinstance(target_buf, str)
+ out_buf = ""
+ # write delta header
+ def encode_size(size):
+ ret = ""
+ c = size & 0x7f
+ size >>= 7
+ while size:
+ ret += chr(c | 0x80)
+ c = size & 0x7f
+ size >>= 7
+ ret += chr(c)
+ return ret
+ out_buf += encode_size(len(base_buf))
+ out_buf += encode_size(len(target_buf))
+ # write out delta opcodes
+ seq = difflib.SequenceMatcher(a=base_buf, b=target_buf)
+ for opcode, i1, i2, j1, j2 in seq.get_opcodes():
+ # Git patch opcodes don't care about deletes!
+ #if opcode == "replace" or opcode == "delete":
+ # pass
+ if opcode == "equal":
+ # If they are equal, unpacker will use data from base_buf
+ # Write out an opcode that says what range to use
+ scratch = ""
+ op = 0x80
+ o = i1
+ for i in range(4):
+ if o & 0xff << i*8:
+ scratch += chr(o >> i)
+ op |= 1 << i
+ s = i2 - i1
+ for i in range(2):
+ if s & 0xff << i*8:
+ scratch += chr(s >> i)
+ op |= 1 << (4+i)
+ out_buf += chr(op)
+ out_buf += scratch
+ if opcode == "replace" or opcode == "insert":
+ # If we are replacing a range or adding one, then we just
+ # output it to the stream (prefixed by its size)
+ s = j2 - j1
+ o = j1
+ while s > 127:
+ out_buf += chr(127)
+ out_buf += target_buf[o:o+127]
+ s -= 127
+ o += 127
+ out_buf += chr(s)
+ out_buf += target_buf[o:o+s]
+ return out_buf
+
+
def apply_delta(src_buf, delta):
"""Based on the similar function in git's patch-delta.c."""
assert isinstance(src_buf, str), "was %r" % (src_buf,)
assert isinstance(delta, str)
- data = str(delta)
out = ""
def pop(delta):
ret = delta[0]
return size, delta
src_size, delta = get_delta_header_size(delta)
dest_size, delta = get_delta_header_size(delta)
+ assert src_size == len(src_buf), "%d vs %d" % (src_size, len(src_buf))
while delta:
cmd, delta = pop(delta)
if cmd & 0x80:
for i in range(4):
if cmd & (1 << i):
x, delta = pop(delta)
- cp_off |= x << (x << (i * 8))
+ cp_off |= x << (i * 8)
cp_size = 0
for i in range(3):
- if cmd & (1 << (2 << 3+i)):
+ if cmd & (1 << (4+i)):
x, delta = pop(delta)
- cp_size |= x << (x << (i * 8))
- if cp_size == 0: cp_size = 0x10000
+ cp_size |= x << (i * 8)
+ if cp_size == 0:
+ cp_size = 0x10000
if (cp_off + cp_size < cp_size or
cp_off + cp_size > src_size or
cp_size > dest_size):
break
out += src_buf[cp_off:cp_off+cp_size]
- dest_size -= cp_size
elif cmd != 0:
- if cmd > dest_size:
- break
- out += data[:cmd]
- data = data[cmd:]
- dest_size -= cmd
+ out += delta[:cmd]
+ delta = delta[cmd:]
else:
raise ApplyDeltaError("Invalid opcode 0")
- if data != []:
- raise ApplyDeltaError("data not empty: %r" % data)
+ if delta != "":
+ raise ApplyDeltaError("delta not empty: %r" % delta)
- if dest_size != 0:
- raise ApplyDeltaError("dest size not empty")
+ if dest_size != len(out):
+ raise ApplyDeltaError("dest size incorrect")
return out
crc32_checksum.
:param pack_checksum: Checksum of the pack file.
"""
- # Sort entries first
- entries = sorted(entries)
f = open(filename, 'w')
f = SHA1Writer(f)
- f.write('\377tOc')
+ f.write('\377tOc') # Magic!
f.write(struct.pack(">L", 2))
fan_out_table = defaultdict(lambda: 0)
for (name, offset, entry_checksum) in entries:
for (name, offset, entry_checksum) in entries:
f.write(name)
for (name, offset, entry_checksum) in entries:
- f.write(struct.pack(">L", entry_checksum))
+ f.write(struct.pack(">l", entry_checksum))
for (name, offset, entry_checksum) in entries:
# FIXME: handle if MSBit is set in offset
f.write(struct.pack(">L", offset))
def __init__(self, basename):
self._basename = basename
- self._idx = PackIndex(basename + ".idx")
- self._pack = PackData(basename + ".pack")
- assert len(self._idx) == len(self._pack)
+ self._data_path = self._basename + ".pack"
+ self._idx_path = self._basename + ".idx"
+ self._data = None
+ self._idx = None
+
+ def name(self):
+ return self.idx.objects_sha1()
+
+ @property
+ def data(self):
+ if self._data is None:
+ self._data = PackData(self._data_path)
+ assert len(self.idx) == len(self._data)
+ assert self.idx.get_stored_checksums()[0] == self._data.get_stored_checksum()
+ return self._data
+
+ @property
+ def idx(self):
+ if self._idx is None:
+ self._idx = PackIndex(self._idx_path)
+ return self._idx
+
+ def close(self):
+ if self._data is not None:
+ self._data.close()
+ self.idx.close()
+
+ def __eq__(self, other):
+ return type(self) == type(other) and self.idx == other.idx
def __len__(self):
"""Number of entries in this pack."""
- return len(self._idx)
+ return len(self.idx)
def __repr__(self):
return "Pack(%r)" % self._basename
def __iter__(self):
"""Iterate over all the sha1s of the objects in this pack."""
- return iter(self._idx)
+ return iter(self.idx)
def check(self):
- return self._idx.check() and self._pack.check()
+ return self.idx.check() and self.data.check()
+
+ def get_stored_checksum(self):
+ return self.data.get_stored_checksum()
def __contains__(self, sha1):
"""Check whether this pack contains a particular SHA1."""
- return (self._idx.object_index(sha1) is not None)
+ return (self.idx.object_index(sha1) is not None)
- def _get_text(self, sha1):
- offset = self._idx.object_index(sha1)
+ def get_raw(self, sha1, resolve_ref=None):
+ offset = self.idx.object_index(sha1)
if offset is None:
raise KeyError(sha1)
- type, obj = self._pack.get_object_at(offset)
+ type, obj = self.data.get_object_at(offset)
assert isinstance(offset, int)
- return resolve_object(offset, type, obj, self._get_text,
- self._pack.get_object_at)
+ return resolve_object(offset, type, obj, resolve_ref,
+ self.data.get_object_at)
def __getitem__(self, sha1):
"""Retrieve the specified SHA1."""
- type, uncomp = self._get_text(sha1)
+ type, uncomp = self.get_raw(sha1)
return ShaFile.from_raw_string(type, uncomp)
+
+ def iterobjects(self, get_raw=None):
+ if get_raw is None:
+ def get_raw(x):
+ raise KeyError(x)
+ for offset, type, obj in self.data.iterobjects():
+ assert isinstance(offset, int)
+ yield ShaFile.from_raw_string(
+ *resolve_object(offset, type, obj,
+ get_raw,
+ self.data.get_object_at))
+
+
+def load_packs(path):
+ if not os.path.exists(path):
+ return
+ for name in os.listdir(path):
+ if name.startswith("pack-") and name.endswith(".pack"):
+ yield Pack(os.path.join(path, name[:-len(".pack")]))
+