# pack.py -- For dealing wih packed git objects.
# Copyright (C) 2007 James Westby <jw+debian@jameswestby.net>
# Copryight (C) 2008 Jelmer Vernooij <jelmer@samba.org>
-# The code is loosely based on that in the sha1_file.c file from git itself,
-# which is Copyright (C) Linus Torvalds, 2005 and distributed under the
-# GPL version 2.
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; version 2
-# of the License.
+# of the License or (at your option) a later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
import struct
import sys
import zlib
+import difflib
from objects import (
ShaFile,
+ hex_to_sha,
+ sha_to_hex,
)
from errors import ApplyDeltaError
while obj.unused_data == "":
base = offset+fed
add = data[base:base+1024]
+ if len(add) < 1024:
+ add += "Z"
fed += len(add)
x += obj.decompress(add)
assert len(x) == dec_size
return sha.hexdigest()
-def hex_to_sha(hex):
- """Convert a hex string to a binary sha string."""
- ret = ""
- for i in range(0, len(hex), 2):
- ret += chr(int(hex[i:i+2], 16))
- return ret
-
-def sha_to_hex(sha):
- """Convert a binary sha string to a hex sha string."""
- ret = ""
- for i in sha:
- ret += "%02x" % ord(i)
- return ret
-
MAX_MMAP_SIZE = 256 * 1024 * 1024
def simple_mmap(f, offset, size, access=mmap.ACCESS_READ):
return None
+def read_pack_header(f):
+ header = f.read(12)
+ assert header[:4] == "PACK"
+ (version,) = struct.unpack_from(">L", header, 4)
+ assert version in (2, 3), "Version was %d" % version
+ (num_objects,) = struct.unpack_from(">L", header, 8)
+ return (version, num_objects)
+
+
+def read_pack_tail(f):
+ return (f.read(20),)
+
+
+def unpack_object(map):
+ bytes = take_msb_bytes(map, 0)
+ type = (bytes[0] >> 4) & 0x07
+ size = bytes[0] & 0x0f
+ for i, byte in enumerate(bytes[1:]):
+ size += (byte & 0x7f) << ((i * 7) + 4)
+ raw_base = len(bytes)
+ if type == 6: # offset delta
+ bytes = take_msb_bytes(map, raw_base)
+ assert not (bytes[-1] & 0x80)
+ delta_base_offset = bytes[0] & 0x7f
+ for byte in bytes[1:]:
+ delta_base_offset += 1
+ delta_base_offset <<= 7
+ delta_base_offset += (byte & 0x7f)
+ raw_base+=len(bytes)
+ uncomp, comp_len = read_zlib(map, raw_base, size)
+ assert size == len(uncomp)
+ return type, (delta_base_offset, uncomp), comp_len+raw_base
+ elif type == 7: # ref delta
+ basename = map[raw_base:raw_base+20]
+ uncomp, comp_len = read_zlib(map, raw_base+20, size)
+ assert size == len(uncomp)
+ return type, (basename, uncomp), comp_len+raw_base+20
+ else:
+ uncomp, comp_len = read_zlib(map, raw_base, size)
+ assert len(uncomp) == size
+ return type, uncomp, comp_len+raw_base
+
+
class PackData(object):
"""The data contained in a packfile.
self._filename = filename
assert os.path.exists(filename), "%s is not a packfile" % filename
self._size = os.path.getsize(filename)
- assert self._size >= 12, "%s is too small for a packfile" % filename
- self._header_size = self._read_header()
+ self._header_size = 12
+ assert self._size >= self._header_size, "%s is too small for a packfile" % filename
+ self._read_header()
def _read_header(self):
f = open(self._filename, 'rb')
try:
- header = f.read(12)
+ (version, self._num_objects) = \
+ read_pack_header(f)
f.seek(self._size-20)
- self._stored_checksum = f.read(20)
+ (self._stored_checksum,) = read_pack_tail(f)
finally:
f.close()
- assert header[:4] == "PACK"
- (version,) = struct.unpack_from(">L", header, 4)
- assert version in (2, 3), "Version was %d" % version
- (self._num_objects,) = struct.unpack_from(">L", header, 8)
- return 12 # Header size
def __len__(self):
"""Returns the number of objects in this pack."""
f = open(self._filename, 'rb')
for i in range(len(self)):
map = simple_mmap(f, offset, self._size-offset)
- (type, obj, total_size) = self._unpack_object(map)
+ (type, obj, total_size) = unpack_object(map)
yield offset, type, obj
offset += total_size
f.close()
- def iterentries(self):
+ def iterentries(self, ext_resolve_ref=None):
found = {}
at = {}
postponed = defaultdict(list)
def get_ref_text(sha):
if sha in found:
return found[sha]
+ if ext_resolve_ref:
+ try:
+ return ext_resolve_ref(sha)
+ except KeyError:
+ pass
raise Postpone, (sha, )
todo = list(self.iterobjects())
while todo:
if postponed:
raise KeyError([sha_to_hex(h) for h in postponed.keys()])
- def sorted_entries(self):
- ret = list(self.iterentries())
+ def sorted_entries(self, resolve_ext_ref=None):
+ ret = list(self.iterentries(resolve_ext_ref))
ret.sort()
return ret
f = open(self._filename, 'rb')
try:
map = simple_mmap(f, offset, size-offset)
- return self._unpack_object(map)[:2]
+ return unpack_object(map)[:2]
finally:
f.close()
- def _unpack_object(self, map):
- bytes = take_msb_bytes(map, 0)
- type = (bytes[0] >> 4) & 0x07
- size = bytes[0] & 0x0f
- for i, byte in enumerate(bytes[1:]):
- size += (byte & 0x7f) << ((i * 7) + 4)
- raw_base = len(bytes)
- if type == 6: # offset delta
- bytes = take_msb_bytes(map, raw_base)
- assert not (bytes[-1] & 0x80)
- delta_base_offset = bytes[0] & 0x7f
- for byte in bytes[1:]:
- delta_base_offset += 1
- delta_base_offset <<= 7
- delta_base_offset += (byte & 0x7f)
- raw_base+=len(bytes)
- uncomp, comp_len = read_zlib(map, raw_base, size)
- assert size == len(uncomp)
- return type, (delta_base_offset, uncomp), comp_len+raw_base
- elif type == 7: # ref delta
- basename = map[raw_base:raw_base+20]
- uncomp, comp_len = read_zlib(map, raw_base+20, size)
- assert size == len(uncomp)
- return type, (basename, uncomp), comp_len+raw_base+20
- else:
- uncomp, comp_len = read_zlib(map, raw_base, size)
- assert len(uncomp) == size
- return type, uncomp, comp_len+raw_base
-
class SHA1Writer(object):
f = open(filename + ".pack", 'w')
try:
entries, data_sum = write_pack_data(f, objects, num_objects)
- except:
+ finally:
f.close()
entries.sort()
write_pack_index_v2(filename + ".idx", entries, data_sum)
-def write_pack_data(f, objects, num_objects):
+def write_pack_data(f, objects, num_objects, window=10):
"""Write a new pack file.
:param filename: The filename of the new pack file.
- :param objects: List of objects to write.
+ :param objects: List of objects to write (tuples with object and path)
:return: List with (name, offset, crc32 checksum) entries, pack checksum
"""
+ recency = list(objects)
+ # FIXME: Somehow limit delta depth
+ # FIXME: Make thin-pack optional (its not used when cloning a pack)
+ # Build a list of objects ordered by the magic Linus heuristic
+ # This helps us find good objects to diff against us
+ magic = []
+ for obj, path in recency:
+ magic.append( (obj.type, path, 1, -len(obj.as_raw_string()[1]), obj) )
+ magic.sort()
+ # Build a map of objects and their index in magic - so we can find preceeding objects
+ # to diff against
+ offs = {}
+ for i in range(len(magic)):
+ offs[magic[i][4]] = i
+ # Write the pack
entries = []
f = SHA1Writer(f)
f.write("PACK") # Pack header
f.write(struct.pack(">L", 2)) # Pack version
f.write(struct.pack(">L", num_objects)) # Number of objects in pack
- for o in objects:
+ for o, path in recency:
sha1 = o.sha().digest()
crc32 = o.crc32()
- # FIXME: Delta !
- t, o = o.as_raw_string()
- offset = write_pack_object(f, t, o)
+ orig_t, raw = o.as_raw_string()
+ winner = raw
+ t = orig_t
+ #for i in range(offs[o]-window, window):
+ # if i < 0 or i >= len(offs): continue
+ # b = magic[i][4]
+ # if b.type != orig_t: continue
+ # _, base = b.as_raw_string()
+ # delta = create_delta(base, raw)
+ # if len(delta) < len(winner):
+ # winner = delta
+ # t = 6 if magic[i][2] == 1 else 7
+ offset = write_pack_object(f, t, winner)
entries.append((sha1, offset, crc32))
return entries, f.write_sha()
f.close()
+def create_delta(base_buf, target_buf):
+ """Use python difflib to work out how to transform base_buf to target_buf"""
+ assert isinstance(base_buf, str)
+ assert isinstance(target_buf, str)
+ out_buf = ""
+ # write delta header
+ def encode_size(size):
+ ret = ""
+ c = size & 0x7f
+ size >>= 7
+ while size:
+ ret += chr(c | 0x80)
+ c = size & 0x7f
+ size >>= 7
+ ret += chr(c)
+ return ret
+ out_buf += encode_size(len(base_buf))
+ out_buf += encode_size(len(target_buf))
+ # write out delta opcodes
+ seq = difflib.SequenceMatcher(a=base_buf, b=target_buf)
+ for opcode, i1, i2, j1, j2 in seq.get_opcodes():
+ # Git patch opcodes don't care about deletes!
+ #if opcode == "replace" or opcode == "delete":
+ # pass
+ if opcode == "equal":
+ # If they are equal, unpacker will use data from base_buf
+ # Write out an opcode that says what range to use
+ scratch = ""
+ op = 0x80
+ o = i1
+ for i in range(4):
+ if o & 0xff << i*8:
+ scratch += chr(o >> i)
+ op |= 1 << i
+ s = i2 - i1
+ for i in range(2):
+ if s & 0xff << i*8:
+ scratch += chr(s >> i)
+ op |= 1 << (4+i)
+ out_buf += chr(op)
+ out_buf += scratch
+ if opcode == "replace" or opcode == "insert":
+ # If we are replacing a range or adding one, then we just
+ # output it to the stream (prefixed by its size)
+ s = j2 - j1
+ o = j1
+ while s > 127:
+ out_buf += chr(127)
+ out_buf += target_buf[o:o+127]
+ s -= 127
+ o += 127
+ out_buf += chr(s)
+ out_buf += target_buf[o:o+s]
+ return out_buf
+
+
def apply_delta(src_buf, delta):
"""Based on the similar function in git's patch-delta.c."""
assert isinstance(src_buf, str), "was %r" % (src_buf,)
return size, delta
src_size, delta = get_delta_header_size(delta)
dest_size, delta = get_delta_header_size(delta)
- assert src_size == len(src_buf)
+ assert src_size == len(src_buf), "%d vs %d" % (src_size, len(src_buf))
while delta:
cmd, delta = pop(delta)
if cmd & 0x80:
"""
f = open(filename, 'w')
f = SHA1Writer(f)
- f.write('\377tOc')
+ f.write('\377tOc') # Magic!
f.write(struct.pack(">L", 2))
fan_out_table = defaultdict(lambda: 0)
for (name, offset, entry_checksum) in entries:
"""Check whether this pack contains a particular SHA1."""
return (self.idx.object_index(sha1) is not None)
- def _get_text(self, sha1):
+ def get_raw(self, sha1, resolve_ref=None):
offset = self.idx.object_index(sha1)
if offset is None:
raise KeyError(sha1)
type, obj = self.data.get_object_at(offset)
assert isinstance(offset, int)
- return resolve_object(offset, type, obj, self._get_text,
+ return resolve_object(offset, type, obj, resolve_ref,
self.data.get_object_at)
def __getitem__(self, sha1):
"""Retrieve the specified SHA1."""
- type, uncomp = self._get_text(sha1)
+ type, uncomp = self.get_raw(sha1)
return ShaFile.from_raw_string(type, uncomp)
- def iterobjects(self):
+ def iterobjects(self, get_raw=None):
+ if get_raw is None:
+ def get_raw(x):
+ raise KeyError(x)
for offset, type, obj in self.data.iterobjects():
assert isinstance(offset, int)
yield ShaFile.from_raw_string(
- *resolve_object(offset, type, obj, self._get_text,
- self.data.get_object_at))
+ *resolve_object(offset, type, obj,
+ get_raw,
+ self.data.get_object_at))
def load_packs(path):
if name.startswith("pack-") and name.endswith(".pack"):
yield Pack(os.path.join(path, name[:-len(".pack")]))
-
-def generate_pack_contents(want, have, get_object):
- """
- Given a list of desired commits and a list of restraining commits,
- work out what objects to send
-
- :param want: list of sha's we want to send
- :param have: list of sha's recepient has (or when to stop for a shallow pack)
- :param get_object: callback to get a commit or tree object
- :return: a list of commits
- """
- sha_queue = []
- commits_to_send = want[:]
- for sha in commits_to_send:
- if sha in sha_queue:
- continue
-
- sha_queue.append(sha)
-
- c = get_object(sha)
- for p in c.parents:
- if not p in commits_to_send:
- commits_to_send.append(p)
-
- def parse_tree(tree, sha_queue):
- for mode, name, x in tree.entries():
- if not x in sha_queue:
- sha_queue.append(x)
- t = get_object(x)
- if t._type == 'tree':
- parse_tree(t, sha_queue)
-
- treesha = c.tree
- if treesha not in sha_queue:
- sha_queue.append(treesha)
- t = get_object(treesha)
- parse_tree(t, sha_queue)
-
- #progress("counting objects: %d\r" % len(sha_queue))
-
- #progress("counting objects: %d, done.\n" % len(sha_queue))
-
- return sha_queue