# pack.py -- For dealing wih packed git objects.
# Copyright (C) 2007 James Westby <jw+debian@jameswestby.net>
# Copryight (C) 2008 Jelmer Vernooij <jelmer@samba.org>
-# The code is loosely based on that in the sha1_file.c file from git itself,
-# which is Copyright (C) Linus Torvalds, 2005 and distributed under the
-# GPL version 2.
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; version 2
-# of the License.
+# of the License or (at your option) a later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
a pointer in to the corresponding packfile.
"""
-from collections import defaultdict
-import hashlib
-from itertools import imap, izip
+try:
+ from collections import defaultdict
+except ImportError:
+ from misc import defaultdict
+
+from itertools import chain, imap, izip
import mmap
import os
-import sha
import struct
+try:
+ from struct import unpack_from
+except ImportError:
+ from dulwich.misc import unpack_from
import sys
import zlib
import difflib
-from objects import (
- ShaFile,
- hex_to_sha,
- sha_to_hex,
- )
-from errors import ApplyDeltaError
-
-supports_mmap_offset = (sys.version_info[0] >= 3 or
+from dulwich.errors import (
+ ApplyDeltaError,
+ ChecksumMismatch,
+ )
+from dulwich.lru_cache import (
+ LRUSizeCache,
+ )
+from dulwich.objects import (
+ ShaFile,
+ hex_to_sha,
+ sha_to_hex,
+ )
+from dulwich.misc import make_sha
+
+supports_mmap_offset = (sys.version_info[0] >= 3 or
(sys.version_info[0] == 2 and sys.version_info[1] >= 6))
def read_zlib(data, offset, dec_size):
obj = zlib.decompressobj()
- x = ""
+ ret = []
fed = 0
while obj.unused_data == "":
base = offset+fed
add = data[base:base+1024]
+ if len(add) < 1024:
+ add += "Z"
fed += len(add)
- x += obj.decompress(add)
+ ret.append(obj.decompress(add))
+ x = "".join(ret)
assert len(x) == dec_size
comp_len = fed-len(obj.unused_data)
return x, comp_len
def iter_sha1(iter):
- sha = hashlib.sha1()
+ """Return the hexdigest of the SHA1 over a set of names."""
+ sha1 = make_sha()
for name in iter:
- sha.update(name)
- return sha.hexdigest()
-
+ sha1.update(name)
+ return sha1.hexdigest()
-MAX_MMAP_SIZE = 256 * 1024 * 1024
def simple_mmap(f, offset, size, access=mmap.ACCESS_READ):
"""Simple wrapper for mmap() which always supports the offset parameter.
:param access: Access mechanism.
:return: MMAP'd area.
"""
- if offset+size > MAX_MMAP_SIZE and not supports_mmap_offset:
- raise AssertionError("%s is larger than 256 meg, and this version "
- "of Python does not support the offset argument to mmap().")
- if supports_mmap_offset:
- return mmap.mmap(f.fileno(), size, access=access, offset=offset)
- else:
- class ArraySkipper(object):
-
- def __init__(self, array, offset):
- self.array = array
- self.offset = offset
-
- def __getslice__(self, i, j):
- return self.array[i+self.offset:j+self.offset]
-
- def __getitem__(self, i):
- return self.array[i+self.offset]
+ mem = mmap.mmap(f.fileno(), size+offset, access=access)
+ return mem, offset
- def __len__(self):
- return len(self.array) - self.offset
-
- def __str__(self):
- return str(self.array[self.offset:])
-
- mem = mmap.mmap(f.fileno(), size+offset, access=access)
- if offset == 0:
- return mem
- return ArraySkipper(mem, offset)
+def load_pack_index(filename):
+ f = open(filename, 'r')
+ if f.read(4) == '\377tOc':
+ version = struct.unpack(">L", f.read(4))[0]
+ if version == 2:
+ f.seek(0)
+ return PackIndex2(filename, file=f)
+ else:
+ raise KeyError("Unknown pack index format %d" % version)
+ else:
+ f.seek(0)
+ return PackIndex1(filename, file=f)
-def resolve_object(offset, type, obj, get_ref, get_offset):
- """Resolve an object, possibly resolving deltas when necessary."""
- if not type in (6, 7): # Not a delta
- return type, obj
- if type == 6: # offset delta
- (delta_offset, delta) = obj
- assert isinstance(delta_offset, int)
- assert isinstance(delta, str)
- offset = offset-delta_offset
- type, base_obj = get_offset(offset)
- assert isinstance(type, int)
- elif type == 7: # ref delta
- (basename, delta) = obj
- assert isinstance(basename, str) and len(basename) == 20
- assert isinstance(delta, str)
- type, base_obj = get_ref(basename)
- assert isinstance(type, int)
- type, base_text = resolve_object(offset, type, base_obj, get_ref, get_offset)
- return type, apply_delta(base_text, delta)
+def bisect_find_sha(start, end, sha, unpack_name):
+ assert start <= end
+ while start <= end:
+ i = (start + end)/2
+ file_sha = unpack_name(i)
+ x = cmp(file_sha, sha)
+ if x < 0:
+ start = i + 1
+ elif x > 0:
+ end = i - 1
+ else:
+ return i
+ return None
class PackIndex(object):
- """An index in to a packfile.
-
- Given a sha id of an object a pack index can tell you the location in the
- packfile of that object if it has it.
+ """An index in to a packfile.
+
+ Given a sha id of an object a pack index can tell you the location in the
+ packfile of that object if it has it.
+
+ To do the loop it opens the file, and indexes first 256 4 byte groups
+ with the first byte of the sha id. The value in the four byte group indexed
+ is the end of the group that shares the same starting byte. Subtract one
+ from the starting byte and index again to find the start of the group.
+ The values are sorted by sha id within the group, so do the math to find
+ the start and end offset and then bisect in to find if the value is present.
+ """
+
+ def __init__(self, filename, file=None):
+ """Create a pack index object.
+
+ Provide it with the name of the index file to consider, and it will map
+ it whenever required.
+ """
+ self._filename = filename
+ # Take the size now, so it can be checked each time we map the file to
+ # ensure that it hasn't changed.
+ self._size = os.path.getsize(filename)
+ if file is None:
+ self._file = open(filename, 'r')
+ else:
+ self._file = file
+ self._contents, map_offset = simple_mmap(self._file, 0, self._size)
+ assert map_offset == 0
+
+ def __eq__(self, other):
+ if not isinstance(other, PackIndex):
+ return False
+
+ if self._fan_out_table != other._fan_out_table:
+ return False
+
+ for (name1, _, _), (name2, _, _) in izip(self.iterentries(), other.iterentries()):
+ if name1 != name2:
+ return False
+ return True
+
+ def close(self):
+ self._file.close()
+
+ def __len__(self):
+ """Return the number of entries in this pack index."""
+ return self._fan_out_table[-1]
+
+ def _unpack_entry(self, i):
+ """Unpack the i-th entry in the index file.
+
+ :return: Tuple with object name (SHA), offset in pack file and
+ CRC32 checksum (if known)."""
+ raise NotImplementedError(self._unpack_entry)
+
+ def _unpack_name(self, i):
+ """Unpack the i-th name from the index file."""
+ raise NotImplementedError(self._unpack_name)
+
+ def _unpack_offset(self, i):
+ """Unpack the i-th object offset from the index file."""
+ raise NotImplementedError(self._unpack_offset)
+
+ def _unpack_crc32_checksum(self, i):
+ """Unpack the crc32 checksum for the i-th object from the index file."""
+ raise NotImplementedError(self._unpack_crc32_checksum)
+
+ def __iter__(self):
+ return imap(sha_to_hex, self._itersha())
+
+ def _itersha(self):
+ for i in range(len(self)):
+ yield self._unpack_name(i)
+
+ def objects_sha1(self):
+ """Return the hex SHA1 over all the shas of all objects in this pack.
+
+ :note: This is used for the filename of the pack.
+ """
+ return iter_sha1(self._itersha())
+
+ def iterentries(self):
+ """Iterate over the entries in this pack index.
+
+ Will yield tuples with object name, offset in packfile and crc32 checksum.
+ """
+ for i in range(len(self)):
+ yield self._unpack_entry(i)
+
+ def _read_fan_out_table(self, start_offset):
+ ret = []
+ for i in range(0x100):
+ ret.append(struct.unpack(">L", self._contents[start_offset+i*4:start_offset+(i+1)*4])[0])
+ return ret
+
+ def check(self):
+ """Check that the stored checksum matches the actual checksum."""
+ return self.calculate_checksum() == self.get_stored_checksum()
+
+ def calculate_checksum(self):
+ return make_sha(self._contents[:-20]).digest()
+
+ def get_pack_checksum(self):
+ """Return the SHA1 checksum stored for the corresponding packfile."""
+ return str(self._contents[-40:-20])
+
+ def get_stored_checksum(self):
+ """Return the SHA1 checksum stored for this index."""
+ return str(self._contents[-20:])
+
+ def object_index(self, sha):
+ """Return the index in to the corresponding packfile for the object.
+
+ Given the name of an object it will return the offset that object lives
+ at within the corresponding pack file. If the pack file doesn't have the
+ object then None will be returned.
+ """
+ if len(sha) == 40:
+ sha = hex_to_sha(sha)
+ return self._object_index(sha)
+
+ def _object_index(self, sha):
+ """See object_index.
+
+ :param sha: A *binary* SHA string. (20 characters long)_
+ """
+ assert len(sha) == 20
+ idx = ord(sha[0])
+ if idx == 0:
+ start = 0
+ else:
+ start = self._fan_out_table[idx-1]
+ end = self._fan_out_table[idx]
+ i = bisect_find_sha(start, end, sha, self._unpack_name)
+ if i is None:
+ raise KeyError(sha)
+ return self._unpack_offset(i)
+
- To do the loop it opens the file, and indexes first 256 4 byte groups
- with the first byte of the sha id. The value in the four byte group indexed
- is the end of the group that shares the same starting byte. Subtract one
- from the starting byte and index again to find the start of the group.
- The values are sorted by sha id within the group, so do the math to find
- the start and end offset and then bisect in to find if the value is present.
- """
- def __init__(self, filename):
- """Create a pack index object.
+class PackIndex1(PackIndex):
+ """Version 1 Pack Index."""
- Provide it with the name of the index file to consider, and it will map
- it whenever required.
- """
- self._filename = filename
- # Take the size now, so it can be checked each time we map the file to
- # ensure that it hasn't changed.
- self._size = os.path.getsize(filename)
- self._file = open(filename, 'r')
- self._contents = simple_mmap(self._file, 0, self._size)
- if self._contents[:4] != '\377tOc':
+ def __init__(self, filename, file=None):
+ PackIndex.__init__(self, filename, file)
self.version = 1
self._fan_out_table = self._read_fan_out_table(0)
- else:
- (self.version, ) = struct.unpack_from(">L", self._contents, 4)
- assert self.version in (2,), "Version was %d" % self.version
+
+ def _unpack_entry(self, i):
+ (offset, name) = unpack_from(">L20s", self._contents,
+ (0x100 * 4) + (i * 24))
+ return (name, offset, None)
+
+ def _unpack_name(self, i):
+ offset = (0x100 * 4) + (i * 24) + 4
+ return self._contents[offset:offset+20]
+
+ def _unpack_offset(self, i):
+ offset = (0x100 * 4) + (i * 24)
+ return unpack_from(">L", self._contents, offset)[0]
+
+ def _unpack_crc32_checksum(self, i):
+ # Not stored in v1 index files
+ return None
+
+
+class PackIndex2(PackIndex):
+ """Version 2 Pack Index."""
+
+ def __init__(self, filename, file=None):
+ PackIndex.__init__(self, filename, file)
+ assert self._contents[:4] == '\377tOc', "Not a v2 pack index file"
+ (self.version, ) = unpack_from(">L", self._contents, 4)
+ assert self.version == 2, "Version was %d" % self.version
self._fan_out_table = self._read_fan_out_table(8)
self._name_table_offset = 8 + 0x100 * 4
self._crc32_table_offset = self._name_table_offset + 20 * len(self)
self._pack_offset_table_offset = self._crc32_table_offset + 4 * len(self)
- def __eq__(self, other):
- if type(self) != type(other):
- return False
-
- if self._fan_out_table != other._fan_out_table:
- return False
-
- for (name1, _, _), (name2, _, _) in izip(self.iterentries(), other.iterentries()):
- if name1 != name2:
- return False
- return True
-
- def close(self):
- self._file.close()
-
- def __len__(self):
- """Return the number of entries in this pack index."""
- return self._fan_out_table[-1]
-
- def _unpack_entry(self, i):
- """Unpack the i-th entry in the index file.
-
- :return: Tuple with object name (SHA), offset in pack file and
- CRC32 checksum (if known)."""
- if self.version == 1:
- (offset, name) = struct.unpack_from(">L20s", self._contents,
- (0x100 * 4) + (i * 24))
- return (name, offset, None)
- else:
+ def _unpack_entry(self, i):
return (self._unpack_name(i), self._unpack_offset(i),
self._unpack_crc32_checksum(i))
-
- def _unpack_name(self, i):
- if self.version == 1:
- return self._unpack_entry(i)[0]
- else:
- return struct.unpack_from("20s", self._contents,
- self._name_table_offset + i * 20)[0]
-
- def _unpack_offset(self, i):
- if self.version == 1:
- return self._unpack_entry(i)[1]
- else:
- return struct.unpack_from(">L", self._contents,
- self._pack_offset_table_offset + i * 4)[0]
-
- def _unpack_crc32_checksum(self, i):
- if self.version == 1:
- return None
- else:
- return struct.unpack_from(">L", self._contents,
- self._crc32_table_offset + i * 4)[0]
-
- def __iter__(self):
- return imap(sha_to_hex, self._itersha())
-
- def _itersha(self):
- for i in range(len(self)):
- yield self._unpack_name(i)
-
- def objects_sha1(self):
- return iter_sha1(self._itersha())
-
- def iterentries(self):
- """Iterate over the entries in this pack index.
-
- Will yield tuples with object name, offset in packfile and crc32 checksum.
- """
- for i in range(len(self)):
- yield self._unpack_entry(i)
-
- def _read_fan_out_table(self, start_offset):
- ret = []
- for i in range(0x100):
- ret.append(struct.unpack(">L", self._contents[start_offset+i*4:start_offset+(i+1)*4])[0])
- return ret
-
- def check(self):
- """Check that the stored checksum matches the actual checksum."""
- return self.calculate_checksum() == self.get_stored_checksums()[1]
-
- def calculate_checksum(self):
- f = open(self._filename, 'r')
- try:
- return hashlib.sha1(self._contents[:-20]).digest()
- finally:
- f.close()
-
- def get_stored_checksums(self):
- """Return the SHA1 checksums stored for the corresponding packfile and
- this header file itself."""
- return str(self._contents[-40:-20]), str(self._contents[-20:])
-
- def object_index(self, sha):
- """Return the index in to the corresponding packfile for the object.
-
- Given the name of an object it will return the offset that object lives
- at within the corresponding pack file. If the pack file doesn't have the
- object then None will be returned.
- """
- size = os.path.getsize(self._filename)
- assert size == self._size, "Pack index %s has changed size, I don't " \
- "like that" % self._filename
- if len(sha) == 40:
- sha = hex_to_sha(sha)
- return self._object_index(sha)
-
- def _object_index(self, sha):
- """See object_index"""
- idx = ord(sha[0])
- if idx == 0:
- start = 0
- else:
- start = self._fan_out_table[idx-1]
- end = self._fan_out_table[idx]
- assert start <= end
- while start <= end:
- i = (start + end)/2
- file_sha = self._unpack_name(i)
- if file_sha < sha:
- start = i + 1
- elif file_sha > sha:
- end = i - 1
- else:
- return self._unpack_offset(i)
- return None
+
+ def _unpack_name(self, i):
+ offset = self._name_table_offset + i * 20
+ return self._contents[offset:offset+20]
+
+ def _unpack_offset(self, i):
+ offset = self._pack_offset_table_offset + i * 4
+ return unpack_from(">L", self._contents, offset)[0]
+
+ def _unpack_crc32_checksum(self, i):
+ return unpack_from(">L", self._contents,
+ self._crc32_table_offset + i * 4)[0]
+
def read_pack_header(f):
header = f.read(12)
assert header[:4] == "PACK"
- (version,) = struct.unpack_from(">L", header, 4)
+ (version,) = unpack_from(">L", header, 4)
assert version in (2, 3), "Version was %d" % version
- (num_objects,) = struct.unpack_from(">L", header, 8)
+ (num_objects,) = unpack_from(">L", header, 8)
return (version, num_objects)
return (f.read(20),)
-def unpack_object(map):
- bytes = take_msb_bytes(map, 0)
+def unpack_object(map, offset=0):
+ bytes = take_msb_bytes(map, offset)
type = (bytes[0] >> 4) & 0x07
size = bytes[0] & 0x0f
for i, byte in enumerate(bytes[1:]):
- size += (byte & 0x7f) << ((i * 7) + 4)
+ size += (byte & 0x7f) << ((i * 7) + 4)
raw_base = len(bytes)
if type == 6: # offset delta
- bytes = take_msb_bytes(map, raw_base)
+ bytes = take_msb_bytes(map, raw_base + offset)
assert not (bytes[-1] & 0x80)
delta_base_offset = bytes[0] & 0x7f
for byte in bytes[1:]:
delta_base_offset <<= 7
delta_base_offset += (byte & 0x7f)
raw_base+=len(bytes)
- uncomp, comp_len = read_zlib(map, raw_base, size)
+ uncomp, comp_len = read_zlib(map, offset + raw_base, size)
assert size == len(uncomp)
return type, (delta_base_offset, uncomp), comp_len+raw_base
elif type == 7: # ref delta
- basename = map[raw_base:raw_base+20]
- uncomp, comp_len = read_zlib(map, raw_base+20, size)
+ basename = map[offset+raw_base:offset+raw_base+20]
+ uncomp, comp_len = read_zlib(map, offset+raw_base+20, size)
assert size == len(uncomp)
return type, (basename, uncomp), comp_len+raw_base+20
else:
- uncomp, comp_len = read_zlib(map, raw_base, size)
+ uncomp, comp_len = read_zlib(map, offset+raw_base, size)
assert len(uncomp) == size
return type, uncomp, comp_len+raw_base
-class PackData(object):
- """The data contained in a packfile.
-
- Pack files can be accessed both sequentially for exploding a pack, and
- directly with the help of an index to retrieve a specific object.
+def compute_object_size((num, obj)):
+ if num in (6, 7):
+ return len(obj[1])
+ assert isinstance(obj, str)
+ return len(obj)
- The objects within are either complete or a delta aginst another.
- The header is variable length. If the MSB of each byte is set then it
- indicates that the subsequent byte is still part of the header.
- For the first byte the next MS bits are the type, which tells you the type
- of object, and whether it is a delta. The LS byte is the lowest bits of the
- size. For each subsequent byte the LS 7 bits are the next MS bits of the
- size, i.e. the last byte of the header contains the MS bits of the size.
-
- For the complete objects the data is stored as zlib deflated data.
- The size in the header is the uncompressed object size, so to uncompress
- you need to just keep feeding data to zlib until you get an object back,
- or it errors on bad data. This is done here by just giving the complete
- buffer from the start of the deflated object on. This is bad, but until I
- get mmap sorted out it will have to do.
-
- Currently there are no integrity checks done. Also no attempt is made to try
- and detect the delta case, or a request for an object at the wrong position.
- It will all just throw a zlib or KeyError.
- """
-
- def __init__(self, filename):
- """Create a PackData object that represents the pack in the given filename.
-
- The file must exist and stay readable until the object is disposed of. It
- must also stay the same size. It will be mapped whenever needed.
-
- Currently there is a restriction on the size of the pack as the python
- mmap implementation is flawed.
+class PackData(object):
+ """The data contained in a packfile.
+
+ Pack files can be accessed both sequentially for exploding a pack, and
+ directly with the help of an index to retrieve a specific object.
+
+ The objects within are either complete or a delta aginst another.
+
+ The header is variable length. If the MSB of each byte is set then it
+ indicates that the subsequent byte is still part of the header.
+ For the first byte the next MS bits are the type, which tells you the type
+ of object, and whether it is a delta. The LS byte is the lowest bits of the
+ size. For each subsequent byte the LS 7 bits are the next MS bits of the
+ size, i.e. the last byte of the header contains the MS bits of the size.
+
+ For the complete objects the data is stored as zlib deflated data.
+ The size in the header is the uncompressed object size, so to uncompress
+ you need to just keep feeding data to zlib until you get an object back,
+ or it errors on bad data. This is done here by just giving the complete
+ buffer from the start of the deflated object on. This is bad, but until I
+ get mmap sorted out it will have to do.
+
+ Currently there are no integrity checks done. Also no attempt is made to try
+ and detect the delta case, or a request for an object at the wrong position.
+ It will all just throw a zlib or KeyError.
"""
- self._filename = filename
- assert os.path.exists(filename), "%s is not a packfile" % filename
- self._size = os.path.getsize(filename)
- self._header_size = 12
- assert self._size >= self._header_size, "%s is too small for a packfile" % filename
- self._read_header()
-
- def _read_header(self):
- f = open(self._filename, 'rb')
- try:
- (version, self._num_objects) = \
- read_pack_header(f)
- f.seek(self._size-20)
- (self._stored_checksum,) = read_pack_tail(f)
- finally:
- f.close()
-
- def __len__(self):
- """Returns the number of objects in this pack."""
- return self._num_objects
-
- def calculate_checksum(self):
- f = open(self._filename, 'rb')
- try:
- map = simple_mmap(f, 0, self._size)
- return hashlib.sha1(map[:-20]).digest()
- finally:
- f.close()
-
- def iterobjects(self):
- offset = self._header_size
- f = open(self._filename, 'rb')
- for i in range(len(self)):
- map = simple_mmap(f, offset, self._size-offset)
- (type, obj, total_size) = unpack_object(map)
- yield offset, type, obj
- offset += total_size
- f.close()
+
+ def __init__(self, filename):
+ """Create a PackData object that represents the pack in the given filename.
+
+ The file must exist and stay readable until the object is disposed of. It
+ must also stay the same size. It will be mapped whenever needed.
+
+ Currently there is a restriction on the size of the pack as the python
+ mmap implementation is flawed.
+ """
+ self._filename = filename
+ assert os.path.exists(filename), "%s is not a packfile" % filename
+ self._size = os.path.getsize(filename)
+ self._header_size = 12
+ assert self._size >= self._header_size, "%s is too small for a packfile (%d < %d)" % (filename, self._size, self._header_size)
+ self._file = open(self._filename, 'rb')
+ self._read_header()
+ self._offset_cache = LRUSizeCache(1024*1024*20,
+ compute_size=compute_object_size)
- def iterentries(self, ext_resolve_ref=None):
- found = {}
- at = {}
- postponed = defaultdict(list)
- class Postpone(Exception):
- """Raised to postpone delta resolving."""
+ def close(self):
+ self._file.close()
+
+ def _read_header(self):
+ (version, self._num_objects) = read_pack_header(self._file)
+ self._file.seek(self._size-20)
+ (self._stored_checksum,) = read_pack_tail(self._file)
+
+ def __len__(self):
+ """Returns the number of objects in this pack."""
+ return self._num_objects
+
+ def calculate_checksum(self):
+ """Calculate the checksum for this pack."""
+ map, map_offset = simple_mmap(self._file, 0, self._size - 20)
+ try:
+ return make_sha(map[map_offset:self._size-20]).digest()
+ finally:
+ map.close()
+
+ def resolve_object(self, offset, type, obj, get_ref, get_offset=None):
+ """Resolve an object, possibly resolving deltas when necessary.
- def get_ref_text(sha):
- if sha in found:
- return found[sha]
- if ext_resolve_ref:
+ :return: Tuple with object type and contents.
+ """
+ if type not in (6, 7): # Not a delta
+ return type, obj
+
+ if get_offset is None:
+ get_offset = self.get_object_at
+
+ if type == 6: # offset delta
+ (delta_offset, delta) = obj
+ assert isinstance(delta_offset, int)
+ assert isinstance(delta, str)
+ base_offset = offset-delta_offset
+ type, base_obj = get_offset(base_offset)
+ assert isinstance(type, int)
+ elif type == 7: # ref delta
+ (basename, delta) = obj
+ assert isinstance(basename, str) and len(basename) == 20
+ assert isinstance(delta, str)
+ type, base_obj = get_ref(basename)
+ assert isinstance(type, int)
+ # Can't be a ofs delta, as we wouldn't know the base offset
+ assert type != 6
+ base_offset = None
+ type, base_text = self.resolve_object(base_offset, type, base_obj, get_ref)
+ if base_offset is not None:
+ self._offset_cache[base_offset] = type, base_text
+ ret = (type, apply_delta(base_text, delta))
+ return ret
+
+ def iterobjects(self, progress=None):
+ offset = self._header_size
+ num = len(self)
+ map, _ = simple_mmap(self._file, 0, self._size)
+ try:
+ for i in range(num):
+ (type, obj, total_size) = unpack_object(map, offset)
+ crc32 = zlib.crc32(map[offset:offset+total_size]) & 0xffffffff
+ yield offset, type, obj, crc32
+ offset += total_size
+ if progress:
+ progress(i, num)
+ finally:
+ map.close()
+
+ def iterentries(self, ext_resolve_ref=None, progress=None):
+ found = {}
+ postponed = defaultdict(list)
+ class Postpone(Exception):
+ """Raised to postpone delta resolving."""
+
+ def get_ref_text(sha):
+ assert len(sha) == 20
+ if sha in found:
+ return found[sha]
+ if ext_resolve_ref:
+ try:
+ return ext_resolve_ref(sha)
+ except KeyError:
+ pass
+ raise Postpone, (sha, )
+ extra = []
+ todo = chain(self.iterobjects(progress=progress), extra)
+ for (offset, type, obj, crc32) in todo:
+ assert isinstance(offset, int)
+ assert isinstance(type, int)
+ assert isinstance(obj, tuple) or isinstance(obj, str)
try:
- return ext_resolve_ref(sha)
- except KeyError:
- pass
- raise Postpone, (sha, )
- todo = list(self.iterobjects())
- while todo:
- (offset, type, obj) = todo.pop(0)
- at[offset] = (type, obj)
- assert isinstance(offset, int)
- assert isinstance(type, int)
- assert isinstance(obj, tuple) or isinstance(obj, str)
- try:
- type, obj = resolve_object(offset, type, obj, get_ref_text,
- at.__getitem__)
- except Postpone, (sha, ):
- postponed[sha].append((offset, type, obj))
- else:
- shafile = ShaFile.from_raw_string(type, obj)
- sha = shafile.sha().digest()
- found[sha] = (type, obj)
- yield sha, offset, shafile.crc32()
- todo += postponed.get(sha, [])
- if postponed:
- raise KeyError([sha_to_hex(h) for h in postponed.keys()])
-
- def sorted_entries(self, resolve_ext_ref=None):
- ret = list(self.iterentries(resolve_ext_ref))
- ret.sort()
- return ret
-
- def create_index_v1(self, filename):
- entries = self.sorted_entries()
- write_pack_index_v1(filename, entries, self.calculate_checksum())
-
- def create_index_v2(self, filename):
- entries = self.sorted_entries()
- write_pack_index_v2(filename, entries, self.calculate_checksum())
-
- def get_stored_checksum(self):
- return self._stored_checksum
-
- def check(self):
- return (self.calculate_checksum() == self.get_stored_checksum())
-
- def get_object_at(self, offset):
- """Given an offset in to the packfile return the object that is there.
-
- Using the associated index the location of an object can be looked up, and
- then the packfile can be asked directly for that object using this
- function.
- """
- assert isinstance(offset, long) or isinstance(offset, int),\
- "offset was %r" % offset
- assert offset >= self._header_size
- size = os.path.getsize(self._filename)
- assert size == self._size, "Pack data %s has changed size, I don't " \
- "like that" % self._filename
- f = open(self._filename, 'rb')
- try:
- map = simple_mmap(f, offset, size-offset)
- return unpack_object(map)[:2]
- finally:
- f.close()
+ type, obj = self.resolve_object(offset, type, obj, get_ref_text)
+ except Postpone, (sha, ):
+ postponed[sha].append((offset, type, obj))
+ else:
+ shafile = ShaFile.from_raw_string(type, obj)
+ sha = shafile.sha().digest()
+ found[sha] = (type, obj)
+ yield sha, offset, crc32
+ extra.extend(postponed.get(sha, []))
+ if postponed:
+ raise KeyError([sha_to_hex(h) for h in postponed.keys()])
+
+ def sorted_entries(self, resolve_ext_ref=None, progress=None):
+ ret = list(self.iterentries(resolve_ext_ref, progress=progress))
+ ret.sort()
+ return ret
+
+ def create_index_v1(self, filename, resolve_ext_ref=None, progress=None):
+ entries = self.sorted_entries(resolve_ext_ref, progress=progress)
+ write_pack_index_v1(filename, entries, self.calculate_checksum())
+
+ def create_index_v2(self, filename, resolve_ext_ref=None, progress=None):
+ entries = self.sorted_entries(resolve_ext_ref, progress=progress)
+ write_pack_index_v2(filename, entries, self.calculate_checksum())
+
+ def get_stored_checksum(self):
+ return self._stored_checksum
+
+ def check(self):
+ return (self.calculate_checksum() == self.get_stored_checksum())
+
+ def get_object_at(self, offset):
+ """Given an offset in to the packfile return the object that is there.
+
+ Using the associated index the location of an object can be looked up, and
+ then the packfile can be asked directly for that object using this
+ function.
+ """
+ if offset in self._offset_cache:
+ return self._offset_cache[offset]
+ assert isinstance(offset, long) or isinstance(offset, int),\
+ "offset was %r" % offset
+ assert offset >= self._header_size
+ map, map_offset = simple_mmap(self._file, offset, self._size-offset)
+ try:
+ ret = unpack_object(map, map_offset)[:2]
+ return ret
+ finally:
+ map.close()
class SHA1Writer(object):
def __init__(self, f):
self.f = f
- self.sha1 = hashlib.sha1("")
+ self.sha1 = make_sha("")
def write(self, data):
self.sha1.update(data)
:param f: File to write to
:param o: Object to write
+ :return: Tuple with offset at which the object was written, and crc32
"""
ret = f.tell()
+ packed_data_hdr = ""
if type == 6: # ref delta
(delta_base_offset, object) = object
elif type == 7: # offset delta
c = (type << 4) | (size & 15)
size >>= 4
while size:
- f.write(chr(c | 0x80))
+ packed_data_hdr += (chr(c | 0x80))
c = size & 0x7f
size >>= 7
- f.write(chr(c))
+ packed_data_hdr += chr(c)
if type == 6: # offset delta
ret = [delta_base_offset & 0x7f]
delta_base_offset >>= 7
delta_base_offset -= 1
ret.insert(0, 0x80 | (delta_base_offset & 0x7f))
delta_base_offset >>= 7
- f.write("".join([chr(x) for x in ret]))
+ packed_data_hdr += "".join([chr(x) for x in ret])
elif type == 7: # ref delta
assert len(basename) == 20
- f.write(basename)
- f.write(zlib.compress(object))
- return f.tell()
+ packed_data_hdr += basename
+ packed_data = packed_data_hdr + zlib.compress(object)
+ f.write(packed_data)
+ return (f.tell(), (zlib.crc32(packed_data) & 0xffffffff))
def write_pack(filename, objects, num_objects):
"""Write a new pack file.
:param filename: The filename of the new pack file.
- :param objects: List of objects to write.
+ :param objects: List of objects to write (tuples with object and path)
:return: List with (name, offset, crc32 checksum) entries, pack checksum
"""
recency = list(objects)
f.write("PACK") # Pack header
f.write(struct.pack(">L", 2)) # Pack version
f.write(struct.pack(">L", num_objects)) # Number of objects in pack
- for o in recency:
+ for o, path in recency:
sha1 = o.sha().digest()
- crc32 = o.crc32()
orig_t, raw = o.as_raw_string()
winner = raw
t = orig_t
# if len(delta) < len(winner):
# winner = delta
# t = 6 if magic[i][2] == 1 else 7
- offset = write_pack_object(f, t, winner)
+ offset, crc32 = write_pack_object(f, t, winner)
entries.append((sha1, offset, crc32))
return entries, f.write_sha()
def apply_delta(src_buf, delta):
- """Based on the similar function in git's patch-delta.c."""
+ """Based on the similar function in git's patch-delta.c.
+
+ :param src_buf: Source buffer
+ :param delta: Delta instructions
+ """
assert isinstance(src_buf, str), "was %r" % (src_buf,)
assert isinstance(delta, str)
- out = ""
- def pop(delta):
- ret = delta[0]
- delta = delta[1:]
- return ord(ret), delta
- def get_delta_header_size(delta):
+ out = []
+ index = 0
+ delta_length = len(delta)
+ def get_delta_header_size(delta, index):
size = 0
i = 0
while delta:
- cmd, delta = pop(delta)
+ cmd = ord(delta[index])
+ index += 1
size |= (cmd & ~0x80) << i
i += 7
if not cmd & 0x80:
break
- return size, delta
- src_size, delta = get_delta_header_size(delta)
- dest_size, delta = get_delta_header_size(delta)
+ return size, index
+ src_size, index = get_delta_header_size(delta, index)
+ dest_size, index = get_delta_header_size(delta, index)
assert src_size == len(src_buf), "%d vs %d" % (src_size, len(src_buf))
- while delta:
- cmd, delta = pop(delta)
+ while index < delta_length:
+ cmd = ord(delta[index])
+ index += 1
if cmd & 0x80:
cp_off = 0
for i in range(4):
if cmd & (1 << i):
- x, delta = pop(delta)
+ x = ord(delta[index])
+ index += 1
cp_off |= x << (i * 8)
cp_size = 0
for i in range(3):
if cmd & (1 << (4+i)):
- x, delta = pop(delta)
+ x = ord(delta[index])
+ index += 1
cp_size |= x << (i * 8)
if cp_size == 0:
cp_size = 0x10000
cp_off + cp_size > src_size or
cp_size > dest_size):
break
- out += src_buf[cp_off:cp_off+cp_size]
+ out.append(src_buf[cp_off:cp_off+cp_size])
elif cmd != 0:
- out += delta[:cmd]
- delta = delta[cmd:]
+ out.append(delta[index:index+cmd])
+ index += cmd
else:
raise ApplyDeltaError("Invalid opcode 0")
- if delta != "":
- raise ApplyDeltaError("delta not empty: %r" % delta)
+ if index != delta_length:
+ raise ApplyDeltaError("delta not empty: %r" % delta[index:])
+ out = ''.join(out)
if dest_size != len(out):
raise ApplyDeltaError("dest size incorrect")
for (name, offset, entry_checksum) in entries:
f.write(name)
for (name, offset, entry_checksum) in entries:
- f.write(struct.pack(">l", entry_checksum))
+ f.write(struct.pack(">L", entry_checksum))
for (name, offset, entry_checksum) in entries:
# FIXME: handle if MSBit is set in offset
f.write(struct.pack(">L", offset))
self._data = None
self._idx = None
+ @classmethod
+ def from_objects(self, data, idx):
+ ret = Pack("")
+ ret._data = data
+ ret._idx = idx
+ return ret
+
def name(self):
+ """The SHA over the SHAs of the objects in this pack."""
return self.idx.objects_sha1()
@property
if self._data is None:
self._data = PackData(self._data_path)
assert len(self.idx) == len(self._data)
- assert self.idx.get_stored_checksums()[0] == self._data.get_stored_checksum()
+ idx_stored_checksum = self.idx.get_pack_checksum()
+ data_stored_checksum = self._data.get_stored_checksum()
+ if idx_stored_checksum != data_stored_checksum:
+ raise ChecksumMismatch(sha_to_hex(idx_stored_checksum),
+ sha_to_hex(data_stored_checksum))
return self._data
@property
def idx(self):
if self._idx is None:
- self._idx = PackIndex(self._idx_path)
+ self._idx = load_pack_index(self._idx_path)
return self._idx
def close(self):
return len(self.idx)
def __repr__(self):
- return "Pack(%r)" % self._basename
+ return "%s(%r)" % (self.__class__.__name__, self._basename)
def __iter__(self):
"""Iterate over all the sha1s of the objects in this pack."""
return iter(self.idx)
def check(self):
- return self.idx.check() and self.data.check()
+ if not self.idx.check():
+ return False
+ if not self.data.check():
+ return False
+ return True
def get_stored_checksum(self):
return self.data.get_stored_checksum()
def __contains__(self, sha1):
"""Check whether this pack contains a particular SHA1."""
- return (self.idx.object_index(sha1) is not None)
+ try:
+ self.idx.object_index(sha1)
+ return True
+ except KeyError:
+ return False
def get_raw(self, sha1, resolve_ref=None):
offset = self.idx.object_index(sha1)
- if offset is None:
- raise KeyError(sha1)
-
- type, obj = self.data.get_object_at(offset)
- assert isinstance(offset, int)
- return resolve_object(offset, type, obj, resolve_ref,
- self.data.get_object_at)
+ obj_type, obj = self.data.get_object_at(offset)
+ if type(offset) is long:
+ offset = int(offset)
+ if resolve_ref is None:
+ resolve_ref = self.get_raw
+ return self.data.resolve_object(offset, obj_type, obj, resolve_ref)
def __getitem__(self, sha1):
"""Retrieve the specified SHA1."""
def iterobjects(self, get_raw=None):
if get_raw is None:
- def get_raw(x):
- raise KeyError(x)
- for offset, type, obj in self.data.iterobjects():
+ get_raw = self.get_raw
+ for offset, type, obj, crc32 in self.data.iterobjects():
assert isinstance(offset, int)
yield ShaFile.from_raw_string(
- *resolve_object(offset, type, obj,
- get_raw,
- self.data.get_object_at))
+ *self.data.resolve_object(offset, type, obj, get_raw))
def load_packs(path):
if name.startswith("pack-") and name.endswith(".pack"):
yield Pack(os.path.join(path, name[:-len(".pack")]))
+
+try:
+ from dulwich._pack import apply_delta, bisect_find_sha
+except ImportError:
+ pass