1 # pack.py -- For dealing with packed git objects.
2 # Copyright (C) 2007 James Westby <jw+debian@jameswestby.net>
3 # Copyright (C) 2008-2009 Jelmer Vernooij <jelmer@samba.org>
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; version 2
8 # of the License or (at your option) a later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
20 """Classes for dealing with packed git objects.
22 A pack is a compact representation of a bunch of objects, stored
23 using deltas where possible.
25 They have two parts, the pack file, which stores the data, and an index
26 that tells you where the data is.
28 To find an object you look in all of the index files 'til you find a
29 match for the object name. You then use the pointer got from this as
30 a pointer in to the corresponding packfile.
34 from collections import defaultdict
36 from misc import defaultdict
38 from cStringIO import (
41 from collections import (
45 from itertools import (
54 from struct import unpack_from
56 from dulwich.misc import unpack_from
60 from dulwich.errors import (
64 from dulwich.file import GitFile
65 from dulwich.lru_cache import (
68 from dulwich.misc import (
72 from dulwich.objects import (
79 supports_mmap_offset = (sys.version_info[0] >= 3 or
80 (sys.version_info[0] == 2 and sys.version_info[1] >= 6))
86 DELTA_TYPES = (OFS_DELTA, REF_DELTA)
89 def take_msb_bytes(read):
90 """Read bytes marked with most significant bit.
92 :param read: Read function
95 while len(ret) == 0 or ret[-1] & 0x80:
96 ret.append(ord(read(1)))
100 def read_zlib_chunks(read_some, dec_size, buffer_size=4096):
101 """Read zlib data from a buffer.
103 This function requires that the buffer have additional data following the
104 compressed data, which is guaranteed to be the case for git pack files.
106 :param read_some: Read function that returns at least one byte, but may
107 return less than the requested size
108 :param dec_size: Expected size of the decompressed buffer
109 :param buffer_size: Size of the read buffer
110 :return: Tuple with list of chunks, length of compressed data length and
111 and unused read data.
112 :raise zlib.error: if a decompression error occurred.
115 raise ValueError("non-negative zlib data stream size expected")
116 obj = zlib.decompressobj()
120 while obj.unused_data == "":
121 add = read_some(buffer_size)
123 raise zlib.error("EOF before end of zlib stream")
125 decomp = obj.decompress(add)
129 raise zlib.error("decompressed data does not match expected size")
130 comp_len = fed - len(obj.unused_data)
131 return ret, comp_len, obj.unused_data
135 """Return the hexdigest of the SHA1 over a set of names.
137 :param iter: Iterator over string objects
138 :return: 40-byte hex sha1 digest
143 return sha1.hexdigest()
146 def load_pack_index(path):
147 """Load an index file by path.
149 :param filename: Path to the index file
150 :return: A PackIndex loaded from the given path
152 f = GitFile(path, 'rb')
154 return load_pack_index_file(path, f)
159 def _load_file_contents(f, size=None):
160 fileno = getattr(f, 'fileno', None)
161 # Attempt to use mmap if possible
162 if fileno is not None:
165 size = os.fstat(fd).st_size
167 contents = mmap.mmap(fd, size, access=mmap.ACCESS_READ)
172 return contents, size
175 return contents, size
178 def load_pack_index_file(path, f):
179 """Load an index file from a file-like object.
181 :param path: Path for the index file
182 :param f: File-like object
183 :return: A PackIndex loaded from the given file
185 contents, size = _load_file_contents(f)
186 if contents[:4] == '\377tOc':
187 version = struct.unpack(">L", contents[4:8])[0]
189 return PackIndex2(path, file=f, contents=contents,
192 raise KeyError("Unknown pack index format %d" % version)
194 return PackIndex1(path, file=f, contents=contents, size=size)
197 def bisect_find_sha(start, end, sha, unpack_name):
198 """Find a SHA in a data blob with sorted SHAs.
200 :param start: Start index of range to search
201 :param end: End index of range to search
202 :param sha: Sha to find
203 :param unpack_name: Callback to retrieve SHA by index
204 :return: Index of the SHA, or None if it wasn't found
209 file_sha = unpack_name(i)
210 x = cmp(file_sha, sha)
220 class PackIndex(object):
221 """An index in to a packfile.
223 Given a sha id of an object a pack index can tell you the location in the
224 packfile of that object if it has it.
226 To do the loop it opens the file, and indexes first 256 4 byte groups
227 with the first byte of the sha id. The value in the four byte group indexed
228 is the end of the group that shares the same starting byte. Subtract one
229 from the starting byte and index again to find the start of the group.
230 The values are sorted by sha id within the group, so do the math to find
231 the start and end offset and then bisect in to find if the value is present.
234 def __init__(self, filename, file=None, contents=None, size=None):
235 """Create a pack index object.
237 Provide it with the name of the index file to consider, and it will map
238 it whenever required.
240 self._filename = filename
241 # Take the size now, so it can be checked each time we map the file to
242 # ensure that it hasn't changed.
244 self._file = GitFile(filename, 'rb')
248 self._contents, self._size = _load_file_contents(file, size)
250 self._contents, self._size = (contents, size)
252 def __eq__(self, other):
253 if not isinstance(other, PackIndex):
256 if self._fan_out_table != other._fan_out_table:
259 for (name1, _, _), (name2, _, _) in izip(self.iterentries(),
260 other.iterentries()):
265 def __ne__(self, other):
266 return not self.__eq__(other)
272 """Return the number of entries in this pack index."""
273 return self._fan_out_table[-1]
275 def _unpack_entry(self, i):
276 """Unpack the i-th entry in the index file.
278 :return: Tuple with object name (SHA), offset in pack file and CRC32
281 raise NotImplementedError(self._unpack_entry)
283 def _unpack_name(self, i):
284 """Unpack the i-th name from the index file."""
285 raise NotImplementedError(self._unpack_name)
287 def _unpack_offset(self, i):
288 """Unpack the i-th object offset from the index file."""
289 raise NotImplementedError(self._unpack_offset)
291 def _unpack_crc32_checksum(self, i):
292 """Unpack the crc32 checksum for the i-th object from the index file."""
293 raise NotImplementedError(self._unpack_crc32_checksum)
296 """Iterate over the SHAs in this pack."""
297 return imap(sha_to_hex, self._itersha())
300 for i in range(len(self)):
301 yield self._unpack_name(i)
303 def objects_sha1(self):
304 """Return the hex SHA1 over all the shas of all objects in this pack.
306 :note: This is used for the filename of the pack.
308 return iter_sha1(self._itersha())
310 def iterentries(self):
311 """Iterate over the entries in this pack index.
313 :yields: tuples with object name, offset in packfile and crc32 checksum.
315 for i in range(len(self)):
316 yield self._unpack_entry(i)
318 def _read_fan_out_table(self, start_offset):
320 for i in range(0x100):
321 fanout_entry = self._contents[start_offset+i*4:start_offset+(i+1)*4]
322 ret.append(struct.unpack(">L", fanout_entry)[0])
326 """Check that the stored checksum matches the actual checksum."""
327 actual = self.calculate_checksum()
328 stored = self.get_stored_checksum()
330 raise ChecksumMismatch(stored, actual)
332 def calculate_checksum(self):
333 """Calculate the SHA1 checksum over this pack index.
335 :return: This is a 20-byte binary digest
337 return make_sha(self._contents[:-20]).digest()
339 def get_pack_checksum(self):
340 """Return the SHA1 checksum stored for the corresponding packfile.
342 :return: 20-byte binary digest
344 return str(self._contents[-40:-20])
346 def get_stored_checksum(self):
347 """Return the SHA1 checksum stored for this index.
349 :return: 20-byte binary digest
351 return str(self._contents[-20:])
353 def object_index(self, sha):
354 """Return the index in to the corresponding packfile for the object.
356 Given the name of an object it will return the offset that object
357 lives at within the corresponding pack file. If the pack file doesn't
358 have the object then None will be returned.
361 sha = hex_to_sha(sha)
362 return self._object_index(sha)
364 def _object_index(self, sha):
367 :param sha: A *binary* SHA string. (20 characters long)_
369 assert len(sha) == 20
374 start = self._fan_out_table[idx-1]
375 end = self._fan_out_table[idx]
376 i = bisect_find_sha(start, end, sha, self._unpack_name)
379 return self._unpack_offset(i)
382 class PackIndex1(PackIndex):
383 """Version 1 Pack Index."""
385 def __init__(self, filename, file=None, contents=None, size=None):
386 PackIndex.__init__(self, filename, file, contents, size)
388 self._fan_out_table = self._read_fan_out_table(0)
390 def _unpack_entry(self, i):
391 (offset, name) = unpack_from(">L20s", self._contents,
392 (0x100 * 4) + (i * 24))
393 return (name, offset, None)
395 def _unpack_name(self, i):
396 offset = (0x100 * 4) + (i * 24) + 4
397 return self._contents[offset:offset+20]
399 def _unpack_offset(self, i):
400 offset = (0x100 * 4) + (i * 24)
401 return unpack_from(">L", self._contents, offset)[0]
403 def _unpack_crc32_checksum(self, i):
404 # Not stored in v1 index files
408 class PackIndex2(PackIndex):
409 """Version 2 Pack Index."""
411 def __init__(self, filename, file=None, contents=None, size=None):
412 PackIndex.__init__(self, filename, file, contents, size)
413 assert self._contents[:4] == '\377tOc', "Not a v2 pack index file"
414 (self.version, ) = unpack_from(">L", self._contents, 4)
415 assert self.version == 2, "Version was %d" % self.version
416 self._fan_out_table = self._read_fan_out_table(8)
417 self._name_table_offset = 8 + 0x100 * 4
418 self._crc32_table_offset = self._name_table_offset + 20 * len(self)
419 self._pack_offset_table_offset = (self._crc32_table_offset +
422 def _unpack_entry(self, i):
423 return (self._unpack_name(i), self._unpack_offset(i),
424 self._unpack_crc32_checksum(i))
426 def _unpack_name(self, i):
427 offset = self._name_table_offset + i * 20
428 return self._contents[offset:offset+20]
430 def _unpack_offset(self, i):
431 offset = self._pack_offset_table_offset + i * 4
432 return unpack_from(">L", self._contents, offset)[0]
434 def _unpack_crc32_checksum(self, i):
435 return unpack_from(">L", self._contents,
436 self._crc32_table_offset + i * 4)[0]
439 def read_pack_header(read):
440 """Read the header of a pack file.
442 :param read: Read function
443 :return: Tuple with pack version and number of objects
446 assert header[:4] == "PACK"
447 (version,) = unpack_from(">L", header, 4)
448 assert version in (2, 3), "Version was %d" % version
449 (num_objects,) = unpack_from(">L", header, 8)
450 return (version, num_objects)
453 def chunks_length(chunks):
454 return sum(imap(len, chunks))
457 def unpack_object(read_all, read_some=None):
458 """Unpack a Git object.
460 :param read_all: Read function that blocks until the number of requested
462 :param read_some: Read function that returns at least one byte, but may not
463 return the number of bytes requested.
464 :return: tuple with type, uncompressed data, compressed size and tail data.
466 if read_some is None:
468 bytes = take_msb_bytes(read_all)
469 type = (bytes[0] >> 4) & 0x07
470 size = bytes[0] & 0x0f
471 for i, byte in enumerate(bytes[1:]):
472 size += (byte & 0x7f) << ((i * 7) + 4)
473 raw_base = len(bytes)
474 if type == OFS_DELTA:
475 bytes = take_msb_bytes(read_all)
476 raw_base += len(bytes)
477 assert not (bytes[-1] & 0x80)
478 delta_base_offset = bytes[0] & 0x7f
479 for byte in bytes[1:]:
480 delta_base_offset += 1
481 delta_base_offset <<= 7
482 delta_base_offset += (byte & 0x7f)
483 uncomp, comp_len, unused = read_zlib_chunks(read_some, size)
484 assert size == chunks_length(uncomp)
485 return type, (delta_base_offset, uncomp), comp_len+raw_base, unused
486 elif type == REF_DELTA:
487 basename = read_all(20)
489 uncomp, comp_len, unused = read_zlib_chunks(read_some, size)
490 assert size == chunks_length(uncomp)
491 return type, (basename, uncomp), comp_len+raw_base, unused
493 uncomp, comp_len, unused = read_zlib_chunks(read_some, size)
494 assert chunks_length(uncomp) == size
495 return type, uncomp, comp_len+raw_base, unused
498 def _compute_object_size((num, obj)):
499 """Compute the size of a unresolved object for use with LRUSizeCache."""
500 if num in DELTA_TYPES:
501 return chunks_length(obj[1])
502 return chunks_length(obj)
505 class PackStreamReader(object):
506 """Class to read a pack stream.
508 The pack is read from a ReceivableProtocol using read() or recv() as
512 def __init__(self, read_all, read_some=None):
513 self.read_all = read_all
514 if read_some is None:
515 self.read_some = read_all
517 self.read_some = read_some
518 self.sha = make_sha()
520 self._rbuf = StringIO()
521 # trailer is a deque to avoid memory allocation on small reads
522 self._trailer = deque()
524 def _read(self, read, size):
525 """Read up to size bytes using the given callback.
527 As a side effect, update the verifier's hash (excluding the last 20
528 bytes read) and write through to the output file.
530 :param read: The read callback to read from.
531 :param size: The maximum number of bytes to read; the particular
532 behavior is callback-specific.
536 # maintain a trailer of the last 20 bytes we've read
539 tn = len(self._trailer)
544 to_pop = max(n + tn - 20, 0)
546 for _ in xrange(to_pop):
547 self.sha.update(self._trailer.popleft())
548 self._trailer.extend(data[-to_add:])
550 # hash everything but the trailer
551 self.sha.update(data[:-to_add])
557 buf.seek(0, SEEK_END)
564 return self._offset - self._buf_len()
566 def read(self, size):
567 """Read, blocking until size bytes are read."""
568 buf_len = self._buf_len()
570 return self._rbuf.read(size)
571 buf_data = self._rbuf.read()
572 self._rbuf = StringIO()
573 return buf_data + self._read(self.read_all, size - buf_len)
575 def recv(self, size):
576 """Read up to size bytes, blocking until one byte is read."""
577 buf_len = self._buf_len()
579 data = self._rbuf.read(size)
581 self._rbuf = StringIO()
583 return self._read(self.read_some, size)
586 return self._num_objects
588 def read_objects(self):
589 """Read the objects in this pack file.
591 :raise AssertionError: if there is an error in the pack format.
592 :raise ChecksumMismatch: if the checksum of the pack contents does not
593 match the checksum in the pack trailer.
594 :raise zlib.error: if an error occurred during zlib decompression.
595 :raise IOError: if an error occurred writing to the output file.
597 pack_version, self._num_objects = read_pack_header(self.read)
598 for i in xrange(self._num_objects):
599 type, uncomp, comp_len, unused = unpack_object(self.read, self.recv)
600 yield type, uncomp, comp_len
602 # prepend any unused data to current read buffer
605 buf.write(self._rbuf.read())
609 pack_sha = sha_to_hex(''.join([c for c in self._trailer]))
610 calculated_sha = self.sha.hexdigest()
611 if pack_sha != calculated_sha:
612 raise ChecksumMismatch(pack_sha, calculated_sha)
615 class PackObjectIterator(object):
617 def __init__(self, pack, progress=None):
619 self.offset = pack._header_size
621 self.map = pack._file
622 self._progress = progress
631 if self.i == self.num:
633 self.map.seek(self.offset)
634 (type, obj, total_size, unused) = unpack_object(self.map.read)
635 self.map.seek(self.offset)
636 crc32 = zlib.crc32(self.map.read(total_size)) & 0xffffffff
637 ret = (self.offset, type, obj, crc32)
638 self.offset += total_size
639 if self._progress is not None:
640 self._progress(self.i, self.num)
644 def obj_sha(type, chunks):
645 """Compute the SHA for a numeric type and object chunks."""
647 sha.update(object_header(type, chunks_length(chunks)))
653 class PackData(object):
654 """The data contained in a packfile.
656 Pack files can be accessed both sequentially for exploding a pack, and
657 directly with the help of an index to retrieve a specific object.
659 The objects within are either complete or a delta aginst another.
661 The header is variable length. If the MSB of each byte is set then it
662 indicates that the subsequent byte is still part of the header.
663 For the first byte the next MS bits are the type, which tells you the type
664 of object, and whether it is a delta. The LS byte is the lowest bits of the
665 size. For each subsequent byte the LS 7 bits are the next MS bits of the
666 size, i.e. the last byte of the header contains the MS bits of the size.
668 For the complete objects the data is stored as zlib deflated data.
669 The size in the header is the uncompressed object size, so to uncompress
670 you need to just keep feeding data to zlib until you get an object back,
671 or it errors on bad data. This is done here by just giving the complete
672 buffer from the start of the deflated object on. This is bad, but until I
673 get mmap sorted out it will have to do.
675 Currently there are no integrity checks done. Also no attempt is made to
676 try and detect the delta case, or a request for an object at the wrong
677 position. It will all just throw a zlib or KeyError.
680 def __init__(self, filename, file=None, size=None):
681 """Create a PackData object representing the pack in the given filename.
683 The file must exist and stay readable until the object is disposed of. It
684 must also stay the same size. It will be mapped whenever needed.
686 Currently there is a restriction on the size of the pack as the python
687 mmap implementation is flawed.
689 self._filename = filename
691 self._header_size = 12
693 self._file = GitFile(self._filename, 'rb')
696 (version, self._num_objects) = read_pack_header(self._file.read)
697 self._offset_cache = LRUSizeCache(1024*1024*20,
698 compute_size=_compute_object_size)
702 def from_file(cls, file, size):
703 return cls(str(file), file=file, size=size)
706 def from_path(cls, path):
707 return cls(filename=path)
713 if self._size is not None:
715 self._size = os.path.getsize(self._filename)
716 if self._size < self._header_size:
717 errmsg = ("%s is too small for a packfile (%d < %d)" %
718 (self._filename, self._size, self._header_size))
719 raise AssertionError(errmsg)
723 """Returns the number of objects in this pack."""
724 return self._num_objects
726 def calculate_checksum(self):
727 """Calculate the checksum for this pack.
729 :return: 20-byte binary SHA1 digest
733 todo = self._get_size() - 20
735 x = self._file.read(min(todo, 1<<16))
740 def get_ref(self, sha):
741 """Get the object for a ref SHA, only looking in this pack."""
742 # TODO: cache these results
743 if self.pack is None:
745 offset = self.pack.index.object_index(sha)
748 type, obj = self.get_object_at(offset)
749 return offset, type, obj
751 def resolve_object(self, offset, type, obj, get_ref=None):
752 """Resolve an object, possibly resolving deltas when necessary.
754 :return: Tuple with object type and contents.
756 if type not in DELTA_TYPES:
760 get_ref = self.get_ref
761 if type == OFS_DELTA:
762 (delta_offset, delta) = obj
763 # TODO: clean up asserts and replace with nicer error messages
764 assert isinstance(offset, int)
765 assert isinstance(delta_offset, int)
766 base_offset = offset-delta_offset
767 type, base_obj = self.get_object_at(base_offset)
768 assert isinstance(type, int)
769 elif type == REF_DELTA:
770 (basename, delta) = obj
771 assert isinstance(basename, str) and len(basename) == 20
772 base_offset, type, base_obj = get_ref(basename)
773 assert isinstance(type, int)
774 type, base_chunks = self.resolve_object(base_offset, type, base_obj)
775 chunks = apply_delta(base_chunks, delta)
776 # TODO(dborowitz): This can result in poor performance if large base
777 # objects are separated from deltas in the pack. We should reorganize
778 # so that we apply deltas to all objects in a chain one after the other
779 # to optimize cache performance.
780 if offset is not None:
781 self._offset_cache[offset] = type, chunks
784 def iterobjects(self, progress=None):
785 return PackObjectIterator(self, progress)
787 def iterentries(self, progress=None):
788 """Yield entries summarizing the contents of this pack.
790 :param progress: Progress function, called with current and total object
792 :yields: tuples with (sha, offset, crc32)
794 for offset, type, obj, crc32 in self.iterobjects(progress=progress):
795 assert isinstance(offset, int)
796 assert isinstance(type, int)
797 assert isinstance(obj, list) or isinstance(obj, tuple)
798 type, obj = self.resolve_object(offset, type, obj)
799 yield obj_sha(type, obj), offset, crc32
801 def sorted_entries(self, progress=None):
802 """Return entries in this pack, sorted by SHA.
804 :param progress: Progress function, called with current and total object
806 :return: List of tuples with (sha, offset, crc32)
808 ret = list(self.iterentries(progress=progress))
812 def create_index_v1(self, filename, progress=None):
813 """Create a version 1 file for this data file.
815 :param filename: Index filename.
816 :param progress: Progress report function
817 :return: Checksum of index file
819 entries = self.sorted_entries(progress=progress)
820 f = GitFile(filename, 'wb')
822 return write_pack_index_v1(f, entries, self.calculate_checksum())
826 def create_index_v2(self, filename, progress=None):
827 """Create a version 2 index file for this data file.
829 :param filename: Index filename.
830 :param progress: Progress report function
831 :return: Checksum of index file
833 entries = self.sorted_entries(progress=progress)
834 f = GitFile(filename, 'wb')
836 return write_pack_index_v2(f, entries, self.calculate_checksum())
840 def create_index(self, filename, progress=None,
842 """Create an index file for this data file.
844 :param filename: Index filename.
845 :param progress: Progress report function
846 :return: Checksum of index file
849 return self.create_index_v1(filename, progress)
851 return self.create_index_v2(filename, progress)
853 raise ValueError("unknown index format %d" % version)
855 def get_stored_checksum(self):
856 """Return the expected checksum stored in this pack."""
857 self._file.seek(self._get_size()-20)
858 return self._file.read(20)
861 """Check the consistency of this pack."""
862 actual = self.calculate_checksum()
863 stored = self.get_stored_checksum()
865 raise ChecksumMismatch(stored, actual)
867 def get_object_at(self, offset):
868 """Given an offset in to the packfile return the object that is there.
870 Using the associated index the location of an object can be looked up,
871 and then the packfile can be asked directly for that object using this
874 if offset in self._offset_cache:
875 return self._offset_cache[offset]
876 assert isinstance(offset, long) or isinstance(offset, int),\
877 "offset was %r" % offset
878 assert offset >= self._header_size
879 self._file.seek(offset)
880 return unpack_object(self._file.read)[:2]
883 class ThinPackData(PackData):
884 """PackData for thin packs, which require an ObjectStore for resolving."""
886 def __init__(self, resolve_ext_ref, *args, **kwargs):
887 super(ThinPackData, self).__init__(*args, **kwargs)
888 self.resolve_ext_ref = resolve_ext_ref
890 def get_ref(self, sha):
891 """Resolve a reference looking in both this pack and the store."""
893 # As part of completing a pack we create a Pack object with a
894 # ThinPackData and a full PackIndex, so check in the index first if
896 # TODO(dborowitz): reevaluate this when the pack completion code is
898 return super(ThinPackData, self).get_ref(sha)
900 type, obj = self.resolve_ext_ref(sha)
901 return None, type, obj
903 def iterentries(self, progress=None):
904 """Yield entries summarizing the contents of this pack.
906 :param progress: Progress function, called with current and
909 This will yield tuples with (sha, offset, crc32)
912 postponed = defaultdict(list)
914 class Postpone(Exception):
915 """Raised to postpone delta resolving."""
917 def __init__(self, sha):
920 def get_ref_text(sha):
921 assert len(sha) == 20
924 type, obj = self.get_object_at(offset)
925 return offset, type, obj
927 return self.get_ref(sha)
932 todo = chain(self.iterobjects(progress=progress), extra)
933 for (offset, type, obj, crc32) in todo:
934 assert isinstance(offset, int)
936 # Inflate postponed delta
937 obj, type = self.get_object_at(offset)
938 assert isinstance(type, int)
939 assert isinstance(obj, list) or isinstance(obj, tuple)
941 type, obj = self.resolve_object(offset, type, obj, get_ref_text)
943 # Save memory by not storing the inflated obj in postponed
944 postponed[e.sha].append((offset, type, None, crc32))
946 sha = obj_sha(type, obj)
948 yield sha, offset, crc32
949 extra.extend(postponed.pop(sha, []))
951 raise KeyError([sha_to_hex(h) for h in postponed.keys()])
954 class SHA1Reader(object):
955 """Wrapper around a file-like object that remembers the SHA1 of its data."""
957 def __init__(self, f):
959 self.sha1 = make_sha("")
961 def read(self, num=None):
962 data = self.f.read(num)
963 self.sha1.update(data)
967 stored = self.f.read(20)
968 if stored != self.sha1.digest():
969 raise ChecksumMismatch(self.sha1.hexdigest(), sha_to_hex(stored))
972 return self.f.close()
978 class SHA1Writer(object):
979 """Wrapper around a file-like object that remembers the SHA1 of its data."""
981 def __init__(self, f):
983 self.sha1 = make_sha("")
985 def write(self, data):
986 self.sha1.update(data)
990 sha = self.sha1.digest()
991 assert len(sha) == 20
996 sha = self.write_sha()
1001 return self.f.tell()
1004 def write_pack_object(f, type, object):
1005 """Write pack object to a file.
1007 :param f: File to write to
1008 :param type: Numeric type of the object
1009 :param object: Object to write
1010 :return: Tuple with offset at which the object was written, and crc32
1013 packed_data_hdr = ""
1014 if type == OFS_DELTA:
1015 (delta_base_offset, object) = object
1016 elif type == REF_DELTA:
1017 (basename, object) = object
1019 c = (type << 4) | (size & 15)
1022 packed_data_hdr += (chr(c | 0x80))
1025 packed_data_hdr += chr(c)
1026 if type == OFS_DELTA:
1027 ret = [delta_base_offset & 0x7f]
1028 delta_base_offset >>= 7
1029 while delta_base_offset:
1030 delta_base_offset -= 1
1031 ret.insert(0, 0x80 | (delta_base_offset & 0x7f))
1032 delta_base_offset >>= 7
1033 packed_data_hdr += "".join([chr(x) for x in ret])
1034 elif type == REF_DELTA:
1035 assert len(basename) == 20
1036 packed_data_hdr += basename
1037 packed_data = packed_data_hdr + zlib.compress(object)
1038 f.write(packed_data)
1039 return (offset, (zlib.crc32(packed_data) & 0xffffffff))
1042 def write_pack(filename, objects, num_objects):
1043 """Write a new pack data file.
1045 :param filename: Path to the new pack file (without .pack extension)
1046 :param objects: Iterable over (object, path) tuples to write
1047 :param num_objects: Number of objects to write
1048 :return: Tuple with checksum of pack file and index file
1050 f = GitFile(filename + ".pack", 'wb')
1052 entries, data_sum = write_pack_data(f, objects, num_objects)
1056 f = GitFile(filename + ".idx", 'wb')
1058 return data_sum, write_pack_index_v2(f, entries, data_sum)
1063 def write_pack_data(f, objects, num_objects, window=10):
1064 """Write a new pack file.
1066 :param filename: The filename of the new pack file.
1067 :param objects: List of objects to write (tuples with object and path)
1068 :return: List with (name, offset, crc32 checksum) entries, pack checksum
1070 recency = list(objects)
1071 # FIXME: Somehow limit delta depth
1072 # FIXME: Make thin-pack optional (its not used when cloning a pack)
1073 # Build a list of objects ordered by the magic Linus heuristic
1074 # This helps us find good objects to diff against us
1076 for obj, path in recency:
1077 magic.append( (obj.type_num, path, 1, -obj.raw_length(), obj) )
1079 # Build a map of objects and their index in magic - so we can find
1080 # preceeding objects to diff against
1082 for i in range(len(magic)):
1083 offs[magic[i][4]] = i
1087 f.write("PACK") # Pack header
1088 f.write(struct.pack(">L", 2)) # Pack version
1089 f.write(struct.pack(">L", num_objects)) # Number of objects in pack
1090 for o, path in recency:
1091 sha1 = o.sha().digest()
1093 raw = o.as_raw_string()
1096 #for i in range(offs[o]-window, window):
1097 # if i < 0 or i >= len(offs): continue
1099 # if b.type_num != orig_t: continue
1100 # base = b.as_raw_string()
1101 # delta = create_delta(base, raw)
1102 # if len(delta) < len(winner):
1104 # t = 6 if magic[i][2] == 1 else 7
1105 offset, crc32 = write_pack_object(f, t, winner)
1106 entries.append((sha1, offset, crc32))
1107 return entries, f.write_sha()
1110 def write_pack_index_v1(f, entries, pack_checksum):
1111 """Write a new pack index file.
1113 :param f: A file-like object to write to
1114 :param entries: List of tuples with object name (sha), offset_in_pack,
1116 :param pack_checksum: Checksum of the pack file.
1117 :return: The SHA of the written index file
1120 fan_out_table = defaultdict(lambda: 0)
1121 for (name, offset, entry_checksum) in entries:
1122 fan_out_table[ord(name[0])] += 1
1124 for i in range(0x100):
1125 f.write(struct.pack(">L", fan_out_table[i]))
1126 fan_out_table[i+1] += fan_out_table[i]
1127 for (name, offset, entry_checksum) in entries:
1128 f.write(struct.pack(">L20s", offset, name))
1129 assert len(pack_checksum) == 20
1130 f.write(pack_checksum)
1131 return f.write_sha()
1134 def create_delta(base_buf, target_buf):
1135 """Use python difflib to work out how to transform base_buf to target_buf.
1137 :param base_buf: Base buffer
1138 :param target_buf: Target buffer
1140 assert isinstance(base_buf, str)
1141 assert isinstance(target_buf, str)
1143 # write delta header
1144 def encode_size(size):
1149 ret += chr(c | 0x80)
1154 out_buf += encode_size(len(base_buf))
1155 out_buf += encode_size(len(target_buf))
1156 # write out delta opcodes
1157 seq = difflib.SequenceMatcher(a=base_buf, b=target_buf)
1158 for opcode, i1, i2, j1, j2 in seq.get_opcodes():
1159 # Git patch opcodes don't care about deletes!
1160 #if opcode == "replace" or opcode == "delete":
1162 if opcode == "equal":
1163 # If they are equal, unpacker will use data from base_buf
1164 # Write out an opcode that says what range to use
1170 scratch += chr((o >> i*8) & 0xff)
1175 scratch += chr((s >> i*8) & 0xff)
1179 if opcode == "replace" or opcode == "insert":
1180 # If we are replacing a range or adding one, then we just
1181 # output it to the stream (prefixed by its size)
1186 out_buf += target_buf[o:o+127]
1190 out_buf += target_buf[o:o+s]
1194 def apply_delta(src_buf, delta):
1195 """Based on the similar function in git's patch-delta.c.
1197 :param src_buf: Source buffer
1198 :param delta: Delta instructions
1200 if type(src_buf) != str:
1201 src_buf = "".join(src_buf)
1202 if type(delta) != str:
1203 delta = "".join(delta)
1206 delta_length = len(delta)
1207 def get_delta_header_size(delta, index):
1211 cmd = ord(delta[index])
1213 size |= (cmd & ~0x80) << i
1218 src_size, index = get_delta_header_size(delta, index)
1219 dest_size, index = get_delta_header_size(delta, index)
1220 assert src_size == len(src_buf), "%d vs %d" % (src_size, len(src_buf))
1221 while index < delta_length:
1222 cmd = ord(delta[index])
1228 x = ord(delta[index])
1230 cp_off |= x << (i * 8)
1233 if cmd & (1 << (4+i)):
1234 x = ord(delta[index])
1236 cp_size |= x << (i * 8)
1239 if (cp_off + cp_size < cp_size or
1240 cp_off + cp_size > src_size or
1241 cp_size > dest_size):
1243 out.append(src_buf[cp_off:cp_off+cp_size])
1245 out.append(delta[index:index+cmd])
1248 raise ApplyDeltaError("Invalid opcode 0")
1250 if index != delta_length:
1251 raise ApplyDeltaError("delta not empty: %r" % delta[index:])
1253 if dest_size != chunks_length(out):
1254 raise ApplyDeltaError("dest size incorrect")
1259 def write_pack_index_v2(f, entries, pack_checksum):
1260 """Write a new pack index file.
1262 :param f: File-like object to write to
1263 :param entries: List of tuples with object name (sha), offset_in_pack, and
1265 :param pack_checksum: Checksum of the pack file.
1266 :return: The SHA of the index file written
1269 f.write('\377tOc') # Magic!
1270 f.write(struct.pack(">L", 2))
1271 fan_out_table = defaultdict(lambda: 0)
1272 for (name, offset, entry_checksum) in entries:
1273 fan_out_table[ord(name[0])] += 1
1275 for i in range(0x100):
1276 f.write(struct.pack(">L", fan_out_table[i]))
1277 fan_out_table[i+1] += fan_out_table[i]
1278 for (name, offset, entry_checksum) in entries:
1280 for (name, offset, entry_checksum) in entries:
1281 f.write(struct.pack(">L", entry_checksum))
1282 for (name, offset, entry_checksum) in entries:
1283 # FIXME: handle if MSBit is set in offset
1284 f.write(struct.pack(">L", offset))
1285 # FIXME: handle table for pack files > 8 Gb
1286 assert len(pack_checksum) == 20
1287 f.write(pack_checksum)
1288 return f.write_sha()
1292 """A Git pack object."""
1294 def __init__(self, basename):
1295 self._basename = basename
1298 self._idx_path = self._basename + ".idx"
1299 self._data_path = self._basename + ".pack"
1300 self._data_load = lambda: PackData(self._data_path)
1301 self._idx_load = lambda: load_pack_index(self._idx_path)
1304 def from_lazy_objects(self, data_fn, idx_fn):
1305 """Create a new pack object from callables to load pack data and
1308 ret._data_load = data_fn
1309 ret._idx_load = idx_fn
1313 def from_objects(self, data, idx):
1314 """Create a new pack object from pack data and index objects."""
1316 ret._data_load = lambda: data
1317 ret._idx_load = lambda: idx
1321 """The SHA over the SHAs of the objects in this pack."""
1322 return self.index.objects_sha1()
1326 """The pack data object being used."""
1327 if self._data is None:
1328 self._data = self._data_load()
1329 self._data.pack = self
1330 assert len(self.index) == len(self._data)
1331 idx_stored_checksum = self.index.get_pack_checksum()
1332 data_stored_checksum = self._data.get_stored_checksum()
1333 if idx_stored_checksum != data_stored_checksum:
1334 raise ChecksumMismatch(sha_to_hex(idx_stored_checksum),
1335 sha_to_hex(data_stored_checksum))
1340 """The index being used.
1342 :note: This may be an in-memory index
1344 if self._idx is None:
1345 self._idx = self._idx_load()
1349 if self._data is not None:
1353 def __eq__(self, other):
1354 return type(self) == type(other) and self.index == other.index
1357 """Number of entries in this pack."""
1358 return len(self.index)
1361 return "%s(%r)" % (self.__class__.__name__, self._basename)
1364 """Iterate over all the sha1s of the objects in this pack."""
1365 return iter(self.index)
1368 """Check the integrity of this pack.
1370 :raise ChecksumMismatch: if a checksum for the index or data is wrong
1374 for obj in self.iterobjects():
1376 # TODO: object connectivity checks
1378 def get_stored_checksum(self):
1379 return self.data.get_stored_checksum()
1381 def __contains__(self, sha1):
1382 """Check whether this pack contains a particular SHA1."""
1384 self.index.object_index(sha1)
1389 def get_raw(self, sha1):
1390 offset = self.index.object_index(sha1)
1391 obj_type, obj = self.data.get_object_at(offset)
1392 if type(offset) is long:
1393 offset = int(offset)
1394 type_num, chunks = self.data.resolve_object(offset, obj_type, obj)
1395 return type_num, "".join(chunks)
1397 def __getitem__(self, sha1):
1398 """Retrieve the specified SHA1."""
1399 type, uncomp = self.get_raw(sha1)
1400 return ShaFile.from_raw_string(type, uncomp)
1402 def iterobjects(self):
1403 """Iterate over the objects in this pack."""
1404 for offset, type, obj, crc32 in self.data.iterobjects():
1405 assert isinstance(offset, int)
1406 yield ShaFile.from_raw_chunks(
1407 *self.data.resolve_object(offset, type, obj))
1411 from dulwich._pack import apply_delta, bisect_find_sha