1 # pack.py -- For dealing with packed git objects.
2 # Copyright (C) 2007 James Westby <jw+debian@jameswestby.net>
3 # Copyright (C) 2008-2009 Jelmer Vernooij <jelmer@samba.org>
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; version 2
8 # of the License or (at your option) a later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
20 """Classes for dealing with packed git objects.
22 A pack is a compact representation of a bunch of objects, stored
23 using deltas where possible.
25 They have two parts, the pack file, which stores the data, and an index
26 that tells you where the data is.
28 To find an object you look in all of the index files 'til you find a
29 match for the object name. You then use the pointer got from this as
30 a pointer in to the corresponding packfile.
34 from collections import defaultdict
36 from misc import defaultdict
38 from cStringIO import (
41 from collections import (
45 from itertools import (
54 from struct import unpack_from
56 from dulwich.misc import unpack_from
60 from dulwich.errors import (
64 from dulwich.file import GitFile
65 from dulwich.lru_cache import (
68 from dulwich.misc import (
72 from dulwich.objects import (
79 supports_mmap_offset = (sys.version_info[0] >= 3 or
80 (sys.version_info[0] == 2 and sys.version_info[1] >= 6))
86 DELTA_TYPES = (OFS_DELTA, REF_DELTA)
89 def take_msb_bytes(read):
90 """Read bytes marked with most significant bit.
92 :param read: Read function
95 while len(ret) == 0 or ret[-1] & 0x80:
96 ret.append(ord(read(1)))
100 def read_zlib_chunks(read_some, dec_size, buffer_size=4096):
101 """Read zlib data from a buffer.
103 This function requires that the buffer have additional data following the
104 compressed data, which is guaranteed to be the case for git pack files.
106 :param read_some: Read function that returns at least one byte, but may
107 return less than the requested size
108 :param dec_size: Expected size of the decompressed buffer
109 :param buffer_size: Size of the read buffer
110 :return: Tuple with list of chunks, length of compressed data length and
111 and unused read data.
112 :raise zlib.error: if a decompression error occurred.
115 raise ValueError("non-negative zlib data stream size expected")
116 obj = zlib.decompressobj()
120 while obj.unused_data == "":
121 add = read_some(buffer_size)
123 raise zlib.error("EOF before end of zlib stream")
125 decomp = obj.decompress(add)
129 raise zlib.error("decompressed data does not match expected size")
130 comp_len = fed - len(obj.unused_data)
131 return ret, comp_len, obj.unused_data
135 """Return the hexdigest of the SHA1 over a set of names.
137 :param iter: Iterator over string objects
138 :return: 40-byte hex sha1 digest
143 return sha1.hexdigest()
146 def load_pack_index(path):
147 """Load an index file by path.
149 :param filename: Path to the index file
150 :return: A PackIndex loaded from the given path
152 f = GitFile(path, 'rb')
154 return load_pack_index_file(path, f)
159 def _load_file_contents(f, size=None):
160 fileno = getattr(f, 'fileno', None)
161 # Attempt to use mmap if possible
162 if fileno is not None:
165 size = os.fstat(fd).st_size
167 contents = mmap.mmap(fd, size, access=mmap.ACCESS_READ)
172 return contents, size
175 return contents, size
178 def load_pack_index_file(path, f):
179 """Load an index file from a file-like object.
181 :param path: Path for the index file
182 :param f: File-like object
183 :return: A PackIndex loaded from the given file
185 contents, size = _load_file_contents(f)
186 if contents[:4] == '\377tOc':
187 version = struct.unpack(">L", contents[4:8])[0]
189 return PackIndex2(path, file=f, contents=contents,
192 raise KeyError("Unknown pack index format %d" % version)
194 return PackIndex1(path, file=f, contents=contents, size=size)
197 def bisect_find_sha(start, end, sha, unpack_name):
198 """Find a SHA in a data blob with sorted SHAs.
200 :param start: Start index of range to search
201 :param end: End index of range to search
202 :param sha: Sha to find
203 :param unpack_name: Callback to retrieve SHA by index
204 :return: Index of the SHA, or None if it wasn't found
209 file_sha = unpack_name(i)
210 x = cmp(file_sha, sha)
220 class PackIndex(object):
221 """An index in to a packfile.
223 Given a sha id of an object a pack index can tell you the location in the
224 packfile of that object if it has it.
226 To do the loop it opens the file, and indexes first 256 4 byte groups
227 with the first byte of the sha id. The value in the four byte group indexed
228 is the end of the group that shares the same starting byte. Subtract one
229 from the starting byte and index again to find the start of the group.
230 The values are sorted by sha id within the group, so do the math to find
231 the start and end offset and then bisect in to find if the value is present.
234 def __init__(self, filename, file=None, contents=None, size=None):
235 """Create a pack index object.
237 Provide it with the name of the index file to consider, and it will map
238 it whenever required.
240 self._filename = filename
241 # Take the size now, so it can be checked each time we map the file to
242 # ensure that it hasn't changed.
244 self._file = GitFile(filename, 'rb')
248 self._contents, self._size = _load_file_contents(file, size)
250 self._contents, self._size = (contents, size)
252 def __eq__(self, other):
253 if not isinstance(other, PackIndex):
256 if self._fan_out_table != other._fan_out_table:
259 for (name1, _, _), (name2, _, _) in izip(self.iterentries(),
260 other.iterentries()):
265 def __ne__(self, other):
266 return not self.__eq__(other)
272 """Return the number of entries in this pack index."""
273 return self._fan_out_table[-1]
275 def _unpack_entry(self, i):
276 """Unpack the i-th entry in the index file.
278 :return: Tuple with object name (SHA), offset in pack file and CRC32
281 raise NotImplementedError(self._unpack_entry)
283 def _unpack_name(self, i):
284 """Unpack the i-th name from the index file."""
285 raise NotImplementedError(self._unpack_name)
287 def _unpack_offset(self, i):
288 """Unpack the i-th object offset from the index file."""
289 raise NotImplementedError(self._unpack_offset)
291 def _unpack_crc32_checksum(self, i):
292 """Unpack the crc32 checksum for the i-th object from the index file."""
293 raise NotImplementedError(self._unpack_crc32_checksum)
296 """Iterate over the SHAs in this pack."""
297 return imap(sha_to_hex, self._itersha())
300 for i in range(len(self)):
301 yield self._unpack_name(i)
303 def objects_sha1(self):
304 """Return the hex SHA1 over all the shas of all objects in this pack.
306 :note: This is used for the filename of the pack.
308 return iter_sha1(self._itersha())
310 def iterentries(self):
311 """Iterate over the entries in this pack index.
313 :yields: tuples with object name, offset in packfile and crc32 checksum.
315 for i in range(len(self)):
316 yield self._unpack_entry(i)
318 def _read_fan_out_table(self, start_offset):
320 for i in range(0x100):
321 fanout_entry = self._contents[start_offset+i*4:start_offset+(i+1)*4]
322 ret.append(struct.unpack(">L", fanout_entry)[0])
326 """Check that the stored checksum matches the actual checksum."""
327 actual = self.calculate_checksum()
328 stored = self.get_stored_checksum()
330 raise ChecksumMismatch(stored, actual)
332 def calculate_checksum(self):
333 """Calculate the SHA1 checksum over this pack index.
335 :return: This is a 20-byte binary digest
337 return make_sha(self._contents[:-20]).digest()
339 def get_pack_checksum(self):
340 """Return the SHA1 checksum stored for the corresponding packfile.
342 :return: 20-byte binary digest
344 return str(self._contents[-40:-20])
346 def get_stored_checksum(self):
347 """Return the SHA1 checksum stored for this index.
349 :return: 20-byte binary digest
351 return str(self._contents[-20:])
353 def object_index(self, sha):
354 """Return the index in to the corresponding packfile for the object.
356 Given the name of an object it will return the offset that object
357 lives at within the corresponding pack file. If the pack file doesn't
358 have the object then None will be returned.
361 sha = hex_to_sha(sha)
362 return self._object_index(sha)
364 def _object_index(self, sha):
367 :param sha: A *binary* SHA string. (20 characters long)_
369 assert len(sha) == 20
374 start = self._fan_out_table[idx-1]
375 end = self._fan_out_table[idx]
376 i = bisect_find_sha(start, end, sha, self._unpack_name)
379 return self._unpack_offset(i)
382 class PackIndex1(PackIndex):
383 """Version 1 Pack Index."""
385 def __init__(self, filename, file=None, contents=None, size=None):
386 PackIndex.__init__(self, filename, file, contents, size)
388 self._fan_out_table = self._read_fan_out_table(0)
390 def _unpack_entry(self, i):
391 (offset, name) = unpack_from(">L20s", self._contents,
392 (0x100 * 4) + (i * 24))
393 return (name, offset, None)
395 def _unpack_name(self, i):
396 offset = (0x100 * 4) + (i * 24) + 4
397 return self._contents[offset:offset+20]
399 def _unpack_offset(self, i):
400 offset = (0x100 * 4) + (i * 24)
401 return unpack_from(">L", self._contents, offset)[0]
403 def _unpack_crc32_checksum(self, i):
404 # Not stored in v1 index files
408 class PackIndex2(PackIndex):
409 """Version 2 Pack Index."""
411 def __init__(self, filename, file=None, contents=None, size=None):
412 PackIndex.__init__(self, filename, file, contents, size)
413 assert self._contents[:4] == '\377tOc', "Not a v2 pack index file"
414 (self.version, ) = unpack_from(">L", self._contents, 4)
415 assert self.version == 2, "Version was %d" % self.version
416 self._fan_out_table = self._read_fan_out_table(8)
417 self._name_table_offset = 8 + 0x100 * 4
418 self._crc32_table_offset = self._name_table_offset + 20 * len(self)
419 self._pack_offset_table_offset = (self._crc32_table_offset +
422 def _unpack_entry(self, i):
423 return (self._unpack_name(i), self._unpack_offset(i),
424 self._unpack_crc32_checksum(i))
426 def _unpack_name(self, i):
427 offset = self._name_table_offset + i * 20
428 return self._contents[offset:offset+20]
430 def _unpack_offset(self, i):
431 offset = self._pack_offset_table_offset + i * 4
432 return unpack_from(">L", self._contents, offset)[0]
434 def _unpack_crc32_checksum(self, i):
435 return unpack_from(">L", self._contents,
436 self._crc32_table_offset + i * 4)[0]
439 def read_pack_header(read):
440 """Read the header of a pack file.
442 :param read: Read function
443 :return: Tuple with pack version and number of objects
446 assert header[:4] == "PACK"
447 (version,) = unpack_from(">L", header, 4)
448 assert version in (2, 3), "Version was %d" % version
449 (num_objects,) = unpack_from(">L", header, 8)
450 return (version, num_objects)
453 def chunks_length(chunks):
454 return sum(imap(len, chunks))
457 def unpack_object(read_all, read_some=None):
458 """Unpack a Git object.
460 :param read_all: Read function that blocks until the number of requested
462 :param read_some: Read function that returns at least one byte, but may not
463 return the number of bytes requested.
464 :return: tuple with type, uncompressed data, compressed size and tail data.
466 if read_some is None:
468 bytes = take_msb_bytes(read_all)
469 type = (bytes[0] >> 4) & 0x07
470 size = bytes[0] & 0x0f
471 for i, byte in enumerate(bytes[1:]):
472 size += (byte & 0x7f) << ((i * 7) + 4)
473 raw_base = len(bytes)
474 if type == OFS_DELTA:
475 bytes = take_msb_bytes(read_all)
476 raw_base += len(bytes)
477 assert not (bytes[-1] & 0x80)
478 delta_base_offset = bytes[0] & 0x7f
479 for byte in bytes[1:]:
480 delta_base_offset += 1
481 delta_base_offset <<= 7
482 delta_base_offset += (byte & 0x7f)
483 uncomp, comp_len, unused = read_zlib_chunks(read_some, size)
484 assert size == chunks_length(uncomp)
485 return type, (delta_base_offset, uncomp), comp_len+raw_base, unused
486 elif type == REF_DELTA:
487 basename = read_all(20)
489 uncomp, comp_len, unused = read_zlib_chunks(read_some, size)
490 assert size == chunks_length(uncomp)
491 return type, (basename, uncomp), comp_len+raw_base, unused
493 uncomp, comp_len, unused = read_zlib_chunks(read_some, size)
494 assert chunks_length(uncomp) == size
495 return type, uncomp, comp_len+raw_base, unused
498 def _compute_object_size((num, obj)):
499 """Compute the size of a unresolved object for use with LRUSizeCache."""
500 if num in DELTA_TYPES:
501 return chunks_length(obj[1])
502 return chunks_length(obj)
505 class PackStreamReader(object):
506 """Class to read a pack stream.
508 The pack is read from a ReceivableProtocol using read() or recv() as
512 def __init__(self, read_all, read_some=None):
513 self.read_all = read_all
514 if read_some is None:
515 self.read_some = read_all
517 self.read_some = read_some
518 self.sha = make_sha()
520 self._rbuf = StringIO()
521 # trailer is a deque to avoid memory allocation on small reads
522 self._trailer = deque()
524 def _read(self, read, size):
525 """Read up to size bytes using the given callback.
527 As a side effect, update the verifier's hash (excluding the last 20
528 bytes read) and write through to the output file.
530 :param read: The read callback to read from.
531 :param size: The maximum number of bytes to read; the particular
532 behavior is callback-specific.
536 # maintain a trailer of the last 20 bytes we've read
539 tn = len(self._trailer)
544 to_pop = max(n + tn - 20, 0)
546 for _ in xrange(to_pop):
547 self.sha.update(self._trailer.popleft())
548 self._trailer.extend(data[-to_add:])
550 # hash everything but the trailer
551 self.sha.update(data[:-to_add])
557 buf.seek(0, SEEK_END)
564 return self._offset - self._buf_len()
566 def read(self, size):
567 """Read, blocking until size bytes are read."""
568 buf_len = self._buf_len()
570 return self._rbuf.read(size)
571 buf_data = self._rbuf.read()
572 self._rbuf = StringIO()
573 return buf_data + self._read(self.read_all, size - buf_len)
575 def recv(self, size):
576 """Read up to size bytes, blocking until one byte is read."""
577 buf_len = self._buf_len()
579 data = self._rbuf.read(size)
581 self._rbuf = StringIO()
583 return self._read(self.read_some, size)
586 return self._num_objects
588 def read_objects(self):
589 """Read the objects in this pack file.
591 :raise AssertionError: if there is an error in the pack format.
592 :raise ChecksumMismatch: if the checksum of the pack contents does not
593 match the checksum in the pack trailer.
594 :raise zlib.error: if an error occurred during zlib decompression.
595 :raise IOError: if an error occurred writing to the output file.
597 pack_version, self._num_objects = read_pack_header(self.read)
598 for i in xrange(self._num_objects):
599 type, uncomp, comp_len, unused = unpack_object(self.read, self.recv)
600 yield type, uncomp, comp_len
602 # prepend any unused data to current read buffer
605 buf.write(self._rbuf.read())
609 pack_sha = sha_to_hex(''.join([c for c in self._trailer]))
610 calculated_sha = self.sha.hexdigest()
611 if pack_sha != calculated_sha:
612 raise ChecksumMismatch(pack_sha, calculated_sha)
615 class PackObjectIterator(object):
617 def __init__(self, pack, progress=None):
619 self.offset = pack._header_size
621 self.map = pack._file
622 self._progress = progress
631 if self.i == self.num:
633 self.map.seek(self.offset)
634 (type, obj, total_size, unused) = unpack_object(self.map.read)
635 self.map.seek(self.offset)
636 crc32 = zlib.crc32(self.map.read(total_size)) & 0xffffffff
637 ret = (self.offset, type, obj, crc32)
638 self.offset += total_size
639 if self._progress is not None:
640 self._progress(self.i, self.num)
644 def obj_sha(type, chunks):
645 """Compute the SHA for a numeric type and object chunks."""
647 sha.update(object_header(type, chunks_length(chunks)))
653 class PackData(object):
654 """The data contained in a packfile.
656 Pack files can be accessed both sequentially for exploding a pack, and
657 directly with the help of an index to retrieve a specific object.
659 The objects within are either complete or a delta aginst another.
661 The header is variable length. If the MSB of each byte is set then it
662 indicates that the subsequent byte is still part of the header.
663 For the first byte the next MS bits are the type, which tells you the type
664 of object, and whether it is a delta. The LS byte is the lowest bits of the
665 size. For each subsequent byte the LS 7 bits are the next MS bits of the
666 size, i.e. the last byte of the header contains the MS bits of the size.
668 For the complete objects the data is stored as zlib deflated data.
669 The size in the header is the uncompressed object size, so to uncompress
670 you need to just keep feeding data to zlib until you get an object back,
671 or it errors on bad data. This is done here by just giving the complete
672 buffer from the start of the deflated object on. This is bad, but until I
673 get mmap sorted out it will have to do.
675 Currently there are no integrity checks done. Also no attempt is made to
676 try and detect the delta case, or a request for an object at the wrong
677 position. It will all just throw a zlib or KeyError.
680 def __init__(self, filename, file=None, size=None):
681 """Create a PackData object representing the pack in the given filename.
683 The file must exist and stay readable until the object is disposed of. It
684 must also stay the same size. It will be mapped whenever needed.
686 Currently there is a restriction on the size of the pack as the python
687 mmap implementation is flawed.
689 self._filename = filename
691 self._header_size = 12
693 self._file = GitFile(self._filename, 'rb')
696 (version, self._num_objects) = read_pack_header(self._file.read)
697 self._offset_cache = LRUSizeCache(1024*1024*20,
698 compute_size=_compute_object_size)
702 def from_file(cls, file, size):
703 return cls(str(file), file=file, size=size)
706 def from_path(cls, path):
707 return cls(filename=path)
713 if self._size is not None:
715 self._size = os.path.getsize(self._filename)
716 if self._size < self._header_size:
717 errmsg = ("%s is too small for a packfile (%d < %d)" %
718 (self._filename, self._size, self._header_size))
719 raise AssertionError(errmsg)
723 """Returns the number of objects in this pack."""
724 return self._num_objects
726 def calculate_checksum(self):
727 """Calculate the checksum for this pack.
729 :return: 20-byte binary SHA1 digest
733 todo = self._get_size() - 20
735 x = self._file.read(min(todo, 1<<16))
740 def get_ref(self, sha):
741 """Get the object for a ref SHA, only looking in this pack."""
742 # TODO: cache these results
743 if self.pack is None:
745 offset = self.pack.index.object_index(sha)
748 type, obj = self.get_object_at(offset)
749 return offset, type, obj
751 def resolve_object(self, offset, type, obj, get_ref=None):
752 """Resolve an object, possibly resolving deltas when necessary.
754 :return: Tuple with object type and contents.
756 if type not in DELTA_TYPES:
760 get_ref = self.get_ref
761 if type == OFS_DELTA:
762 (delta_offset, delta) = obj
763 # TODO: clean up asserts and replace with nicer error messages
764 assert isinstance(offset, int)
765 assert isinstance(delta_offset, int)
766 base_offset = offset-delta_offset
767 type, base_obj = self.get_object_at(base_offset)
768 assert isinstance(type, int)
769 elif type == REF_DELTA:
770 (basename, delta) = obj
771 assert isinstance(basename, str) and len(basename) == 20
772 base_offset, type, base_obj = get_ref(basename)
773 assert isinstance(type, int)
774 type, base_chunks = self.resolve_object(base_offset, type, base_obj)
775 chunks = apply_delta(base_chunks, delta)
776 # TODO(dborowitz): This can result in poor performance if large base
777 # objects are separated from deltas in the pack. We should reorganize
778 # so that we apply deltas to all objects in a chain one after the other
779 # to optimize cache performance.
780 if offset is not None:
781 self._offset_cache[offset] = type, chunks
784 def iterobjects(self, progress=None):
785 return PackObjectIterator(self, progress)
787 def iterentries(self, progress=None):
788 """Yield entries summarizing the contents of this pack.
790 :param progress: Progress function, called with current and total object
792 :yields: tuples with (sha, offset, crc32)
794 for offset, type, obj, crc32 in self.iterobjects(progress=progress):
795 assert isinstance(offset, int)
796 assert isinstance(type, int)
797 assert isinstance(obj, list) or isinstance(obj, tuple)
798 type, obj = self.resolve_object(offset, type, obj)
799 yield obj_sha(type, obj), offset, crc32
801 def sorted_entries(self, progress=None):
802 """Return entries in this pack, sorted by SHA.
804 :param progress: Progress function, called with current and total object
806 :return: List of tuples with (sha, offset, crc32)
808 ret = list(self.iterentries(progress=progress))
812 def create_index_v1(self, filename, progress=None):
813 """Create a version 1 file for this data file.
815 :param filename: Index filename.
816 :param progress: Progress report function
818 entries = self.sorted_entries(progress=progress)
819 write_pack_index_v1(filename, entries, self.calculate_checksum())
821 def create_index_v2(self, filename, progress=None):
822 """Create a version 2 index file for this data file.
824 :param filename: Index filename.
825 :param progress: Progress report function
827 entries = self.sorted_entries(progress=progress)
828 write_pack_index_v2(filename, entries, self.calculate_checksum())
830 def create_index(self, filename, progress=None,
832 """Create an index file for this data file.
834 :param filename: Index filename.
835 :param progress: Progress report function
838 self.create_index_v1(filename, progress)
840 self.create_index_v2(filename, progress)
842 raise ValueError("unknown index format %d" % version)
844 def get_stored_checksum(self):
845 """Return the expected checksum stored in this pack."""
846 self._file.seek(self._get_size()-20)
847 return self._file.read(20)
850 """Check the consistency of this pack."""
851 actual = self.calculate_checksum()
852 stored = self.get_stored_checksum()
854 raise ChecksumMismatch(stored, actual)
856 def get_object_at(self, offset):
857 """Given an offset in to the packfile return the object that is there.
859 Using the associated index the location of an object can be looked up,
860 and then the packfile can be asked directly for that object using this
863 if offset in self._offset_cache:
864 return self._offset_cache[offset]
865 assert isinstance(offset, long) or isinstance(offset, int),\
866 "offset was %r" % offset
867 assert offset >= self._header_size
868 self._file.seek(offset)
869 return unpack_object(self._file.read)[:2]
872 class ThinPackData(PackData):
873 """PackData for thin packs, which require an ObjectStore for resolving."""
875 def __init__(self, resolve_ext_ref, *args, **kwargs):
876 super(ThinPackData, self).__init__(*args, **kwargs)
877 self.resolve_ext_ref = resolve_ext_ref
879 def get_ref(self, sha):
880 """Resolve a reference looking in both this pack and the store."""
882 # As part of completing a pack we create a Pack object with a
883 # ThinPackData and a full PackIndex, so check in the index first if
885 # TODO(dborowitz): reevaluate this when the pack completion code is
887 return super(ThinPackData, self).get_ref(sha)
889 type, obj = self.resolve_ext_ref(sha)
890 return None, type, obj
892 def iterentries(self, progress=None):
893 """Yield entries summarizing the contents of this pack.
895 :param progress: Progress function, called with current and
898 This will yield tuples with (sha, offset, crc32)
901 postponed = defaultdict(list)
903 class Postpone(Exception):
904 """Raised to postpone delta resolving."""
906 def __init__(self, sha):
909 def get_ref_text(sha):
910 assert len(sha) == 20
913 type, obj = self.get_object_at(offset)
914 return offset, type, obj
916 return self.get_ref(sha)
921 todo = chain(self.iterobjects(progress=progress), extra)
922 for (offset, type, obj, crc32) in todo:
923 assert isinstance(offset, int)
925 # Inflate postponed delta
926 obj, type = self.get_object_at(offset)
927 assert isinstance(type, int)
928 assert isinstance(obj, list) or isinstance(obj, tuple)
930 type, obj = self.resolve_object(offset, type, obj, get_ref_text)
932 # Save memory by not storing the inflated obj in postponed
933 postponed[e.sha].append((offset, type, None, crc32))
935 sha = obj_sha(type, obj)
937 yield sha, offset, crc32
938 extra.extend(postponed.pop(sha, []))
940 raise KeyError([sha_to_hex(h) for h in postponed.keys()])
943 class SHA1Reader(object):
944 """Wrapper around a file-like object that remembers the SHA1 of its data."""
946 def __init__(self, f):
948 self.sha1 = make_sha("")
950 def read(self, num=None):
951 data = self.f.read(num)
952 self.sha1.update(data)
956 stored = self.f.read(20)
957 if stored != self.sha1.digest():
958 raise ChecksumMismatch(self.sha1.hexdigest(), sha_to_hex(stored))
961 return self.f.close()
967 class SHA1Writer(object):
968 """Wrapper around a file-like object that remembers the SHA1 of its data."""
970 def __init__(self, f):
972 self.sha1 = make_sha("")
974 def write(self, data):
975 self.sha1.update(data)
979 sha = self.sha1.digest()
980 assert len(sha) == 20
985 sha = self.write_sha()
993 def write_pack_object(f, type, object):
994 """Write pack object to a file.
996 :param f: File to write to
997 :param type: Numeric type of the object
998 :param object: Object to write
999 :return: Tuple with offset at which the object was written, and crc32
1002 packed_data_hdr = ""
1003 if type == OFS_DELTA:
1004 (delta_base_offset, object) = object
1005 elif type == REF_DELTA:
1006 (basename, object) = object
1008 c = (type << 4) | (size & 15)
1011 packed_data_hdr += (chr(c | 0x80))
1014 packed_data_hdr += chr(c)
1015 if type == OFS_DELTA:
1016 ret = [delta_base_offset & 0x7f]
1017 delta_base_offset >>= 7
1018 while delta_base_offset:
1019 delta_base_offset -= 1
1020 ret.insert(0, 0x80 | (delta_base_offset & 0x7f))
1021 delta_base_offset >>= 7
1022 packed_data_hdr += "".join([chr(x) for x in ret])
1023 elif type == REF_DELTA:
1024 assert len(basename) == 20
1025 packed_data_hdr += basename
1026 packed_data = packed_data_hdr + zlib.compress(object)
1027 f.write(packed_data)
1028 return (offset, (zlib.crc32(packed_data) & 0xffffffff))
1031 def write_pack(filename, objects, num_objects):
1032 """Write a new pack data file.
1034 :param filename: Path to the new pack file (without .pack extension)
1035 :param objects: Iterable over (object, path) tuples to write
1036 :param num_objects: Number of objects to write
1038 f = GitFile(filename + ".pack", 'wb')
1040 entries, data_sum = write_pack_data(f, objects, num_objects)
1044 write_pack_index_v2(filename + ".idx", entries, data_sum)
1047 def write_pack_data(f, objects, num_objects, window=10):
1048 """Write a new pack file.
1050 :param filename: The filename of the new pack file.
1051 :param objects: List of objects to write (tuples with object and path)
1052 :return: List with (name, offset, crc32 checksum) entries, pack checksum
1054 recency = list(objects)
1055 # FIXME: Somehow limit delta depth
1056 # FIXME: Make thin-pack optional (its not used when cloning a pack)
1057 # Build a list of objects ordered by the magic Linus heuristic
1058 # This helps us find good objects to diff against us
1060 for obj, path in recency:
1061 magic.append( (obj.type_num, path, 1, -obj.raw_length(), obj) )
1063 # Build a map of objects and their index in magic - so we can find
1064 # preceeding objects to diff against
1066 for i in range(len(magic)):
1067 offs[magic[i][4]] = i
1071 f.write("PACK") # Pack header
1072 f.write(struct.pack(">L", 2)) # Pack version
1073 f.write(struct.pack(">L", num_objects)) # Number of objects in pack
1074 for o, path in recency:
1075 sha1 = o.sha().digest()
1077 raw = o.as_raw_string()
1080 #for i in range(offs[o]-window, window):
1081 # if i < 0 or i >= len(offs): continue
1083 # if b.type_num != orig_t: continue
1084 # base = b.as_raw_string()
1085 # delta = create_delta(base, raw)
1086 # if len(delta) < len(winner):
1088 # t = 6 if magic[i][2] == 1 else 7
1089 offset, crc32 = write_pack_object(f, t, winner)
1090 entries.append((sha1, offset, crc32))
1091 return entries, f.write_sha()
1094 def write_pack_index_v1(filename, entries, pack_checksum):
1095 """Write a new pack index file.
1097 :param filename: The filename of the new pack index file.
1098 :param entries: List of tuples with object name (sha), offset_in_pack,
1100 :param pack_checksum: Checksum of the pack file.
1102 f = GitFile(filename, 'wb')
1105 fan_out_table = defaultdict(lambda: 0)
1106 for (name, offset, entry_checksum) in entries:
1107 fan_out_table[ord(name[0])] += 1
1109 for i in range(0x100):
1110 f.write(struct.pack(">L", fan_out_table[i]))
1111 fan_out_table[i+1] += fan_out_table[i]
1112 for (name, offset, entry_checksum) in entries:
1113 f.write(struct.pack(">L20s", offset, name))
1114 assert len(pack_checksum) == 20
1115 f.write(pack_checksum)
1120 def create_delta(base_buf, target_buf):
1121 """Use python difflib to work out how to transform base_buf to target_buf.
1123 :param base_buf: Base buffer
1124 :param target_buf: Target buffer
1126 assert isinstance(base_buf, str)
1127 assert isinstance(target_buf, str)
1129 # write delta header
1130 def encode_size(size):
1135 ret += chr(c | 0x80)
1140 out_buf += encode_size(len(base_buf))
1141 out_buf += encode_size(len(target_buf))
1142 # write out delta opcodes
1143 seq = difflib.SequenceMatcher(a=base_buf, b=target_buf)
1144 for opcode, i1, i2, j1, j2 in seq.get_opcodes():
1145 # Git patch opcodes don't care about deletes!
1146 #if opcode == "replace" or opcode == "delete":
1148 if opcode == "equal":
1149 # If they are equal, unpacker will use data from base_buf
1150 # Write out an opcode that says what range to use
1156 scratch += chr((o >> i*8) & 0xff)
1161 scratch += chr((s >> i*8) & 0xff)
1165 if opcode == "replace" or opcode == "insert":
1166 # If we are replacing a range or adding one, then we just
1167 # output it to the stream (prefixed by its size)
1172 out_buf += target_buf[o:o+127]
1176 out_buf += target_buf[o:o+s]
1180 def apply_delta(src_buf, delta):
1181 """Based on the similar function in git's patch-delta.c.
1183 :param src_buf: Source buffer
1184 :param delta: Delta instructions
1186 if type(src_buf) != str:
1187 src_buf = "".join(src_buf)
1188 if type(delta) != str:
1189 delta = "".join(delta)
1192 delta_length = len(delta)
1193 def get_delta_header_size(delta, index):
1197 cmd = ord(delta[index])
1199 size |= (cmd & ~0x80) << i
1204 src_size, index = get_delta_header_size(delta, index)
1205 dest_size, index = get_delta_header_size(delta, index)
1206 assert src_size == len(src_buf), "%d vs %d" % (src_size, len(src_buf))
1207 while index < delta_length:
1208 cmd = ord(delta[index])
1214 x = ord(delta[index])
1216 cp_off |= x << (i * 8)
1219 if cmd & (1 << (4+i)):
1220 x = ord(delta[index])
1222 cp_size |= x << (i * 8)
1225 if (cp_off + cp_size < cp_size or
1226 cp_off + cp_size > src_size or
1227 cp_size > dest_size):
1229 out.append(src_buf[cp_off:cp_off+cp_size])
1231 out.append(delta[index:index+cmd])
1234 raise ApplyDeltaError("Invalid opcode 0")
1236 if index != delta_length:
1237 raise ApplyDeltaError("delta not empty: %r" % delta[index:])
1239 if dest_size != chunks_length(out):
1240 raise ApplyDeltaError("dest size incorrect")
1245 def write_pack_index_v2(filename, entries, pack_checksum):
1246 """Write a new pack index file.
1248 :param filename: The filename of the new pack index file.
1249 :param entries: List of tuples with object name (sha), offset_in_pack, and
1251 :param pack_checksum: Checksum of the pack file.
1253 f = GitFile(filename, 'wb')
1256 f.write('\377tOc') # Magic!
1257 f.write(struct.pack(">L", 2))
1258 fan_out_table = defaultdict(lambda: 0)
1259 for (name, offset, entry_checksum) in entries:
1260 fan_out_table[ord(name[0])] += 1
1262 for i in range(0x100):
1263 f.write(struct.pack(">L", fan_out_table[i]))
1264 fan_out_table[i+1] += fan_out_table[i]
1265 for (name, offset, entry_checksum) in entries:
1267 for (name, offset, entry_checksum) in entries:
1268 f.write(struct.pack(">L", entry_checksum))
1269 for (name, offset, entry_checksum) in entries:
1270 # FIXME: handle if MSBit is set in offset
1271 f.write(struct.pack(">L", offset))
1272 # FIXME: handle table for pack files > 8 Gb
1273 assert len(pack_checksum) == 20
1274 f.write(pack_checksum)
1280 """A Git pack object."""
1282 def __init__(self, basename):
1283 self._basename = basename
1284 self._data_path = self._basename + ".pack"
1285 self._idx_path = self._basename + ".idx"
1290 def from_objects(self, data, idx):
1291 """Create a new pack object from pack data and index objects."""
1299 """The SHA over the SHAs of the objects in this pack."""
1300 return self.index.objects_sha1()
1304 """The pack data object being used."""
1305 if self._data is None:
1306 self._data = PackData(self._data_path)
1307 self._data.pack = self
1308 assert len(self.index) == len(self._data)
1309 idx_stored_checksum = self.index.get_pack_checksum()
1310 data_stored_checksum = self._data.get_stored_checksum()
1311 if idx_stored_checksum != data_stored_checksum:
1312 raise ChecksumMismatch(sha_to_hex(idx_stored_checksum),
1313 sha_to_hex(data_stored_checksum))
1318 """The index being used.
1320 :note: This may be an in-memory index
1322 if self._idx is None:
1323 self._idx = load_pack_index(self._idx_path)
1327 if self._data is not None:
1331 def __eq__(self, other):
1332 return type(self) == type(other) and self.index == other.index
1335 """Number of entries in this pack."""
1336 return len(self.index)
1339 return "%s(%r)" % (self.__class__.__name__, self._basename)
1342 """Iterate over all the sha1s of the objects in this pack."""
1343 return iter(self.index)
1346 """Check the integrity of this pack.
1348 :raise ChecksumMismatch: if a checksum for the index or data is wrong
1352 for obj in self.iterobjects():
1354 # TODO: object connectivity checks
1356 def get_stored_checksum(self):
1357 return self.data.get_stored_checksum()
1359 def __contains__(self, sha1):
1360 """Check whether this pack contains a particular SHA1."""
1362 self.index.object_index(sha1)
1367 def get_raw(self, sha1):
1368 offset = self.index.object_index(sha1)
1369 obj_type, obj = self.data.get_object_at(offset)
1370 if type(offset) is long:
1371 offset = int(offset)
1372 type_num, chunks = self.data.resolve_object(offset, obj_type, obj)
1373 return type_num, "".join(chunks)
1375 def __getitem__(self, sha1):
1376 """Retrieve the specified SHA1."""
1377 type, uncomp = self.get_raw(sha1)
1378 return ShaFile.from_raw_string(type, uncomp)
1380 def iterobjects(self):
1381 """Iterate over the objects in this pack."""
1382 for offset, type, obj, crc32 in self.data.iterobjects():
1383 assert isinstance(offset, int)
1384 yield ShaFile.from_raw_chunks(
1385 *self.data.resolve_object(offset, type, obj))
1389 from dulwich._pack import apply_delta, bisect_find_sha