1 # pack.py -- For dealing wih packed git objects.
2 # Copyright (C) 2007 James Westby <jw+debian@jameswestby.net>
3 # Copryight (C) 2008-2009 Jelmer Vernooij <jelmer@samba.org>
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; version 2
8 # of the License or (at your option) a later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
20 """Classes for dealing with packed git objects.
22 A pack is a compact representation of a bunch of objects, stored
23 using deltas where possible.
25 They have two parts, the pack file, which stores the data, and an index
26 that tells you where the data is.
28 To find an object you look in all of the index files 'til you find a
29 match for the object name. You then use the pointer got from this as
30 a pointer in to the corresponding packfile.
34 from collections import defaultdict
36 from misc import defaultdict
39 from itertools import (
48 from struct import unpack_from
50 from dulwich.misc import unpack_from
54 from dulwich.errors import (
58 from dulwich.file import GitFile
59 from dulwich.lru_cache import (
62 from dulwich.objects import (
67 from dulwich.misc import (
71 supports_mmap_offset = (sys.version_info[0] >= 3 or
72 (sys.version_info[0] == 2 and sys.version_info[1] >= 6))
75 def take_msb_bytes(read):
76 """Read bytes marked with most significant bit.
78 :param read: Read function
81 while len(ret) == 0 or ret[-1] & 0x80:
82 ret.append(ord(read(1)))
86 def read_zlib_chunks(read, buffer_size=4096):
87 """Read chunks of zlib data from a buffer.
89 :param read: Read function
90 :return: Tuple with list of chunks, length of
91 compressed data length and unused read data
93 obj = zlib.decompressobj()
96 while obj.unused_data == "":
97 add = read(buffer_size)
98 if len(add) < buffer_size:
101 ret.append(obj.decompress(add))
102 comp_len = fed-len(obj.unused_data)
103 return ret, comp_len, obj.unused_data
106 def read_zlib(read, dec_size):
107 """Read zlib-compressed data from a buffer.
109 :param read: Read function
110 :param dec_size: Size of the decompressed buffer
111 :return: Uncompressed buffer, compressed buffer length and unused read
114 ret, comp_len, unused = read_zlib_chunks(read)
116 assert len(x) == dec_size
117 return x, comp_len, unused
122 """Return the hexdigest of the SHA1 over a set of names.
124 :param iter: Iterator over string objects
125 :return: 40-byte hex sha1 digest
130 return sha1.hexdigest()
133 def load_pack_index(path):
134 """Load an index file by path.
136 :param filename: Path to the index file
138 f = GitFile(path, 'rb')
139 return load_pack_index_file(path, f)
142 def load_pack_index_file(path, f):
143 """Load an index file from a file-like object.
145 :param path: Path for the index file
146 :param f: File-like object
148 if f.read(4) == '\377tOc':
149 version = struct.unpack(">L", f.read(4))[0]
152 return PackIndex2(path, file=f)
154 raise KeyError("Unknown pack index format %d" % version)
157 return PackIndex1(path, file=f)
160 def bisect_find_sha(start, end, sha, unpack_name):
161 """Find a SHA in a data blob with sorted SHAs.
163 :param start: Start index of range to search
164 :param end: End index of range to search
165 :param sha: Sha to find
166 :param unpack_name: Callback to retrieve SHA by index
167 :return: Index of the SHA, or None if it wasn't found
172 file_sha = unpack_name(i)
173 x = cmp(file_sha, sha)
183 class PackIndex(object):
184 """An index in to a packfile.
186 Given a sha id of an object a pack index can tell you the location in the
187 packfile of that object if it has it.
189 To do the loop it opens the file, and indexes first 256 4 byte groups
190 with the first byte of the sha id. The value in the four byte group indexed
191 is the end of the group that shares the same starting byte. Subtract one
192 from the starting byte and index again to find the start of the group.
193 The values are sorted by sha id within the group, so do the math to find
194 the start and end offset and then bisect in to find if the value is present.
197 def __init__(self, filename, file=None, size=None):
198 """Create a pack index object.
200 Provide it with the name of the index file to consider, and it will map
201 it whenever required.
203 self._filename = filename
204 # Take the size now, so it can be checked each time we map the file to
205 # ensure that it hasn't changed.
207 self._file = GitFile(filename, 'rb')
210 fileno = getattr(self._file, 'fileno', None)
211 if fileno is not None:
212 fd = self._file.fileno()
214 self._size = os.fstat(fd).st_size
217 self._contents = mmap.mmap(fd, self._size,
218 access=mmap.ACCESS_READ)
221 self._contents = self._file.read()
222 self._size = len(self._contents)
224 def __eq__(self, other):
225 if not isinstance(other, PackIndex):
228 if self._fan_out_table != other._fan_out_table:
231 for (name1, _, _), (name2, _, _) in izip(self.iterentries(), other.iterentries()):
236 def __ne__(self, other):
237 return not self.__eq__(other)
243 """Return the number of entries in this pack index."""
244 return self._fan_out_table[-1]
246 def _unpack_entry(self, i):
247 """Unpack the i-th entry in the index file.
249 :return: Tuple with object name (SHA), offset in pack file and
250 CRC32 checksum (if known)."""
251 raise NotImplementedError(self._unpack_entry)
253 def _unpack_name(self, i):
254 """Unpack the i-th name from the index file."""
255 raise NotImplementedError(self._unpack_name)
257 def _unpack_offset(self, i):
258 """Unpack the i-th object offset from the index file."""
259 raise NotImplementedError(self._unpack_offset)
261 def _unpack_crc32_checksum(self, i):
262 """Unpack the crc32 checksum for the i-th object from the index file."""
263 raise NotImplementedError(self._unpack_crc32_checksum)
266 """Iterate over the SHAs in this pack."""
267 return imap(sha_to_hex, self._itersha())
270 for i in range(len(self)):
271 yield self._unpack_name(i)
273 def objects_sha1(self):
274 """Return the hex SHA1 over all the shas of all objects in this pack.
276 :note: This is used for the filename of the pack.
278 return iter_sha1(self._itersha())
280 def iterentries(self):
281 """Iterate over the entries in this pack index.
283 Will yield tuples with object name, offset in packfile and crc32 checksum.
285 for i in range(len(self)):
286 yield self._unpack_entry(i)
288 def _read_fan_out_table(self, start_offset):
290 for i in range(0x100):
291 ret.append(struct.unpack(">L", self._contents[start_offset+i*4:start_offset+(i+1)*4])[0])
295 """Check that the stored checksum matches the actual checksum."""
296 # TODO: Check pack contents, too
297 return self.calculate_checksum() == self.get_stored_checksum()
299 def calculate_checksum(self):
300 """Calculate the SHA1 checksum over this pack index.
302 :return: This is a 20-byte binary digest
304 return make_sha(self._contents[:-20]).digest()
306 def get_pack_checksum(self):
307 """Return the SHA1 checksum stored for the corresponding packfile.
309 :return: 20-byte binary digest
311 return str(self._contents[-40:-20])
313 def get_stored_checksum(self):
314 """Return the SHA1 checksum stored for this index.
316 :return: 20-byte binary digest
318 return str(self._contents[-20:])
320 def object_index(self, sha):
321 """Return the index in to the corresponding packfile for the object.
323 Given the name of an object it will return the offset that object lives
324 at within the corresponding pack file. If the pack file doesn't have the
325 object then None will be returned.
328 sha = hex_to_sha(sha)
329 return self._object_index(sha)
331 def _object_index(self, sha):
334 :param sha: A *binary* SHA string. (20 characters long)_
336 assert len(sha) == 20
341 start = self._fan_out_table[idx-1]
342 end = self._fan_out_table[idx]
343 i = bisect_find_sha(start, end, sha, self._unpack_name)
346 return self._unpack_offset(i)
350 class PackIndex1(PackIndex):
351 """Version 1 Pack Index."""
353 def __init__(self, filename, file=None, size=None):
354 PackIndex.__init__(self, filename, file, size)
356 self._fan_out_table = self._read_fan_out_table(0)
358 def _unpack_entry(self, i):
359 (offset, name) = unpack_from(">L20s", self._contents,
360 (0x100 * 4) + (i * 24))
361 return (name, offset, None)
363 def _unpack_name(self, i):
364 offset = (0x100 * 4) + (i * 24) + 4
365 return self._contents[offset:offset+20]
367 def _unpack_offset(self, i):
368 offset = (0x100 * 4) + (i * 24)
369 return unpack_from(">L", self._contents, offset)[0]
371 def _unpack_crc32_checksum(self, i):
372 # Not stored in v1 index files
376 class PackIndex2(PackIndex):
377 """Version 2 Pack Index."""
379 def __init__(self, filename, file=None, size=None):
380 PackIndex.__init__(self, filename, file, size)
381 assert self._contents[:4] == '\377tOc', "Not a v2 pack index file"
382 (self.version, ) = unpack_from(">L", self._contents, 4)
383 assert self.version == 2, "Version was %d" % self.version
384 self._fan_out_table = self._read_fan_out_table(8)
385 self._name_table_offset = 8 + 0x100 * 4
386 self._crc32_table_offset = self._name_table_offset + 20 * len(self)
387 self._pack_offset_table_offset = self._crc32_table_offset + 4 * len(self)
389 def _unpack_entry(self, i):
390 return (self._unpack_name(i), self._unpack_offset(i),
391 self._unpack_crc32_checksum(i))
393 def _unpack_name(self, i):
394 offset = self._name_table_offset + i * 20
395 return self._contents[offset:offset+20]
397 def _unpack_offset(self, i):
398 offset = self._pack_offset_table_offset + i * 4
399 return unpack_from(">L", self._contents, offset)[0]
401 def _unpack_crc32_checksum(self, i):
402 return unpack_from(">L", self._contents,
403 self._crc32_table_offset + i * 4)[0]
407 def read_pack_header(f):
408 """Read the header of a pack file.
410 :param f: File-like object to read from
413 assert header[:4] == "PACK"
414 (version,) = unpack_from(">L", header, 4)
415 assert version in (2, 3), "Version was %d" % version
416 (num_objects,) = unpack_from(">L", header, 8)
417 return (version, num_objects)
420 def unpack_object(read):
421 """Unpack a Git object.
423 :return: tuple with type, uncompressed data, compressed size and
426 bytes = take_msb_bytes(read)
427 type = (bytes[0] >> 4) & 0x07
428 size = bytes[0] & 0x0f
429 for i, byte in enumerate(bytes[1:]):
430 size += (byte & 0x7f) << ((i * 7) + 4)
431 raw_base = len(bytes)
432 if type == 6: # offset delta
433 bytes = take_msb_bytes(read)
434 raw_base += len(bytes)
435 assert not (bytes[-1] & 0x80)
436 delta_base_offset = bytes[0] & 0x7f
437 for byte in bytes[1:]:
438 delta_base_offset += 1
439 delta_base_offset <<= 7
440 delta_base_offset += (byte & 0x7f)
441 uncomp, comp_len, unused = read_zlib(read, size)
442 assert size == len(uncomp)
443 return type, (delta_base_offset, uncomp), comp_len+raw_base, unused
444 elif type == 7: # ref delta
447 uncomp, comp_len, unused = read_zlib(read, size)
448 assert size == len(uncomp)
449 return type, (basename, uncomp), comp_len+raw_base, unused
451 uncomp, comp_len, unused = read_zlib(read, size)
452 assert len(uncomp) == size
453 return type, uncomp, comp_len+raw_base, unused
456 def _compute_object_size((num, obj)):
457 """Compute the size of a unresolved object for use with LRUSizeCache.
461 assert isinstance(obj, str)
465 class PackData(object):
466 """The data contained in a packfile.
468 Pack files can be accessed both sequentially for exploding a pack, and
469 directly with the help of an index to retrieve a specific object.
471 The objects within are either complete or a delta aginst another.
473 The header is variable length. If the MSB of each byte is set then it
474 indicates that the subsequent byte is still part of the header.
475 For the first byte the next MS bits are the type, which tells you the type
476 of object, and whether it is a delta. The LS byte is the lowest bits of the
477 size. For each subsequent byte the LS 7 bits are the next MS bits of the
478 size, i.e. the last byte of the header contains the MS bits of the size.
480 For the complete objects the data is stored as zlib deflated data.
481 The size in the header is the uncompressed object size, so to uncompress
482 you need to just keep feeding data to zlib until you get an object back,
483 or it errors on bad data. This is done here by just giving the complete
484 buffer from the start of the deflated object on. This is bad, but until I
485 get mmap sorted out it will have to do.
487 Currently there are no integrity checks done. Also no attempt is made to try
488 and detect the delta case, or a request for an object at the wrong position.
489 It will all just throw a zlib or KeyError.
492 def __init__(self, filename, file=None, size=None):
493 """Create a PackData object that represents the pack in the given filename.
495 The file must exist and stay readable until the object is disposed of. It
496 must also stay the same size. It will be mapped whenever needed.
498 Currently there is a restriction on the size of the pack as the python
499 mmap implementation is flawed.
501 self._filename = filename
503 self._header_size = 12
505 self._file = GitFile(self._filename, 'rb')
508 (version, self._num_objects) = read_pack_header(self._file)
509 self._offset_cache = LRUSizeCache(1024*1024*20,
510 compute_size=_compute_object_size)
513 def from_file(cls, file, size):
514 return cls(str(file), file=file, size=size)
517 def from_path(cls, path):
518 return cls(filename=path)
524 if self._size is not None:
526 self._size = os.path.getsize(self._filename)
527 assert self._size >= self._header_size, "%s is too small for a packfile (%d < %d)" % (self._filename, self._size, self._header_size)
531 """Returns the number of objects in this pack."""
532 return self._num_objects
534 def calculate_checksum(self):
535 """Calculate the checksum for this pack.
537 :return: 20-byte binary SHA1 digest
541 todo = self._get_size() - 20
543 x = self._file.read(min(todo, 1<<16))
548 def resolve_object(self, offset, type, obj, get_ref, get_offset=None):
549 """Resolve an object, possibly resolving deltas when necessary.
551 :return: Tuple with object type and contents.
553 if type not in (6, 7): # Not a delta
556 if get_offset is None:
557 get_offset = self.get_object_at
559 if type == 6: # offset delta
560 (delta_offset, delta) = obj
561 assert isinstance(delta_offset, int)
562 assert isinstance(delta, str)
563 base_offset = offset-delta_offset
564 type, base_obj = get_offset(base_offset)
565 assert isinstance(type, int)
566 elif type == 7: # ref delta
567 (basename, delta) = obj
568 assert isinstance(basename, str) and len(basename) == 20
569 assert isinstance(delta, str)
570 type, base_obj = get_ref(basename)
571 assert isinstance(type, int)
572 # Can't be a ofs delta, as we wouldn't know the base offset
575 type, base_text = self.resolve_object(base_offset, type, base_obj, get_ref)
576 if base_offset is not None:
577 self._offset_cache[base_offset] = type, base_text
578 ret = (type, apply_delta(base_text, delta))
581 def iterobjects(self, progress=None):
583 class ObjectIterator(object):
585 def __init__(self, pack):
587 self.offset = pack._header_size
589 self.map = pack._file
598 if self.i == self.num:
600 self.map.seek(self.offset)
601 (type, obj, total_size, unused) = unpack_object(self.map.read)
602 self.map.seek(self.offset)
603 crc32 = zlib.crc32(self.map.read(total_size)) & 0xffffffff
604 ret = (self.offset, type, obj, crc32)
605 self.offset += total_size
607 progress(self.i, self.num)
610 return ObjectIterator(self)
612 def iterentries(self, ext_resolve_ref=None, progress=None):
613 """Yield entries summarizing the contents of this pack.
615 :param ext_resolve_ref: Optional function to resolve base
616 objects (in case this is a thin pack)
617 :param progress: Progress function, called with current and
620 This will yield tuples with (sha, offset, crc32)
623 postponed = defaultdict(list)
624 class Postpone(Exception):
625 """Raised to postpone delta resolving."""
627 def get_ref_text(sha):
628 assert len(sha) == 20
630 return self.get_object_at(found[sha])
633 return ext_resolve_ref(sha)
636 raise Postpone, (sha, )
638 todo = chain(self.iterobjects(progress=progress), extra)
639 for (offset, type, obj, crc32) in todo:
640 assert isinstance(offset, int)
641 assert isinstance(type, int)
642 assert isinstance(obj, tuple) or isinstance(obj, str)
644 type, obj = self.resolve_object(offset, type, obj, get_ref_text)
645 except Postpone, (sha, ):
646 postponed[sha].append((offset, type, obj))
648 shafile = ShaFile.from_raw_string(type, obj)
649 sha = shafile.sha().digest()
651 yield sha, offset, crc32
652 extra.extend(postponed.get(sha, []))
654 raise KeyError([sha_to_hex(h) for h in postponed.keys()])
656 def sorted_entries(self, resolve_ext_ref=None, progress=None):
657 """Return entries in this pack, sorted by SHA.
659 :param ext_resolve_ref: Optional function to resolve base
660 objects (in case this is a thin pack)
661 :param progress: Progress function, called with current and
663 :return: List of tuples with (sha, offset, crc32)
665 ret = list(self.iterentries(resolve_ext_ref, progress=progress))
669 def create_index_v1(self, filename, resolve_ext_ref=None, progress=None):
670 """Create a version 1 file for this data file.
672 :param filename: Index filename.
673 :param resolve_ext_ref: Function to use for resolving externally referenced
674 SHA1s (for thin packs)
675 :param progress: Progress report function
677 entries = self.sorted_entries(resolve_ext_ref, progress=progress)
678 write_pack_index_v1(filename, entries, self.calculate_checksum())
680 def create_index_v2(self, filename, resolve_ext_ref=None, progress=None):
681 """Create a version 2 index file for this data file.
683 :param filename: Index filename.
684 :param resolve_ext_ref: Function to use for resolving externally referenced
685 SHA1s (for thin packs)
686 :param progress: Progress report function
688 entries = self.sorted_entries(resolve_ext_ref, progress=progress)
689 write_pack_index_v2(filename, entries, self.calculate_checksum())
691 def create_index(self, filename, resolve_ext_ref=None, progress=None,
693 """Create an index file for this data file.
695 :param filename: Index filename.
696 :param resolve_ext_ref: Function to use for resolving externally referenced
697 SHA1s (for thin packs)
698 :param progress: Progress report function
701 self.create_index_v1(filename, resolve_ext_ref, progress)
703 self.create_index_v2(filename, resolve_ext_ref, progress)
705 raise ValueError("unknown index format %d" % version)
707 def get_stored_checksum(self):
708 """Return the expected checksum stored in this pack."""
709 self._file.seek(self._get_size()-20)
710 return self._file.read(20)
713 """Check the consistency of this pack."""
714 return (self.calculate_checksum() == self.get_stored_checksum())
716 def get_object_at(self, offset):
717 """Given an offset in to the packfile return the object that is there.
719 Using the associated index the location of an object can be looked up, and
720 then the packfile can be asked directly for that object using this
723 if offset in self._offset_cache:
724 return self._offset_cache[offset]
725 assert isinstance(offset, long) or isinstance(offset, int),\
726 "offset was %r" % offset
727 assert offset >= self._header_size
728 self._file.seek(offset)
729 return unpack_object(self._file.read)[:2]
732 class SHA1Reader(object):
733 """Wrapper around a file-like object that remembers the SHA1 of
734 the data read from it."""
736 def __init__(self, f):
738 self.sha1 = make_sha("")
740 def read(self, num=None):
741 data = self.f.read(num)
742 self.sha1.update(data)
746 stored = self.f.read(20)
747 if stored != self.sha1.digest():
748 raise ChecksumMismatch(self.sha1.hexdigest(), sha_to_hex(stored))
751 return self.f.close()
757 class SHA1Writer(object):
758 """Wrapper around a file-like object that remembers the SHA1 of
759 the data written to it."""
761 def __init__(self, f):
763 self.sha1 = make_sha("")
765 def write(self, data):
766 self.sha1.update(data)
770 sha = self.sha1.digest()
771 assert len(sha) == 20
776 sha = self.write_sha()
784 def write_pack_object(f, type, object):
785 """Write pack object to a file.
787 :param f: File to write to
788 :param o: Object to write
789 :return: Tuple with offset at which the object was written, and crc32
793 if type == 6: # offset delta
794 (delta_base_offset, object) = object
795 elif type == 7: # ref delta
796 (basename, object) = object
798 c = (type << 4) | (size & 15)
801 packed_data_hdr += (chr(c | 0x80))
804 packed_data_hdr += chr(c)
805 if type == 6: # offset delta
806 ret = [delta_base_offset & 0x7f]
807 delta_base_offset >>= 7
808 while delta_base_offset:
809 delta_base_offset -= 1
810 ret.insert(0, 0x80 | (delta_base_offset & 0x7f))
811 delta_base_offset >>= 7
812 packed_data_hdr += "".join([chr(x) for x in ret])
813 elif type == 7: # ref delta
814 assert len(basename) == 20
815 packed_data_hdr += basename
816 packed_data = packed_data_hdr + zlib.compress(object)
818 return (offset, (zlib.crc32(packed_data) & 0xffffffff))
821 def write_pack(filename, objects, num_objects):
822 """Write a new pack data file.
824 :param filename: Path to the new pack file (without .pack extension)
825 :param objects: Iterable over (object, path) tuples to write
826 :param num_objects: Number of objects to write
828 f = GitFile(filename + ".pack", 'wb')
830 entries, data_sum = write_pack_data(f, objects, num_objects)
834 write_pack_index_v2(filename + ".idx", entries, data_sum)
837 def write_pack_data(f, objects, num_objects, window=10):
838 """Write a new pack file.
840 :param filename: The filename of the new pack file.
841 :param objects: List of objects to write (tuples with object and path)
842 :return: List with (name, offset, crc32 checksum) entries, pack checksum
844 recency = list(objects)
845 # FIXME: Somehow limit delta depth
846 # FIXME: Make thin-pack optional (its not used when cloning a pack)
847 # Build a list of objects ordered by the magic Linus heuristic
848 # This helps us find good objects to diff against us
850 for obj, path in recency:
851 magic.append( (obj.type, path, 1, -len(obj.as_raw_string()), obj) )
853 # Build a map of objects and their index in magic - so we can find preceeding objects
856 for i in range(len(magic)):
857 offs[magic[i][4]] = i
861 f.write("PACK") # Pack header
862 f.write(struct.pack(">L", 2)) # Pack version
863 f.write(struct.pack(">L", num_objects)) # Number of objects in pack
864 for o, path in recency:
865 sha1 = o.sha().digest()
867 raw = o.as_raw_string()
870 #for i in range(offs[o]-window, window):
871 # if i < 0 or i >= len(offs): continue
873 # if b.type != orig_t: continue
874 # base = b.as_raw_string()
875 # delta = create_delta(base, raw)
876 # if len(delta) < len(winner):
878 # t = 6 if magic[i][2] == 1 else 7
879 offset, crc32 = write_pack_object(f, t, winner)
880 entries.append((sha1, offset, crc32))
881 return entries, f.write_sha()
884 def write_pack_index_v1(filename, entries, pack_checksum):
885 """Write a new pack index file.
887 :param filename: The filename of the new pack index file.
888 :param entries: List of tuples with object name (sha), offset_in_pack, and
890 :param pack_checksum: Checksum of the pack file.
892 f = GitFile(filename, 'wb')
894 fan_out_table = defaultdict(lambda: 0)
895 for (name, offset, entry_checksum) in entries:
896 fan_out_table[ord(name[0])] += 1
898 for i in range(0x100):
899 f.write(struct.pack(">L", fan_out_table[i]))
900 fan_out_table[i+1] += fan_out_table[i]
901 for (name, offset, entry_checksum) in entries:
902 f.write(struct.pack(">L20s", offset, name))
903 assert len(pack_checksum) == 20
904 f.write(pack_checksum)
908 def create_delta(base_buf, target_buf):
909 """Use python difflib to work out how to transform base_buf to target_buf.
911 :param base_buf: Base buffer
912 :param target_buf: Target buffer
914 assert isinstance(base_buf, str)
915 assert isinstance(target_buf, str)
918 def encode_size(size):
928 out_buf += encode_size(len(base_buf))
929 out_buf += encode_size(len(target_buf))
930 # write out delta opcodes
931 seq = difflib.SequenceMatcher(a=base_buf, b=target_buf)
932 for opcode, i1, i2, j1, j2 in seq.get_opcodes():
933 # Git patch opcodes don't care about deletes!
934 #if opcode == "replace" or opcode == "delete":
936 if opcode == "equal":
937 # If they are equal, unpacker will use data from base_buf
938 # Write out an opcode that says what range to use
944 scratch += chr((o >> i*8) & 0xff)
949 scratch += chr((s >> i*8) & 0xff)
953 if opcode == "replace" or opcode == "insert":
954 # If we are replacing a range or adding one, then we just
955 # output it to the stream (prefixed by its size)
960 out_buf += target_buf[o:o+127]
964 out_buf += target_buf[o:o+s]
968 def apply_delta(src_buf, delta):
969 """Based on the similar function in git's patch-delta.c.
971 :param src_buf: Source buffer
972 :param delta: Delta instructions
974 assert isinstance(src_buf, str), "was %r" % (src_buf,)
975 assert isinstance(delta, str)
978 delta_length = len(delta)
979 def get_delta_header_size(delta, index):
983 cmd = ord(delta[index])
985 size |= (cmd & ~0x80) << i
990 src_size, index = get_delta_header_size(delta, index)
991 dest_size, index = get_delta_header_size(delta, index)
992 assert src_size == len(src_buf), "%d vs %d" % (src_size, len(src_buf))
993 while index < delta_length:
994 cmd = ord(delta[index])
1000 x = ord(delta[index])
1002 cp_off |= x << (i * 8)
1005 if cmd & (1 << (4+i)):
1006 x = ord(delta[index])
1008 cp_size |= x << (i * 8)
1011 if (cp_off + cp_size < cp_size or
1012 cp_off + cp_size > src_size or
1013 cp_size > dest_size):
1015 out.append(src_buf[cp_off:cp_off+cp_size])
1017 out.append(delta[index:index+cmd])
1020 raise ApplyDeltaError("Invalid opcode 0")
1022 if index != delta_length:
1023 raise ApplyDeltaError("delta not empty: %r" % delta[index:])
1026 if dest_size != len(out):
1027 raise ApplyDeltaError("dest size incorrect")
1032 def write_pack_index_v2(filename, entries, pack_checksum):
1033 """Write a new pack index file.
1035 :param filename: The filename of the new pack index file.
1036 :param entries: List of tuples with object name (sha), offset_in_pack, and
1038 :param pack_checksum: Checksum of the pack file.
1040 f = GitFile(filename, 'wb')
1042 f.write('\377tOc') # Magic!
1043 f.write(struct.pack(">L", 2))
1044 fan_out_table = defaultdict(lambda: 0)
1045 for (name, offset, entry_checksum) in entries:
1046 fan_out_table[ord(name[0])] += 1
1048 for i in range(0x100):
1049 f.write(struct.pack(">L", fan_out_table[i]))
1050 fan_out_table[i+1] += fan_out_table[i]
1051 for (name, offset, entry_checksum) in entries:
1053 for (name, offset, entry_checksum) in entries:
1054 f.write(struct.pack(">L", entry_checksum))
1055 for (name, offset, entry_checksum) in entries:
1056 # FIXME: handle if MSBit is set in offset
1057 f.write(struct.pack(">L", offset))
1058 # FIXME: handle table for pack files > 8 Gb
1059 assert len(pack_checksum) == 20
1060 f.write(pack_checksum)
1065 """A Git pack object."""
1067 def __init__(self, basename):
1068 self._basename = basename
1069 self._data_path = self._basename + ".pack"
1070 self._idx_path = self._basename + ".idx"
1075 def from_objects(self, data, idx):
1076 """Create a new pack object from pack data and index objects."""
1083 """The SHA over the SHAs of the objects in this pack."""
1084 return self.index.objects_sha1()
1088 """The pack data object being used."""
1089 if self._data is None:
1090 self._data = PackData(self._data_path)
1091 assert len(self.index) == len(self._data)
1092 idx_stored_checksum = self.index.get_pack_checksum()
1093 data_stored_checksum = self._data.get_stored_checksum()
1094 if idx_stored_checksum != data_stored_checksum:
1095 raise ChecksumMismatch(sha_to_hex(idx_stored_checksum),
1096 sha_to_hex(data_stored_checksum))
1101 """The index being used.
1103 :note: This may be an in-memory index
1105 if self._idx is None:
1106 self._idx = load_pack_index(self._idx_path)
1110 if self._data is not None:
1114 def __eq__(self, other):
1115 return type(self) == type(other) and self.index == other.index
1118 """Number of entries in this pack."""
1119 return len(self.index)
1122 return "%s(%r)" % (self.__class__.__name__, self._basename)
1125 """Iterate over all the sha1s of the objects in this pack."""
1126 return iter(self.index)
1129 """Check the integrity of this pack."""
1130 if not self.index.check():
1132 if not self.data.check():
1136 def get_stored_checksum(self):
1137 return self.data.get_stored_checksum()
1139 def __contains__(self, sha1):
1140 """Check whether this pack contains a particular SHA1."""
1142 self.index.object_index(sha1)
1147 def get_raw(self, sha1, resolve_ref=None):
1148 offset = self.index.object_index(sha1)
1149 obj_type, obj = self.data.get_object_at(offset)
1150 if type(offset) is long:
1151 offset = int(offset)
1152 if resolve_ref is None:
1153 resolve_ref = self.get_raw
1154 return self.data.resolve_object(offset, obj_type, obj, resolve_ref)
1156 def __getitem__(self, sha1):
1157 """Retrieve the specified SHA1."""
1158 type, uncomp = self.get_raw(sha1)
1159 return ShaFile.from_raw_string(type, uncomp)
1161 def iterobjects(self, get_raw=None):
1162 """Iterate over the objects in this pack."""
1164 get_raw = self.get_raw
1165 for offset, type, obj, crc32 in self.data.iterobjects():
1166 assert isinstance(offset, int)
1167 yield ShaFile.from_raw_string(
1168 *self.data.resolve_object(offset, type, obj, get_raw))
1172 from dulwich._pack import apply_delta, bisect_find_sha