1 # pack.py -- For dealing wih packed git objects.
2 # Copyright (C) 2007 James Westby <jw+debian@jameswestby.net>
3 # Copryight (C) 2008-2009 Jelmer Vernooij <jelmer@samba.org>
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; version 2
8 # of the License or (at your option) a later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
20 """Classes for dealing with packed git objects.
22 A pack is a compact representation of a bunch of objects, stored
23 using deltas where possible.
25 They have two parts, the pack file, which stores the data, and an index
26 that tells you where the data is.
28 To find an object you look in all of the index files 'til you find a
29 match for the object name. You then use the pointer got from this as
30 a pointer in to the corresponding packfile.
34 from collections import defaultdict
36 from misc import defaultdict
39 from itertools import (
48 from struct import unpack_from
50 from dulwich.misc import unpack_from
54 from dulwich.errors import (
58 from dulwich.file import GitFile
59 from dulwich.lru_cache import (
62 from dulwich.objects import (
67 from dulwich.misc import (
71 supports_mmap_offset = (sys.version_info[0] >= 3 or
72 (sys.version_info[0] == 2 and sys.version_info[1] >= 6))
75 def take_msb_bytes(read):
76 """Read bytes marked with most significant bit.
78 :param read: Read function
81 while len(ret) == 0 or ret[-1] & 0x80:
82 ret.append(ord(read(1)))
86 def read_zlib_chunks(read, buffer_size=4096):
87 """Read chunks of zlib data from a buffer.
89 :param read: Read function
90 :return: Tuple with list of chunks and length of
91 compressed data length
93 obj = zlib.decompressobj()
96 while obj.unused_data == "":
97 add = read(buffer_size)
98 if len(add) < buffer_size:
101 ret.append(obj.decompress(add))
102 comp_len = fed-len(obj.unused_data)
106 def read_zlib(read, dec_size):
107 """Read zlib-compressed data from a buffer.
109 :param read: Read function
110 :param dec_size: Size of the decompressed buffer
111 :return: Uncompressed buffer and compressed buffer length.
113 ret, comp_len = read_zlib_chunks(read)
115 assert len(x) == dec_size
120 """Return the hexdigest of the SHA1 over a set of names.
122 :param iter: Iterator over string objects
123 :return: 40-byte hex sha1 digest
128 return sha1.hexdigest()
131 def load_pack_index(path):
132 """Load an index file by path.
134 :param filename: Path to the index file
136 f = GitFile(path, 'rb')
137 return load_pack_index_file(path, f)
140 def load_pack_index_file(path, f):
141 """Load an index file from a file-like object.
143 :param path: Path for the index file
144 :param f: File-like object
146 if f.read(4) == '\377tOc':
147 version = struct.unpack(">L", f.read(4))[0]
150 return PackIndex2(path, file=f)
152 raise KeyError("Unknown pack index format %d" % version)
155 return PackIndex1(path, file=f)
158 def bisect_find_sha(start, end, sha, unpack_name):
159 """Find a SHA in a data blob with sorted SHAs.
161 :param start: Start index of range to search
162 :param end: End index of range to search
163 :param sha: Sha to find
164 :param unpack_name: Callback to retrieve SHA by index
165 :return: Index of the SHA, or None if it wasn't found
170 file_sha = unpack_name(i)
171 x = cmp(file_sha, sha)
181 class PackIndex(object):
182 """An index in to a packfile.
184 Given a sha id of an object a pack index can tell you the location in the
185 packfile of that object if it has it.
187 To do the loop it opens the file, and indexes first 256 4 byte groups
188 with the first byte of the sha id. The value in the four byte group indexed
189 is the end of the group that shares the same starting byte. Subtract one
190 from the starting byte and index again to find the start of the group.
191 The values are sorted by sha id within the group, so do the math to find
192 the start and end offset and then bisect in to find if the value is present.
195 def __init__(self, filename, file=None, size=None):
196 """Create a pack index object.
198 Provide it with the name of the index file to consider, and it will map
199 it whenever required.
201 self._filename = filename
202 # Take the size now, so it can be checked each time we map the file to
203 # ensure that it hasn't changed.
205 self._file = GitFile(filename, 'rb')
208 fileno = getattr(self._file, 'fileno', None)
209 if fileno is not None:
210 fd = self._file.fileno()
212 self._size = os.fstat(fd).st_size
215 self._contents = mmap.mmap(fd, self._size,
216 access=mmap.ACCESS_READ)
219 self._contents = self._file.read()
220 self._size = len(self._contents)
222 def __eq__(self, other):
223 if not isinstance(other, PackIndex):
226 if self._fan_out_table != other._fan_out_table:
229 for (name1, _, _), (name2, _, _) in izip(self.iterentries(), other.iterentries()):
234 def __ne__(self, other):
235 return not self.__eq__(other)
241 """Return the number of entries in this pack index."""
242 return self._fan_out_table[-1]
244 def _unpack_entry(self, i):
245 """Unpack the i-th entry in the index file.
247 :return: Tuple with object name (SHA), offset in pack file and
248 CRC32 checksum (if known)."""
249 raise NotImplementedError(self._unpack_entry)
251 def _unpack_name(self, i):
252 """Unpack the i-th name from the index file."""
253 raise NotImplementedError(self._unpack_name)
255 def _unpack_offset(self, i):
256 """Unpack the i-th object offset from the index file."""
257 raise NotImplementedError(self._unpack_offset)
259 def _unpack_crc32_checksum(self, i):
260 """Unpack the crc32 checksum for the i-th object from the index file."""
261 raise NotImplementedError(self._unpack_crc32_checksum)
264 """Iterate over the SHAs in this pack."""
265 return imap(sha_to_hex, self._itersha())
268 for i in range(len(self)):
269 yield self._unpack_name(i)
271 def objects_sha1(self):
272 """Return the hex SHA1 over all the shas of all objects in this pack.
274 :note: This is used for the filename of the pack.
276 return iter_sha1(self._itersha())
278 def iterentries(self):
279 """Iterate over the entries in this pack index.
281 Will yield tuples with object name, offset in packfile and crc32 checksum.
283 for i in range(len(self)):
284 yield self._unpack_entry(i)
286 def _read_fan_out_table(self, start_offset):
288 for i in range(0x100):
289 ret.append(struct.unpack(">L", self._contents[start_offset+i*4:start_offset+(i+1)*4])[0])
293 """Check that the stored checksum matches the actual checksum."""
294 # TODO: Check pack contents, too
295 return self.calculate_checksum() == self.get_stored_checksum()
297 def calculate_checksum(self):
298 """Calculate the SHA1 checksum over this pack index.
300 :return: This is a 20-byte binary digest
302 return make_sha(self._contents[:-20]).digest()
304 def get_pack_checksum(self):
305 """Return the SHA1 checksum stored for the corresponding packfile.
307 :return: 20-byte binary digest
309 return str(self._contents[-40:-20])
311 def get_stored_checksum(self):
312 """Return the SHA1 checksum stored for this index.
314 :return: 20-byte binary digest
316 return str(self._contents[-20:])
318 def object_index(self, sha):
319 """Return the index in to the corresponding packfile for the object.
321 Given the name of an object it will return the offset that object lives
322 at within the corresponding pack file. If the pack file doesn't have the
323 object then None will be returned.
326 sha = hex_to_sha(sha)
327 return self._object_index(sha)
329 def _object_index(self, sha):
332 :param sha: A *binary* SHA string. (20 characters long)_
334 assert len(sha) == 20
339 start = self._fan_out_table[idx-1]
340 end = self._fan_out_table[idx]
341 i = bisect_find_sha(start, end, sha, self._unpack_name)
344 return self._unpack_offset(i)
348 class PackIndex1(PackIndex):
349 """Version 1 Pack Index."""
351 def __init__(self, filename, file=None, size=None):
352 PackIndex.__init__(self, filename, file, size)
354 self._fan_out_table = self._read_fan_out_table(0)
356 def _unpack_entry(self, i):
357 (offset, name) = unpack_from(">L20s", self._contents,
358 (0x100 * 4) + (i * 24))
359 return (name, offset, None)
361 def _unpack_name(self, i):
362 offset = (0x100 * 4) + (i * 24) + 4
363 return self._contents[offset:offset+20]
365 def _unpack_offset(self, i):
366 offset = (0x100 * 4) + (i * 24)
367 return unpack_from(">L", self._contents, offset)[0]
369 def _unpack_crc32_checksum(self, i):
370 # Not stored in v1 index files
374 class PackIndex2(PackIndex):
375 """Version 2 Pack Index."""
377 def __init__(self, filename, file=None, size=None):
378 PackIndex.__init__(self, filename, file, size)
379 assert self._contents[:4] == '\377tOc', "Not a v2 pack index file"
380 (self.version, ) = unpack_from(">L", self._contents, 4)
381 assert self.version == 2, "Version was %d" % self.version
382 self._fan_out_table = self._read_fan_out_table(8)
383 self._name_table_offset = 8 + 0x100 * 4
384 self._crc32_table_offset = self._name_table_offset + 20 * len(self)
385 self._pack_offset_table_offset = self._crc32_table_offset + 4 * len(self)
387 def _unpack_entry(self, i):
388 return (self._unpack_name(i), self._unpack_offset(i),
389 self._unpack_crc32_checksum(i))
391 def _unpack_name(self, i):
392 offset = self._name_table_offset + i * 20
393 return self._contents[offset:offset+20]
395 def _unpack_offset(self, i):
396 offset = self._pack_offset_table_offset + i * 4
397 return unpack_from(">L", self._contents, offset)[0]
399 def _unpack_crc32_checksum(self, i):
400 return unpack_from(">L", self._contents,
401 self._crc32_table_offset + i * 4)[0]
405 def read_pack_header(f):
406 """Read the header of a pack file.
408 :param f: File-like object to read from
411 assert header[:4] == "PACK"
412 (version,) = unpack_from(">L", header, 4)
413 assert version in (2, 3), "Version was %d" % version
414 (num_objects,) = unpack_from(">L", header, 8)
415 return (version, num_objects)
418 def unpack_object(read):
419 """Unpack a Git object.
421 :return: tuple with type, uncompressed data and compressed size
423 bytes = take_msb_bytes(read)
424 type = (bytes[0] >> 4) & 0x07
425 size = bytes[0] & 0x0f
426 for i, byte in enumerate(bytes[1:]):
427 size += (byte & 0x7f) << ((i * 7) + 4)
428 raw_base = len(bytes)
429 if type == 6: # offset delta
430 bytes = take_msb_bytes(read)
431 raw_base += len(bytes)
432 assert not (bytes[-1] & 0x80)
433 delta_base_offset = bytes[0] & 0x7f
434 for byte in bytes[1:]:
435 delta_base_offset += 1
436 delta_base_offset <<= 7
437 delta_base_offset += (byte & 0x7f)
438 uncomp, comp_len = read_zlib(read, size)
439 assert size == len(uncomp)
440 return type, (delta_base_offset, uncomp), comp_len+raw_base
441 elif type == 7: # ref delta
442 basename = map.read(20)
444 uncomp, comp_len = read_zlib(read, size)
445 assert size == len(uncomp)
446 return type, (basename, uncomp), comp_len+raw_base
448 uncomp, comp_len = read_zlib(read, size)
449 assert len(uncomp) == size
450 return type, uncomp, comp_len+raw_base
453 def _compute_object_size((num, obj)):
454 """Compute the size of a unresolved object for use with LRUSizeCache.
458 assert isinstance(obj, str)
462 class PackData(object):
463 """The data contained in a packfile.
465 Pack files can be accessed both sequentially for exploding a pack, and
466 directly with the help of an index to retrieve a specific object.
468 The objects within are either complete or a delta aginst another.
470 The header is variable length. If the MSB of each byte is set then it
471 indicates that the subsequent byte is still part of the header.
472 For the first byte the next MS bits are the type, which tells you the type
473 of object, and whether it is a delta. The LS byte is the lowest bits of the
474 size. For each subsequent byte the LS 7 bits are the next MS bits of the
475 size, i.e. the last byte of the header contains the MS bits of the size.
477 For the complete objects the data is stored as zlib deflated data.
478 The size in the header is the uncompressed object size, so to uncompress
479 you need to just keep feeding data to zlib until you get an object back,
480 or it errors on bad data. This is done here by just giving the complete
481 buffer from the start of the deflated object on. This is bad, but until I
482 get mmap sorted out it will have to do.
484 Currently there are no integrity checks done. Also no attempt is made to try
485 and detect the delta case, or a request for an object at the wrong position.
486 It will all just throw a zlib or KeyError.
489 def __init__(self, filename, file=None, size=None):
490 """Create a PackData object that represents the pack in the given filename.
492 The file must exist and stay readable until the object is disposed of. It
493 must also stay the same size. It will be mapped whenever needed.
495 Currently there is a restriction on the size of the pack as the python
496 mmap implementation is flawed.
498 self._filename = filename
500 self._header_size = 12
502 self._file = GitFile(self._filename, 'rb')
506 self._offset_cache = LRUSizeCache(1024*1024*20,
507 compute_size=_compute_object_size)
510 def from_file(cls, file, size):
511 return cls(str(file), file=file, size=size)
514 def from_path(cls, path):
515 return cls(filename=path)
521 self._size = os.path.getsize(self._filename)
522 assert self._size >= self._header_size, "%s is too small for a packfile (%d < %d)" % (self._filename, self._size, self._header_size)
524 def _read_header(self):
525 (version, self._num_objects) = read_pack_header(self._file)
528 """Returns the number of objects in this pack."""
529 return self._num_objects
531 def calculate_checksum(self):
532 """Calculate the checksum for this pack.
534 :return: 20-byte binary SHA1 digest
538 todo = self._get_size() - 20
540 x = self._file.read(min(todo, 1<<16))
545 def resolve_object(self, offset, type, obj, get_ref, get_offset=None):
546 """Resolve an object, possibly resolving deltas when necessary.
548 :return: Tuple with object type and contents.
550 if type not in (6, 7): # Not a delta
553 if get_offset is None:
554 get_offset = self.get_object_at
556 if type == 6: # offset delta
557 (delta_offset, delta) = obj
558 assert isinstance(delta_offset, int)
559 assert isinstance(delta, str)
560 base_offset = offset-delta_offset
561 type, base_obj = get_offset(base_offset)
562 assert isinstance(type, int)
563 elif type == 7: # ref delta
564 (basename, delta) = obj
565 assert isinstance(basename, str) and len(basename) == 20
566 assert isinstance(delta, str)
567 type, base_obj = get_ref(basename)
568 assert isinstance(type, int)
569 # Can't be a ofs delta, as we wouldn't know the base offset
572 type, base_text = self.resolve_object(base_offset, type, base_obj, get_ref)
573 if base_offset is not None:
574 self._offset_cache[base_offset] = type, base_text
575 ret = (type, apply_delta(base_text, delta))
578 def iterobjects(self, progress=None):
580 class ObjectIterator(object):
582 def __init__(self, pack):
584 self.offset = pack._header_size
586 self.map = pack._file
595 if self.i == self.num:
597 self.map.seek(self.offset)
598 (type, obj, total_size) = unpack_object(self.map.read)
599 self.map.seek(self.offset)
600 crc32 = zlib.crc32(self.map.read(total_size)) & 0xffffffff
601 ret = (self.offset, type, obj, crc32)
602 self.offset += total_size
604 progress(self.i, self.num)
607 return ObjectIterator(self)
609 def iterentries(self, ext_resolve_ref=None, progress=None):
610 """Yield entries summarizing the contents of this pack.
612 :param ext_resolve_ref: Optional function to resolve base
613 objects (in case this is a thin pack)
614 :param progress: Progress function, called with current and
617 This will yield tuples with (sha, offset, crc32)
620 postponed = defaultdict(list)
621 class Postpone(Exception):
622 """Raised to postpone delta resolving."""
624 def get_ref_text(sha):
625 assert len(sha) == 20
627 return self.get_object_at(found[sha])
630 return ext_resolve_ref(sha)
633 raise Postpone, (sha, )
635 todo = chain(self.iterobjects(progress=progress), extra)
636 for (offset, type, obj, crc32) in todo:
637 assert isinstance(offset, int)
638 assert isinstance(type, int)
639 assert isinstance(obj, tuple) or isinstance(obj, str)
641 type, obj = self.resolve_object(offset, type, obj, get_ref_text)
642 except Postpone, (sha, ):
643 postponed[sha].append((offset, type, obj))
645 shafile = ShaFile.from_raw_string(type, obj)
646 sha = shafile.sha().digest()
648 yield sha, offset, crc32
649 extra.extend(postponed.get(sha, []))
651 raise KeyError([sha_to_hex(h) for h in postponed.keys()])
653 def sorted_entries(self, resolve_ext_ref=None, progress=None):
654 """Return entries in this pack, sorted by SHA.
656 :param ext_resolve_ref: Optional function to resolve base
657 objects (in case this is a thin pack)
658 :param progress: Progress function, called with current and
660 :return: List of tuples with (sha, offset, crc32)
662 ret = list(self.iterentries(resolve_ext_ref, progress=progress))
666 def create_index_v1(self, filename, resolve_ext_ref=None, progress=None):
667 """Create a version 1 file for this data file.
669 :param filename: Index filename.
670 :param resolve_ext_ref: Function to use for resolving externally referenced
671 SHA1s (for thin packs)
672 :param progress: Progress report function
674 entries = self.sorted_entries(resolve_ext_ref, progress=progress)
675 write_pack_index_v1(filename, entries, self.calculate_checksum())
677 def create_index_v2(self, filename, resolve_ext_ref=None, progress=None):
678 """Create a version 2 index file for this data file.
680 :param filename: Index filename.
681 :param resolve_ext_ref: Function to use for resolving externally referenced
682 SHA1s (for thin packs)
683 :param progress: Progress report function
685 entries = self.sorted_entries(resolve_ext_ref, progress=progress)
686 write_pack_index_v2(filename, entries, self.calculate_checksum())
688 def create_index(self, filename, resolve_ext_ref=None, progress=None,
690 """Create an index file for this data file.
692 :param filename: Index filename.
693 :param resolve_ext_ref: Function to use for resolving externally referenced
694 SHA1s (for thin packs)
695 :param progress: Progress report function
698 self.create_index_v1(filename, resolve_ext_ref, progress)
700 self.create_index_v2(filename, resolve_ext_ref, progress)
702 raise ValueError("unknown index format %d" % version)
704 def get_stored_checksum(self):
705 """Return the expected checksum stored in this pack."""
706 self._file.seek(self._get_size()-20)
707 return self._file.read(20)
710 """Check the consistency of this pack."""
711 return (self.calculate_checksum() == self.get_stored_checksum())
713 def get_object_at(self, offset):
714 """Given an offset in to the packfile return the object that is there.
716 Using the associated index the location of an object can be looked up, and
717 then the packfile can be asked directly for that object using this
720 if offset in self._offset_cache:
721 return self._offset_cache[offset]
722 assert isinstance(offset, long) or isinstance(offset, int),\
723 "offset was %r" % offset
724 assert offset >= self._header_size
725 self._file.seek(offset)
726 return unpack_object(self._file.read)[:2]
729 class SHA1Reader(object):
730 """Wrapper around a file-like object that remembers the SHA1 of
731 the data read from it."""
733 def __init__(self, f):
735 self.sha1 = make_sha("")
737 def read(self, num=None):
738 data = self.f.read(num)
739 self.sha1.update(data)
743 stored = self.f.read(20)
744 if stored != self.sha1.digest():
745 raise ChecksumMismatch(self.sha1.hexdigest(), sha_to_hex(stored))
748 return self.f.close()
754 class SHA1Writer(object):
755 """Wrapper around a file-like object that remembers the SHA1 of
756 the data written to it."""
758 def __init__(self, f):
760 self.sha1 = make_sha("")
762 def write(self, data):
763 self.sha1.update(data)
767 sha = self.sha1.digest()
768 assert len(sha) == 20
773 sha = self.write_sha()
781 def write_pack_object(f, type, object):
782 """Write pack object to a file.
784 :param f: File to write to
785 :param o: Object to write
786 :return: Tuple with offset at which the object was written, and crc32
790 if type == 6: # offset delta
791 (delta_base_offset, object) = object
792 elif type == 7: # ref delta
793 (basename, object) = object
795 c = (type << 4) | (size & 15)
798 packed_data_hdr += (chr(c | 0x80))
801 packed_data_hdr += chr(c)
802 if type == 6: # offset delta
803 ret = [delta_base_offset & 0x7f]
804 delta_base_offset >>= 7
805 while delta_base_offset:
806 delta_base_offset -= 1
807 ret.insert(0, 0x80 | (delta_base_offset & 0x7f))
808 delta_base_offset >>= 7
809 packed_data_hdr += "".join([chr(x) for x in ret])
810 elif type == 7: # ref delta
811 assert len(basename) == 20
812 packed_data_hdr += basename
813 packed_data = packed_data_hdr + zlib.compress(object)
815 return (offset, (zlib.crc32(packed_data) & 0xffffffff))
818 def write_pack(filename, objects, num_objects):
819 """Write a new pack data file.
821 :param filename: Path to the new pack file (without .pack extension)
822 :param objects: Iterable over (object, path) tuples to write
823 :param num_objects: Number of objects to write
825 f = GitFile(filename + ".pack", 'wb')
827 entries, data_sum = write_pack_data(f, objects, num_objects)
831 write_pack_index_v2(filename + ".idx", entries, data_sum)
834 def write_pack_data(f, objects, num_objects, window=10):
835 """Write a new pack file.
837 :param filename: The filename of the new pack file.
838 :param objects: List of objects to write (tuples with object and path)
839 :return: List with (name, offset, crc32 checksum) entries, pack checksum
841 recency = list(objects)
842 # FIXME: Somehow limit delta depth
843 # FIXME: Make thin-pack optional (its not used when cloning a pack)
844 # Build a list of objects ordered by the magic Linus heuristic
845 # This helps us find good objects to diff against us
847 for obj, path in recency:
848 magic.append( (obj.type, path, 1, -len(obj.as_raw_string()), obj) )
850 # Build a map of objects and their index in magic - so we can find preceeding objects
853 for i in range(len(magic)):
854 offs[magic[i][4]] = i
858 f.write("PACK") # Pack header
859 f.write(struct.pack(">L", 2)) # Pack version
860 f.write(struct.pack(">L", num_objects)) # Number of objects in pack
861 for o, path in recency:
862 sha1 = o.sha().digest()
864 raw = o.as_raw_string()
867 #for i in range(offs[o]-window, window):
868 # if i < 0 or i >= len(offs): continue
870 # if b.type != orig_t: continue
871 # base = b.as_raw_string()
872 # delta = create_delta(base, raw)
873 # if len(delta) < len(winner):
875 # t = 6 if magic[i][2] == 1 else 7
876 offset, crc32 = write_pack_object(f, t, winner)
877 entries.append((sha1, offset, crc32))
878 return entries, f.write_sha()
881 def write_pack_index_v1(filename, entries, pack_checksum):
882 """Write a new pack index file.
884 :param filename: The filename of the new pack index file.
885 :param entries: List of tuples with object name (sha), offset_in_pack, and
887 :param pack_checksum: Checksum of the pack file.
889 f = GitFile(filename, 'wb')
891 fan_out_table = defaultdict(lambda: 0)
892 for (name, offset, entry_checksum) in entries:
893 fan_out_table[ord(name[0])] += 1
895 for i in range(0x100):
896 f.write(struct.pack(">L", fan_out_table[i]))
897 fan_out_table[i+1] += fan_out_table[i]
898 for (name, offset, entry_checksum) in entries:
899 f.write(struct.pack(">L20s", offset, name))
900 assert len(pack_checksum) == 20
901 f.write(pack_checksum)
905 def create_delta(base_buf, target_buf):
906 """Use python difflib to work out how to transform base_buf to target_buf.
908 :param base_buf: Base buffer
909 :param target_buf: Target buffer
911 assert isinstance(base_buf, str)
912 assert isinstance(target_buf, str)
915 def encode_size(size):
925 out_buf += encode_size(len(base_buf))
926 out_buf += encode_size(len(target_buf))
927 # write out delta opcodes
928 seq = difflib.SequenceMatcher(a=base_buf, b=target_buf)
929 for opcode, i1, i2, j1, j2 in seq.get_opcodes():
930 # Git patch opcodes don't care about deletes!
931 #if opcode == "replace" or opcode == "delete":
933 if opcode == "equal":
934 # If they are equal, unpacker will use data from base_buf
935 # Write out an opcode that says what range to use
941 scratch += chr((o >> i*8) & 0xff)
946 scratch += chr((s >> i*8) & 0xff)
950 if opcode == "replace" or opcode == "insert":
951 # If we are replacing a range or adding one, then we just
952 # output it to the stream (prefixed by its size)
957 out_buf += target_buf[o:o+127]
961 out_buf += target_buf[o:o+s]
965 def apply_delta(src_buf, delta):
966 """Based on the similar function in git's patch-delta.c.
968 :param src_buf: Source buffer
969 :param delta: Delta instructions
971 assert isinstance(src_buf, str), "was %r" % (src_buf,)
972 assert isinstance(delta, str)
975 delta_length = len(delta)
976 def get_delta_header_size(delta, index):
980 cmd = ord(delta[index])
982 size |= (cmd & ~0x80) << i
987 src_size, index = get_delta_header_size(delta, index)
988 dest_size, index = get_delta_header_size(delta, index)
989 assert src_size == len(src_buf), "%d vs %d" % (src_size, len(src_buf))
990 while index < delta_length:
991 cmd = ord(delta[index])
997 x = ord(delta[index])
999 cp_off |= x << (i * 8)
1002 if cmd & (1 << (4+i)):
1003 x = ord(delta[index])
1005 cp_size |= x << (i * 8)
1008 if (cp_off + cp_size < cp_size or
1009 cp_off + cp_size > src_size or
1010 cp_size > dest_size):
1012 out.append(src_buf[cp_off:cp_off+cp_size])
1014 out.append(delta[index:index+cmd])
1017 raise ApplyDeltaError("Invalid opcode 0")
1019 if index != delta_length:
1020 raise ApplyDeltaError("delta not empty: %r" % delta[index:])
1023 if dest_size != len(out):
1024 raise ApplyDeltaError("dest size incorrect")
1029 def write_pack_index_v2(filename, entries, pack_checksum):
1030 """Write a new pack index file.
1032 :param filename: The filename of the new pack index file.
1033 :param entries: List of tuples with object name (sha), offset_in_pack, and
1035 :param pack_checksum: Checksum of the pack file.
1037 f = GitFile(filename, 'wb')
1039 f.write('\377tOc') # Magic!
1040 f.write(struct.pack(">L", 2))
1041 fan_out_table = defaultdict(lambda: 0)
1042 for (name, offset, entry_checksum) in entries:
1043 fan_out_table[ord(name[0])] += 1
1045 for i in range(0x100):
1046 f.write(struct.pack(">L", fan_out_table[i]))
1047 fan_out_table[i+1] += fan_out_table[i]
1048 for (name, offset, entry_checksum) in entries:
1050 for (name, offset, entry_checksum) in entries:
1051 f.write(struct.pack(">L", entry_checksum))
1052 for (name, offset, entry_checksum) in entries:
1053 # FIXME: handle if MSBit is set in offset
1054 f.write(struct.pack(">L", offset))
1055 # FIXME: handle table for pack files > 8 Gb
1056 assert len(pack_checksum) == 20
1057 f.write(pack_checksum)
1062 """A Git pack object."""
1064 def __init__(self, basename):
1065 self._basename = basename
1066 self._data_path = self._basename + ".pack"
1067 self._idx_path = self._basename + ".idx"
1072 def from_objects(self, data, idx):
1073 """Create a new pack object from pack data and index objects."""
1080 """The SHA over the SHAs of the objects in this pack."""
1081 return self.index.objects_sha1()
1085 """The pack data object being used."""
1086 if self._data is None:
1087 self._data = PackData(self._data_path)
1088 assert len(self.index) == len(self._data)
1089 idx_stored_checksum = self.index.get_pack_checksum()
1090 data_stored_checksum = self._data.get_stored_checksum()
1091 if idx_stored_checksum != data_stored_checksum:
1092 raise ChecksumMismatch(sha_to_hex(idx_stored_checksum),
1093 sha_to_hex(data_stored_checksum))
1098 """The index being used.
1100 :note: This may be an in-memory index
1102 if self._idx is None:
1103 self._idx = load_pack_index(self._idx_path)
1107 if self._data is not None:
1111 def __eq__(self, other):
1112 return type(self) == type(other) and self.index == other.index
1115 """Number of entries in this pack."""
1116 return len(self.index)
1119 return "%s(%r)" % (self.__class__.__name__, self._basename)
1122 """Iterate over all the sha1s of the objects in this pack."""
1123 return iter(self.index)
1126 """Check the integrity of this pack."""
1127 if not self.index.check():
1129 if not self.data.check():
1133 def get_stored_checksum(self):
1134 return self.data.get_stored_checksum()
1136 def __contains__(self, sha1):
1137 """Check whether this pack contains a particular SHA1."""
1139 self.index.object_index(sha1)
1144 def get_raw(self, sha1, resolve_ref=None):
1145 offset = self.index.object_index(sha1)
1146 obj_type, obj = self.data.get_object_at(offset)
1147 if type(offset) is long:
1148 offset = int(offset)
1149 if resolve_ref is None:
1150 resolve_ref = self.get_raw
1151 return self.data.resolve_object(offset, obj_type, obj, resolve_ref)
1153 def __getitem__(self, sha1):
1154 """Retrieve the specified SHA1."""
1155 type, uncomp = self.get_raw(sha1)
1156 return ShaFile.from_raw_string(type, uncomp)
1158 def iterobjects(self, get_raw=None):
1159 """Iterate over the objects in this pack."""
1161 get_raw = self.get_raw
1162 for offset, type, obj, crc32 in self.data.iterobjects():
1163 assert isinstance(offset, int)
1164 yield ShaFile.from_raw_string(
1165 *self.data.resolve_object(offset, type, obj, get_raw))
1169 from dulwich._pack import apply_delta, bisect_find_sha