1 # pack.py -- For dealing wih packed git objects.
2 # Copyright (C) 2007 James Westby <jw+debian@jameswestby.net>
3 # Copryight (C) 2008-2009 Jelmer Vernooij <jelmer@samba.org>
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; version 2
8 # of the License or (at your option) a later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
20 """Classes for dealing with packed git objects.
22 A pack is a compact representation of a bunch of objects, stored
23 using deltas where possible.
25 They have two parts, the pack file, which stores the data, and an index
26 that tells you where the data is.
28 To find an object you look in all of the index files 'til you find a
29 match for the object name. You then use the pointer got from this as
30 a pointer in to the corresponding packfile.
34 from collections import defaultdict
36 from misc import defaultdict
39 from itertools import (
48 from struct import unpack_from
50 from dulwich.misc import unpack_from
54 from dulwich.errors import (
58 from dulwich.file import GitFile
59 from dulwich.lru_cache import (
62 from dulwich.objects import (
67 from dulwich.misc import (
71 supports_mmap_offset = (sys.version_info[0] >= 3 or
72 (sys.version_info[0] == 2 and sys.version_info[1] >= 6))
75 def take_msb_bytes(read):
76 """Read bytes marked with most significant bit.
78 :param read: Read function
81 while len(ret) == 0 or ret[-1] & 0x80:
82 ret.append(ord(read(1)))
86 def read_zlib_chunks(read, buffer_size=4096):
87 """Read chunks of zlib data from a buffer.
89 :param read: Read function
90 :return: Tuple with list of chunks, length of
91 compressed data length and unused read data
93 obj = zlib.decompressobj()
96 while obj.unused_data == "":
97 add = read(buffer_size)
98 if len(add) < buffer_size:
101 ret.append(obj.decompress(add))
102 comp_len = fed-len(obj.unused_data)
103 return ret, comp_len, obj.unused_data
107 """Return the hexdigest of the SHA1 over a set of names.
109 :param iter: Iterator over string objects
110 :return: 40-byte hex sha1 digest
115 return sha1.hexdigest()
118 def load_pack_index(path):
119 """Load an index file by path.
121 :param filename: Path to the index file
123 f = GitFile(path, 'rb')
124 return load_pack_index_file(path, f)
127 def load_pack_index_file(path, f):
128 """Load an index file from a file-like object.
130 :param path: Path for the index file
131 :param f: File-like object
133 if f.read(4) == '\377tOc':
134 version = struct.unpack(">L", f.read(4))[0]
137 return PackIndex2(path, file=f)
139 raise KeyError("Unknown pack index format %d" % version)
142 return PackIndex1(path, file=f)
145 def bisect_find_sha(start, end, sha, unpack_name):
146 """Find a SHA in a data blob with sorted SHAs.
148 :param start: Start index of range to search
149 :param end: End index of range to search
150 :param sha: Sha to find
151 :param unpack_name: Callback to retrieve SHA by index
152 :return: Index of the SHA, or None if it wasn't found
157 file_sha = unpack_name(i)
158 x = cmp(file_sha, sha)
168 class PackIndex(object):
169 """An index in to a packfile.
171 Given a sha id of an object a pack index can tell you the location in the
172 packfile of that object if it has it.
174 To do the loop it opens the file, and indexes first 256 4 byte groups
175 with the first byte of the sha id. The value in the four byte group indexed
176 is the end of the group that shares the same starting byte. Subtract one
177 from the starting byte and index again to find the start of the group.
178 The values are sorted by sha id within the group, so do the math to find
179 the start and end offset and then bisect in to find if the value is present.
182 def __init__(self, filename, file=None, size=None):
183 """Create a pack index object.
185 Provide it with the name of the index file to consider, and it will map
186 it whenever required.
188 self._filename = filename
189 # Take the size now, so it can be checked each time we map the file to
190 # ensure that it hasn't changed.
192 self._file = GitFile(filename, 'rb')
195 fileno = getattr(self._file, 'fileno', None)
196 if fileno is not None:
197 fd = self._file.fileno()
199 self._size = os.fstat(fd).st_size
202 self._contents = mmap.mmap(fd, self._size,
203 access=mmap.ACCESS_READ)
206 self._contents = self._file.read()
207 self._size = len(self._contents)
209 def __eq__(self, other):
210 if not isinstance(other, PackIndex):
213 if self._fan_out_table != other._fan_out_table:
216 for (name1, _, _), (name2, _, _) in izip(self.iterentries(), other.iterentries()):
221 def __ne__(self, other):
222 return not self.__eq__(other)
228 """Return the number of entries in this pack index."""
229 return self._fan_out_table[-1]
231 def _unpack_entry(self, i):
232 """Unpack the i-th entry in the index file.
234 :return: Tuple with object name (SHA), offset in pack file and
235 CRC32 checksum (if known)."""
236 raise NotImplementedError(self._unpack_entry)
238 def _unpack_name(self, i):
239 """Unpack the i-th name from the index file."""
240 raise NotImplementedError(self._unpack_name)
242 def _unpack_offset(self, i):
243 """Unpack the i-th object offset from the index file."""
244 raise NotImplementedError(self._unpack_offset)
246 def _unpack_crc32_checksum(self, i):
247 """Unpack the crc32 checksum for the i-th object from the index file."""
248 raise NotImplementedError(self._unpack_crc32_checksum)
251 """Iterate over the SHAs in this pack."""
252 return imap(sha_to_hex, self._itersha())
255 for i in range(len(self)):
256 yield self._unpack_name(i)
258 def objects_sha1(self):
259 """Return the hex SHA1 over all the shas of all objects in this pack.
261 :note: This is used for the filename of the pack.
263 return iter_sha1(self._itersha())
265 def iterentries(self):
266 """Iterate over the entries in this pack index.
268 Will yield tuples with object name, offset in packfile and crc32 checksum.
270 for i in range(len(self)):
271 yield self._unpack_entry(i)
273 def _read_fan_out_table(self, start_offset):
275 for i in range(0x100):
276 ret.append(struct.unpack(">L", self._contents[start_offset+i*4:start_offset+(i+1)*4])[0])
280 """Check that the stored checksum matches the actual checksum."""
281 # TODO: Check pack contents, too
282 return self.calculate_checksum() == self.get_stored_checksum()
284 def calculate_checksum(self):
285 """Calculate the SHA1 checksum over this pack index.
287 :return: This is a 20-byte binary digest
289 return make_sha(self._contents[:-20]).digest()
291 def get_pack_checksum(self):
292 """Return the SHA1 checksum stored for the corresponding packfile.
294 :return: 20-byte binary digest
296 return str(self._contents[-40:-20])
298 def get_stored_checksum(self):
299 """Return the SHA1 checksum stored for this index.
301 :return: 20-byte binary digest
303 return str(self._contents[-20:])
305 def object_index(self, sha):
306 """Return the index in to the corresponding packfile for the object.
308 Given the name of an object it will return the offset that object lives
309 at within the corresponding pack file. If the pack file doesn't have the
310 object then None will be returned.
313 sha = hex_to_sha(sha)
314 return self._object_index(sha)
316 def _object_index(self, sha):
319 :param sha: A *binary* SHA string. (20 characters long)_
321 assert len(sha) == 20
326 start = self._fan_out_table[idx-1]
327 end = self._fan_out_table[idx]
328 i = bisect_find_sha(start, end, sha, self._unpack_name)
331 return self._unpack_offset(i)
335 class PackIndex1(PackIndex):
336 """Version 1 Pack Index."""
338 def __init__(self, filename, file=None, size=None):
339 PackIndex.__init__(self, filename, file, size)
341 self._fan_out_table = self._read_fan_out_table(0)
343 def _unpack_entry(self, i):
344 (offset, name) = unpack_from(">L20s", self._contents,
345 (0x100 * 4) + (i * 24))
346 return (name, offset, None)
348 def _unpack_name(self, i):
349 offset = (0x100 * 4) + (i * 24) + 4
350 return self._contents[offset:offset+20]
352 def _unpack_offset(self, i):
353 offset = (0x100 * 4) + (i * 24)
354 return unpack_from(">L", self._contents, offset)[0]
356 def _unpack_crc32_checksum(self, i):
357 # Not stored in v1 index files
361 class PackIndex2(PackIndex):
362 """Version 2 Pack Index."""
364 def __init__(self, filename, file=None, size=None):
365 PackIndex.__init__(self, filename, file, size)
366 assert self._contents[:4] == '\377tOc', "Not a v2 pack index file"
367 (self.version, ) = unpack_from(">L", self._contents, 4)
368 assert self.version == 2, "Version was %d" % self.version
369 self._fan_out_table = self._read_fan_out_table(8)
370 self._name_table_offset = 8 + 0x100 * 4
371 self._crc32_table_offset = self._name_table_offset + 20 * len(self)
372 self._pack_offset_table_offset = self._crc32_table_offset + 4 * len(self)
374 def _unpack_entry(self, i):
375 return (self._unpack_name(i), self._unpack_offset(i),
376 self._unpack_crc32_checksum(i))
378 def _unpack_name(self, i):
379 offset = self._name_table_offset + i * 20
380 return self._contents[offset:offset+20]
382 def _unpack_offset(self, i):
383 offset = self._pack_offset_table_offset + i * 4
384 return unpack_from(">L", self._contents, offset)[0]
386 def _unpack_crc32_checksum(self, i):
387 return unpack_from(">L", self._contents,
388 self._crc32_table_offset + i * 4)[0]
392 def read_pack_header(f):
393 """Read the header of a pack file.
395 :param f: File-like object to read from
398 assert header[:4] == "PACK"
399 (version,) = unpack_from(">L", header, 4)
400 assert version in (2, 3), "Version was %d" % version
401 (num_objects,) = unpack_from(">L", header, 8)
402 return (version, num_objects)
405 def chunks_length(chunks):
406 return sum(imap(len, chunks))
409 def unpack_object(read):
410 """Unpack a Git object.
412 :return: tuple with type, uncompressed data as chunks, compressed size and
415 bytes = take_msb_bytes(read)
416 type = (bytes[0] >> 4) & 0x07
417 size = bytes[0] & 0x0f
418 for i, byte in enumerate(bytes[1:]):
419 size += (byte & 0x7f) << ((i * 7) + 4)
420 raw_base = len(bytes)
421 if type == 6: # offset delta
422 bytes = take_msb_bytes(read)
423 raw_base += len(bytes)
424 assert not (bytes[-1] & 0x80)
425 delta_base_offset = bytes[0] & 0x7f
426 for byte in bytes[1:]:
427 delta_base_offset += 1
428 delta_base_offset <<= 7
429 delta_base_offset += (byte & 0x7f)
430 uncomp, comp_len, unused = read_zlib_chunks(read, size)
431 assert size == chunks_length(uncomp)
432 return type, (delta_base_offset, uncomp), comp_len+raw_base, unused
433 elif type == 7: # ref delta
436 uncomp, comp_len, unused = read_zlib_chunks(read, size)
437 assert size == chunks_length(uncomp)
438 return type, (basename, uncomp), comp_len+raw_base, unused
440 uncomp, comp_len, unused = read_zlib_chunks(read, size)
441 assert chunks_length(uncomp) == size
442 return type, uncomp, comp_len+raw_base, unused
445 def _compute_object_size((num, obj)):
446 """Compute the size of a unresolved object for use with LRUSizeCache.
450 assert isinstance(obj, str)
454 class PackData(object):
455 """The data contained in a packfile.
457 Pack files can be accessed both sequentially for exploding a pack, and
458 directly with the help of an index to retrieve a specific object.
460 The objects within are either complete or a delta aginst another.
462 The header is variable length. If the MSB of each byte is set then it
463 indicates that the subsequent byte is still part of the header.
464 For the first byte the next MS bits are the type, which tells you the type
465 of object, and whether it is a delta. The LS byte is the lowest bits of the
466 size. For each subsequent byte the LS 7 bits are the next MS bits of the
467 size, i.e. the last byte of the header contains the MS bits of the size.
469 For the complete objects the data is stored as zlib deflated data.
470 The size in the header is the uncompressed object size, so to uncompress
471 you need to just keep feeding data to zlib until you get an object back,
472 or it errors on bad data. This is done here by just giving the complete
473 buffer from the start of the deflated object on. This is bad, but until I
474 get mmap sorted out it will have to do.
476 Currently there are no integrity checks done. Also no attempt is made to try
477 and detect the delta case, or a request for an object at the wrong position.
478 It will all just throw a zlib or KeyError.
481 def __init__(self, filename, file=None, size=None):
482 """Create a PackData object that represents the pack in the given filename.
484 The file must exist and stay readable until the object is disposed of. It
485 must also stay the same size. It will be mapped whenever needed.
487 Currently there is a restriction on the size of the pack as the python
488 mmap implementation is flawed.
490 self._filename = filename
492 self._header_size = 12
494 self._file = GitFile(self._filename, 'rb')
497 (version, self._num_objects) = read_pack_header(self._file)
498 self._offset_cache = LRUSizeCache(1024*1024*20,
499 compute_size=_compute_object_size)
502 def from_file(cls, file, size):
503 return cls(str(file), file=file, size=size)
506 def from_path(cls, path):
507 return cls(filename=path)
513 if self._size is not None:
515 self._size = os.path.getsize(self._filename)
516 assert self._size >= self._header_size, "%s is too small for a packfile (%d < %d)" % (self._filename, self._size, self._header_size)
520 """Returns the number of objects in this pack."""
521 return self._num_objects
523 def calculate_checksum(self):
524 """Calculate the checksum for this pack.
526 :return: 20-byte binary SHA1 digest
530 todo = self._get_size() - 20
532 x = self._file.read(min(todo, 1<<16))
537 def resolve_object(self, offset, type, obj, get_ref, get_offset=None):
538 """Resolve an object, possibly resolving deltas when necessary.
540 :return: Tuple with object type and contents.
542 if type not in (6, 7): # Not a delta
545 if get_offset is None:
546 get_offset = self.get_object_at
548 if type == 6: # offset delta
549 (delta_offset, delta) = obj
550 assert isinstance(delta_offset, int)
551 assert isinstance(delta, str)
552 base_offset = offset-delta_offset
553 type, base_obj = get_offset(base_offset)
554 assert isinstance(type, int)
555 elif type == 7: # ref delta
556 (basename, delta) = obj
557 assert isinstance(basename, str) and len(basename) == 20
558 assert isinstance(delta, str)
559 type, base_obj = get_ref(basename)
560 assert isinstance(type, int)
561 # Can't be a ofs delta, as we wouldn't know the base offset
564 type, base_chunks = self.resolve_object(base_offset, type, base_obj,
566 if base_offset is not None:
567 self._offset_cache[base_offset] = type, base_chunks
568 return (type, apply_delta(base_chunks, delta))
570 def iterobjects(self, progress=None):
572 class ObjectIterator(object):
574 def __init__(self, pack):
576 self.offset = pack._header_size
578 self.map = pack._file
587 if self.i == self.num:
589 self.map.seek(self.offset)
590 (type, obj, total_size, unused) = unpack_object(self.map.read)
591 self.map.seek(self.offset)
592 crc32 = zlib.crc32(self.map.read(total_size)) & 0xffffffff
593 ret = (self.offset, type, obj, crc32)
594 self.offset += total_size
596 progress(self.i, self.num)
599 return ObjectIterator(self)
601 def iterentries(self, ext_resolve_ref=None, progress=None):
602 """Yield entries summarizing the contents of this pack.
604 :param ext_resolve_ref: Optional function to resolve base
605 objects (in case this is a thin pack)
606 :param progress: Progress function, called with current and
609 This will yield tuples with (sha, offset, crc32)
612 postponed = defaultdict(list)
613 class Postpone(Exception):
614 """Raised to postpone delta resolving."""
616 def get_ref_text(sha):
617 assert len(sha) == 20
619 return self.get_object_at(found[sha])
622 return ext_resolve_ref(sha)
625 raise Postpone, (sha, )
627 todo = chain(self.iterobjects(progress=progress), extra)
628 for (offset, type, obj, crc32) in todo:
629 assert isinstance(offset, int)
630 assert isinstance(type, int)
631 assert isinstance(obj, list) or isinstance(obj, str)
633 type, obj = self.resolve_object(offset, type, obj, get_ref_text)
634 except Postpone, (sha, ):
635 postponed[sha].append((offset, type, obj))
637 shafile = ShaFile.from_raw_chunks(type, obj)
638 sha = shafile.sha().digest()
640 yield sha, offset, crc32
641 extra.extend(postponed.get(sha, []))
643 raise KeyError([sha_to_hex(h) for h in postponed.keys()])
645 def sorted_entries(self, resolve_ext_ref=None, progress=None):
646 """Return entries in this pack, sorted by SHA.
648 :param ext_resolve_ref: Optional function to resolve base
649 objects (in case this is a thin pack)
650 :param progress: Progress function, called with current and
652 :return: List of tuples with (sha, offset, crc32)
654 ret = list(self.iterentries(resolve_ext_ref, progress=progress))
658 def create_index_v1(self, filename, resolve_ext_ref=None, progress=None):
659 """Create a version 1 file for this data file.
661 :param filename: Index filename.
662 :param resolve_ext_ref: Function to use for resolving externally referenced
663 SHA1s (for thin packs)
664 :param progress: Progress report function
666 entries = self.sorted_entries(resolve_ext_ref, progress=progress)
667 write_pack_index_v1(filename, entries, self.calculate_checksum())
669 def create_index_v2(self, filename, resolve_ext_ref=None, progress=None):
670 """Create a version 2 index file for this data file.
672 :param filename: Index filename.
673 :param resolve_ext_ref: Function to use for resolving externally referenced
674 SHA1s (for thin packs)
675 :param progress: Progress report function
677 entries = self.sorted_entries(resolve_ext_ref, progress=progress)
678 write_pack_index_v2(filename, entries, self.calculate_checksum())
680 def create_index(self, filename, resolve_ext_ref=None, progress=None,
682 """Create an index file for this data file.
684 :param filename: Index filename.
685 :param resolve_ext_ref: Function to use for resolving externally referenced
686 SHA1s (for thin packs)
687 :param progress: Progress report function
690 self.create_index_v1(filename, resolve_ext_ref, progress)
692 self.create_index_v2(filename, resolve_ext_ref, progress)
694 raise ValueError("unknown index format %d" % version)
696 def get_stored_checksum(self):
697 """Return the expected checksum stored in this pack."""
698 self._file.seek(self._get_size()-20)
699 return self._file.read(20)
702 """Check the consistency of this pack."""
703 return (self.calculate_checksum() == self.get_stored_checksum())
705 def get_object_at(self, offset):
706 """Given an offset in to the packfile return the object that is there.
708 Using the associated index the location of an object can be looked up, and
709 then the packfile can be asked directly for that object using this
712 if offset in self._offset_cache:
713 return self._offset_cache[offset]
714 assert isinstance(offset, long) or isinstance(offset, int),\
715 "offset was %r" % offset
716 assert offset >= self._header_size
717 self._file.seek(offset)
718 return unpack_object(self._file.read)[:2]
721 class SHA1Reader(object):
722 """Wrapper around a file-like object that remembers the SHA1 of
723 the data read from it."""
725 def __init__(self, f):
727 self.sha1 = make_sha("")
729 def read(self, num=None):
730 data = self.f.read(num)
731 self.sha1.update(data)
735 stored = self.f.read(20)
736 if stored != self.sha1.digest():
737 raise ChecksumMismatch(self.sha1.hexdigest(), sha_to_hex(stored))
740 return self.f.close()
746 class SHA1Writer(object):
747 """Wrapper around a file-like object that remembers the SHA1 of
748 the data written to it."""
750 def __init__(self, f):
752 self.sha1 = make_sha("")
754 def write(self, data):
755 self.sha1.update(data)
759 sha = self.sha1.digest()
760 assert len(sha) == 20
765 sha = self.write_sha()
773 def write_pack_object(f, type, object):
774 """Write pack object to a file.
776 :param f: File to write to
777 :param o: Object to write
778 :return: Tuple with offset at which the object was written, and crc32
782 if type == 6: # offset delta
783 (delta_base_offset, object) = object
784 elif type == 7: # ref delta
785 (basename, object) = object
787 c = (type << 4) | (size & 15)
790 packed_data_hdr += (chr(c | 0x80))
793 packed_data_hdr += chr(c)
794 if type == 6: # offset delta
795 ret = [delta_base_offset & 0x7f]
796 delta_base_offset >>= 7
797 while delta_base_offset:
798 delta_base_offset -= 1
799 ret.insert(0, 0x80 | (delta_base_offset & 0x7f))
800 delta_base_offset >>= 7
801 packed_data_hdr += "".join([chr(x) for x in ret])
802 elif type == 7: # ref delta
803 assert len(basename) == 20
804 packed_data_hdr += basename
805 packed_data = packed_data_hdr + zlib.compress(object)
807 return (offset, (zlib.crc32(packed_data) & 0xffffffff))
810 def write_pack(filename, objects, num_objects):
811 """Write a new pack data file.
813 :param filename: Path to the new pack file (without .pack extension)
814 :param objects: Iterable over (object, path) tuples to write
815 :param num_objects: Number of objects to write
817 f = GitFile(filename + ".pack", 'wb')
819 entries, data_sum = write_pack_data(f, objects, num_objects)
823 write_pack_index_v2(filename + ".idx", entries, data_sum)
826 def write_pack_data(f, objects, num_objects, window=10):
827 """Write a new pack file.
829 :param filename: The filename of the new pack file.
830 :param objects: List of objects to write (tuples with object and path)
831 :return: List with (name, offset, crc32 checksum) entries, pack checksum
833 recency = list(objects)
834 # FIXME: Somehow limit delta depth
835 # FIXME: Make thin-pack optional (its not used when cloning a pack)
836 # Build a list of objects ordered by the magic Linus heuristic
837 # This helps us find good objects to diff against us
839 for obj, path in recency:
840 magic.append( (obj.type, path, 1, -obj.raw_length(), obj) )
842 # Build a map of objects and their index in magic - so we can find preceeding objects
845 for i in range(len(magic)):
846 offs[magic[i][4]] = i
850 f.write("PACK") # Pack header
851 f.write(struct.pack(">L", 2)) # Pack version
852 f.write(struct.pack(">L", num_objects)) # Number of objects in pack
853 for o, path in recency:
854 sha1 = o.sha().digest()
856 raw = o.as_raw_string()
859 #for i in range(offs[o]-window, window):
860 # if i < 0 or i >= len(offs): continue
862 # if b.type != orig_t: continue
863 # base = b.as_raw_string()
864 # delta = create_delta(base, raw)
865 # if len(delta) < len(winner):
867 # t = 6 if magic[i][2] == 1 else 7
868 offset, crc32 = write_pack_object(f, t, winner)
869 entries.append((sha1, offset, crc32))
870 return entries, f.write_sha()
873 def write_pack_index_v1(filename, entries, pack_checksum):
874 """Write a new pack index file.
876 :param filename: The filename of the new pack index file.
877 :param entries: List of tuples with object name (sha), offset_in_pack, and
879 :param pack_checksum: Checksum of the pack file.
881 f = GitFile(filename, 'wb')
883 fan_out_table = defaultdict(lambda: 0)
884 for (name, offset, entry_checksum) in entries:
885 fan_out_table[ord(name[0])] += 1
887 for i in range(0x100):
888 f.write(struct.pack(">L", fan_out_table[i]))
889 fan_out_table[i+1] += fan_out_table[i]
890 for (name, offset, entry_checksum) in entries:
891 f.write(struct.pack(">L20s", offset, name))
892 assert len(pack_checksum) == 20
893 f.write(pack_checksum)
897 def create_delta(base_buf, target_buf):
898 """Use python difflib to work out how to transform base_buf to target_buf.
900 :param base_buf: Base buffer
901 :param target_buf: Target buffer
903 assert isinstance(base_buf, str)
904 assert isinstance(target_buf, str)
907 def encode_size(size):
917 out_buf += encode_size(len(base_buf))
918 out_buf += encode_size(len(target_buf))
919 # write out delta opcodes
920 seq = difflib.SequenceMatcher(a=base_buf, b=target_buf)
921 for opcode, i1, i2, j1, j2 in seq.get_opcodes():
922 # Git patch opcodes don't care about deletes!
923 #if opcode == "replace" or opcode == "delete":
925 if opcode == "equal":
926 # If they are equal, unpacker will use data from base_buf
927 # Write out an opcode that says what range to use
933 scratch += chr((o >> i*8) & 0xff)
938 scratch += chr((s >> i*8) & 0xff)
942 if opcode == "replace" or opcode == "insert":
943 # If we are replacing a range or adding one, then we just
944 # output it to the stream (prefixed by its size)
949 out_buf += target_buf[o:o+127]
953 out_buf += target_buf[o:o+s]
957 def apply_delta(src_buf, delta):
958 """Based on the similar function in git's patch-delta.c.
960 :param src_buf: Source buffer
961 :param delta: Delta instructions
963 if type(src_buf) != str:
964 src_buf = "".join(src_buf)
965 assert isinstance(delta, str)
968 delta_length = len(delta)
969 def get_delta_header_size(delta, index):
973 cmd = ord(delta[index])
975 size |= (cmd & ~0x80) << i
980 src_size, index = get_delta_header_size(delta, index)
981 dest_size, index = get_delta_header_size(delta, index)
982 assert src_size == len(src_buf), "%d vs %d" % (src_size, len(src_buf))
983 while index < delta_length:
984 cmd = ord(delta[index])
990 x = ord(delta[index])
992 cp_off |= x << (i * 8)
995 if cmd & (1 << (4+i)):
996 x = ord(delta[index])
998 cp_size |= x << (i * 8)
1001 if (cp_off + cp_size < cp_size or
1002 cp_off + cp_size > src_size or
1003 cp_size > dest_size):
1005 out.append(src_buf[cp_off:cp_off+cp_size])
1007 out.append(delta[index:index+cmd])
1010 raise ApplyDeltaError("Invalid opcode 0")
1012 if index != delta_length:
1013 raise ApplyDeltaError("delta not empty: %r" % delta[index:])
1015 if dest_size != chunks_length(out):
1016 raise ApplyDeltaError("dest size incorrect")
1021 def write_pack_index_v2(filename, entries, pack_checksum):
1022 """Write a new pack index file.
1024 :param filename: The filename of the new pack index file.
1025 :param entries: List of tuples with object name (sha), offset_in_pack, and
1027 :param pack_checksum: Checksum of the pack file.
1029 f = GitFile(filename, 'wb')
1031 f.write('\377tOc') # Magic!
1032 f.write(struct.pack(">L", 2))
1033 fan_out_table = defaultdict(lambda: 0)
1034 for (name, offset, entry_checksum) in entries:
1035 fan_out_table[ord(name[0])] += 1
1037 for i in range(0x100):
1038 f.write(struct.pack(">L", fan_out_table[i]))
1039 fan_out_table[i+1] += fan_out_table[i]
1040 for (name, offset, entry_checksum) in entries:
1042 for (name, offset, entry_checksum) in entries:
1043 f.write(struct.pack(">L", entry_checksum))
1044 for (name, offset, entry_checksum) in entries:
1045 # FIXME: handle if MSBit is set in offset
1046 f.write(struct.pack(">L", offset))
1047 # FIXME: handle table for pack files > 8 Gb
1048 assert len(pack_checksum) == 20
1049 f.write(pack_checksum)
1054 """A Git pack object."""
1056 def __init__(self, basename):
1057 self._basename = basename
1058 self._data_path = self._basename + ".pack"
1059 self._idx_path = self._basename + ".idx"
1064 def from_objects(self, data, idx):
1065 """Create a new pack object from pack data and index objects."""
1072 """The SHA over the SHAs of the objects in this pack."""
1073 return self.index.objects_sha1()
1077 """The pack data object being used."""
1078 if self._data is None:
1079 self._data = PackData(self._data_path)
1080 assert len(self.index) == len(self._data)
1081 idx_stored_checksum = self.index.get_pack_checksum()
1082 data_stored_checksum = self._data.get_stored_checksum()
1083 if idx_stored_checksum != data_stored_checksum:
1084 raise ChecksumMismatch(sha_to_hex(idx_stored_checksum),
1085 sha_to_hex(data_stored_checksum))
1090 """The index being used.
1092 :note: This may be an in-memory index
1094 if self._idx is None:
1095 self._idx = load_pack_index(self._idx_path)
1099 if self._data is not None:
1103 def __eq__(self, other):
1104 return type(self) == type(other) and self.index == other.index
1107 """Number of entries in this pack."""
1108 return len(self.index)
1111 return "%s(%r)" % (self.__class__.__name__, self._basename)
1114 """Iterate over all the sha1s of the objects in this pack."""
1115 return iter(self.index)
1118 """Check the integrity of this pack."""
1119 if not self.index.check():
1121 if not self.data.check():
1125 def get_stored_checksum(self):
1126 return self.data.get_stored_checksum()
1128 def __contains__(self, sha1):
1129 """Check whether this pack contains a particular SHA1."""
1131 self.index.object_index(sha1)
1136 def get_raw(self, sha1, resolve_ref=None):
1137 offset = self.index.object_index(sha1)
1138 obj_type, obj = self.data.get_object_at(offset)
1139 if type(offset) is long:
1140 offset = int(offset)
1141 if resolve_ref is None:
1142 resolve_ref = self.get_raw
1143 kind, chunks = self.data.resolve_object(offset, obj_type, obj,
1145 return kind, "".join(chunks)
1147 def __getitem__(self, sha1):
1148 """Retrieve the specified SHA1."""
1149 type, uncomp = self.get_raw(sha1)
1150 return ShaFile.from_raw_string(type, uncomp)
1152 def iterobjects(self, get_raw=None):
1153 """Iterate over the objects in this pack."""
1155 get_raw = self.get_raw
1156 for offset, type, obj, crc32 in self.data.iterobjects():
1157 assert isinstance(offset, int)
1158 type, obj = self.data.resolve_object(offset, type, obj, get_raw)
1159 yield ShaFile.from_raw_chunks(type, obj)
1163 from dulwich._pack import apply_delta, bisect_find_sha