1 # pack.py -- For dealing wih packed git objects.
2 # Copyright (C) 2007 James Westby <jw+debian@jameswestby.net>
3 # Copryight (C) 2008-2009 Jelmer Vernooij <jelmer@samba.org>
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; version 2
8 # of the License or (at your option) a later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
20 """Classes for dealing with packed git objects.
22 A pack is a compact representation of a bunch of objects, stored
23 using deltas where possible.
25 They have two parts, the pack file, which stores the data, and an index
26 that tells you where the data is.
28 To find an object you look in all of the index files 'til you find a
29 match for the object name. You then use the pointer got from this as
30 a pointer in to the corresponding packfile.
34 from collections import defaultdict
36 from misc import defaultdict
38 from itertools import chain, imap, izip
43 from struct import unpack_from
45 from dulwich.misc import unpack_from
50 from dulwich.errors import (
54 from dulwich.lru_cache import (
57 from dulwich.objects import (
62 from dulwich.misc import (
66 supports_mmap_offset = (sys.version_info[0] >= 3 or
67 (sys.version_info[0] == 2 and sys.version_info[1] >= 6))
70 def take_msb_bytes(map, offset):
72 while len(ret) == 0 or ret[-1] & 0x80:
73 ret.append(ord(map[offset]))
78 def read_zlib_chunks(data, offset, dec_size):
79 obj = zlib.decompressobj()
82 while obj.unused_data == "":
84 add = data[base:base+1024]
88 ret.append(obj.decompress(add))
89 comp_len = fed-len(obj.unused_data)
93 def read_zlib(data, offset, dec_size):
94 ret, comp_len = read_zlib_chunks(data, offset, dec_size)
96 assert len(x) == dec_size
101 """Return the hexdigest of the SHA1 over a set of names.
103 :param iter: Iterator over string objects
104 :return: 40-byte hex sha1 digest
109 return sha1.hexdigest()
112 def simple_mmap(f, offset, size, access=mmap.ACCESS_READ):
113 """Simple wrapper for mmap() which always supports the offset parameter.
115 :param f: File object.
116 :param offset: Offset in the file, from the beginning of the file.
117 :param size: Size of the mmap'ed area
118 :param access: Access mechanism.
119 :return: MMAP'd area.
121 mem = mmap.mmap(f.fileno(), size+offset, access=access)
125 def load_pack_index(filename):
126 f = open(filename, 'rb')
127 if f.read(4) == '\377tOc':
128 version = struct.unpack(">L", f.read(4))[0]
131 return PackIndex2(filename, file=f)
133 raise KeyError("Unknown pack index format %d" % version)
136 return PackIndex1(filename, file=f)
139 def bisect_find_sha(start, end, sha, unpack_name):
143 file_sha = unpack_name(i)
144 x = cmp(file_sha, sha)
154 class PackIndex(object):
155 """An index in to a packfile.
157 Given a sha id of an object a pack index can tell you the location in the
158 packfile of that object if it has it.
160 To do the loop it opens the file, and indexes first 256 4 byte groups
161 with the first byte of the sha id. The value in the four byte group indexed
162 is the end of the group that shares the same starting byte. Subtract one
163 from the starting byte and index again to find the start of the group.
164 The values are sorted by sha id within the group, so do the math to find
165 the start and end offset and then bisect in to find if the value is present.
168 def __init__(self, filename, file=None):
169 """Create a pack index object.
171 Provide it with the name of the index file to consider, and it will map
172 it whenever required.
174 self._filename = filename
175 # Take the size now, so it can be checked each time we map the file to
176 # ensure that it hasn't changed.
177 self._size = os.path.getsize(filename)
179 self._file = open(filename, 'rb')
182 self._contents, map_offset = simple_mmap(self._file, 0, self._size)
183 assert map_offset == 0
185 def __eq__(self, other):
186 if not isinstance(other, PackIndex):
189 if self._fan_out_table != other._fan_out_table:
192 for (name1, _, _), (name2, _, _) in izip(self.iterentries(), other.iterentries()):
201 """Return the number of entries in this pack index."""
202 return self._fan_out_table[-1]
204 def _unpack_entry(self, i):
205 """Unpack the i-th entry in the index file.
207 :return: Tuple with object name (SHA), offset in pack file and
208 CRC32 checksum (if known)."""
209 raise NotImplementedError(self._unpack_entry)
211 def _unpack_name(self, i):
212 """Unpack the i-th name from the index file."""
213 raise NotImplementedError(self._unpack_name)
215 def _unpack_offset(self, i):
216 """Unpack the i-th object offset from the index file."""
217 raise NotImplementedError(self._unpack_offset)
219 def _unpack_crc32_checksum(self, i):
220 """Unpack the crc32 checksum for the i-th object from the index file."""
221 raise NotImplementedError(self._unpack_crc32_checksum)
224 return imap(sha_to_hex, self._itersha())
227 for i in range(len(self)):
228 yield self._unpack_name(i)
230 def objects_sha1(self):
231 """Return the hex SHA1 over all the shas of all objects in this pack.
233 :note: This is used for the filename of the pack.
235 return iter_sha1(self._itersha())
237 def iterentries(self):
238 """Iterate over the entries in this pack index.
240 Will yield tuples with object name, offset in packfile and crc32 checksum.
242 for i in range(len(self)):
243 yield self._unpack_entry(i)
245 def _read_fan_out_table(self, start_offset):
247 for i in range(0x100):
248 ret.append(struct.unpack(">L", self._contents[start_offset+i*4:start_offset+(i+1)*4])[0])
252 """Check that the stored checksum matches the actual checksum."""
253 return self.calculate_checksum() == self.get_stored_checksum()
255 def calculate_checksum(self):
256 """Calculate the SHA1 checksum over this pack index.
258 :return: This is a 20-byte binary digest
260 return make_sha(self._contents[:-20]).digest()
262 def get_pack_checksum(self):
263 """Return the SHA1 checksum stored for the corresponding packfile.
265 :return: 20-byte binary digest
267 return str(self._contents[-40:-20])
269 def get_stored_checksum(self):
270 """Return the SHA1 checksum stored for this index.
272 :return: 20-byte binary digest
274 return str(self._contents[-20:])
276 def object_index(self, sha):
277 """Return the index in to the corresponding packfile for the object.
279 Given the name of an object it will return the offset that object lives
280 at within the corresponding pack file. If the pack file doesn't have the
281 object then None will be returned.
284 sha = hex_to_sha(sha)
285 return self._object_index(sha)
287 def _object_index(self, sha):
290 :param sha: A *binary* SHA string. (20 characters long)_
292 assert len(sha) == 20
297 start = self._fan_out_table[idx-1]
298 end = self._fan_out_table[idx]
299 i = bisect_find_sha(start, end, sha, self._unpack_name)
302 return self._unpack_offset(i)
306 class PackIndex1(PackIndex):
307 """Version 1 Pack Index."""
309 def __init__(self, filename, file=None):
310 PackIndex.__init__(self, filename, file)
312 self._fan_out_table = self._read_fan_out_table(0)
314 def _unpack_entry(self, i):
315 (offset, name) = unpack_from(">L20s", self._contents,
316 (0x100 * 4) + (i * 24))
317 return (name, offset, None)
319 def _unpack_name(self, i):
320 offset = (0x100 * 4) + (i * 24) + 4
321 return self._contents[offset:offset+20]
323 def _unpack_offset(self, i):
324 offset = (0x100 * 4) + (i * 24)
325 return unpack_from(">L", self._contents, offset)[0]
327 def _unpack_crc32_checksum(self, i):
328 # Not stored in v1 index files
332 class PackIndex2(PackIndex):
333 """Version 2 Pack Index."""
335 def __init__(self, filename, file=None):
336 PackIndex.__init__(self, filename, file)
337 assert self._contents[:4] == '\377tOc', "Not a v2 pack index file"
338 (self.version, ) = unpack_from(">L", self._contents, 4)
339 assert self.version == 2, "Version was %d" % self.version
340 self._fan_out_table = self._read_fan_out_table(8)
341 self._name_table_offset = 8 + 0x100 * 4
342 self._crc32_table_offset = self._name_table_offset + 20 * len(self)
343 self._pack_offset_table_offset = self._crc32_table_offset + 4 * len(self)
345 def _unpack_entry(self, i):
346 return (self._unpack_name(i), self._unpack_offset(i),
347 self._unpack_crc32_checksum(i))
349 def _unpack_name(self, i):
350 offset = self._name_table_offset + i * 20
351 return self._contents[offset:offset+20]
353 def _unpack_offset(self, i):
354 offset = self._pack_offset_table_offset + i * 4
355 return unpack_from(">L", self._contents, offset)[0]
357 def _unpack_crc32_checksum(self, i):
358 return unpack_from(">L", self._contents,
359 self._crc32_table_offset + i * 4)[0]
363 def read_pack_header(f):
364 """Read the header of a pack file.
366 :param f: File-like object to read from
369 assert header[:4] == "PACK"
370 (version,) = unpack_from(">L", header, 4)
371 assert version in (2, 3), "Version was %d" % version
372 (num_objects,) = unpack_from(">L", header, 8)
373 return (version, num_objects)
376 def read_pack_tail(f):
380 def unpack_object(map, offset=0):
381 """Unpack a Git object.
383 :return: tuple with type, uncompressed data and compressed size
385 bytes = take_msb_bytes(map, offset)
386 type = (bytes[0] >> 4) & 0x07
387 size = bytes[0] & 0x0f
388 for i, byte in enumerate(bytes[1:]):
389 size += (byte & 0x7f) << ((i * 7) + 4)
390 raw_base = len(bytes)
391 if type == 6: # offset delta
392 bytes = take_msb_bytes(map, raw_base + offset)
393 assert not (bytes[-1] & 0x80)
394 delta_base_offset = bytes[0] & 0x7f
395 for byte in bytes[1:]:
396 delta_base_offset += 1
397 delta_base_offset <<= 7
398 delta_base_offset += (byte & 0x7f)
400 uncomp, comp_len = read_zlib(map, offset + raw_base, size)
401 assert size == len(uncomp)
402 return type, (delta_base_offset, uncomp), comp_len+raw_base
403 elif type == 7: # ref delta
404 basename = map[offset+raw_base:offset+raw_base+20]
405 uncomp, comp_len = read_zlib(map, offset+raw_base+20, size)
406 assert size == len(uncomp)
407 return type, (basename, uncomp), comp_len+raw_base+20
409 uncomp, comp_len = read_zlib(map, offset+raw_base, size)
410 assert len(uncomp) == size
411 return type, uncomp, comp_len+raw_base
414 def compute_object_size((num, obj)):
415 """Compute the size of a unresolved object for use with LRUSizeCache.
419 assert isinstance(obj, str)
423 class PackData(object):
424 """The data contained in a packfile.
426 Pack files can be accessed both sequentially for exploding a pack, and
427 directly with the help of an index to retrieve a specific object.
429 The objects within are either complete or a delta aginst another.
431 The header is variable length. If the MSB of each byte is set then it
432 indicates that the subsequent byte is still part of the header.
433 For the first byte the next MS bits are the type, which tells you the type
434 of object, and whether it is a delta. The LS byte is the lowest bits of the
435 size. For each subsequent byte the LS 7 bits are the next MS bits of the
436 size, i.e. the last byte of the header contains the MS bits of the size.
438 For the complete objects the data is stored as zlib deflated data.
439 The size in the header is the uncompressed object size, so to uncompress
440 you need to just keep feeding data to zlib until you get an object back,
441 or it errors on bad data. This is done here by just giving the complete
442 buffer from the start of the deflated object on. This is bad, but until I
443 get mmap sorted out it will have to do.
445 Currently there are no integrity checks done. Also no attempt is made to try
446 and detect the delta case, or a request for an object at the wrong position.
447 It will all just throw a zlib or KeyError.
450 def __init__(self, filename):
451 """Create a PackData object that represents the pack in the given filename.
453 The file must exist and stay readable until the object is disposed of. It
454 must also stay the same size. It will be mapped whenever needed.
456 Currently there is a restriction on the size of the pack as the python
457 mmap implementation is flawed.
459 self._filename = filename
460 assert os.path.exists(filename), "%s is not a packfile" % filename
461 self._size = os.path.getsize(filename)
462 self._header_size = 12
463 assert self._size >= self._header_size, "%s is too small for a packfile (%d < %d)" % (filename, self._size, self._header_size)
464 self._file = open(self._filename, 'rb')
466 self._offset_cache = LRUSizeCache(1024*1024*20,
467 compute_size=compute_object_size)
472 def _read_header(self):
473 (version, self._num_objects) = read_pack_header(self._file)
474 self._file.seek(self._size-20)
475 (self._stored_checksum,) = read_pack_tail(self._file)
478 """Returns the number of objects in this pack."""
479 return self._num_objects
481 def calculate_checksum(self):
482 """Calculate the checksum for this pack.
484 :return: 20-byte binary SHA1 digest
486 map, map_offset = simple_mmap(self._file, 0, self._size - 20)
488 return make_sha(map[map_offset:self._size-20]).digest()
492 def resolve_object(self, offset, type, obj, get_ref, get_offset=None):
493 """Resolve an object, possibly resolving deltas when necessary.
495 :return: Tuple with object type and contents.
497 if type not in (6, 7): # Not a delta
500 if get_offset is None:
501 get_offset = self.get_object_at
503 if type == 6: # offset delta
504 (delta_offset, delta) = obj
505 assert isinstance(delta_offset, int)
506 assert isinstance(delta, str)
507 base_offset = offset-delta_offset
508 type, base_obj = get_offset(base_offset)
509 assert isinstance(type, int)
510 elif type == 7: # ref delta
511 (basename, delta) = obj
512 assert isinstance(basename, str) and len(basename) == 20
513 assert isinstance(delta, str)
514 type, base_obj = get_ref(basename)
515 assert isinstance(type, int)
516 # Can't be a ofs delta, as we wouldn't know the base offset
519 type, base_text = self.resolve_object(base_offset, type, base_obj, get_ref)
520 if base_offset is not None:
521 self._offset_cache[base_offset] = type, base_text
522 ret = (type, apply_delta(base_text, delta))
525 def iterobjects(self, progress=None):
527 class ObjectIterator(object):
529 def __init__(self, pack):
531 self.offset = pack._header_size
533 self.map, _ = simple_mmap(pack._file, 0, pack._size)
545 if self.i == self.num:
547 (type, obj, total_size) = unpack_object(self.map, self.offset)
548 crc32 = zlib.crc32(self.map[self.offset:self.offset+total_size]) & 0xffffffff
549 ret = (self.offset, type, obj, crc32)
550 self.offset += total_size
552 progress(self.i, self.num)
555 return ObjectIterator(self)
557 def iterentries(self, ext_resolve_ref=None, progress=None):
559 postponed = defaultdict(list)
560 class Postpone(Exception):
561 """Raised to postpone delta resolving."""
563 def get_ref_text(sha):
564 assert len(sha) == 20
566 return self.get_object_at(found[sha])
569 return ext_resolve_ref(sha)
572 raise Postpone, (sha, )
574 todo = chain(self.iterobjects(progress=progress), extra)
575 for (offset, type, obj, crc32) in todo:
576 assert isinstance(offset, int)
577 assert isinstance(type, int)
578 assert isinstance(obj, tuple) or isinstance(obj, str)
580 type, obj = self.resolve_object(offset, type, obj, get_ref_text)
581 except Postpone, (sha, ):
582 postponed[sha].append((offset, type, obj))
584 shafile = ShaFile.from_raw_string(type, obj)
585 sha = shafile.sha().digest()
587 yield sha, offset, crc32
588 extra.extend(postponed.get(sha, []))
590 raise KeyError([sha_to_hex(h) for h in postponed.keys()])
592 def sorted_entries(self, resolve_ext_ref=None, progress=None):
593 ret = list(self.iterentries(resolve_ext_ref, progress=progress))
597 def create_index_v1(self, filename, resolve_ext_ref=None, progress=None):
598 """Create a version 1 file for this data file.
600 :param filename: Index filename.
601 :param resolve_ext_ref: Function to use for resolving externally referenced
602 SHA1s (for thin packs)
603 :param progress: Progress report function
605 entries = self.sorted_entries(resolve_ext_ref, progress=progress)
606 write_pack_index_v1(filename, entries, self.calculate_checksum())
608 def create_index_v2(self, filename, resolve_ext_ref=None, progress=None):
609 """Create a version 2 index file for this data file.
611 :param filename: Index filename.
612 :param resolve_ext_ref: Function to use for resolving externally referenced
613 SHA1s (for thin packs)
614 :param progress: Progress report function
616 entries = self.sorted_entries(resolve_ext_ref, progress=progress)
617 write_pack_index_v2(filename, entries, self.calculate_checksum())
619 def create_index(self, filename, resolve_ext_ref=None, progress=None,
621 """Create an index file for this data file.
623 :param filename: Index filename.
624 :param resolve_ext_ref: Function to use for resolving externally referenced
625 SHA1s (for thin packs)
626 :param progress: Progress report function
629 self.create_index_v1(filename, resolve_ext_ref, progress)
631 self.create_index_v2(filename, resolve_ext_ref, progress)
633 raise ValueError("unknown index format %d" % version)
635 def get_stored_checksum(self):
636 return self._stored_checksum
639 return (self.calculate_checksum() == self.get_stored_checksum())
641 def get_object_at(self, offset):
642 """Given an offset in to the packfile return the object that is there.
644 Using the associated index the location of an object can be looked up, and
645 then the packfile can be asked directly for that object using this
648 if offset in self._offset_cache:
649 return self._offset_cache[offset]
650 assert isinstance(offset, long) or isinstance(offset, int),\
651 "offset was %r" % offset
652 assert offset >= self._header_size
653 map, map_offset = simple_mmap(self._file, offset, self._size-offset)
655 ret = unpack_object(map, map_offset)[:2]
661 class SHA1Reader(object):
662 """Wrapper around a file-like object that remembers the SHA1 of
663 the data read from it."""
665 def __init__(self, f):
667 self.sha1 = make_sha("")
669 def read(self, num=None):
670 data = self.f.read(num)
671 self.sha1.update(data)
675 stored = self.f.read(20)
676 if stored != self.sha1.digest():
677 raise ChecksumMismatch(self.sha1.hexdigest(), sha_to_hex(stored))
680 return self.f.close()
686 class SHA1Writer(object):
687 """Wrapper around a file-like object that remembers the SHA1 of
688 the data written to it."""
690 def __init__(self, f):
692 self.sha1 = make_sha("")
694 def write(self, data):
695 self.sha1.update(data)
699 sha = self.sha1.digest()
700 assert len(sha) == 20
705 sha = self.write_sha()
713 def write_pack_object(f, type, object):
714 """Write pack object to a file.
716 :param f: File to write to
717 :param o: Object to write
718 :return: Tuple with offset at which the object was written, and crc32
722 if type == 6: # ref delta
723 (delta_base_offset, object) = object
724 elif type == 7: # offset delta
725 (basename, object) = object
727 c = (type << 4) | (size & 15)
730 packed_data_hdr += (chr(c | 0x80))
733 packed_data_hdr += chr(c)
734 if type == 6: # offset delta
735 ret = [delta_base_offset & 0x7f]
736 delta_base_offset >>= 7
737 while delta_base_offset:
738 delta_base_offset -= 1
739 ret.insert(0, 0x80 | (delta_base_offset & 0x7f))
740 delta_base_offset >>= 7
741 packed_data_hdr += "".join([chr(x) for x in ret])
742 elif type == 7: # ref delta
743 assert len(basename) == 20
744 packed_data_hdr += basename
745 packed_data = packed_data_hdr + zlib.compress(object)
747 return (f.tell(), (zlib.crc32(packed_data) & 0xffffffff))
750 def write_pack(filename, objects, num_objects):
751 """Write a new pack data file.
753 :param filename: Path to the new pack file (without .pack extension)
754 :param objects: Iterable over (object, path) tuples to write
755 :param num_objects: Number of objects to write
757 f = open(filename + ".pack", 'wb')
759 entries, data_sum = write_pack_data(f, objects, num_objects)
763 write_pack_index_v2(filename + ".idx", entries, data_sum)
766 def write_pack_data(f, objects, num_objects, window=10):
767 """Write a new pack file.
769 :param filename: The filename of the new pack file.
770 :param objects: List of objects to write (tuples with object and path)
771 :return: List with (name, offset, crc32 checksum) entries, pack checksum
773 recency = list(objects)
774 # FIXME: Somehow limit delta depth
775 # FIXME: Make thin-pack optional (its not used when cloning a pack)
776 # Build a list of objects ordered by the magic Linus heuristic
777 # This helps us find good objects to diff against us
779 for obj, path in recency:
780 magic.append( (obj.type, path, 1, -len(obj.as_raw_string()), obj) )
782 # Build a map of objects and their index in magic - so we can find preceeding objects
785 for i in range(len(magic)):
786 offs[magic[i][4]] = i
790 f.write("PACK") # Pack header
791 f.write(struct.pack(">L", 2)) # Pack version
792 f.write(struct.pack(">L", num_objects)) # Number of objects in pack
793 for o, path in recency:
794 sha1 = o.sha().digest()
796 raw = o.as_raw_string()
799 #for i in range(offs[o]-window, window):
800 # if i < 0 or i >= len(offs): continue
802 # if b.type != orig_t: continue
803 # base = b.as_raw_string()
804 # delta = create_delta(base, raw)
805 # if len(delta) < len(winner):
807 # t = 6 if magic[i][2] == 1 else 7
808 offset, crc32 = write_pack_object(f, t, winner)
809 entries.append((sha1, offset, crc32))
810 return entries, f.write_sha()
813 def write_pack_index_v1(filename, entries, pack_checksum):
814 """Write a new pack index file.
816 :param filename: The filename of the new pack index file.
817 :param entries: List of tuples with object name (sha), offset_in_pack, and
819 :param pack_checksum: Checksum of the pack file.
821 f = open(filename, 'wb')
823 fan_out_table = defaultdict(lambda: 0)
824 for (name, offset, entry_checksum) in entries:
825 fan_out_table[ord(name[0])] += 1
827 for i in range(0x100):
828 f.write(struct.pack(">L", fan_out_table[i]))
829 fan_out_table[i+1] += fan_out_table[i]
830 for (name, offset, entry_checksum) in entries:
831 f.write(struct.pack(">L20s", offset, name))
832 assert len(pack_checksum) == 20
833 f.write(pack_checksum)
837 def create_delta(base_buf, target_buf):
838 """Use python difflib to work out how to transform base_buf to target_buf.
840 :param base_buf: Base buffer
841 :param target_buf: Target buffer
843 assert isinstance(base_buf, str)
844 assert isinstance(target_buf, str)
847 def encode_size(size):
857 out_buf += encode_size(len(base_buf))
858 out_buf += encode_size(len(target_buf))
859 # write out delta opcodes
860 seq = difflib.SequenceMatcher(a=base_buf, b=target_buf)
861 for opcode, i1, i2, j1, j2 in seq.get_opcodes():
862 # Git patch opcodes don't care about deletes!
863 #if opcode == "replace" or opcode == "delete":
865 if opcode == "equal":
866 # If they are equal, unpacker will use data from base_buf
867 # Write out an opcode that says what range to use
873 scratch += chr(o >> i)
878 scratch += chr(s >> i)
882 if opcode == "replace" or opcode == "insert":
883 # If we are replacing a range or adding one, then we just
884 # output it to the stream (prefixed by its size)
889 out_buf += target_buf[o:o+127]
893 out_buf += target_buf[o:o+s]
897 def apply_delta(src_buf, delta):
898 """Based on the similar function in git's patch-delta.c.
900 :param src_buf: Source buffer
901 :param delta: Delta instructions
903 assert isinstance(src_buf, str), "was %r" % (src_buf,)
904 assert isinstance(delta, str)
907 delta_length = len(delta)
908 def get_delta_header_size(delta, index):
912 cmd = ord(delta[index])
914 size |= (cmd & ~0x80) << i
919 src_size, index = get_delta_header_size(delta, index)
920 dest_size, index = get_delta_header_size(delta, index)
921 assert src_size == len(src_buf), "%d vs %d" % (src_size, len(src_buf))
922 while index < delta_length:
923 cmd = ord(delta[index])
929 x = ord(delta[index])
931 cp_off |= x << (i * 8)
934 if cmd & (1 << (4+i)):
935 x = ord(delta[index])
937 cp_size |= x << (i * 8)
940 if (cp_off + cp_size < cp_size or
941 cp_off + cp_size > src_size or
942 cp_size > dest_size):
944 out.append(src_buf[cp_off:cp_off+cp_size])
946 out.append(delta[index:index+cmd])
949 raise ApplyDeltaError("Invalid opcode 0")
951 if index != delta_length:
952 raise ApplyDeltaError("delta not empty: %r" % delta[index:])
955 if dest_size != len(out):
956 raise ApplyDeltaError("dest size incorrect")
961 def write_pack_index_v2(filename, entries, pack_checksum):
962 """Write a new pack index file.
964 :param filename: The filename of the new pack index file.
965 :param entries: List of tuples with object name (sha), offset_in_pack, and
967 :param pack_checksum: Checksum of the pack file.
969 f = open(filename, 'wb')
971 f.write('\377tOc') # Magic!
972 f.write(struct.pack(">L", 2))
973 fan_out_table = defaultdict(lambda: 0)
974 for (name, offset, entry_checksum) in entries:
975 fan_out_table[ord(name[0])] += 1
977 for i in range(0x100):
978 f.write(struct.pack(">L", fan_out_table[i]))
979 fan_out_table[i+1] += fan_out_table[i]
980 for (name, offset, entry_checksum) in entries:
982 for (name, offset, entry_checksum) in entries:
983 f.write(struct.pack(">L", entry_checksum))
984 for (name, offset, entry_checksum) in entries:
985 # FIXME: handle if MSBit is set in offset
986 f.write(struct.pack(">L", offset))
987 # FIXME: handle table for pack files > 8 Gb
988 assert len(pack_checksum) == 20
989 f.write(pack_checksum)
994 """A Git pack object."""
996 def __init__(self, basename):
997 self._basename = basename
998 self._data_path = self._basename + ".pack"
999 self._idx_path = self._basename + ".idx"
1004 def from_objects(self, data, idx):
1005 """Create a new pack object from pack data and index objects."""
1012 """The SHA over the SHAs of the objects in this pack."""
1013 return self.index.objects_sha1()
1017 """The pack data object being used."""
1018 if self._data is None:
1019 self._data = PackData(self._data_path)
1020 assert len(self.index) == len(self._data)
1021 idx_stored_checksum = self.index.get_pack_checksum()
1022 data_stored_checksum = self._data.get_stored_checksum()
1023 if idx_stored_checksum != data_stored_checksum:
1024 raise ChecksumMismatch(sha_to_hex(idx_stored_checksum),
1025 sha_to_hex(data_stored_checksum))
1030 """The index being used.
1032 :note: This may be an in-memory index
1034 if self._idx is None:
1035 self._idx = load_pack_index(self._idx_path)
1039 if self._data is not None:
1043 def __eq__(self, other):
1044 return type(self) == type(other) and self.index == other.index
1047 """Number of entries in this pack."""
1048 return len(self.index)
1051 return "%s(%r)" % (self.__class__.__name__, self._basename)
1054 """Iterate over all the sha1s of the objects in this pack."""
1055 return iter(self.index)
1058 """Check the integrity of this pack."""
1059 if not self.index.check():
1061 if not self.data.check():
1065 def get_stored_checksum(self):
1066 return self.data.get_stored_checksum()
1068 def __contains__(self, sha1):
1069 """Check whether this pack contains a particular SHA1."""
1071 self.index.object_index(sha1)
1076 def get_raw(self, sha1, resolve_ref=None):
1077 offset = self.index.object_index(sha1)
1078 obj_type, obj = self.data.get_object_at(offset)
1079 if type(offset) is long:
1080 offset = int(offset)
1081 if resolve_ref is None:
1082 resolve_ref = self.get_raw
1083 return self.data.resolve_object(offset, obj_type, obj, resolve_ref)
1085 def __getitem__(self, sha1):
1086 """Retrieve the specified SHA1."""
1087 type, uncomp = self.get_raw(sha1)
1088 return ShaFile.from_raw_string(type, uncomp)
1090 def iterobjects(self, get_raw=None):
1091 """Iterate over the objects in this pack."""
1093 get_raw = self.get_raw
1094 for offset, type, obj, crc32 in self.data.iterobjects():
1095 assert isinstance(offset, int)
1096 yield ShaFile.from_raw_string(
1097 *self.data.resolve_object(offset, type, obj, get_raw))
1100 def load_packs(path):
1101 if not os.path.exists(path):
1103 for name in os.listdir(path):
1104 if name.startswith("pack-") and name.endswith(".pack"):
1105 yield Pack(os.path.join(path, name[:-len(".pack")]))
1109 from dulwich._pack import apply_delta, bisect_find_sha