1 # pack.py -- For dealing wih packed git objects.
2 # Copyright (C) 2007 James Westby <jw+debian@jameswestby.net>
3 # Copryight (C) 2008 Jelmer Vernooij <jelmer@samba.org>
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; version 2
8 # of the License or (at your option) a later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
20 """Classes for dealing with packed git objects.
22 A pack is a compact representation of a bunch of objects, stored
23 using deltas where possible.
25 They have two parts, the pack file, which stores the data, and an index
26 that tells you where the data is.
28 To find an object you look in all of the index files 'til you find a
29 match for the object name. You then use the pointer got from this as
30 a pointer in to the corresponding packfile.
34 from collections import defaultdict
36 class defaultdict(dict):
37 def __init__(self, default_factory=None, *a, **kw):
38 if (default_factory is not None and
39 not hasattr(default_factory, '__call__')):
40 raise TypeError('first argument must be callable')
41 dict.__init__(self, *a, **kw)
42 self.default_factory = default_factory
43 def __getitem__(self, key):
45 return dict.__getitem__(self, key)
47 return self.__missing__(key)
48 def __missing__(self, key):
49 if self.default_factory is None:
51 self[key] = value = self.default_factory()
54 if self.default_factory is None:
57 args = self.default_factory,
58 return type(self), args, None, None, self.items()
60 return self.__copy__()
62 return type(self)(self.default_factory, self)
63 def __deepcopy__(self, memo):
65 return type(self)(self.default_factory,
66 copy.deepcopy(self.items()))
68 return 'defaultdict(%s, %s)' % (self.default_factory,
70 from itertools import imap, izip
84 from errors import ApplyDeltaError
86 def unpack_from(fmt, buf, offset=0):
87 b = buf[offset:offset+struct.calcsize(fmt)]
88 return struct.unpack(fmt, b)
91 supports_mmap_offset = (sys.version_info[0] >= 3 or
92 (sys.version_info[0] == 2 and sys.version_info[1] >= 6))
95 def take_msb_bytes(map, offset):
97 while len(ret) == 0 or ret[-1] & 0x80:
98 ret.append(ord(map[offset]))
103 def read_zlib(data, offset, dec_size):
104 obj = zlib.decompressobj()
107 while obj.unused_data == "":
109 add = data[base:base+1024]
113 x += obj.decompress(add)
114 assert len(x) == dec_size
115 comp_len = fed-len(obj.unused_data)
123 return sha1.hexdigest()
126 MAX_MMAP_SIZE = 256 * 1024 * 1024
128 def simple_mmap(f, offset, size, access=mmap.ACCESS_READ):
129 """Simple wrapper for mmap() which always supports the offset parameter.
131 :param f: File object.
132 :param offset: Offset in the file, from the beginning of the file.
133 :param size: Size of the mmap'ed area
134 :param access: Access mechanism.
135 :return: MMAP'd area.
137 if offset+size > MAX_MMAP_SIZE and not supports_mmap_offset:
138 raise AssertionError("%s is larger than 256 meg, and this version "
139 "of Python does not support the offset argument to mmap().")
140 if supports_mmap_offset:
141 return mmap.mmap(f.fileno(), size, access=access, offset=offset)
143 class ArraySkipper(object):
145 def __init__(self, array, offset):
149 def __getslice__(self, i, j):
150 return self.array[i+self.offset:j+self.offset]
152 def __getitem__(self, i):
153 return self.array[i+self.offset]
156 return len(self.array) - self.offset
159 return str(self.array[self.offset:])
161 mem = mmap.mmap(f.fileno(), size+offset, access=access)
164 return ArraySkipper(mem, offset)
167 def resolve_object(offset, type, obj, get_ref, get_offset):
168 """Resolve an object, possibly resolving deltas when necessary."""
169 if not type in (6, 7): # Not a delta
172 if type == 6: # offset delta
173 (delta_offset, delta) = obj
174 assert isinstance(delta_offset, int)
175 assert isinstance(delta, str)
176 offset = offset-delta_offset
177 type, base_obj = get_offset(offset)
178 assert isinstance(type, int)
179 elif type == 7: # ref delta
180 (basename, delta) = obj
181 assert isinstance(basename, str) and len(basename) == 20
182 assert isinstance(delta, str)
183 type, base_obj = get_ref(basename)
184 assert isinstance(type, int)
185 type, base_text = resolve_object(offset, type, base_obj, get_ref, get_offset)
186 return type, apply_delta(base_text, delta)
189 class PackIndex(object):
190 """An index in to a packfile.
192 Given a sha id of an object a pack index can tell you the location in the
193 packfile of that object if it has it.
195 To do the loop it opens the file, and indexes first 256 4 byte groups
196 with the first byte of the sha id. The value in the four byte group indexed
197 is the end of the group that shares the same starting byte. Subtract one
198 from the starting byte and index again to find the start of the group.
199 The values are sorted by sha id within the group, so do the math to find
200 the start and end offset and then bisect in to find if the value is present.
203 def __init__(self, filename):
204 """Create a pack index object.
206 Provide it with the name of the index file to consider, and it will map
207 it whenever required.
209 self._filename = filename
210 # Take the size now, so it can be checked each time we map the file to
211 # ensure that it hasn't changed.
212 self._size = os.path.getsize(filename)
213 self._file = open(filename, 'r')
214 self._contents = simple_mmap(self._file, 0, self._size)
215 if self._contents[:4] != '\377tOc':
217 self._fan_out_table = self._read_fan_out_table(0)
219 (self.version, ) = unpack_from(">L", self._contents, 4)
220 assert self.version in (2,), "Version was %d" % self.version
221 self._fan_out_table = self._read_fan_out_table(8)
222 self._name_table_offset = 8 + 0x100 * 4
223 self._crc32_table_offset = self._name_table_offset + 20 * len(self)
224 self._pack_offset_table_offset = self._crc32_table_offset + 4 * len(self)
226 def __eq__(self, other):
227 if type(self) != type(other):
230 if self._fan_out_table != other._fan_out_table:
233 for (name1, _, _), (name2, _, _) in izip(self.iterentries(), other.iterentries()):
242 """Return the number of entries in this pack index."""
243 return self._fan_out_table[-1]
245 def _unpack_entry(self, i):
246 """Unpack the i-th entry in the index file.
248 :return: Tuple with object name (SHA), offset in pack file and
249 CRC32 checksum (if known)."""
250 if self.version == 1:
251 (offset, name) = unpack_from(">L20s", self._contents,
252 (0x100 * 4) + (i * 24))
253 return (name, offset, None)
255 return (self._unpack_name(i), self._unpack_offset(i),
256 self._unpack_crc32_checksum(i))
258 def _unpack_name(self, i):
259 if self.version == 1:
260 return self._unpack_entry(i)[0]
262 return unpack_from("20s", self._contents,
263 self._name_table_offset + i * 20)[0]
265 def _unpack_offset(self, i):
266 if self.version == 1:
267 return self._unpack_entry(i)[1]
269 return unpack_from(">L", self._contents,
270 self._pack_offset_table_offset + i * 4)[0]
272 def _unpack_crc32_checksum(self, i):
273 if self.version == 1:
276 return unpack_from(">L", self._contents,
277 self._crc32_table_offset + i * 4)[0]
280 return imap(sha_to_hex, self._itersha())
283 for i in range(len(self)):
284 yield self._unpack_name(i)
286 def objects_sha1(self):
287 return iter_sha1(self._itersha())
289 def iterentries(self):
290 """Iterate over the entries in this pack index.
292 Will yield tuples with object name, offset in packfile and crc32 checksum.
294 for i in range(len(self)):
295 yield self._unpack_entry(i)
297 def _read_fan_out_table(self, start_offset):
299 for i in range(0x100):
300 ret.append(struct.unpack(">L", self._contents[start_offset+i*4:start_offset+(i+1)*4])[0])
304 """Check that the stored checksum matches the actual checksum."""
305 return self.calculate_checksum() == self.get_stored_checksums()[1]
307 def calculate_checksum(self):
308 f = open(self._filename, 'r')
310 return sha.sha(self._contents[:-20]).digest()
314 def get_stored_checksums(self):
315 """Return the SHA1 checksums stored for the corresponding packfile and
316 this header file itself."""
317 return str(self._contents[-40:-20]), str(self._contents[-20:])
319 def object_index(self, sha):
320 """Return the index in to the corresponding packfile for the object.
322 Given the name of an object it will return the offset that object lives
323 at within the corresponding pack file. If the pack file doesn't have the
324 object then None will be returned.
326 size = os.path.getsize(self._filename)
327 assert size == self._size, "Pack index %s has changed size, I don't " \
328 "like that" % self._filename
330 sha = hex_to_sha(sha)
331 return self._object_index(sha)
333 def _object_index(self, sha):
334 """See object_index"""
339 start = self._fan_out_table[idx-1]
340 end = self._fan_out_table[idx]
344 file_sha = self._unpack_name(i)
350 return self._unpack_offset(i)
354 def read_pack_header(f):
356 assert header[:4] == "PACK"
357 (version,) = unpack_from(">L", header, 4)
358 assert version in (2, 3), "Version was %d" % version
359 (num_objects,) = unpack_from(">L", header, 8)
360 return (version, num_objects)
363 def read_pack_tail(f):
367 def unpack_object(map):
368 bytes = take_msb_bytes(map, 0)
369 type = (bytes[0] >> 4) & 0x07
370 size = bytes[0] & 0x0f
371 for i, byte in enumerate(bytes[1:]):
372 size += (byte & 0x7f) << ((i * 7) + 4)
373 raw_base = len(bytes)
374 if type == 6: # offset delta
375 bytes = take_msb_bytes(map, raw_base)
376 assert not (bytes[-1] & 0x80)
377 delta_base_offset = bytes[0] & 0x7f
378 for byte in bytes[1:]:
379 delta_base_offset += 1
380 delta_base_offset <<= 7
381 delta_base_offset += (byte & 0x7f)
383 uncomp, comp_len = read_zlib(map, raw_base, size)
384 assert size == len(uncomp)
385 return type, (delta_base_offset, uncomp), comp_len+raw_base
386 elif type == 7: # ref delta
387 basename = map[raw_base:raw_base+20]
388 uncomp, comp_len = read_zlib(map, raw_base+20, size)
389 assert size == len(uncomp)
390 return type, (basename, uncomp), comp_len+raw_base+20
392 uncomp, comp_len = read_zlib(map, raw_base, size)
393 assert len(uncomp) == size
394 return type, uncomp, comp_len+raw_base
397 class PackData(object):
398 """The data contained in a packfile.
400 Pack files can be accessed both sequentially for exploding a pack, and
401 directly with the help of an index to retrieve a specific object.
403 The objects within are either complete or a delta aginst another.
405 The header is variable length. If the MSB of each byte is set then it
406 indicates that the subsequent byte is still part of the header.
407 For the first byte the next MS bits are the type, which tells you the type
408 of object, and whether it is a delta. The LS byte is the lowest bits of the
409 size. For each subsequent byte the LS 7 bits are the next MS bits of the
410 size, i.e. the last byte of the header contains the MS bits of the size.
412 For the complete objects the data is stored as zlib deflated data.
413 The size in the header is the uncompressed object size, so to uncompress
414 you need to just keep feeding data to zlib until you get an object back,
415 or it errors on bad data. This is done here by just giving the complete
416 buffer from the start of the deflated object on. This is bad, but until I
417 get mmap sorted out it will have to do.
419 Currently there are no integrity checks done. Also no attempt is made to try
420 and detect the delta case, or a request for an object at the wrong position.
421 It will all just throw a zlib or KeyError.
424 def __init__(self, filename):
425 """Create a PackData object that represents the pack in the given filename.
427 The file must exist and stay readable until the object is disposed of. It
428 must also stay the same size. It will be mapped whenever needed.
430 Currently there is a restriction on the size of the pack as the python
431 mmap implementation is flawed.
433 self._filename = filename
434 assert os.path.exists(filename), "%s is not a packfile" % filename
435 self._size = os.path.getsize(filename)
436 self._header_size = 12
437 assert self._size >= self._header_size, "%s is too small for a packfile (%d < %d)" % (filename, self._size, self._header_size)
440 def _read_header(self):
441 f = open(self._filename, 'rb')
443 (version, self._num_objects) = \
445 f.seek(self._size-20)
446 (self._stored_checksum,) = read_pack_tail(f)
451 """Returns the number of objects in this pack."""
452 return self._num_objects
454 def calculate_checksum(self):
455 f = open(self._filename, 'rb')
457 map = simple_mmap(f, 0, self._size)
458 return sha.sha(map[:-20]).digest()
462 def iterobjects(self):
463 offset = self._header_size
464 f = open(self._filename, 'rb')
465 for i in range(len(self)):
466 map = simple_mmap(f, offset, self._size-offset)
467 (type, obj, total_size) = unpack_object(map)
468 yield offset, type, obj
472 def iterentries(self, ext_resolve_ref=None):
475 postponed = defaultdict(list)
476 class Postpone(Exception):
477 """Raised to postpone delta resolving."""
479 def get_ref_text(sha):
484 return ext_resolve_ref(sha)
487 raise Postpone, (sha, )
488 todo = list(self.iterobjects())
490 (offset, type, obj) = todo.pop(0)
491 at[offset] = (type, obj)
492 assert isinstance(offset, int)
493 assert isinstance(type, int)
494 assert isinstance(obj, tuple) or isinstance(obj, str)
496 type, obj = resolve_object(offset, type, obj, get_ref_text,
498 except Postpone, (sha, ):
499 postponed[sha].append((offset, type, obj))
501 shafile = ShaFile.from_raw_string(type, obj)
502 sha = shafile.sha().digest()
503 found[sha] = (type, obj)
504 yield sha, offset, shafile.crc32()
505 todo += postponed.get(sha, [])
507 raise KeyError([sha_to_hex(h) for h in postponed.keys()])
509 def sorted_entries(self, resolve_ext_ref=None):
510 ret = list(self.iterentries(resolve_ext_ref))
514 def create_index_v1(self, filename, resolve_ext_ref=None):
515 entries = self.sorted_entries(resolve_ext_ref)
516 write_pack_index_v1(filename, entries, self.calculate_checksum())
518 def create_index_v2(self, filename, resolve_ext_ref=None):
519 entries = self.sorted_entries(resolve_ext_ref)
520 write_pack_index_v2(filename, entries, self.calculate_checksum())
522 def get_stored_checksum(self):
523 return self._stored_checksum
526 return (self.calculate_checksum() == self.get_stored_checksum())
528 def get_object_at(self, offset):
529 """Given an offset in to the packfile return the object that is there.
531 Using the associated index the location of an object can be looked up, and
532 then the packfile can be asked directly for that object using this
535 assert isinstance(offset, long) or isinstance(offset, int),\
536 "offset was %r" % offset
537 assert offset >= self._header_size
538 size = os.path.getsize(self._filename)
539 assert size == self._size, "Pack data %s has changed size, I don't " \
540 "like that" % self._filename
541 f = open(self._filename, 'rb')
543 map = simple_mmap(f, offset, size-offset)
544 return unpack_object(map)[:2]
549 class SHA1Writer(object):
551 def __init__(self, f):
553 self.sha1 = sha.sha("")
555 def write(self, data):
556 self.sha1.update(data)
560 sha = self.sha1.digest()
561 assert len(sha) == 20
566 sha = self.write_sha()
574 def write_pack_object(f, type, object):
575 """Write pack object to a file.
577 :param f: File to write to
578 :param o: Object to write
581 if type == 6: # ref delta
582 (delta_base_offset, object) = object
583 elif type == 7: # offset delta
584 (basename, object) = object
586 c = (type << 4) | (size & 15)
589 f.write(chr(c | 0x80))
593 if type == 6: # offset delta
594 ret = [delta_base_offset & 0x7f]
595 delta_base_offset >>= 7
596 while delta_base_offset:
597 delta_base_offset -= 1
598 ret.insert(0, 0x80 | (delta_base_offset & 0x7f))
599 delta_base_offset >>= 7
600 f.write("".join([chr(x) for x in ret]))
601 elif type == 7: # ref delta
602 assert len(basename) == 20
604 f.write(zlib.compress(object))
608 def write_pack(filename, objects, num_objects):
609 f = open(filename + ".pack", 'w')
611 entries, data_sum = write_pack_data(f, objects, num_objects)
615 write_pack_index_v2(filename + ".idx", entries, data_sum)
618 def write_pack_data(f, objects, num_objects, window=10):
619 """Write a new pack file.
621 :param filename: The filename of the new pack file.
622 :param objects: List of objects to write (tuples with object and path)
623 :return: List with (name, offset, crc32 checksum) entries, pack checksum
625 recency = list(objects)
626 # FIXME: Somehow limit delta depth
627 # FIXME: Make thin-pack optional (its not used when cloning a pack)
628 # Build a list of objects ordered by the magic Linus heuristic
629 # This helps us find good objects to diff against us
631 for obj, path in recency:
632 magic.append( (obj.type, path, 1, -len(obj.as_raw_string()[1]), obj) )
634 # Build a map of objects and their index in magic - so we can find preceeding objects
637 for i in range(len(magic)):
638 offs[magic[i][4]] = i
642 f.write("PACK") # Pack header
643 f.write(struct.pack(">L", 2)) # Pack version
644 f.write(struct.pack(">L", num_objects)) # Number of objects in pack
645 for o, path in recency:
646 sha1 = o.sha().digest()
648 orig_t, raw = o.as_raw_string()
651 #for i in range(offs[o]-window, window):
652 # if i < 0 or i >= len(offs): continue
654 # if b.type != orig_t: continue
655 # _, base = b.as_raw_string()
656 # delta = create_delta(base, raw)
657 # if len(delta) < len(winner):
659 # t = 6 if magic[i][2] == 1 else 7
660 offset = write_pack_object(f, t, winner)
661 entries.append((sha1, offset, crc32))
662 return entries, f.write_sha()
665 def write_pack_index_v1(filename, entries, pack_checksum):
666 """Write a new pack index file.
668 :param filename: The filename of the new pack index file.
669 :param entries: List of tuples with object name (sha), offset_in_pack, and
671 :param pack_checksum: Checksum of the pack file.
673 f = open(filename, 'w')
675 fan_out_table = defaultdict(lambda: 0)
676 for (name, offset, entry_checksum) in entries:
677 fan_out_table[ord(name[0])] += 1
679 for i in range(0x100):
680 f.write(struct.pack(">L", fan_out_table[i]))
681 fan_out_table[i+1] += fan_out_table[i]
682 for (name, offset, entry_checksum) in entries:
683 f.write(struct.pack(">L20s", offset, name))
684 assert len(pack_checksum) == 20
685 f.write(pack_checksum)
689 def create_delta(base_buf, target_buf):
690 """Use python difflib to work out how to transform base_buf to target_buf"""
691 assert isinstance(base_buf, str)
692 assert isinstance(target_buf, str)
695 def encode_size(size):
705 out_buf += encode_size(len(base_buf))
706 out_buf += encode_size(len(target_buf))
707 # write out delta opcodes
708 seq = difflib.SequenceMatcher(a=base_buf, b=target_buf)
709 for opcode, i1, i2, j1, j2 in seq.get_opcodes():
710 # Git patch opcodes don't care about deletes!
711 #if opcode == "replace" or opcode == "delete":
713 if opcode == "equal":
714 # If they are equal, unpacker will use data from base_buf
715 # Write out an opcode that says what range to use
721 scratch += chr(o >> i)
726 scratch += chr(s >> i)
730 if opcode == "replace" or opcode == "insert":
731 # If we are replacing a range or adding one, then we just
732 # output it to the stream (prefixed by its size)
737 out_buf += target_buf[o:o+127]
741 out_buf += target_buf[o:o+s]
745 def apply_delta(src_buf, delta):
746 """Based on the similar function in git's patch-delta.c."""
747 assert isinstance(src_buf, str), "was %r" % (src_buf,)
748 assert isinstance(delta, str)
751 delta_length = len(delta)
752 def pop(delta, index):
755 return ord(ret), index
756 def get_delta_header_size(delta, index):
760 cmd, index = pop(delta, index)
761 size |= (cmd & ~0x80) << i
766 src_size, index = get_delta_header_size(delta, index)
767 dest_size, index = get_delta_header_size(delta, index)
768 assert src_size == len(src_buf), "%d vs %d" % (src_size, len(src_buf))
769 while index < delta_length:
770 cmd, index = pop(delta, index)
775 x, index = pop(delta, index)
776 cp_off |= x << (i * 8)
779 if cmd & (1 << (4+i)):
780 x, index = pop(delta, index)
781 cp_size |= x << (i * 8)
784 if (cp_off + cp_size < cp_size or
785 cp_off + cp_size > src_size or
786 cp_size > dest_size):
788 out += src_buf[cp_off:cp_off+cp_size]
790 out += delta[index:index+cmd]
793 raise ApplyDeltaError("Invalid opcode 0")
795 if index != delta_length:
796 raise ApplyDeltaError("delta not empty: %r" % delta[index:])
798 if dest_size != len(out):
799 raise ApplyDeltaError("dest size incorrect")
804 def write_pack_index_v2(filename, entries, pack_checksum):
805 """Write a new pack index file.
807 :param filename: The filename of the new pack index file.
808 :param entries: List of tuples with object name (sha), offset_in_pack, and
810 :param pack_checksum: Checksum of the pack file.
812 f = open(filename, 'w')
814 f.write('\377tOc') # Magic!
815 f.write(struct.pack(">L", 2))
816 fan_out_table = defaultdict(lambda: 0)
817 for (name, offset, entry_checksum) in entries:
818 fan_out_table[ord(name[0])] += 1
820 for i in range(0x100):
821 f.write(struct.pack(">L", fan_out_table[i]))
822 fan_out_table[i+1] += fan_out_table[i]
823 for (name, offset, entry_checksum) in entries:
825 for (name, offset, entry_checksum) in entries:
826 f.write(struct.pack(">l", entry_checksum))
827 for (name, offset, entry_checksum) in entries:
828 # FIXME: handle if MSBit is set in offset
829 f.write(struct.pack(">L", offset))
830 # FIXME: handle table for pack files > 8 Gb
831 assert len(pack_checksum) == 20
832 f.write(pack_checksum)
838 def __init__(self, basename):
839 self._basename = basename
840 self._data_path = self._basename + ".pack"
841 self._idx_path = self._basename + ".idx"
846 return self.idx.objects_sha1()
850 if self._data is None:
851 self._data = PackData(self._data_path)
852 assert len(self.idx) == len(self._data)
853 assert self.idx.get_stored_checksums()[0] == self._data.get_stored_checksum()
858 if self._idx is None:
859 self._idx = PackIndex(self._idx_path)
863 if self._data is not None:
867 def __eq__(self, other):
868 return type(self) == type(other) and self.idx == other.idx
871 """Number of entries in this pack."""
875 return "Pack(%r)" % self._basename
878 """Iterate over all the sha1s of the objects in this pack."""
879 return iter(self.idx)
882 return self.idx.check() and self.data.check()
884 def get_stored_checksum(self):
885 return self.data.get_stored_checksum()
887 def __contains__(self, sha1):
888 """Check whether this pack contains a particular SHA1."""
889 return (self.idx.object_index(sha1) is not None)
891 def get_raw(self, sha1, resolve_ref=None):
892 offset = self.idx.object_index(sha1)
896 type, obj = self.data.get_object_at(offset)
897 if isinstance(offset, long):
899 assert isinstance(offset, int)
900 return resolve_object(offset, type, obj, resolve_ref,
901 self.data.get_object_at)
903 def __getitem__(self, sha1):
904 """Retrieve the specified SHA1."""
905 type, uncomp = self.get_raw(sha1)
906 return ShaFile.from_raw_string(type, uncomp)
908 def iterobjects(self, get_raw=None):
912 for offset, type, obj in self.data.iterobjects():
913 assert isinstance(offset, int)
914 yield ShaFile.from_raw_string(
915 *resolve_object(offset, type, obj,
917 self.data.get_object_at))
920 def load_packs(path):
921 if not os.path.exists(path):
923 for name in os.listdir(path):
924 if name.startswith("pack-") and name.endswith(".pack"):
925 yield Pack(os.path.join(path, name[:-len(".pack")]))