1 # pack.py -- For dealing wih packed git objects.
2 # Copyright (C) 2007 James Westby <jw+debian@jameswestby.net>
3 # Copryight (C) 2008 Jelmer Vernooij <jelmer@samba.org>
4 # The code is loosely based on that in the sha1_file.c file from git itself,
5 # which is Copyright (C) Linus Torvalds, 2005 and distributed under the
8 # This program is free software; you can redistribute it and/or
9 # modify it under the terms of the GNU General Public License
10 # as published by the Free Software Foundation; version 2
13 # This program is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
18 # You should have received a copy of the GNU General Public License
19 # along with this program; if not, write to the Free Software
20 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
23 """Classes for dealing with packed git objects.
25 A pack is a compact representation of a bunch of objects, stored
26 using deltas where possible.
28 They have two parts, the pack file, which stores the data, and an index
29 that tells you where the data is.
31 To find an object you look in all of the index files 'til you find a
32 match for the object name. You then use the pointer got from this as
33 a pointer in to the corresponding packfile.
36 from collections import defaultdict
38 from itertools import imap, izip
49 from errors import ApplyDeltaError
51 supports_mmap_offset = (sys.version_info[0] >= 3 or
52 (sys.version_info[0] == 2 and sys.version_info[1] >= 6))
55 def take_msb_bytes(map, offset):
57 while len(ret) == 0 or ret[-1] & 0x80:
58 ret.append(ord(map[offset]))
63 def read_zlib(data, offset, dec_size):
64 obj = zlib.decompressobj()
67 while obj.unused_data == "":
69 add = data[base:base+1024]
71 x += obj.decompress(add)
72 assert len(x) == dec_size
73 comp_len = fed-len(obj.unused_data)
81 return sha.hexdigest()
85 """Convert a hex string to a binary sha string."""
87 for i in range(0, len(hex), 2):
88 ret += chr(int(hex[i:i+2], 16))
92 """Convert a binary sha string to a hex sha string."""
95 ret += "%02x" % ord(i)
98 MAX_MMAP_SIZE = 256 * 1024 * 1024
100 def simple_mmap(f, offset, size, access=mmap.ACCESS_READ):
101 """Simple wrapper for mmap() which always supports the offset parameter.
103 :param f: File object.
104 :param offset: Offset in the file, from the beginning of the file.
105 :param size: Size of the mmap'ed area
106 :param access: Access mechanism.
107 :return: MMAP'd area.
109 if offset+size > MAX_MMAP_SIZE and not supports_mmap_offset:
110 raise AssertionError("%s is larger than 256 meg, and this version "
111 "of Python does not support the offset argument to mmap().")
112 if supports_mmap_offset:
113 return mmap.mmap(f.fileno(), size, access=access, offset=offset)
115 class ArraySkipper(object):
117 def __init__(self, array, offset):
121 def __getslice__(self, i, j):
122 return self.array[i+self.offset:j+self.offset]
124 def __getitem__(self, i):
125 return self.array[i+self.offset]
128 return len(self.array) - self.offset
131 return str(self.array[self.offset:])
133 mem = mmap.mmap(f.fileno(), size+offset, access=access)
136 return ArraySkipper(mem, offset)
139 def resolve_object(offset, type, obj, get_ref, get_offset):
140 """Resolve an object, possibly resolving deltas when necessary."""
141 if not type in (6, 7): # Not a delta
144 if type == 6: # offset delta
145 (delta_offset, delta) = obj
146 assert isinstance(delta_offset, int)
147 assert isinstance(delta, str)
148 offset = offset-delta_offset
149 type, base_obj = get_offset(offset)
150 assert isinstance(type, int)
151 elif type == 7: # ref delta
152 (basename, delta) = obj
153 assert isinstance(basename, str) and len(basename) == 20
154 assert isinstance(delta, str)
155 type, base_obj = get_ref(basename)
156 assert isinstance(type, int)
157 type, base_text = resolve_object(offset, type, base_obj, get_ref, get_offset)
158 return type, apply_delta(base_text, delta)
161 class PackIndex(object):
162 """An index in to a packfile.
164 Given a sha id of an object a pack index can tell you the location in the
165 packfile of that object if it has it.
167 To do the loop it opens the file, and indexes first 256 4 byte groups
168 with the first byte of the sha id. The value in the four byte group indexed
169 is the end of the group that shares the same starting byte. Subtract one
170 from the starting byte and index again to find the start of the group.
171 The values are sorted by sha id within the group, so do the math to find
172 the start and end offset and then bisect in to find if the value is present.
175 def __init__(self, filename):
176 """Create a pack index object.
178 Provide it with the name of the index file to consider, and it will map
179 it whenever required.
181 self._filename = filename
182 # Take the size now, so it can be checked each time we map the file to
183 # ensure that it hasn't changed.
184 self._size = os.path.getsize(filename)
185 self._file = open(filename, 'r')
186 self._contents = simple_mmap(self._file, 0, self._size)
187 if self._contents[:4] != '\377tOc':
189 self._fan_out_table = self._read_fan_out_table(0)
191 (self.version, ) = struct.unpack_from(">L", self._contents, 4)
192 assert self.version in (2,), "Version was %d" % self.version
193 self._fan_out_table = self._read_fan_out_table(8)
194 self._name_table_offset = 8 + 0x100 * 4
195 self._crc32_table_offset = self._name_table_offset + 20 * len(self)
196 self._pack_offset_table_offset = self._crc32_table_offset + 4 * len(self)
198 def __eq__(self, other):
199 if type(self) != type(other):
202 if self._fan_out_table != other._fan_out_table:
205 for (name1, _, _), (name2, _, _) in izip(self.iterentries(), other.iterentries()):
214 """Return the number of entries in this pack index."""
215 return self._fan_out_table[-1]
217 def _unpack_entry(self, i):
218 """Unpack the i-th entry in the index file.
220 :return: Tuple with object name (SHA), offset in pack file and
221 CRC32 checksum (if known)."""
222 if self.version == 1:
223 (offset, name) = struct.unpack_from(">L20s", self._contents,
224 (0x100 * 4) + (i * 24))
225 return (name, offset, None)
227 return (self._unpack_name(i), self._unpack_offset(i),
228 self._unpack_crc32_checksum(i))
230 def _unpack_name(self, i):
231 if self.version == 1:
232 return self._unpack_entry(i)[0]
234 return struct.unpack_from("20s", self._contents,
235 self._name_table_offset + i * 20)[0]
237 def _unpack_offset(self, i):
238 if self.version == 1:
239 return self._unpack_entry(i)[1]
241 return struct.unpack_from(">L", self._contents,
242 self._pack_offset_table_offset + i * 4)[0]
244 def _unpack_crc32_checksum(self, i):
245 if self.version == 1:
248 return struct.unpack_from(">L", self._contents,
249 self._crc32_table_offset + i * 4)[0]
252 return imap(sha_to_hex, self._itersha())
255 for i in range(len(self)):
256 yield self._unpack_name(i)
258 def objects_sha1(self):
259 return iter_sha1(self._itersha())
261 def iterentries(self):
262 """Iterate over the entries in this pack index.
264 Will yield tuples with object name, offset in packfile and crc32 checksum.
266 for i in range(len(self)):
267 yield self._unpack_entry(i)
269 def _read_fan_out_table(self, start_offset):
271 for i in range(0x100):
272 ret.append(struct.unpack(">L", self._contents[start_offset+i*4:start_offset+(i+1)*4])[0])
276 """Check that the stored checksum matches the actual checksum."""
277 return self.calculate_checksum() == self.get_stored_checksums()[1]
279 def calculate_checksum(self):
280 f = open(self._filename, 'r')
282 return hashlib.sha1(self._contents[:-20]).digest()
286 def get_stored_checksums(self):
287 """Return the SHA1 checksums stored for the corresponding packfile and
288 this header file itself."""
289 return str(self._contents[-40:-20]), str(self._contents[-20:])
291 def object_index(self, sha):
292 """Return the index in to the corresponding packfile for the object.
294 Given the name of an object it will return the offset that object lives
295 at within the corresponding pack file. If the pack file doesn't have the
296 object then None will be returned.
298 size = os.path.getsize(self._filename)
299 assert size == self._size, "Pack index %s has changed size, I don't " \
300 "like that" % self._filename
302 sha = hex_to_sha(sha)
303 return self._object_index(sha)
305 def _object_index(self, sha):
306 """See object_index"""
311 start = self._fan_out_table[idx-1]
312 end = self._fan_out_table[idx]
316 file_sha = self._unpack_name(i)
322 return self._unpack_offset(i)
326 class PackData(object):
327 """The data contained in a packfile.
329 Pack files can be accessed both sequentially for exploding a pack, and
330 directly with the help of an index to retrieve a specific object.
332 The objects within are either complete or a delta aginst another.
334 The header is variable length. If the MSB of each byte is set then it
335 indicates that the subsequent byte is still part of the header.
336 For the first byte the next MS bits are the type, which tells you the type
337 of object, and whether it is a delta. The LS byte is the lowest bits of the
338 size. For each subsequent byte the LS 7 bits are the next MS bits of the
339 size, i.e. the last byte of the header contains the MS bits of the size.
341 For the complete objects the data is stored as zlib deflated data.
342 The size in the header is the uncompressed object size, so to uncompress
343 you need to just keep feeding data to zlib until you get an object back,
344 or it errors on bad data. This is done here by just giving the complete
345 buffer from the start of the deflated object on. This is bad, but until I
346 get mmap sorted out it will have to do.
348 Currently there are no integrity checks done. Also no attempt is made to try
349 and detect the delta case, or a request for an object at the wrong position.
350 It will all just throw a zlib or KeyError.
353 def __init__(self, filename):
354 """Create a PackData object that represents the pack in the given filename.
356 The file must exist and stay readable until the object is disposed of. It
357 must also stay the same size. It will be mapped whenever needed.
359 Currently there is a restriction on the size of the pack as the python
360 mmap implementation is flawed.
362 self._filename = filename
363 assert os.path.exists(filename), "%s is not a packfile" % filename
364 self._size = os.path.getsize(filename)
365 assert self._size >= 12, "%s is too small for a packfile" % filename
366 self._header_size = self._read_header()
368 def _read_header(self):
369 f = open(self._filename, 'rb')
372 f.seek(self._size-20)
373 self._stored_checksum = f.read(20)
376 assert header[:4] == "PACK"
377 (version,) = struct.unpack_from(">L", header, 4)
378 assert version in (2, 3), "Version was %d" % version
379 (self._num_objects,) = struct.unpack_from(">L", header, 8)
380 return 12 # Header size
383 """Returns the number of objects in this pack."""
384 return self._num_objects
386 def calculate_checksum(self):
387 f = open(self._filename, 'rb')
389 map = simple_mmap(f, 0, self._size)
390 return hashlib.sha1(map[:-20]).digest()
394 def iterobjects(self):
395 offset = self._header_size
396 f = open(self._filename, 'rb')
397 for i in range(len(self)):
398 map = simple_mmap(f, offset, self._size-offset)
399 (type, obj, total_size) = self._unpack_object(map)
400 yield offset, type, obj
404 def iterentries(self):
407 postponed = defaultdict(list)
408 class Postpone(Exception):
409 """Raised to postpone delta resolving."""
411 def get_ref_text(sha):
414 raise Postpone, (sha, )
415 todo = list(self.iterobjects())
417 (offset, type, obj) = todo.pop(0)
418 at[offset] = (type, obj)
419 assert isinstance(offset, int)
420 assert isinstance(type, int)
421 assert isinstance(obj, tuple) or isinstance(obj, str)
423 type, obj = resolve_object(offset, type, obj, get_ref_delta,
425 except Postpone, (sha, ):
426 postponed[sha].append((offset, type, obj))
428 shafile = ShaFile.from_raw_string(type, obj)
429 sha = shafile.sha().digest()
430 found[sha] = (type, obj)
431 yield sha, offset, shafile.crc32()
432 todo += postponed.get(sha, [])
434 raise KeyError([sha_to_hex(h) for h in postponed.keys()])
436 def sorted_entries(self):
437 ret = list(self.iterentries())
441 def create_index_v1(self, filename):
442 entries = self.sorted_entries()
443 write_pack_index_v1(filename, entries, self.calculate_checksum())
445 def create_index_v2(self, filename):
446 entries = self.sorted_entries()
447 write_pack_index_v2(filename, entries, self.calculate_checksum())
449 def get_stored_checksum(self):
450 return self._stored_checksum
453 return (self.calculate_checksum() == self.get_stored_checksum())
455 def get_object_at(self, offset):
456 """Given an offset in to the packfile return the object that is there.
458 Using the associated index the location of an object can be looked up, and
459 then the packfile can be asked directly for that object using this
462 assert isinstance(offset, long) or isinstance(offset, int),\
463 "offset was %r" % offset
464 assert offset >= self._header_size
465 size = os.path.getsize(self._filename)
466 assert size == self._size, "Pack data %s has changed size, I don't " \
467 "like that" % self._filename
468 f = open(self._filename, 'rb')
470 map = simple_mmap(f, offset, size-offset)
471 return self._unpack_object(map)[:2]
475 def _unpack_object(self, map):
476 bytes = take_msb_bytes(map, 0)
477 type = (bytes[0] >> 4) & 0x07
478 size = bytes[0] & 0x0f
479 for i, byte in enumerate(bytes[1:]):
480 size += (byte & 0x7f) << ((i * 7) + 4)
481 raw_base = len(bytes)
482 if type == 6: # offset delta
483 bytes = take_msb_bytes(map, raw_base)
484 assert not (bytes[-1] & 0x80)
485 delta_base_offset = bytes[0] & 0x7f
486 for byte in bytes[1:]:
487 delta_base_offset += 1
488 delta_base_offset <<= 7
489 delta_base_offset += (byte & 0x7f)
491 uncomp, comp_len = read_zlib(map, raw_base, size)
492 assert size == len(uncomp)
493 return type, (delta_base_offset, uncomp), comp_len+raw_base
494 elif type == 7: # ref delta
495 basename = map[raw_base:raw_base+20]
496 uncomp, comp_len = read_zlib(map, raw_base+20, size)
497 assert size == len(uncomp)
498 return type, (basename, uncomp), comp_len+raw_base+20
500 uncomp, comp_len = read_zlib(map, raw_base, size)
501 assert len(uncomp) == size
502 return type, uncomp, comp_len+raw_base
505 class SHA1Writer(object):
507 def __init__(self, f):
509 self.sha1 = hashlib.sha1("")
511 def write(self, data):
512 self.sha1.update(data)
516 sha = self.sha1.digest()
517 assert len(sha) == 20
522 sha = self.write_sha()
530 def write_pack_object(f, type, object):
531 """Write pack object to a file.
533 :param f: File to write to
534 :param o: Object to write
537 if type == 6: # ref delta
538 (delta_base_offset, object) = object
539 elif type == 7: # offset delta
540 (basename, object) = object
542 c = (type << 4) | (size & 15)
545 f.write(chr(c | 0x80))
549 if type == 6: # offset delta
550 ret = [delta_base_offset & 0x7f]
551 delta_base_offset >>= 7
552 while delta_base_offset:
553 delta_base_offset -= 1
554 ret.insert(0, 0x80 | (delta_base_offset & 0x7f))
555 delta_base_offset >>= 7
556 f.write("".join([chr(x) for x in ret]))
557 elif type == 7: # ref delta
558 assert len(basename) == 20
560 f.write(zlib.compress(object))
564 def write_pack(filename, objects, num_objects):
565 f = open(filename + ".pack", 'w')
567 entries, data_sum = write_pack_data(f, objects, num_objects)
571 write_pack_index_v2(filename + ".idx", entries, data_sum)
574 def write_pack_data(f, objects, num_objects):
575 """Write a new pack file.
577 :param filename: The filename of the new pack file.
578 :param objects: List of objects to write.
579 :return: List with (name, offset, crc32 checksum) entries, pack checksum
583 f.write("PACK") # Pack header
584 f.write(struct.pack(">L", 2)) # Pack version
585 f.write(struct.pack(">L", num_objects)) # Number of objects in pack
587 sha1 = o.sha().digest()
590 t, o = o.as_raw_string()
591 offset = write_pack_object(f, t, o)
592 entries.append((sha1, offset, crc32))
593 return entries, f.write_sha()
596 def write_pack_index_v1(filename, entries, pack_checksum):
597 """Write a new pack index file.
599 :param filename: The filename of the new pack index file.
600 :param entries: List of tuples with object name (sha), offset_in_pack, and
602 :param pack_checksum: Checksum of the pack file.
604 f = open(filename, 'w')
606 fan_out_table = defaultdict(lambda: 0)
607 for (name, offset, entry_checksum) in entries:
608 fan_out_table[ord(name[0])] += 1
610 for i in range(0x100):
611 f.write(struct.pack(">L", fan_out_table[i]))
612 fan_out_table[i+1] += fan_out_table[i]
613 for (name, offset, entry_checksum) in entries:
614 f.write(struct.pack(">L20s", offset, name))
615 assert len(pack_checksum) == 20
616 f.write(pack_checksum)
620 def apply_delta(src_buf, delta):
621 """Based on the similar function in git's patch-delta.c."""
622 assert isinstance(src_buf, str), "was %r" % (src_buf,)
623 assert isinstance(delta, str)
628 return ord(ret), delta
629 def get_delta_header_size(delta):
633 cmd, delta = pop(delta)
634 size |= (cmd & ~0x80) << i
639 src_size, delta = get_delta_header_size(delta)
640 dest_size, delta = get_delta_header_size(delta)
641 assert src_size == len(src_buf)
643 cmd, delta = pop(delta)
648 x, delta = pop(delta)
649 cp_off |= x << (i * 8)
652 if cmd & (1 << (4+i)):
653 x, delta = pop(delta)
654 cp_size |= x << (i * 8)
657 if (cp_off + cp_size < cp_size or
658 cp_off + cp_size > src_size or
659 cp_size > dest_size):
661 out += src_buf[cp_off:cp_off+cp_size]
666 raise ApplyDeltaError("Invalid opcode 0")
669 raise ApplyDeltaError("delta not empty: %r" % delta)
671 if dest_size != len(out):
672 raise ApplyDeltaError("dest size incorrect")
677 def write_pack_index_v2(filename, entries, pack_checksum):
678 """Write a new pack index file.
680 :param filename: The filename of the new pack index file.
681 :param entries: List of tuples with object name (sha), offset_in_pack, and
683 :param pack_checksum: Checksum of the pack file.
685 f = open(filename, 'w')
688 f.write(struct.pack(">L", 2))
689 fan_out_table = defaultdict(lambda: 0)
690 for (name, offset, entry_checksum) in entries:
691 fan_out_table[ord(name[0])] += 1
693 for i in range(0x100):
694 f.write(struct.pack(">L", fan_out_table[i]))
695 fan_out_table[i+1] += fan_out_table[i]
696 for (name, offset, entry_checksum) in entries:
698 for (name, offset, entry_checksum) in entries:
699 f.write(struct.pack(">l", entry_checksum))
700 for (name, offset, entry_checksum) in entries:
701 # FIXME: handle if MSBit is set in offset
702 f.write(struct.pack(">L", offset))
703 # FIXME: handle table for pack files > 8 Gb
704 assert len(pack_checksum) == 20
705 f.write(pack_checksum)
711 def __init__(self, basename):
712 self._basename = basename
713 self._data_path = self._basename + ".pack"
714 self._idx_path = self._basename + ".idx"
719 return self.idx.objects_sha1()
723 if self._data is None:
724 self._data = PackData(self._data_path)
725 assert len(self.idx) == len(self._data)
726 assert self.idx.get_stored_checksums()[0] == self._data.get_stored_checksum()
731 if self._idx is None:
732 self._idx = PackIndex(self._idx_path)
736 if self._data is not None:
740 def __eq__(self, other):
741 return type(self) == type(other) and self.idx == other.idx
744 """Number of entries in this pack."""
748 return "Pack(%r)" % self._basename
751 """Iterate over all the sha1s of the objects in this pack."""
752 return iter(self.idx)
755 return self.idx.check() and self.data.check()
757 def get_stored_checksum(self):
758 return self.data.get_stored_checksum()
760 def __contains__(self, sha1):
761 """Check whether this pack contains a particular SHA1."""
762 return (self.idx.object_index(sha1) is not None)
764 def _get_text(self, sha1):
765 offset = self.idx.object_index(sha1)
769 type, obj = self.data.get_object_at(offset)
770 assert isinstance(offset, int)
771 return resolve_object(offset, type, obj, self._get_text,
772 self.data.get_object_at)
774 def __getitem__(self, sha1):
775 """Retrieve the specified SHA1."""
776 type, uncomp = self._get_text(sha1)
777 return ShaFile.from_raw_string(type, uncomp)
779 def iterobjects(self):
780 for offset, type, obj in self.data.iterobjects():
781 assert isinstance(offset, int)
782 yield ShaFile.from_raw_string(
783 *resolve_object(offset, type, obj, self._get_text,
784 self.data.get_object_at))
787 def load_packs(path):
788 if not os.path.exists(path):
790 for name in os.listdir(path):
791 if name.startswith("pack-") and name.endswith(".pack"):
792 yield Pack(os.path.join(path, name[:-len(".pack")]))