dulwich/pack.py

   1 # pack.py -- For dealing wih packed git objects.
   2 # Copyright (C) 2007 James Westby <jw+debian@jameswestby.net>
   3 # Copryight (C) 2008 Jelmer Vernooij <jelmer@samba.org>
   4 # The code is loosely based on that in the sha1_file.c file from git itself,
   5 # which is Copyright (C) Linus Torvalds, 2005 and distributed under the
   6 # GPL version 2.
   7 #
   8 # This program is free software; you can redistribute it and/or
   9 # modify it under the terms of the GNU General Public License
  10 # as published by the Free Software Foundation; version 2
  11 # of the License.
  12 #
  13 # This program is distributed in the hope that it will be useful,
  14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 # GNU General Public License for more details.
  17 #
  18 # You should have received a copy of the GNU General Public License
  19 # along with this program; if not, write to the Free Software
  20 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
  21 # MA  02110-1301, USA.
  22
  23 """Classes for dealing with packed git objects.
  24
  25 A pack is a compact representation of a bunch of objects, stored
  26 using deltas where possible.
  27
  28 They have two parts, the pack file, which stores the data, and an index
  29 that tells you where the data is.
  30
  31 To find an object you look in all of the index files 'til you find a
  32 match for the object name. You then use the pointer got from this as
  33 a pointer in to the corresponding packfile.
  34 """
  35
  36 from collections import defaultdict
  37 import hashlib
  38 from itertools import imap, izip
  39 import mmap
  40 import os
  41 import sha
  42 import struct
  43 import sys
  44 import zlib
  45 import difflib
  46
  47 from objects import (
  48         ShaFile,
  49         hex_to_sha,
  50         sha_to_hex,
  51         )
  52 from errors import ApplyDeltaError
  53
  54 supports_mmap_offset = (sys.version_info[0] >= 3 or
  55         (sys.version_info[0] == 2 and sys.version_info[1] >= 6))
  56
  57
  58 def take_msb_bytes(map, offset):
  59     ret = []
  60     while len(ret) == 0 or ret[-1] & 0x80:
  61         ret.append(ord(map[offset]))
  62         offset += 1
  63     return ret
  64
  65
  66 def read_zlib(data, offset, dec_size):
  67     obj = zlib.decompressobj()
  68     x = ""
  69     fed = 0
  70     while obj.unused_data == "":
  71         base = offset+fed
  72         add = data[base:base+1024]
  73         fed += len(add)
  74         x += obj.decompress(add)
  75     assert len(x) == dec_size
  76     comp_len = fed-len(obj.unused_data)
  77     return x, comp_len
  78
  79
  80 def iter_sha1(iter):
  81     sha = hashlib.sha1()
  82     for name in iter:
  83         sha.update(name)
  84     return sha.hexdigest()
  85
  86
  87 MAX_MMAP_SIZE = 256 * 1024 * 1024
  88
  89 def simple_mmap(f, offset, size, access=mmap.ACCESS_READ):
  90     """Simple wrapper for mmap() which always supports the offset parameter.
  91
  92     :param f: File object.
  93     :param offset: Offset in the file, from the beginning of the file.
  94     :param size: Size of the mmap'ed area
  95     :param access: Access mechanism.
  96     :return: MMAP'd area.
  97     """
  98     if offset+size > MAX_MMAP_SIZE and not supports_mmap_offset:
  99         raise AssertionError("%s is larger than 256 meg, and this version "
 100             "of Python does not support the offset argument to mmap().")
 101     if supports_mmap_offset:
 102         return mmap.mmap(f.fileno(), size, access=access, offset=offset)
 103     else:
 104         class ArraySkipper(object):
 105
 106             def __init__(self, array, offset):
 107                 self.array = array
 108                 self.offset = offset
 109
 110             def __getslice__(self, i, j):
 111                 return self.array[i+self.offset:j+self.offset]
 112
 113             def __getitem__(self, i):
 114                 return self.array[i+self.offset]
 115
 116             def __len__(self):
 117                 return len(self.array) - self.offset
 118
 119             def __str__(self):
 120                 return str(self.array[self.offset:])
 121
 122         mem = mmap.mmap(f.fileno(), size+offset, access=access)
 123         if offset == 0:
 124             return mem
 125         return ArraySkipper(mem, offset)
 126
 127
 128 def resolve_object(offset, type, obj, get_ref, get_offset):
 129   """Resolve an object, possibly resolving deltas when necessary."""
 130   if not type in (6, 7): # Not a delta
 131      return type, obj
 132
 133   if type == 6: # offset delta
 134      (delta_offset, delta) = obj
 135      assert isinstance(delta_offset, int)
 136      assert isinstance(delta, str)
 137      offset = offset-delta_offset
 138      type, base_obj = get_offset(offset)
 139      assert isinstance(type, int)
 140   elif type == 7: # ref delta
 141      (basename, delta) = obj
 142      assert isinstance(basename, str) and len(basename) == 20
 143      assert isinstance(delta, str)
 144      type, base_obj = get_ref(basename)
 145      assert isinstance(type, int)
 146   type, base_text = resolve_object(offset, type, base_obj, get_ref, get_offset)
 147   return type, apply_delta(base_text, delta)
 148
 149
 150 class PackIndex(object):
 151   """An index in to a packfile.
 152
 153   Given a sha id of an object a pack index can tell you the location in the
 154   packfile of that object if it has it.
 155
 156   To do the loop it opens the file, and indexes first 256 4 byte groups
 157   with the first byte of the sha id. The value in the four byte group indexed
 158   is the end of the group that shares the same starting byte. Subtract one
 159   from the starting byte and index again to find the start of the group.
 160   The values are sorted by sha id within the group, so do the math to find
 161   the start and end offset and then bisect in to find if the value is present.
 162   """
 163
 164   def __init__(self, filename):
 165     """Create a pack index object.
 166
 167     Provide it with the name of the index file to consider, and it will map
 168     it whenever required.
 169     """
 170     self._filename = filename
 171     # Take the size now, so it can be checked each time we map the file to
 172     # ensure that it hasn't changed.
 173     self._size = os.path.getsize(filename)
 174     self._file = open(filename, 'r')
 175     self._contents = simple_mmap(self._file, 0, self._size)
 176     if self._contents[:4] != '\377tOc':
 177         self.version = 1
 178         self._fan_out_table = self._read_fan_out_table(0)
 179     else:
 180         (self.version, ) = struct.unpack_from(">L", self._contents, 4)
 181         assert self.version in (2,), "Version was %d" % self.version
 182         self._fan_out_table = self._read_fan_out_table(8)
 183         self._name_table_offset = 8 + 0x100 * 4
 184         self._crc32_table_offset = self._name_table_offset + 20 * len(self)
 185         self._pack_offset_table_offset = self._crc32_table_offset + 4 * len(self)
 186
 187   def __eq__(self, other):
 188     if type(self) != type(other):
 189         return False
 190
 191     if self._fan_out_table != other._fan_out_table:
 192         return False
 193
 194     for (name1, _, _), (name2, _, _) in izip(self.iterentries(), other.iterentries()):
 195         if name1 != name2:
 196             return False
 197     return True
 198
 199   def close(self):
 200     self._file.close()
 201
 202   def __len__(self):
 203     """Return the number of entries in this pack index."""
 204     return self._fan_out_table[-1]
 205
 206   def _unpack_entry(self, i):
 207     """Unpack the i-th entry in the index file.
 208
 209     :return: Tuple with object name (SHA), offset in pack file and
 210           CRC32 checksum (if known)."""
 211     if self.version == 1:
 212         (offset, name) = struct.unpack_from(">L20s", self._contents,
 213             (0x100 * 4) + (i * 24))
 214         return (name, offset, None)
 215     else:
 216         return (self._unpack_name(i), self._unpack_offset(i),
 217                 self._unpack_crc32_checksum(i))
 218
 219   def _unpack_name(self, i):
 220     if self.version == 1:
 221         return self._unpack_entry(i)[0]
 222     else:
 223         return struct.unpack_from("20s", self._contents,
 224                                   self._name_table_offset + i * 20)[0]
 225
 226   def _unpack_offset(self, i):
 227     if self.version == 1:
 228         return self._unpack_entry(i)[1]
 229     else:
 230         return struct.unpack_from(">L", self._contents,
 231                                   self._pack_offset_table_offset + i * 4)[0]
 232
 233   def _unpack_crc32_checksum(self, i):
 234     if self.version == 1:
 235         return None
 236     else:
 237         return struct.unpack_from(">L", self._contents,
 238                                   self._crc32_table_offset + i * 4)[0]
 239
 240   def __iter__(self):
 241       return imap(sha_to_hex, self._itersha())
 242
 243   def _itersha(self):
 244     for i in range(len(self)):
 245         yield self._unpack_name(i)
 246
 247   def objects_sha1(self):
 248     return iter_sha1(self._itersha())
 249
 250   def iterentries(self):
 251     """Iterate over the entries in this pack index.
 252
 253     Will yield tuples with object name, offset in packfile and crc32 checksum.
 254     """
 255     for i in range(len(self)):
 256         yield self._unpack_entry(i)
 257
 258   def _read_fan_out_table(self, start_offset):
 259     ret = []
 260     for i in range(0x100):
 261         ret.append(struct.unpack(">L", self._contents[start_offset+i*4:start_offset+(i+1)*4])[0])
 262     return ret
 263
 264   def check(self):
 265     """Check that the stored checksum matches the actual checksum."""
 266     return self.calculate_checksum() == self.get_stored_checksums()[1]
 267
 268   def calculate_checksum(self):
 269     f = open(self._filename, 'r')
 270     try:
 271         return hashlib.sha1(self._contents[:-20]).digest()
 272     finally:
 273         f.close()
 274
 275   def get_stored_checksums(self):
 276     """Return the SHA1 checksums stored for the corresponding packfile and
 277     this header file itself."""
 278     return str(self._contents[-40:-20]), str(self._contents[-20:])
 279
 280   def object_index(self, sha):
 281     """Return the index in to the corresponding packfile for the object.
 282
 283     Given the name of an object it will return the offset that object lives
 284     at within the corresponding pack file. If the pack file doesn't have the
 285     object then None will be returned.
 286     """
 287     size = os.path.getsize(self._filename)
 288     assert size == self._size, "Pack index %s has changed size, I don't " \
 289          "like that" % self._filename
 290     if len(sha) == 40:
 291         sha = hex_to_sha(sha)
 292     return self._object_index(sha)
 293
 294   def _object_index(self, sha):
 295       """See object_index"""
 296       idx = ord(sha[0])
 297       if idx == 0:
 298           start = 0
 299       else:
 300           start = self._fan_out_table[idx-1]
 301       end = self._fan_out_table[idx]
 302       assert start <= end
 303       while start <= end:
 304         i = (start + end)/2
 305         file_sha = self._unpack_name(i)
 306         if file_sha < sha:
 307           start = i + 1
 308         elif file_sha > sha:
 309           end = i - 1
 310         else:
 311           return self._unpack_offset(i)
 312       return None
 313
 314
 315 def read_pack_header(f):
 316     header = f.read(12)
 317     assert header[:4] == "PACK"
 318     (version,) = struct.unpack_from(">L", header, 4)
 319     assert version in (2, 3), "Version was %d" % version
 320     (num_objects,) = struct.unpack_from(">L", header, 8)
 321     return (version, num_objects)
 322
 323
 324 def read_pack_tail(f):
 325     return (f.read(20),)
 326
 327
 328 def unpack_object(map):
 329     bytes = take_msb_bytes(map, 0)
 330     type = (bytes[0] >> 4) & 0x07
 331     size = bytes[0] & 0x0f
 332     for i, byte in enumerate(bytes[1:]):
 333       size += (byte & 0x7f) << ((i * 7) + 4)
 334     raw_base = len(bytes)
 335     if type == 6: # offset delta
 336         bytes = take_msb_bytes(map, raw_base)
 337         assert not (bytes[-1] & 0x80)
 338         delta_base_offset = bytes[0] & 0x7f
 339         for byte in bytes[1:]:
 340             delta_base_offset += 1
 341             delta_base_offset <<= 7
 342             delta_base_offset += (byte & 0x7f)
 343         raw_base+=len(bytes)
 344         uncomp, comp_len = read_zlib(map, raw_base, size)
 345         assert size == len(uncomp)
 346         return type, (delta_base_offset, uncomp), comp_len+raw_base
 347     elif type == 7: # ref delta
 348         basename = map[raw_base:raw_base+20]
 349         uncomp, comp_len = read_zlib(map, raw_base+20, size)
 350         assert size == len(uncomp)
 351         return type, (basename, uncomp), comp_len+raw_base+20
 352     else:
 353         uncomp, comp_len = read_zlib(map, raw_base, size)
 354         assert len(uncomp) == size
 355         return type, uncomp, comp_len+raw_base
 356
 357
 358 class PackData(object):
 359   """The data contained in a packfile.
 360
 361   Pack files can be accessed both sequentially for exploding a pack, and
 362   directly with the help of an index to retrieve a specific object.
 363
 364   The objects within are either complete or a delta aginst another.
 365
 366   The header is variable length. If the MSB of each byte is set then it
 367   indicates that the subsequent byte is still part of the header.
 368   For the first byte the next MS bits are the type, which tells you the type
 369   of object, and whether it is a delta. The LS byte is the lowest bits of the
 370   size. For each subsequent byte the LS 7 bits are the next MS bits of the
 371   size, i.e. the last byte of the header contains the MS bits of the size.
 372
 373   For the complete objects the data is stored as zlib deflated data.
 374   The size in the header is the uncompressed object size, so to uncompress
 375   you need to just keep feeding data to zlib until you get an object back,
 376   or it errors on bad data. This is done here by just giving the complete
 377   buffer from the start of the deflated object on. This is bad, but until I
 378   get mmap sorted out it will have to do.
 379
 380   Currently there are no integrity checks done. Also no attempt is made to try
 381   and detect the delta case, or a request for an object at the wrong position.
 382   It will all just throw a zlib or KeyError.
 383   """
 384
 385   def __init__(self, filename):
 386     """Create a PackData object that represents the pack in the given filename.
 387
 388     The file must exist and stay readable until the object is disposed of. It
 389     must also stay the same size. It will be mapped whenever needed.
 390
 391     Currently there is a restriction on the size of the pack as the python
 392     mmap implementation is flawed.
 393     """
 394     self._filename = filename
 395     assert os.path.exists(filename), "%s is not a packfile" % filename
 396     self._size = os.path.getsize(filename)
 397     self._header_size = 12
 398     assert self._size >= self._header_size, "%s is too small for a packfile" % filename
 399     self._read_header()
 400
 401   def _read_header(self):
 402     f = open(self._filename, 'rb')
 403     try:
 404         (version, self._num_objects) = \
 405                 read_pack_header(f)
 406         f.seek(self._size-20)
 407         (self._stored_checksum,) = read_pack_tail(f)
 408     finally:
 409         f.close()
 410
 411   def __len__(self):
 412       """Returns the number of objects in this pack."""
 413       return self._num_objects
 414
 415   def calculate_checksum(self):
 416     f = open(self._filename, 'rb')
 417     try:
 418         map = simple_mmap(f, 0, self._size)
 419         return hashlib.sha1(map[:-20]).digest()
 420     finally:
 421         f.close()
 422
 423   def iterobjects(self):
 424     offset = self._header_size
 425     f = open(self._filename, 'rb')
 426     for i in range(len(self)):
 427         map = simple_mmap(f, offset, self._size-offset)
 428         (type, obj, total_size) = unpack_object(map)
 429         yield offset, type, obj
 430         offset += total_size
 431     f.close()
 432
 433   def iterentries(self, ext_resolve_ref=None):
 434     found = {}
 435     at = {}
 436     postponed = defaultdict(list)
 437     class Postpone(Exception):
 438         """Raised to postpone delta resolving."""
 439
 440     def get_ref_text(sha):
 441         if sha in found:
 442             return found[sha]
 443         if ext_resolve_ref:
 444             try:
 445                 return ext_resolve_ref(sha)
 446             except KeyError:
 447                 pass
 448         raise Postpone, (sha, )
 449     todo = list(self.iterobjects())
 450     while todo:
 451       (offset, type, obj) = todo.pop(0)
 452       at[offset] = (type, obj)
 453       assert isinstance(offset, int)
 454       assert isinstance(type, int)
 455       assert isinstance(obj, tuple) or isinstance(obj, str)
 456       try:
 457         type, obj = resolve_object(offset, type, obj, get_ref_text,
 458             at.__getitem__)
 459       except Postpone, (sha, ):
 460         postponed[sha].append((offset, type, obj))
 461       else:
 462         shafile = ShaFile.from_raw_string(type, obj)
 463         sha = shafile.sha().digest()
 464         found[sha] = (type, obj)
 465         yield sha, offset, shafile.crc32()
 466         todo += postponed.get(sha, [])
 467     if postponed:
 468         raise KeyError([sha_to_hex(h) for h in postponed.keys()])
 469
 470   def sorted_entries(self, resolve_ext_ref=None):
 471     ret = list(self.iterentries(resolve_ext_ref))
 472     ret.sort()
 473     return ret
 474
 475   def create_index_v1(self, filename):
 476     entries = self.sorted_entries()
 477     write_pack_index_v1(filename, entries, self.calculate_checksum())
 478
 479   def create_index_v2(self, filename):
 480     entries = self.sorted_entries()
 481     write_pack_index_v2(filename, entries, self.calculate_checksum())
 482
 483   def get_stored_checksum(self):
 484     return self._stored_checksum
 485
 486   def check(self):
 487     return (self.calculate_checksum() == self.get_stored_checksum())
 488
 489   def get_object_at(self, offset):
 490     """Given an offset in to the packfile return the object that is there.
 491
 492     Using the associated index the location of an object can be looked up, and
 493     then the packfile can be asked directly for that object using this
 494     function.
 495     """
 496     assert isinstance(offset, long) or isinstance(offset, int),\
 497             "offset was %r" % offset
 498     assert offset >= self._header_size
 499     size = os.path.getsize(self._filename)
 500     assert size == self._size, "Pack data %s has changed size, I don't " \
 501          "like that" % self._filename
 502     f = open(self._filename, 'rb')
 503     try:
 504       map = simple_mmap(f, offset, size-offset)
 505       return unpack_object(map)[:2]
 506     finally:
 507       f.close()
 508
 509
 510 class SHA1Writer(object):
 511
 512     def __init__(self, f):
 513         self.f = f
 514         self.sha1 = hashlib.sha1("")
 515
 516     def write(self, data):
 517         self.sha1.update(data)
 518         self.f.write(data)
 519
 520     def write_sha(self):
 521         sha = self.sha1.digest()
 522         assert len(sha) == 20
 523         self.f.write(sha)
 524         return sha
 525
 526     def close(self):
 527         sha = self.write_sha()
 528         self.f.close()
 529         return sha
 530
 531     def tell(self):
 532         return self.f.tell()
 533
 534
 535 def write_pack_object(f, type, object):
 536     """Write pack object to a file.
 537
 538     :param f: File to write to
 539     :param o: Object to write
 540     """
 541     ret = f.tell()
 542     if type == 6: # ref delta
 543         (delta_base_offset, object) = object
 544     elif type == 7: # offset delta
 545         (basename, object) = object
 546     size = len(object)
 547     c = (type << 4) | (size & 15)
 548     size >>= 4
 549     while size:
 550         f.write(chr(c | 0x80))
 551         c = size & 0x7f
 552         size >>= 7
 553     f.write(chr(c))
 554     if type == 6: # offset delta
 555         ret = [delta_base_offset & 0x7f]
 556         delta_base_offset >>= 7
 557         while delta_base_offset:
 558             delta_base_offset -= 1
 559             ret.insert(0, 0x80 | (delta_base_offset & 0x7f))
 560             delta_base_offset >>= 7
 561         f.write("".join([chr(x) for x in ret]))
 562     elif type == 7: # ref delta
 563         assert len(basename) == 20
 564         f.write(basename)
 565     f.write(zlib.compress(object))
 566     return f.tell()
 567
 568
 569 def write_pack(filename, objects, num_objects):
 570     f = open(filename + ".pack", 'w')
 571     try:
 572         entries, data_sum = write_pack_data(f, objects, num_objects)
 573     finally:
 574         f.close()
 575     entries.sort()
 576     write_pack_index_v2(filename + ".idx", entries, data_sum)
 577
 578
 579 def write_pack_data(f, objects, num_objects, window=10):
 580     """Write a new pack file.
 581
 582     :param filename: The filename of the new pack file.
 583     :param objects: List of objects to write.
 584     :return: List with (name, offset, crc32 checksum) entries, pack checksum
 585     """
 586     recency = list(objects)
 587     # FIXME: Somehow limit delta depth
 588     # FIXME: Make thin-pack optional (its not used when cloning a pack)
 589     # Build a list of objects ordered by the magic Linus heuristic
 590     # This helps us find good objects to diff against us
 591     magic = []
 592     for o in recency:
 593         magic.append( (o._num_type, "filename", 1, -len(o.as_raw_string()[1]), o) )
 594     magic.sort()
 595     # Build a map of objects and their index in magic - so we can find preceeding objects
 596     # to diff against
 597     offs = {}
 598     for i in range(len(magic)):
 599         offs[magic[i][4]] = i
 600     # Write the pack
 601     entries = []
 602     f = SHA1Writer(f)
 603     f.write("PACK")               # Pack header
 604     f.write(struct.pack(">L", 2)) # Pack version
 605     f.write(struct.pack(">L", num_objects)) # Number of objects in pack
 606     for o in recency:
 607         sha1 = o.sha().digest()
 608         crc32 = o.crc32()
 609         orig_t, raw = o.as_raw_string()
 610         winner = raw
 611         t = orig_t
 612         #for i in range(offs[o]-window, window):
 613         #    if i < 0 or i >= len(offs): continue
 614         #    b = magic[i][4]
 615         #    if b._num_type != orig_t: continue
 616         #    _, base = b.as_raw_string()
 617         #    delta = create_delta(base, raw)
 618         #    if len(delta) < len(winner):
 619         #        winner = delta
 620         #        t = 6 if magic[i][2] == 1 else 7
 621         offset = write_pack_object(f, t, winner)
 622         entries.append((sha1, offset, crc32))
 623     return entries, f.write_sha()
 624
 625
 626 def write_pack_index_v1(filename, entries, pack_checksum):
 627     """Write a new pack index file.
 628
 629     :param filename: The filename of the new pack index file.
 630     :param entries: List of tuples with object name (sha), offset_in_pack,  and
 631             crc32_checksum.
 632     :param pack_checksum: Checksum of the pack file.
 633     """
 634     f = open(filename, 'w')
 635     f = SHA1Writer(f)
 636     fan_out_table = defaultdict(lambda: 0)
 637     for (name, offset, entry_checksum) in entries:
 638         fan_out_table[ord(name[0])] += 1
 639     # Fan-out table
 640     for i in range(0x100):
 641         f.write(struct.pack(">L", fan_out_table[i]))
 642         fan_out_table[i+1] += fan_out_table[i]
 643     for (name, offset, entry_checksum) in entries:
 644         f.write(struct.pack(">L20s", offset, name))
 645     assert len(pack_checksum) == 20
 646     f.write(pack_checksum)
 647     f.close()
 648
 649
 650 def create_delta(base_buf, target_buf):
 651     """Use python difflib to work out how to transform base_buf to target_buf"""
 652     assert isinstance(base_buf, str)
 653     assert isinstance(target_buf, str)
 654     out_buf = ""
 655     # write delta header
 656     def encode_size(size):
 657         ret = ""
 658         c = size & 0x7f
 659         size >>= 7
 660         while size:
 661             ret += chr(c | 0x80)
 662             c = size & 0x7f
 663             size >>= 7
 664         ret += chr(c)
 665         return ret
 666     out_buf += encode_size(len(base_buf))
 667     out_buf += encode_size(len(target_buf))
 668     # write out delta opcodes
 669     seq = difflib.SequenceMatcher(a=base_buf, b=target_buf)
 670     for opcode, i1, i2, j1, j2 in seq.get_opcodes():
 671         # Git patch opcodes don't care about deletes!
 672         #if opcode == "replace" or opcode == "delete":
 673         #    pass
 674         if opcode == "equal":
 675             # If they are equal, unpacker will use data from base_buf
 676             # Write out an opcode that says what range to use
 677             scratch = ""
 678             op = 0x80
 679             o = i1
 680             for i in range(4):
 681                 if o & 0xff << i*8:
 682                     scratch += chr(o >> i)
 683                     op |= 1 << i
 684             s = i2 - i1
 685             for i in range(2):
 686                 if s & 0xff << i*8:
 687                     scratch += chr(s >> i)
 688                     op |= 1 << (4+i)
 689             out_buf += chr(op)
 690             out_buf += scratch
 691         if opcode == "replace" or opcode == "insert":
 692             # If we are replacing a range or adding one, then we just
 693             # output it to the stream (prefixed by its size)
 694             s = j2 - j1
 695             o = j1
 696             while s > 127:
 697                 out_buf += chr(127)
 698                 out_buf += target_buf[o:o+127]
 699                 s -= 127
 700                 o += 127
 701             out_buf += chr(s)
 702             out_buf += target_buf[o:o+s]
 703     return out_buf
 704
 705
 706 def apply_delta(src_buf, delta):
 707     """Based on the similar function in git's patch-delta.c."""
 708     assert isinstance(src_buf, str), "was %r" % (src_buf,)
 709     assert isinstance(delta, str)
 710     out = ""
 711     def pop(delta):
 712         ret = delta[0]
 713         delta = delta[1:]
 714         return ord(ret), delta
 715     def get_delta_header_size(delta):
 716         size = 0
 717         i = 0
 718         while delta:
 719             cmd, delta = pop(delta)
 720             size |= (cmd & ~0x80) << i
 721             i += 7
 722             if not cmd & 0x80:
 723                 break
 724         return size, delta
 725     src_size, delta = get_delta_header_size(delta)
 726     dest_size, delta = get_delta_header_size(delta)
 727     assert src_size == len(src_buf), "%d vs %d" % (src_size, len(src_buf))
 728     while delta:
 729         cmd, delta = pop(delta)
 730         if cmd & 0x80:
 731             cp_off = 0
 732             for i in range(4):
 733                 if cmd & (1 << i):
 734                     x, delta = pop(delta)
 735                     cp_off |= x << (i * 8)
 736             cp_size = 0
 737             for i in range(3):
 738                 if cmd & (1 << (4+i)):
 739                     x, delta = pop(delta)
 740                     cp_size |= x << (i * 8)
 741             if cp_size == 0:
 742                 cp_size = 0x10000
 743             if (cp_off + cp_size < cp_size or
 744                 cp_off + cp_size > src_size or
 745                 cp_size > dest_size):
 746                 break
 747             out += src_buf[cp_off:cp_off+cp_size]
 748         elif cmd != 0:
 749             out += delta[:cmd]
 750             delta = delta[cmd:]
 751         else:
 752             raise ApplyDeltaError("Invalid opcode 0")
 753
 754     if delta != "":
 755         raise ApplyDeltaError("delta not empty: %r" % delta)
 756
 757     if dest_size != len(out):
 758         raise ApplyDeltaError("dest size incorrect")
 759
 760     return out
 761
 762
 763 def write_pack_index_v2(filename, entries, pack_checksum):
 764     """Write a new pack index file.
 765
 766     :param filename: The filename of the new pack index file.
 767     :param entries: List of tuples with object name (sha), offset_in_pack,  and
 768             crc32_checksum.
 769     :param pack_checksum: Checksum of the pack file.
 770     """
 771     f = open(filename, 'w')
 772     f = SHA1Writer(f)
 773     f.write('\377tOc') # Magic!
 774     f.write(struct.pack(">L", 2))
 775     fan_out_table = defaultdict(lambda: 0)
 776     for (name, offset, entry_checksum) in entries:
 777         fan_out_table[ord(name[0])] += 1
 778     # Fan-out table
 779     for i in range(0x100):
 780         f.write(struct.pack(">L", fan_out_table[i]))
 781         fan_out_table[i+1] += fan_out_table[i]
 782     for (name, offset, entry_checksum) in entries:
 783         f.write(name)
 784     for (name, offset, entry_checksum) in entries:
 785         f.write(struct.pack(">l", entry_checksum))
 786     for (name, offset, entry_checksum) in entries:
 787         # FIXME: handle if MSBit is set in offset
 788         f.write(struct.pack(">L", offset))
 789     # FIXME: handle table for pack files > 8 Gb
 790     assert len(pack_checksum) == 20
 791     f.write(pack_checksum)
 792     f.close()
 793
 794
 795 class Pack(object):
 796
 797     def __init__(self, basename):
 798         self._basename = basename
 799         self._data_path = self._basename + ".pack"
 800         self._idx_path = self._basename + ".idx"
 801         self._data = None
 802         self._idx = None
 803
 804     def name(self):
 805         return self.idx.objects_sha1()
 806
 807     @property
 808     def data(self):
 809         if self._data is None:
 810             self._data = PackData(self._data_path)
 811             assert len(self.idx) == len(self._data)
 812             assert self.idx.get_stored_checksums()[0] == self._data.get_stored_checksum()
 813         return self._data
 814
 815     @property
 816     def idx(self):
 817         if self._idx is None:
 818             self._idx = PackIndex(self._idx_path)
 819         return self._idx
 820
 821     def close(self):
 822         if self._data is not None:
 823             self._data.close()
 824         self.idx.close()
 825
 826     def __eq__(self, other):
 827         return type(self) == type(other) and self.idx == other.idx
 828
 829     def __len__(self):
 830         """Number of entries in this pack."""
 831         return len(self.idx)
 832
 833     def __repr__(self):
 834         return "Pack(%r)" % self._basename
 835
 836     def __iter__(self):
 837         """Iterate over all the sha1s of the objects in this pack."""
 838         return iter(self.idx)
 839
 840     def check(self):
 841         return self.idx.check() and self.data.check()
 842
 843     def get_stored_checksum(self):
 844         return self.data.get_stored_checksum()
 845
 846     def __contains__(self, sha1):
 847         """Check whether this pack contains a particular SHA1."""
 848         return (self.idx.object_index(sha1) is not None)
 849
 850     def get_raw(self, sha1, resolve_ref=None):
 851         if resolve_ref is None:
 852             resolve_ref = self.get_raw
 853         offset = self.idx.object_index(sha1)
 854         if offset is None:
 855             raise KeyError(sha1)
 856
 857         type, obj = self.data.get_object_at(offset)
 858         assert isinstance(offset, int)
 859         return resolve_object(offset, type, obj, resolve_ref,
 860             self.data.get_object_at)
 861
 862     def __getitem__(self, sha1):
 863         """Retrieve the specified SHA1."""
 864         type, uncomp = self.get_raw(sha1)
 865         return ShaFile.from_raw_string(type, uncomp)
 866
 867     def iterobjects(self):
 868         for offset, type, obj in self.data.iterobjects():
 869             assert isinstance(offset, int)
 870             yield ShaFile.from_raw_string(
 871                     *resolve_object(offset, type, obj, self.get_raw,
 872                 self.data.get_object_at))
 873
 874
 875 def load_packs(path):
 876     if not os.path.exists(path):
 877         return
 878     for name in os.listdir(path):
 879         if name.startswith("pack-") and name.endswith(".pack"):
 880             yield Pack(os.path.join(path, name[:-len(".pack")]))
 881