dulwich/pack.py

   1 # pack.py -- For dealing wih packed git objects.
   2 # Copyright (C) 2007 James Westby <jw+debian@jameswestby.net>
   3 # Copryight (C) 2008 Jelmer Vernooij <jelmer@samba.org>
   4 #
   5 # This program is free software; you can redistribute it and/or
   6 # modify it under the terms of the GNU General Public License
   7 # as published by the Free Software Foundation; version 2
   8 # of the License or (at your option) a later version.
   9 #
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 # GNU General Public License for more details.
  14 #
  15 # You should have received a copy of the GNU General Public License
  16 # along with this program; if not, write to the Free Software
  17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
  18 # MA  02110-1301, USA.
  19
  20 """Classes for dealing with packed git objects.
  21
  22 A pack is a compact representation of a bunch of objects, stored
  23 using deltas where possible.
  24
  25 They have two parts, the pack file, which stores the data, and an index
  26 that tells you where the data is.
  27
  28 To find an object you look in all of the index files 'til you find a
  29 match for the object name. You then use the pointer got from this as
  30 a pointer in to the corresponding packfile.
  31 """
  32
  33 try:
  34     from collections import defaultdict
  35 except:
  36     class defaultdict(dict):
  37         def __init__(self, default_factory=None, *a, **kw):
  38             if (default_factory is not None and
  39                 not hasattr(default_factory, '__call__')):
  40                 raise TypeError('first argument must be callable')
  41             dict.__init__(self, *a, **kw)
  42             self.default_factory = default_factory
  43         def __getitem__(self, key):
  44             try:
  45                 return dict.__getitem__(self, key)
  46             except KeyError:
  47                 return self.__missing__(key)
  48         def __missing__(self, key):
  49             if self.default_factory is None:
  50                 raise KeyError(key)
  51             self[key] = value = self.default_factory()
  52             return value
  53         def __reduce__(self):
  54             if self.default_factory is None:
  55                 args = tuple()
  56             else:
  57                 args = self.default_factory,
  58             return type(self), args, None, None, self.items()
  59         def copy(self):
  60             return self.__copy__()
  61         def __copy__(self):
  62             return type(self)(self.default_factory, self)
  63         def __deepcopy__(self, memo):
  64             import copy
  65             return type(self)(self.default_factory,
  66                               copy.deepcopy(self.items()))
  67         def __repr__(self):
  68             return 'defaultdict(%s, %s)' % (self.default_factory,
  69                                             dict.__repr__(self))
  70 from itertools import imap, izip
  71 import mmap
  72 import os
  73 import sha
  74 import struct
  75 import sys
  76 import zlib
  77 import difflib
  78
  79 from objects import (
  80         ShaFile,
  81         hex_to_sha,
  82         sha_to_hex,
  83         )
  84 from errors import ApplyDeltaError
  85
  86 def unpack_from(fmt, buf, offset=0):
  87   b = buf[offset:offset+struct.calcsize(fmt)]
  88   return struct.unpack(fmt, b)
  89
  90
  91 supports_mmap_offset = (sys.version_info[0] >= 3 or
  92         (sys.version_info[0] == 2 and sys.version_info[1] >= 6))
  93
  94
  95 def take_msb_bytes(map, offset):
  96     ret = []
  97     while len(ret) == 0 or ret[-1] & 0x80:
  98         ret.append(ord(map[offset]))
  99         offset += 1
 100     return ret
 101
 102
 103 def read_zlib(data, offset, dec_size):
 104     obj = zlib.decompressobj()
 105     x = ""
 106     fed = 0
 107     while obj.unused_data == "":
 108         base = offset+fed
 109         add = data[base:base+1024]
 110         if len(add) < 1024:
 111             add += "Z"
 112         fed += len(add)
 113         x += obj.decompress(add)
 114     assert len(x) == dec_size
 115     comp_len = fed-len(obj.unused_data)
 116     return x, comp_len
 117
 118
 119 def iter_sha1(iter):
 120     sha1 = sha.sha()
 121     for name in iter:
 122         sha1.update(name)
 123     return sha1.hexdigest()
 124
 125
 126 MAX_MMAP_SIZE = 256 * 1024 * 1024
 127
 128 def simple_mmap(f, offset, size, access=mmap.ACCESS_READ):
 129     """Simple wrapper for mmap() which always supports the offset parameter.
 130
 131     :param f: File object.
 132     :param offset: Offset in the file, from the beginning of the file.
 133     :param size: Size of the mmap'ed area
 134     :param access: Access mechanism.
 135     :return: MMAP'd area.
 136     """
 137     if offset+size > MAX_MMAP_SIZE and not supports_mmap_offset:
 138         raise AssertionError("%s is larger than 256 meg, and this version "
 139             "of Python does not support the offset argument to mmap().")
 140     if supports_mmap_offset:
 141         return mmap.mmap(f.fileno(), size, access=access, offset=offset)
 142     else:
 143         class ArraySkipper(object):
 144
 145             def __init__(self, array, offset):
 146                 self.array = array
 147                 self.offset = offset
 148
 149             def __getslice__(self, i, j):
 150                 return self.array[i+self.offset:j+self.offset]
 151
 152             def __getitem__(self, i):
 153                 return self.array[i+self.offset]
 154
 155             def __len__(self):
 156                 return len(self.array) - self.offset
 157
 158             def __str__(self):
 159                 return str(self.array[self.offset:])
 160
 161         mem = mmap.mmap(f.fileno(), size+offset, access=access)
 162         if offset == 0:
 163             return mem
 164         return ArraySkipper(mem, offset)
 165
 166
 167 def resolve_object(offset, type, obj, get_ref, get_offset):
 168   """Resolve an object, possibly resolving deltas when necessary."""
 169   if not type in (6, 7): # Not a delta
 170      return type, obj
 171
 172   if type == 6: # offset delta
 173      (delta_offset, delta) = obj
 174      assert isinstance(delta_offset, int)
 175      assert isinstance(delta, str)
 176      offset = offset-delta_offset
 177      type, base_obj = get_offset(offset)
 178      assert isinstance(type, int)
 179   elif type == 7: # ref delta
 180      (basename, delta) = obj
 181      assert isinstance(basename, str) and len(basename) == 20
 182      assert isinstance(delta, str)
 183      type, base_obj = get_ref(basename)
 184      assert isinstance(type, int)
 185   type, base_text = resolve_object(offset, type, base_obj, get_ref, get_offset)
 186   return type, apply_delta(base_text, delta)
 187
 188
 189 class PackIndex(object):
 190   """An index in to a packfile.
 191
 192   Given a sha id of an object a pack index can tell you the location in the
 193   packfile of that object if it has it.
 194
 195   To do the loop it opens the file, and indexes first 256 4 byte groups
 196   with the first byte of the sha id. The value in the four byte group indexed
 197   is the end of the group that shares the same starting byte. Subtract one
 198   from the starting byte and index again to find the start of the group.
 199   The values are sorted by sha id within the group, so do the math to find
 200   the start and end offset and then bisect in to find if the value is present.
 201   """
 202
 203   def __init__(self, filename):
 204     """Create a pack index object.
 205
 206     Provide it with the name of the index file to consider, and it will map
 207     it whenever required.
 208     """
 209     self._filename = filename
 210     # Take the size now, so it can be checked each time we map the file to
 211     # ensure that it hasn't changed.
 212     self._size = os.path.getsize(filename)
 213     self._file = open(filename, 'r')
 214     self._contents = simple_mmap(self._file, 0, self._size)
 215     if self._contents[:4] != '\377tOc':
 216         self.version = 1
 217         self._fan_out_table = self._read_fan_out_table(0)
 218     else:
 219         (self.version, ) = unpack_from(">L", self._contents, 4)
 220         assert self.version in (2,), "Version was %d" % self.version
 221         self._fan_out_table = self._read_fan_out_table(8)
 222         self._name_table_offset = 8 + 0x100 * 4
 223         self._crc32_table_offset = self._name_table_offset + 20 * len(self)
 224         self._pack_offset_table_offset = self._crc32_table_offset + 4 * len(self)
 225
 226   def __eq__(self, other):
 227     if type(self) != type(other):
 228         return False
 229
 230     if self._fan_out_table != other._fan_out_table:
 231         return False
 232
 233     for (name1, _, _), (name2, _, _) in izip(self.iterentries(), other.iterentries()):
 234         if name1 != name2:
 235             return False
 236     return True
 237
 238   def close(self):
 239     self._file.close()
 240
 241   def __len__(self):
 242     """Return the number of entries in this pack index."""
 243     return self._fan_out_table[-1]
 244
 245   def _unpack_entry(self, i):
 246     """Unpack the i-th entry in the index file.
 247
 248     :return: Tuple with object name (SHA), offset in pack file and
 249           CRC32 checksum (if known)."""
 250     if self.version == 1:
 251         (offset, name) = unpack_from(">L20s", self._contents,
 252             (0x100 * 4) + (i * 24))
 253         return (name, offset, None)
 254     else:
 255         return (self._unpack_name(i), self._unpack_offset(i),
 256                 self._unpack_crc32_checksum(i))
 257
 258   def _unpack_name(self, i):
 259     if self.version == 1:
 260         return self._unpack_entry(i)[0]
 261     else:
 262         return unpack_from("20s", self._contents,
 263                                   self._name_table_offset + i * 20)[0]
 264
 265   def _unpack_offset(self, i):
 266     if self.version == 1:
 267         return self._unpack_entry(i)[1]
 268     else:
 269         return unpack_from(">L", self._contents,
 270                                   self._pack_offset_table_offset + i * 4)[0]
 271
 272   def _unpack_crc32_checksum(self, i):
 273     if self.version == 1:
 274         return None
 275     else:
 276         return unpack_from(">L", self._contents,
 277                                   self._crc32_table_offset + i * 4)[0]
 278
 279   def __iter__(self):
 280       return imap(sha_to_hex, self._itersha())
 281
 282   def _itersha(self):
 283     for i in range(len(self)):
 284         yield self._unpack_name(i)
 285
 286   def objects_sha1(self):
 287     return iter_sha1(self._itersha())
 288
 289   def iterentries(self):
 290     """Iterate over the entries in this pack index.
 291
 292     Will yield tuples with object name, offset in packfile and crc32 checksum.
 293     """
 294     for i in range(len(self)):
 295         yield self._unpack_entry(i)
 296
 297   def _read_fan_out_table(self, start_offset):
 298     ret = []
 299     for i in range(0x100):
 300         ret.append(struct.unpack(">L", self._contents[start_offset+i*4:start_offset+(i+1)*4])[0])
 301     return ret
 302
 303   def check(self):
 304     """Check that the stored checksum matches the actual checksum."""
 305     return self.calculate_checksum() == self.get_stored_checksums()[1]
 306
 307   def calculate_checksum(self):
 308     f = open(self._filename, 'r')
 309     try:
 310         return sha.sha(self._contents[:-20]).digest()
 311     finally:
 312         f.close()
 313
 314   def get_stored_checksums(self):
 315     """Return the SHA1 checksums stored for the corresponding packfile and
 316     this header file itself."""
 317     return str(self._contents[-40:-20]), str(self._contents[-20:])
 318
 319   def object_index(self, sha):
 320     """Return the index in to the corresponding packfile for the object.
 321
 322     Given the name of an object it will return the offset that object lives
 323     at within the corresponding pack file. If the pack file doesn't have the
 324     object then None will be returned.
 325     """
 326     size = os.path.getsize(self._filename)
 327     assert size == self._size, "Pack index %s has changed size, I don't " \
 328          "like that" % self._filename
 329     if len(sha) == 40:
 330         sha = hex_to_sha(sha)
 331     return self._object_index(sha)
 332
 333   def _object_index(self, sha):
 334       """See object_index"""
 335       idx = ord(sha[0])
 336       if idx == 0:
 337           start = 0
 338       else:
 339           start = self._fan_out_table[idx-1]
 340       end = self._fan_out_table[idx]
 341       assert start <= end
 342       while start <= end:
 343         i = (start + end)/2
 344         file_sha = self._unpack_name(i)
 345         if file_sha < sha:
 346           start = i + 1
 347         elif file_sha > sha:
 348           end = i - 1
 349         else:
 350           return self._unpack_offset(i)
 351       return None
 352
 353
 354 def read_pack_header(f):
 355     header = f.read(12)
 356     assert header[:4] == "PACK"
 357     (version,) = unpack_from(">L", header, 4)
 358     assert version in (2, 3), "Version was %d" % version
 359     (num_objects,) = unpack_from(">L", header, 8)
 360     return (version, num_objects)
 361
 362
 363 def read_pack_tail(f):
 364     return (f.read(20),)
 365
 366
 367 def unpack_object(map):
 368     bytes = take_msb_bytes(map, 0)
 369     type = (bytes[0] >> 4) & 0x07
 370     size = bytes[0] & 0x0f
 371     for i, byte in enumerate(bytes[1:]):
 372       size += (byte & 0x7f) << ((i * 7) + 4)
 373     raw_base = len(bytes)
 374     if type == 6: # offset delta
 375         bytes = take_msb_bytes(map, raw_base)
 376         assert not (bytes[-1] & 0x80)
 377         delta_base_offset = bytes[0] & 0x7f
 378         for byte in bytes[1:]:
 379             delta_base_offset += 1
 380             delta_base_offset <<= 7
 381             delta_base_offset += (byte & 0x7f)
 382         raw_base+=len(bytes)
 383         uncomp, comp_len = read_zlib(map, raw_base, size)
 384         assert size == len(uncomp)
 385         return type, (delta_base_offset, uncomp), comp_len+raw_base
 386     elif type == 7: # ref delta
 387         basename = map[raw_base:raw_base+20]
 388         uncomp, comp_len = read_zlib(map, raw_base+20, size)
 389         assert size == len(uncomp)
 390         return type, (basename, uncomp), comp_len+raw_base+20
 391     else:
 392         uncomp, comp_len = read_zlib(map, raw_base, size)
 393         assert len(uncomp) == size
 394         return type, uncomp, comp_len+raw_base
 395
 396
 397 class PackData(object):
 398   """The data contained in a packfile.
 399
 400   Pack files can be accessed both sequentially for exploding a pack, and
 401   directly with the help of an index to retrieve a specific object.
 402
 403   The objects within are either complete or a delta aginst another.
 404
 405   The header is variable length. If the MSB of each byte is set then it
 406   indicates that the subsequent byte is still part of the header.
 407   For the first byte the next MS bits are the type, which tells you the type
 408   of object, and whether it is a delta. The LS byte is the lowest bits of the
 409   size. For each subsequent byte the LS 7 bits are the next MS bits of the
 410   size, i.e. the last byte of the header contains the MS bits of the size.
 411
 412   For the complete objects the data is stored as zlib deflated data.
 413   The size in the header is the uncompressed object size, so to uncompress
 414   you need to just keep feeding data to zlib until you get an object back,
 415   or it errors on bad data. This is done here by just giving the complete
 416   buffer from the start of the deflated object on. This is bad, but until I
 417   get mmap sorted out it will have to do.
 418
 419   Currently there are no integrity checks done. Also no attempt is made to try
 420   and detect the delta case, or a request for an object at the wrong position.
 421   It will all just throw a zlib or KeyError.
 422   """
 423
 424   def __init__(self, filename):
 425     """Create a PackData object that represents the pack in the given filename.
 426
 427     The file must exist and stay readable until the object is disposed of. It
 428     must also stay the same size. It will be mapped whenever needed.
 429
 430     Currently there is a restriction on the size of the pack as the python
 431     mmap implementation is flawed.
 432     """
 433     self._filename = filename
 434     assert os.path.exists(filename), "%s is not a packfile" % filename
 435     self._size = os.path.getsize(filename)
 436     self._header_size = 12
 437     assert self._size >= self._header_size, "%s is too small for a packfile (%d < %d)" % (filename, self._size, self._header_size)
 438     self._read_header()
 439
 440   def _read_header(self):
 441     f = open(self._filename, 'rb')
 442     try:
 443         (version, self._num_objects) = \
 444                 read_pack_header(f)
 445         f.seek(self._size-20)
 446         (self._stored_checksum,) = read_pack_tail(f)
 447     finally:
 448         f.close()
 449
 450   def __len__(self):
 451       """Returns the number of objects in this pack."""
 452       return self._num_objects
 453
 454   def calculate_checksum(self):
 455     f = open(self._filename, 'rb')
 456     try:
 457         map = simple_mmap(f, 0, self._size)
 458         return sha.sha(map[:-20]).digest()
 459     finally:
 460         f.close()
 461
 462   def iterobjects(self):
 463     offset = self._header_size
 464     f = open(self._filename, 'rb')
 465     for i in range(len(self)):
 466         map = simple_mmap(f, offset, self._size-offset)
 467         (type, obj, total_size) = unpack_object(map)
 468         yield offset, type, obj
 469         offset += total_size
 470     f.close()
 471
 472   def iterentries(self, ext_resolve_ref=None):
 473     found = {}
 474     at = {}
 475     postponed = defaultdict(list)
 476     class Postpone(Exception):
 477         """Raised to postpone delta resolving."""
 478
 479     def get_ref_text(sha):
 480         if sha in found:
 481             return found[sha]
 482         if ext_resolve_ref:
 483             try:
 484                 return ext_resolve_ref(sha)
 485             except KeyError:
 486                 pass
 487         raise Postpone, (sha, )
 488     todo = list(self.iterobjects())
 489     while todo:
 490       (offset, type, obj) = todo.pop(0)
 491       at[offset] = (type, obj)
 492       assert isinstance(offset, int)
 493       assert isinstance(type, int)
 494       assert isinstance(obj, tuple) or isinstance(obj, str)
 495       try:
 496         type, obj = resolve_object(offset, type, obj, get_ref_text,
 497             at.__getitem__)
 498       except Postpone, (sha, ):
 499         postponed[sha].append((offset, type, obj))
 500       else:
 501         shafile = ShaFile.from_raw_string(type, obj)
 502         sha = shafile.sha().digest()
 503         found[sha] = (type, obj)
 504         yield sha, offset, shafile.crc32()
 505         todo += postponed.get(sha, [])
 506     if postponed:
 507         raise KeyError([sha_to_hex(h) for h in postponed.keys()])
 508
 509   def sorted_entries(self, resolve_ext_ref=None):
 510     ret = list(self.iterentries(resolve_ext_ref))
 511     ret.sort()
 512     return ret
 513
 514   def create_index_v1(self, filename, resolve_ext_ref=None):
 515     entries = self.sorted_entries(resolve_ext_ref)
 516     write_pack_index_v1(filename, entries, self.calculate_checksum())
 517
 518   def create_index_v2(self, filename, resolve_ext_ref=None):
 519     entries = self.sorted_entries(resolve_ext_ref)
 520     write_pack_index_v2(filename, entries, self.calculate_checksum())
 521
 522   def get_stored_checksum(self):
 523     return self._stored_checksum
 524
 525   def check(self):
 526     return (self.calculate_checksum() == self.get_stored_checksum())
 527
 528   def get_object_at(self, offset):
 529     """Given an offset in to the packfile return the object that is there.
 530
 531     Using the associated index the location of an object can be looked up, and
 532     then the packfile can be asked directly for that object using this
 533     function.
 534     """
 535     assert isinstance(offset, long) or isinstance(offset, int),\
 536             "offset was %r" % offset
 537     assert offset >= self._header_size
 538     size = os.path.getsize(self._filename)
 539     assert size == self._size, "Pack data %s has changed size, I don't " \
 540          "like that" % self._filename
 541     f = open(self._filename, 'rb')
 542     try:
 543       map = simple_mmap(f, offset, size-offset)
 544       return unpack_object(map)[:2]
 545     finally:
 546       f.close()
 547
 548
 549 class SHA1Writer(object):
 550
 551     def __init__(self, f):
 552         self.f = f
 553         self.sha1 = sha.sha("")
 554
 555     def write(self, data):
 556         self.sha1.update(data)
 557         self.f.write(data)
 558
 559     def write_sha(self):
 560         sha = self.sha1.digest()
 561         assert len(sha) == 20
 562         self.f.write(sha)
 563         return sha
 564
 565     def close(self):
 566         sha = self.write_sha()
 567         self.f.close()
 568         return sha
 569
 570     def tell(self):
 571         return self.f.tell()
 572
 573
 574 def write_pack_object(f, type, object):
 575     """Write pack object to a file.
 576
 577     :param f: File to write to
 578     :param o: Object to write
 579     """
 580     ret = f.tell()
 581     if type == 6: # ref delta
 582         (delta_base_offset, object) = object
 583     elif type == 7: # offset delta
 584         (basename, object) = object
 585     size = len(object)
 586     c = (type << 4) | (size & 15)
 587     size >>= 4
 588     while size:
 589         f.write(chr(c | 0x80))
 590         c = size & 0x7f
 591         size >>= 7
 592     f.write(chr(c))
 593     if type == 6: # offset delta
 594         ret = [delta_base_offset & 0x7f]
 595         delta_base_offset >>= 7
 596         while delta_base_offset:
 597             delta_base_offset -= 1
 598             ret.insert(0, 0x80 | (delta_base_offset & 0x7f))
 599             delta_base_offset >>= 7
 600         f.write("".join([chr(x) for x in ret]))
 601     elif type == 7: # ref delta
 602         assert len(basename) == 20
 603         f.write(basename)
 604     f.write(zlib.compress(object))
 605     return f.tell()
 606
 607
 608 def write_pack(filename, objects, num_objects):
 609     f = open(filename + ".pack", 'w')
 610     try:
 611         entries, data_sum = write_pack_data(f, objects, num_objects)
 612     finally:
 613         f.close()
 614     entries.sort()
 615     write_pack_index_v2(filename + ".idx", entries, data_sum)
 616
 617
 618 def write_pack_data(f, objects, num_objects, window=10):
 619     """Write a new pack file.
 620
 621     :param filename: The filename of the new pack file.
 622     :param objects: List of objects to write (tuples with object and path)
 623     :return: List with (name, offset, crc32 checksum) entries, pack checksum
 624     """
 625     recency = list(objects)
 626     # FIXME: Somehow limit delta depth
 627     # FIXME: Make thin-pack optional (its not used when cloning a pack)
 628     # Build a list of objects ordered by the magic Linus heuristic
 629     # This helps us find good objects to diff against us
 630     magic = []
 631     for obj, path in recency:
 632         magic.append( (obj.type, path, 1, -len(obj.as_raw_string()[1]), obj) )
 633     magic.sort()
 634     # Build a map of objects and their index in magic - so we can find preceeding objects
 635     # to diff against
 636     offs = {}
 637     for i in range(len(magic)):
 638         offs[magic[i][4]] = i
 639     # Write the pack
 640     entries = []
 641     f = SHA1Writer(f)
 642     f.write("PACK")               # Pack header
 643     f.write(struct.pack(">L", 2)) # Pack version
 644     f.write(struct.pack(">L", num_objects)) # Number of objects in pack
 645     for o, path in recency:
 646         sha1 = o.sha().digest()
 647         crc32 = o.crc32()
 648         orig_t, raw = o.as_raw_string()
 649         winner = raw
 650         t = orig_t
 651         #for i in range(offs[o]-window, window):
 652         #    if i < 0 or i >= len(offs): continue
 653         #    b = magic[i][4]
 654         #    if b.type != orig_t: continue
 655         #    _, base = b.as_raw_string()
 656         #    delta = create_delta(base, raw)
 657         #    if len(delta) < len(winner):
 658         #        winner = delta
 659         #        t = 6 if magic[i][2] == 1 else 7
 660         offset = write_pack_object(f, t, winner)
 661         entries.append((sha1, offset, crc32))
 662     return entries, f.write_sha()
 663
 664
 665 def write_pack_index_v1(filename, entries, pack_checksum):
 666     """Write a new pack index file.
 667
 668     :param filename: The filename of the new pack index file.
 669     :param entries: List of tuples with object name (sha), offset_in_pack,  and
 670             crc32_checksum.
 671     :param pack_checksum: Checksum of the pack file.
 672     """
 673     f = open(filename, 'w')
 674     f = SHA1Writer(f)
 675     fan_out_table = defaultdict(lambda: 0)
 676     for (name, offset, entry_checksum) in entries:
 677         fan_out_table[ord(name[0])] += 1
 678     # Fan-out table
 679     for i in range(0x100):
 680         f.write(struct.pack(">L", fan_out_table[i]))
 681         fan_out_table[i+1] += fan_out_table[i]
 682     for (name, offset, entry_checksum) in entries:
 683         f.write(struct.pack(">L20s", offset, name))
 684     assert len(pack_checksum) == 20
 685     f.write(pack_checksum)
 686     f.close()
 687
 688
 689 def create_delta(base_buf, target_buf):
 690     """Use python difflib to work out how to transform base_buf to target_buf"""
 691     assert isinstance(base_buf, str)
 692     assert isinstance(target_buf, str)
 693     out_buf = ""
 694     # write delta header
 695     def encode_size(size):
 696         ret = ""
 697         c = size & 0x7f
 698         size >>= 7
 699         while size:
 700             ret += chr(c | 0x80)
 701             c = size & 0x7f
 702             size >>= 7
 703         ret += chr(c)
 704         return ret
 705     out_buf += encode_size(len(base_buf))
 706     out_buf += encode_size(len(target_buf))
 707     # write out delta opcodes
 708     seq = difflib.SequenceMatcher(a=base_buf, b=target_buf)
 709     for opcode, i1, i2, j1, j2 in seq.get_opcodes():
 710         # Git patch opcodes don't care about deletes!
 711         #if opcode == "replace" or opcode == "delete":
 712         #    pass
 713         if opcode == "equal":
 714             # If they are equal, unpacker will use data from base_buf
 715             # Write out an opcode that says what range to use
 716             scratch = ""
 717             op = 0x80
 718             o = i1
 719             for i in range(4):
 720                 if o & 0xff << i*8:
 721                     scratch += chr(o >> i)
 722                     op |= 1 << i
 723             s = i2 - i1
 724             for i in range(2):
 725                 if s & 0xff << i*8:
 726                     scratch += chr(s >> i)
 727                     op |= 1 << (4+i)
 728             out_buf += chr(op)
 729             out_buf += scratch
 730         if opcode == "replace" or opcode == "insert":
 731             # If we are replacing a range or adding one, then we just
 732             # output it to the stream (prefixed by its size)
 733             s = j2 - j1
 734             o = j1
 735             while s > 127:
 736                 out_buf += chr(127)
 737                 out_buf += target_buf[o:o+127]
 738                 s -= 127
 739                 o += 127
 740             out_buf += chr(s)
 741             out_buf += target_buf[o:o+s]
 742     return out_buf
 743
 744
 745 def apply_delta(src_buf, delta):
 746     """Based on the similar function in git's patch-delta.c."""
 747     assert isinstance(src_buf, str), "was %r" % (src_buf,)
 748     assert isinstance(delta, str)
 749     out = ""
 750     index = 0
 751     delta_length = len(delta)
 752     def pop(delta, index):
 753         ret = delta[index]
 754         index += 1
 755         return ord(ret), index
 756     def get_delta_header_size(delta, index):
 757         size = 0
 758         i = 0
 759         while delta:
 760             cmd, index = pop(delta, index)
 761             size |= (cmd & ~0x80) << i
 762             i += 7
 763             if not cmd & 0x80:
 764                 break
 765         return size, index
 766     src_size, index = get_delta_header_size(delta, index)
 767     dest_size, index = get_delta_header_size(delta, index)
 768     assert src_size == len(src_buf), "%d vs %d" % (src_size, len(src_buf))
 769     while index < delta_length:
 770         cmd, index = pop(delta, index)
 771         if cmd & 0x80:
 772             cp_off = 0
 773             for i in range(4):
 774                 if cmd & (1 << i):
 775                     x, index = pop(delta, index)
 776                     cp_off |= x << (i * 8)
 777             cp_size = 0
 778             for i in range(3):
 779                 if cmd & (1 << (4+i)):
 780                     x, index = pop(delta, index)
 781                     cp_size |= x << (i * 8)
 782             if cp_size == 0:
 783                 cp_size = 0x10000
 784             if (cp_off + cp_size < cp_size or
 785                 cp_off + cp_size > src_size or
 786                 cp_size > dest_size):
 787                 break
 788             out += src_buf[cp_off:cp_off+cp_size]
 789         elif cmd != 0:
 790             out += delta[index:index+cmd]
 791             index += cmd
 792         else:
 793             raise ApplyDeltaError("Invalid opcode 0")
 794
 795     if index != delta_length:
 796         raise ApplyDeltaError("delta not empty: %r" % delta[index:])
 797
 798     if dest_size != len(out):
 799         raise ApplyDeltaError("dest size incorrect")
 800
 801     return out
 802
 803
 804 def write_pack_index_v2(filename, entries, pack_checksum):
 805     """Write a new pack index file.
 806
 807     :param filename: The filename of the new pack index file.
 808     :param entries: List of tuples with object name (sha), offset_in_pack,  and
 809             crc32_checksum.
 810     :param pack_checksum: Checksum of the pack file.
 811     """
 812     f = open(filename, 'w')
 813     f = SHA1Writer(f)
 814     f.write('\377tOc') # Magic!
 815     f.write(struct.pack(">L", 2))
 816     fan_out_table = defaultdict(lambda: 0)
 817     for (name, offset, entry_checksum) in entries:
 818         fan_out_table[ord(name[0])] += 1
 819     # Fan-out table
 820     for i in range(0x100):
 821         f.write(struct.pack(">L", fan_out_table[i]))
 822         fan_out_table[i+1] += fan_out_table[i]
 823     for (name, offset, entry_checksum) in entries:
 824         f.write(name)
 825     for (name, offset, entry_checksum) in entries:
 826         f.write(struct.pack(">l", entry_checksum))
 827     for (name, offset, entry_checksum) in entries:
 828         # FIXME: handle if MSBit is set in offset
 829         f.write(struct.pack(">L", offset))
 830     # FIXME: handle table for pack files > 8 Gb
 831     assert len(pack_checksum) == 20
 832     f.write(pack_checksum)
 833     f.close()
 834
 835
 836 class Pack(object):
 837
 838     def __init__(self, basename):
 839         self._basename = basename
 840         self._data_path = self._basename + ".pack"
 841         self._idx_path = self._basename + ".idx"
 842         self._data = None
 843         self._idx = None
 844
 845     def name(self):
 846         return self.idx.objects_sha1()
 847
 848     @property
 849     def data(self):
 850         if self._data is None:
 851             self._data = PackData(self._data_path)
 852             assert len(self.idx) == len(self._data)
 853             assert self.idx.get_stored_checksums()[0] == self._data.get_stored_checksum()
 854         return self._data
 855
 856     @property
 857     def idx(self):
 858         if self._idx is None:
 859             self._idx = PackIndex(self._idx_path)
 860         return self._idx
 861
 862     def close(self):
 863         if self._data is not None:
 864             self._data.close()
 865         self.idx.close()
 866
 867     def __eq__(self, other):
 868         return type(self) == type(other) and self.idx == other.idx
 869
 870     def __len__(self):
 871         """Number of entries in this pack."""
 872         return len(self.idx)
 873
 874     def __repr__(self):
 875         return "Pack(%r)" % self._basename
 876
 877     def __iter__(self):
 878         """Iterate over all the sha1s of the objects in this pack."""
 879         return iter(self.idx)
 880
 881     def check(self):
 882         return self.idx.check() and self.data.check()
 883
 884     def get_stored_checksum(self):
 885         return self.data.get_stored_checksum()
 886
 887     def __contains__(self, sha1):
 888         """Check whether this pack contains a particular SHA1."""
 889         return (self.idx.object_index(sha1) is not None)
 890
 891     def get_raw(self, sha1, resolve_ref=None):
 892         offset = self.idx.object_index(sha1)
 893         if offset is None:
 894             raise KeyError(sha1)
 895
 896         type, obj = self.data.get_object_at(offset)
 897         if isinstance(offset, long):
 898           offset = int(offset)
 899         assert isinstance(offset, int)
 900         return resolve_object(offset, type, obj, resolve_ref,
 901             self.data.get_object_at)
 902
 903     def __getitem__(self, sha1):
 904         """Retrieve the specified SHA1."""
 905         type, uncomp = self.get_raw(sha1)
 906         return ShaFile.from_raw_string(type, uncomp)
 907
 908     def iterobjects(self, get_raw=None):
 909         if get_raw is None:
 910             def get_raw(x):
 911                 raise KeyError(x)
 912         for offset, type, obj in self.data.iterobjects():
 913             assert isinstance(offset, int)
 914             yield ShaFile.from_raw_string(
 915                     *resolve_object(offset, type, obj,
 916                         get_raw,
 917                     self.data.get_object_at))
 918
 919
 920 def load_packs(path):
 921     if not os.path.exists(path):
 922         return
 923     for name in os.listdir(path):
 924         if name.startswith("pack-") and name.endswith(".pack"):
 925             yield Pack(os.path.join(path, name[:-len(".pack")]))
 926