dulwich/pack.py

   1 # pack.py -- For dealing wih packed git objects.
   2 # Copyright (C) 2007 James Westby <jw+debian@jameswestby.net>
   3 # Copryight (C) 2008 Jelmer Vernooij <jelmer@samba.org>
   4 #
   5 # This program is free software; you can redistribute it and/or
   6 # modify it under the terms of the GNU General Public License
   7 # as published by the Free Software Foundation; version 2
   8 # of the License or (at your option) a later version.
   9 #
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 # GNU General Public License for more details.
  14 #
  15 # You should have received a copy of the GNU General Public License
  16 # along with this program; if not, write to the Free Software
  17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
  18 # MA  02110-1301, USA.
  19
  20 """Classes for dealing with packed git objects.
  21
  22 A pack is a compact representation of a bunch of objects, stored
  23 using deltas where possible.
  24
  25 They have two parts, the pack file, which stores the data, and an index
  26 that tells you where the data is.
  27
  28 To find an object you look in all of the index files 'til you find a
  29 match for the object name. You then use the pointer got from this as
  30 a pointer in to the corresponding packfile.
  31 """
  32
  33 try:
  34     from collections import defaultdict
  35 except ImportError:
  36     from misc import defaultdict
  37
  38 from itertools import imap, izip
  39 import mmap
  40 import os
  41 import struct
  42 try:
  43     from struct import unpack_from
  44 except ImportError:
  45     from dulwich.misc import unpack_from
  46 import sys
  47 import zlib
  48 import difflib
  49
  50 from dulwich.errors import (
  51     ApplyDeltaError,
  52     ChecksumMismatch,
  53     )
  54 from dulwich.lru_cache import (
  55     LRUSizeCache,
  56     )
  57 from dulwich.objects import (
  58     ShaFile,
  59     hex_to_sha,
  60     sha_to_hex,
  61     )
  62 from dulwich.misc import make_sha
  63
  64 supports_mmap_offset = (sys.version_info[0] >= 3 or
  65         (sys.version_info[0] == 2 and sys.version_info[1] >= 6))
  66
  67
  68 def take_msb_bytes(map, offset):
  69     ret = []
  70     while len(ret) == 0 or ret[-1] & 0x80:
  71         ret.append(ord(map[offset]))
  72         offset += 1
  73     return ret
  74
  75
  76 def read_zlib(data, offset, dec_size):
  77     obj = zlib.decompressobj()
  78     x = ""
  79     fed = 0
  80     while obj.unused_data == "":
  81         base = offset+fed
  82         add = data[base:base+1024]
  83         if len(add) < 1024:
  84             add += "Z"
  85         fed += len(add)
  86         x += obj.decompress(add)
  87     assert len(x) == dec_size
  88     comp_len = fed-len(obj.unused_data)
  89     return x, comp_len
  90
  91
  92 def iter_sha1(iter):
  93     sha1 = make_sha()
  94     for name in iter:
  95         sha1.update(name)
  96     return sha1.hexdigest()
  97
  98
  99 MAX_MMAP_SIZE = 1024 * 1024 * 1024
 100
 101 def simple_mmap(f, offset, size, access=mmap.ACCESS_READ):
 102     """Simple wrapper for mmap() which always supports the offset parameter.
 103
 104     :param f: File object.
 105     :param offset: Offset in the file, from the beginning of the file.
 106     :param size: Size of the mmap'ed area
 107     :param access: Access mechanism.
 108     :return: MMAP'd area.
 109     """
 110     if offset+size > MAX_MMAP_SIZE and not supports_mmap_offset:
 111         raise AssertionError("%s is larger than 256 meg, and this version "
 112             "of Python does not support the offset argument to mmap().")
 113     if supports_mmap_offset:
 114         return mmap.mmap(f.fileno(), size, access=access, offset=offset)
 115     else:
 116         class ArraySkipper(object):
 117
 118             def __init__(self, array, offset):
 119                 self.array = array
 120                 self.offset = offset
 121
 122             def __getslice__(self, i, j):
 123                 return self.array[i+self.offset:j+self.offset]
 124
 125             def __getitem__(self, i):
 126                 return self.array[i+self.offset]
 127
 128             def __len__(self):
 129                 return len(self.array) - self.offset
 130
 131             def __str__(self):
 132                 return str(self.array[self.offset:])
 133
 134         mem = mmap.mmap(f.fileno(), size+offset, access=access)
 135         if offset == 0:
 136             return mem
 137         return ArraySkipper(mem, offset)
 138
 139
 140 class PackIndex(object):
 141     """An index in to a packfile.
 142
 143     Given a sha id of an object a pack index can tell you the location in the
 144     packfile of that object if it has it.
 145
 146     To do the loop it opens the file, and indexes first 256 4 byte groups
 147     with the first byte of the sha id. The value in the four byte group indexed
 148     is the end of the group that shares the same starting byte. Subtract one
 149     from the starting byte and index again to find the start of the group.
 150     The values are sorted by sha id within the group, so do the math to find
 151     the start and end offset and then bisect in to find if the value is present.
 152     """
 153
 154     def __init__(self, filename):
 155         """Create a pack index object.
 156
 157         Provide it with the name of the index file to consider, and it will map
 158         it whenever required.
 159         """
 160         self._filename = filename
 161         # Take the size now, so it can be checked each time we map the file to
 162         # ensure that it hasn't changed.
 163         self._size = os.path.getsize(filename)
 164         self._file = open(filename, 'r')
 165         self._contents = simple_mmap(self._file, 0, self._size)
 166         if self._contents[:4] != '\377tOc':
 167             self.version = 1
 168             self._fan_out_table = self._read_fan_out_table(0)
 169         else:
 170             (self.version, ) = unpack_from(">L", self._contents, 4)
 171             assert self.version in (2,), "Version was %d" % self.version
 172             self._fan_out_table = self._read_fan_out_table(8)
 173             self._name_table_offset = 8 + 0x100 * 4
 174             self._crc32_table_offset = self._name_table_offset + 20 * len(self)
 175             self._pack_offset_table_offset = self._crc32_table_offset + 4 * len(self)
 176
 177     def __eq__(self, other):
 178         if type(self) != type(other):
 179             return False
 180
 181         if self._fan_out_table != other._fan_out_table:
 182             return False
 183
 184         for (name1, _, _), (name2, _, _) in izip(self.iterentries(), other.iterentries()):
 185             if name1 != name2:
 186                 return False
 187         return True
 188
 189     def close(self):
 190         self._file.close()
 191
 192     def __len__(self):
 193         """Return the number of entries in this pack index."""
 194         return self._fan_out_table[-1]
 195
 196     def _unpack_entry(self, i):
 197         """Unpack the i-th entry in the index file.
 198
 199         :return: Tuple with object name (SHA), offset in pack file and
 200               CRC32 checksum (if known)."""
 201         if self.version == 1:
 202             (offset, name) = unpack_from(">L20s", self._contents,
 203                 (0x100 * 4) + (i * 24))
 204             return (name, offset, None)
 205         else:
 206             return (self._unpack_name(i), self._unpack_offset(i),
 207                     self._unpack_crc32_checksum(i))
 208
 209     def _unpack_name(self, i):
 210         if self.version == 1:
 211             return self._unpack_entry(i)[0]
 212         else:
 213             return unpack_from("20s", self._contents,
 214                                       self._name_table_offset + i * 20)[0]
 215
 216     def _unpack_offset(self, i):
 217         if self.version == 1:
 218             return self._unpack_entry(i)[1]
 219         else:
 220             return unpack_from(">L", self._contents,
 221                                       self._pack_offset_table_offset + i * 4)[0]
 222
 223     def _unpack_crc32_checksum(self, i):
 224         if self.version == 1:
 225             return None
 226         else:
 227             return unpack_from(">L", self._contents,
 228                                       self._crc32_table_offset + i * 4)[0]
 229
 230     def __iter__(self):
 231         return imap(sha_to_hex, self._itersha())
 232
 233     def _itersha(self):
 234         for i in range(len(self)):
 235             yield self._unpack_name(i)
 236
 237     def objects_sha1(self):
 238         return iter_sha1(self._itersha())
 239
 240     def iterentries(self):
 241         """Iterate over the entries in this pack index.
 242
 243         Will yield tuples with object name, offset in packfile and crc32 checksum.
 244         """
 245         for i in range(len(self)):
 246             yield self._unpack_entry(i)
 247
 248     def _read_fan_out_table(self, start_offset):
 249         ret = []
 250         for i in range(0x100):
 251             ret.append(struct.unpack(">L", self._contents[start_offset+i*4:start_offset+(i+1)*4])[0])
 252         return ret
 253
 254     def check(self):
 255         """Check that the stored checksum matches the actual checksum."""
 256         return self.calculate_checksum() == self.get_stored_checksum()
 257
 258     def calculate_checksum(self):
 259         f = open(self._filename, 'r')
 260         try:
 261             return make_sha(self._contents[:-20]).digest()
 262         finally:
 263             f.close()
 264
 265     def get_pack_checksum(self):
 266         """Return the SHA1 checksum stored for the corresponding packfile."""
 267         return str(self._contents[-40:-20])
 268
 269     def get_stored_checksum(self):
 270         """Return the SHA1 checksum stored for this index."""
 271         return str(self._contents[-20:])
 272
 273     def object_index(self, sha):
 274         """Return the index in to the corresponding packfile for the object.
 275
 276         Given the name of an object it will return the offset that object lives
 277         at within the corresponding pack file. If the pack file doesn't have the
 278         object then None will be returned.
 279         """
 280         size = os.path.getsize(self._filename)
 281         assert size == self._size, "Pack index %s has changed size, I don't " \
 282              "like that" % self._filename
 283         if len(sha) == 40:
 284             sha = hex_to_sha(sha)
 285         return self._object_index(sha)
 286
 287     def _object_index(self, sha):
 288         """See object_index"""
 289         idx = ord(sha[0])
 290         if idx == 0:
 291             start = 0
 292         else:
 293             start = self._fan_out_table[idx-1]
 294         end = self._fan_out_table[idx]
 295         assert start <= end
 296         while start <= end:
 297             i = (start + end)/2
 298             file_sha = self._unpack_name(i)
 299             if file_sha < sha:
 300                 start = i + 1
 301             elif file_sha > sha:
 302                 end = i - 1
 303             else:
 304                 return self._unpack_offset(i)
 305         return None
 306
 307
 308 def read_pack_header(f):
 309     header = f.read(12)
 310     assert header[:4] == "PACK"
 311     (version,) = unpack_from(">L", header, 4)
 312     assert version in (2, 3), "Version was %d" % version
 313     (num_objects,) = unpack_from(">L", header, 8)
 314     return (version, num_objects)
 315
 316
 317 def read_pack_tail(f):
 318     return (f.read(20),)
 319
 320
 321 def unpack_object(map):
 322     bytes = take_msb_bytes(map, 0)
 323     type = (bytes[0] >> 4) & 0x07
 324     size = bytes[0] & 0x0f
 325     for i, byte in enumerate(bytes[1:]):
 326         size += (byte & 0x7f) << ((i * 7) + 4)
 327     raw_base = len(bytes)
 328     if type == 6: # offset delta
 329         bytes = take_msb_bytes(map, raw_base)
 330         assert not (bytes[-1] & 0x80)
 331         delta_base_offset = bytes[0] & 0x7f
 332         for byte in bytes[1:]:
 333             delta_base_offset += 1
 334             delta_base_offset <<= 7
 335             delta_base_offset += (byte & 0x7f)
 336         raw_base+=len(bytes)
 337         uncomp, comp_len = read_zlib(map, raw_base, size)
 338         assert size == len(uncomp)
 339         return type, (delta_base_offset, uncomp), comp_len+raw_base
 340     elif type == 7: # ref delta
 341         basename = map[raw_base:raw_base+20]
 342         uncomp, comp_len = read_zlib(map, raw_base+20, size)
 343         assert size == len(uncomp)
 344         return type, (basename, uncomp), comp_len+raw_base+20
 345     else:
 346         uncomp, comp_len = read_zlib(map, raw_base, size)
 347         assert len(uncomp) == size
 348         return type, uncomp, comp_len+raw_base
 349
 350
 351 def compute_object_size((num, obj)):
 352     if num in (6, 7):
 353         return len(obj[1])
 354     assert isinstance(obj, str)
 355     return len(obj)
 356
 357
 358 class PackData(object):
 359     """The data contained in a packfile.
 360
 361     Pack files can be accessed both sequentially for exploding a pack, and
 362     directly with the help of an index to retrieve a specific object.
 363
 364     The objects within are either complete or a delta aginst another.
 365
 366     The header is variable length. If the MSB of each byte is set then it
 367     indicates that the subsequent byte is still part of the header.
 368     For the first byte the next MS bits are the type, which tells you the type
 369     of object, and whether it is a delta. The LS byte is the lowest bits of the
 370     size. For each subsequent byte the LS 7 bits are the next MS bits of the
 371     size, i.e. the last byte of the header contains the MS bits of the size.
 372
 373     For the complete objects the data is stored as zlib deflated data.
 374     The size in the header is the uncompressed object size, so to uncompress
 375     you need to just keep feeding data to zlib until you get an object back,
 376     or it errors on bad data. This is done here by just giving the complete
 377     buffer from the start of the deflated object on. This is bad, but until I
 378     get mmap sorted out it will have to do.
 379
 380     Currently there are no integrity checks done. Also no attempt is made to try
 381     and detect the delta case, or a request for an object at the wrong position.
 382     It will all just throw a zlib or KeyError.
 383     """
 384
 385     def __init__(self, filename):
 386         """Create a PackData object that represents the pack in the given filename.
 387
 388         The file must exist and stay readable until the object is disposed of. It
 389         must also stay the same size. It will be mapped whenever needed.
 390
 391         Currently there is a restriction on the size of the pack as the python
 392         mmap implementation is flawed.
 393         """
 394         self._filename = filename
 395         assert os.path.exists(filename), "%s is not a packfile" % filename
 396         self._size = os.path.getsize(filename)
 397         self._header_size = 12
 398         assert self._size >= self._header_size, "%s is too small for a packfile (%d < %d)" % (filename, self._size, self._header_size)
 399         self._read_header()
 400         self._offset_cache = LRUSizeCache(1024*1024*100,
 401             compute_size=compute_object_size)
 402
 403     def _read_header(self):
 404         f = open(self._filename, 'rb')
 405         try:
 406             (version, self._num_objects) = \
 407                     read_pack_header(f)
 408             f.seek(self._size-20)
 409             (self._stored_checksum,) = read_pack_tail(f)
 410         finally:
 411             f.close()
 412
 413     def __len__(self):
 414         """Returns the number of objects in this pack."""
 415         return self._num_objects
 416
 417     def calculate_checksum(self):
 418         """Calculate the checksum for this pack."""
 419         f = open(self._filename, 'rb')
 420         try:
 421             map = simple_mmap(f, 0, self._size)
 422             return make_sha(map[:-20]).digest()
 423         finally:
 424             f.close()
 425
 426     def resolve_object(self, offset, type, obj, get_ref, get_offset=None):
 427         """Resolve an object, possibly resolving deltas when necessary.
 428
 429         :return: Tuple with object type and contents.
 430         """
 431         if not type in (6, 7): # Not a delta
 432             return type, obj
 433
 434         if get_offset is None:
 435             get_offset = self.get_object_at
 436
 437         if type == 6: # offset delta
 438             (delta_offset, delta) = obj
 439             assert isinstance(delta_offset, int)
 440             assert isinstance(delta, str)
 441             base_offset = offset-delta_offset
 442             type, base_obj = get_offset(base_offset)
 443             assert isinstance(type, int)
 444         elif type == 7: # ref delta
 445             (basename, delta) = obj
 446             assert isinstance(basename, str) and len(basename) == 20
 447             assert isinstance(delta, str)
 448             type, base_obj = get_ref(basename)
 449             assert isinstance(type, int)
 450             # Can't be a ofs delta, as we wouldn't know the base offset
 451             assert type != 6
 452             base_offset = None
 453         type, base_text = self.resolve_object(base_offset, type, base_obj, get_ref)
 454         if base_offset is not None:
 455             self._offset_cache[base_offset] = type, base_text
 456         ret = (type, apply_delta(base_text, delta))
 457         return ret
 458
 459     def iterobjects(self):
 460         offset = self._header_size
 461         f = open(self._filename, 'rb')
 462         for i in range(len(self)):
 463             map = simple_mmap(f, offset, self._size-offset)
 464             (type, obj, total_size) = unpack_object(map)
 465             crc32 = zlib.crc32(map[:total_size]) & 0xffffffff
 466             yield offset, type, obj, crc32
 467             offset += total_size
 468         f.close()
 469
 470     def iterentries(self, ext_resolve_ref=None):
 471         found = {}
 472         postponed = defaultdict(list)
 473         class Postpone(Exception):
 474             """Raised to postpone delta resolving."""
 475
 476         def get_ref_text(sha):
 477             if sha in found:
 478                 return found[sha]
 479             if ext_resolve_ref:
 480                 try:
 481                     return ext_resolve_ref(sha)
 482                 except KeyError:
 483                     pass
 484             raise Postpone, (sha, )
 485         todo = list(self.iterobjects())
 486         while todo:
 487             (offset, type, obj, crc32) = todo.pop(0)
 488             assert isinstance(offset, int)
 489             assert isinstance(type, int)
 490             assert isinstance(obj, tuple) or isinstance(obj, str)
 491             try:
 492                 type, obj = self.resolve_object(offset, type, obj, get_ref_text)
 493             except Postpone, (sha, ):
 494                 postponed[sha].append((offset, type, obj))
 495             else:
 496                 shafile = ShaFile.from_raw_string(type, obj)
 497                 sha = shafile.sha().digest()
 498                 found[sha] = (type, obj)
 499                 yield sha, offset, crc32
 500                 todo += postponed.get(sha, [])
 501         if postponed:
 502             raise KeyError([sha_to_hex(h) for h in postponed.keys()])
 503
 504     def sorted_entries(self, resolve_ext_ref=None):
 505         ret = list(self.iterentries(resolve_ext_ref))
 506         ret.sort()
 507         return ret
 508
 509     def create_index_v1(self, filename, resolve_ext_ref=None):
 510         entries = self.sorted_entries(resolve_ext_ref)
 511         write_pack_index_v1(filename, entries, self.calculate_checksum())
 512
 513     def create_index_v2(self, filename, resolve_ext_ref=None):
 514         entries = self.sorted_entries(resolve_ext_ref)
 515         write_pack_index_v2(filename, entries, self.calculate_checksum())
 516
 517     def get_stored_checksum(self):
 518         return self._stored_checksum
 519
 520     def check(self):
 521         return (self.calculate_checksum() == self.get_stored_checksum())
 522
 523     def get_object_at(self, offset):
 524         """Given an offset in to the packfile return the object that is there.
 525
 526         Using the associated index the location of an object can be looked up, and
 527         then the packfile can be asked directly for that object using this
 528         function.
 529         """
 530         if offset in self._offset_cache:
 531             return self._offset_cache[offset]
 532         assert isinstance(offset, long) or isinstance(offset, int),\
 533                 "offset was %r" % offset
 534         assert offset >= self._header_size
 535         size = os.path.getsize(self._filename)
 536         assert size == self._size, "Pack data %s has changed size, I don't " \
 537              "like that" % self._filename
 538         f = open(self._filename, 'rb')
 539         try:
 540             map = simple_mmap(f, offset, size-offset)
 541             ret = unpack_object(map)[:2]
 542             return ret
 543         finally:
 544             f.close()
 545
 546
 547 class SHA1Writer(object):
 548
 549     def __init__(self, f):
 550         self.f = f
 551         self.sha1 = make_sha("")
 552
 553     def write(self, data):
 554         self.sha1.update(data)
 555         self.f.write(data)
 556
 557     def write_sha(self):
 558         sha = self.sha1.digest()
 559         assert len(sha) == 20
 560         self.f.write(sha)
 561         return sha
 562
 563     def close(self):
 564         sha = self.write_sha()
 565         self.f.close()
 566         return sha
 567
 568     def tell(self):
 569         return self.f.tell()
 570
 571
 572 def write_pack_object(f, type, object):
 573     """Write pack object to a file.
 574
 575     :param f: File to write to
 576     :param o: Object to write
 577     :return: Tuple with offset at which the object was written, and crc32
 578     """
 579     ret = f.tell()
 580     packed_data_hdr = ""
 581     if type == 6: # ref delta
 582         (delta_base_offset, object) = object
 583     elif type == 7: # offset delta
 584         (basename, object) = object
 585     size = len(object)
 586     c = (type << 4) | (size & 15)
 587     size >>= 4
 588     while size:
 589         packed_data_hdr += (chr(c | 0x80))
 590         c = size & 0x7f
 591         size >>= 7
 592     packed_data_hdr += chr(c)
 593     if type == 6: # offset delta
 594         ret = [delta_base_offset & 0x7f]
 595         delta_base_offset >>= 7
 596         while delta_base_offset:
 597             delta_base_offset -= 1
 598             ret.insert(0, 0x80 | (delta_base_offset & 0x7f))
 599             delta_base_offset >>= 7
 600         packed_data_hdr += "".join([chr(x) for x in ret])
 601     elif type == 7: # ref delta
 602         assert len(basename) == 20
 603         packed_data_hdr += basename
 604     packed_data = packed_data_hdr + zlib.compress(object)
 605     f.write(packed_data)
 606     return (f.tell(), (zlib.crc32(packed_data) & 0xffffffff))
 607
 608
 609 def write_pack(filename, objects, num_objects):
 610     f = open(filename + ".pack", 'w')
 611     try:
 612         entries, data_sum = write_pack_data(f, objects, num_objects)
 613     finally:
 614         f.close()
 615     entries.sort()
 616     write_pack_index_v2(filename + ".idx", entries, data_sum)
 617
 618
 619 def write_pack_data(f, objects, num_objects, window=10):
 620     """Write a new pack file.
 621
 622     :param filename: The filename of the new pack file.
 623     :param objects: List of objects to write (tuples with object and path)
 624     :return: List with (name, offset, crc32 checksum) entries, pack checksum
 625     """
 626     recency = list(objects)
 627     # FIXME: Somehow limit delta depth
 628     # FIXME: Make thin-pack optional (its not used when cloning a pack)
 629     # Build a list of objects ordered by the magic Linus heuristic
 630     # This helps us find good objects to diff against us
 631     magic = []
 632     for obj, path in recency:
 633         magic.append( (obj.type, path, 1, -len(obj.as_raw_string()[1]), obj) )
 634     magic.sort()
 635     # Build a map of objects and their index in magic - so we can find preceeding objects
 636     # to diff against
 637     offs = {}
 638     for i in range(len(magic)):
 639         offs[magic[i][4]] = i
 640     # Write the pack
 641     entries = []
 642     f = SHA1Writer(f)
 643     f.write("PACK")               # Pack header
 644     f.write(struct.pack(">L", 2)) # Pack version
 645     f.write(struct.pack(">L", num_objects)) # Number of objects in pack
 646     for o, path in recency:
 647         sha1 = o.sha().digest()
 648         orig_t, raw = o.as_raw_string()
 649         winner = raw
 650         t = orig_t
 651         #for i in range(offs[o]-window, window):
 652         #    if i < 0 or i >= len(offs): continue
 653         #    b = magic[i][4]
 654         #    if b.type != orig_t: continue
 655         #    _, base = b.as_raw_string()
 656         #    delta = create_delta(base, raw)
 657         #    if len(delta) < len(winner):
 658         #        winner = delta
 659         #        t = 6 if magic[i][2] == 1 else 7
 660         offset, crc32 = write_pack_object(f, t, winner)
 661         entries.append((sha1, offset, crc32))
 662     return entries, f.write_sha()
 663
 664
 665 def write_pack_index_v1(filename, entries, pack_checksum):
 666     """Write a new pack index file.
 667
 668     :param filename: The filename of the new pack index file.
 669     :param entries: List of tuples with object name (sha), offset_in_pack,  and
 670             crc32_checksum.
 671     :param pack_checksum: Checksum of the pack file.
 672     """
 673     f = open(filename, 'w')
 674     f = SHA1Writer(f)
 675     fan_out_table = defaultdict(lambda: 0)
 676     for (name, offset, entry_checksum) in entries:
 677         fan_out_table[ord(name[0])] += 1
 678     # Fan-out table
 679     for i in range(0x100):
 680         f.write(struct.pack(">L", fan_out_table[i]))
 681         fan_out_table[i+1] += fan_out_table[i]
 682     for (name, offset, entry_checksum) in entries:
 683         f.write(struct.pack(">L20s", offset, name))
 684     assert len(pack_checksum) == 20
 685     f.write(pack_checksum)
 686     f.close()
 687
 688
 689 def create_delta(base_buf, target_buf):
 690     """Use python difflib to work out how to transform base_buf to target_buf"""
 691     assert isinstance(base_buf, str)
 692     assert isinstance(target_buf, str)
 693     out_buf = ""
 694     # write delta header
 695     def encode_size(size):
 696         ret = ""
 697         c = size & 0x7f
 698         size >>= 7
 699         while size:
 700             ret += chr(c | 0x80)
 701             c = size & 0x7f
 702             size >>= 7
 703         ret += chr(c)
 704         return ret
 705     out_buf += encode_size(len(base_buf))
 706     out_buf += encode_size(len(target_buf))
 707     # write out delta opcodes
 708     seq = difflib.SequenceMatcher(a=base_buf, b=target_buf)
 709     for opcode, i1, i2, j1, j2 in seq.get_opcodes():
 710         # Git patch opcodes don't care about deletes!
 711         #if opcode == "replace" or opcode == "delete":
 712         #    pass
 713         if opcode == "equal":
 714             # If they are equal, unpacker will use data from base_buf
 715             # Write out an opcode that says what range to use
 716             scratch = ""
 717             op = 0x80
 718             o = i1
 719             for i in range(4):
 720                 if o & 0xff << i*8:
 721                     scratch += chr(o >> i)
 722                     op |= 1 << i
 723             s = i2 - i1
 724             for i in range(2):
 725                 if s & 0xff << i*8:
 726                     scratch += chr(s >> i)
 727                     op |= 1 << (4+i)
 728             out_buf += chr(op)
 729             out_buf += scratch
 730         if opcode == "replace" or opcode == "insert":
 731             # If we are replacing a range or adding one, then we just
 732             # output it to the stream (prefixed by its size)
 733             s = j2 - j1
 734             o = j1
 735             while s > 127:
 736                 out_buf += chr(127)
 737                 out_buf += target_buf[o:o+127]
 738                 s -= 127
 739                 o += 127
 740             out_buf += chr(s)
 741             out_buf += target_buf[o:o+s]
 742     return out_buf
 743
 744
 745 def apply_delta(src_buf, delta):
 746     """Based on the similar function in git's patch-delta.c."""
 747     assert isinstance(src_buf, str), "was %r" % (src_buf,)
 748     assert isinstance(delta, str)
 749     out = []
 750     index = 0
 751     delta_length = len(delta)
 752     def pop(delta, index):
 753         ret = delta[index]
 754         index += 1
 755         return ord(ret), index
 756     def get_delta_header_size(delta, index):
 757         size = 0
 758         i = 0
 759         while delta:
 760             cmd, index = pop(delta, index)
 761             size |= (cmd & ~0x80) << i
 762             i += 7
 763             if not cmd & 0x80:
 764                 break
 765         return size, index
 766     src_size, index = get_delta_header_size(delta, index)
 767     dest_size, index = get_delta_header_size(delta, index)
 768     assert src_size == len(src_buf), "%d vs %d" % (src_size, len(src_buf))
 769     while index < delta_length:
 770         cmd, index = pop(delta, index)
 771         if cmd & 0x80:
 772             cp_off = 0
 773             for i in range(4):
 774                 if cmd & (1 << i):
 775                     x, index = pop(delta, index)
 776                     cp_off |= x << (i * 8)
 777             cp_size = 0
 778             for i in range(3):
 779                 if cmd & (1 << (4+i)):
 780                     x, index = pop(delta, index)
 781                     cp_size |= x << (i * 8)
 782             if cp_size == 0:
 783                 cp_size = 0x10000
 784             if (cp_off + cp_size < cp_size or
 785                 cp_off + cp_size > src_size or
 786                 cp_size > dest_size):
 787                 break
 788             out.append(src_buf[cp_off:cp_off+cp_size])
 789         elif cmd != 0:
 790             out.append(delta[index:index+cmd])
 791             index += cmd
 792         else:
 793             raise ApplyDeltaError("Invalid opcode 0")
 794
 795     if index != delta_length:
 796         raise ApplyDeltaError("delta not empty: %r" % delta[index:])
 797
 798     out = ''.join(out)
 799     if dest_size != len(out):
 800         raise ApplyDeltaError("dest size incorrect")
 801
 802     return out
 803
 804
 805 def write_pack_index_v2(filename, entries, pack_checksum):
 806     """Write a new pack index file.
 807
 808     :param filename: The filename of the new pack index file.
 809     :param entries: List of tuples with object name (sha), offset_in_pack,  and
 810             crc32_checksum.
 811     :param pack_checksum: Checksum of the pack file.
 812     """
 813     f = open(filename, 'w')
 814     f = SHA1Writer(f)
 815     f.write('\377tOc') # Magic!
 816     f.write(struct.pack(">L", 2))
 817     fan_out_table = defaultdict(lambda: 0)
 818     for (name, offset, entry_checksum) in entries:
 819         fan_out_table[ord(name[0])] += 1
 820     # Fan-out table
 821     for i in range(0x100):
 822         f.write(struct.pack(">L", fan_out_table[i]))
 823         fan_out_table[i+1] += fan_out_table[i]
 824     for (name, offset, entry_checksum) in entries:
 825         f.write(name)
 826     for (name, offset, entry_checksum) in entries:
 827         f.write(struct.pack(">L", entry_checksum))
 828     for (name, offset, entry_checksum) in entries:
 829         # FIXME: handle if MSBit is set in offset
 830         f.write(struct.pack(">L", offset))
 831     # FIXME: handle table for pack files > 8 Gb
 832     assert len(pack_checksum) == 20
 833     f.write(pack_checksum)
 834     f.close()
 835
 836
 837 class Pack(object):
 838
 839     def __init__(self, basename):
 840         self._basename = basename
 841         self._data_path = self._basename + ".pack"
 842         self._idx_path = self._basename + ".idx"
 843         self._data = None
 844         self._idx = None
 845
 846     def name(self):
 847         """The SHA over the SHAs of the objects in this pack."""
 848         return self.idx.objects_sha1()
 849
 850     @property
 851     def data(self):
 852         if self._data is None:
 853             self._data = PackData(self._data_path)
 854             assert len(self.idx) == len(self._data)
 855             idx_stored_checksum = self.idx.get_pack_checksum()
 856             data_stored_checksum = self._data.get_stored_checksum()
 857             if idx_stored_checksum != data_stored_checksum:
 858                 raise ChecksumMismatch(sha_to_hex(idx_stored_checksum),
 859                                        sha_to_hex(data_stored_checksum))
 860         return self._data
 861
 862     @property
 863     def idx(self):
 864         if self._idx is None:
 865             self._idx = PackIndex(self._idx_path)
 866         return self._idx
 867
 868     def close(self):
 869         if self._data is not None:
 870             self._data.close()
 871         self.idx.close()
 872
 873     def __eq__(self, other):
 874         return type(self) == type(other) and self.idx == other.idx
 875
 876     def __len__(self):
 877         """Number of entries in this pack."""
 878         return len(self.idx)
 879
 880     def __repr__(self):
 881         return "Pack(%r)" % self._basename
 882
 883     def __iter__(self):
 884         """Iterate over all the sha1s of the objects in this pack."""
 885         return iter(self.idx)
 886
 887     def check(self):
 888         if not self.idx.check():
 889             return False
 890         if not self.data.check():
 891             return False
 892         return True
 893
 894     def get_stored_checksum(self):
 895         return self.data.get_stored_checksum()
 896
 897     def __contains__(self, sha1):
 898         """Check whether this pack contains a particular SHA1."""
 899         return (self.idx.object_index(sha1) is not None)
 900
 901     def get_raw(self, sha1, resolve_ref=None):
 902         offset = self.idx.object_index(sha1)
 903         if offset is None:
 904             raise KeyError(sha1)
 905
 906         type, obj = self.data.get_object_at(offset)
 907         if isinstance(offset, long):
 908           offset = int(offset)
 909         assert isinstance(offset, int)
 910         return self.data.resolve_object(offset, type, obj, resolve_ref)
 911
 912     def __getitem__(self, sha1):
 913         """Retrieve the specified SHA1."""
 914         type, uncomp = self.get_raw(sha1)
 915         return ShaFile.from_raw_string(type, uncomp)
 916
 917     def iterobjects(self, get_raw=None):
 918         if get_raw is None:
 919             def get_raw(x):
 920                 raise KeyError(x)
 921         for offset, type, obj, crc32 in self.data.iterobjects():
 922             assert isinstance(offset, int)
 923             yield ShaFile.from_raw_string(
 924                     *self.data.resolve_object(offset, type, obj, get_raw))
 925
 926
 927 def load_packs(path):
 928     if not os.path.exists(path):
 929         return
 930     for name in os.listdir(path):
 931         if name.startswith("pack-") and name.endswith(".pack"):
 932             yield Pack(os.path.join(path, name[:-len(".pack")]))
 933