dulwich/pack.py

   1 # pack.py -- For dealing wih packed git objects.
   2 # Copyright (C) 2007 James Westby <jw+debian@jameswestby.net>
   3 # Copryight (C) 2008 Jelmer Vernooij <jelmer@samba.org>
   4 #
   5 # This program is free software; you can redistribute it and/or
   6 # modify it under the terms of the GNU General Public License
   7 # as published by the Free Software Foundation; version 2
   8 # of the License or (at your option) a later version.
   9 #
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 # GNU General Public License for more details.
  14 #
  15 # You should have received a copy of the GNU General Public License
  16 # along with this program; if not, write to the Free Software
  17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
  18 # MA  02110-1301, USA.
  19
  20 """Classes for dealing with packed git objects.
  21
  22 A pack is a compact representation of a bunch of objects, stored
  23 using deltas where possible.
  24
  25 They have two parts, the pack file, which stores the data, and an index
  26 that tells you where the data is.
  27
  28 To find an object you look in all of the index files 'til you find a
  29 match for the object name. You then use the pointer got from this as
  30 a pointer in to the corresponding packfile.
  31 """
  32
  33 try:
  34     from collections import defaultdict
  35 except ImportError:
  36     from misc import defaultdict
  37
  38 from itertools import imap, izip
  39 import mmap
  40 import os
  41 import struct
  42 try:
  43     from struct import unpack_from
  44 except ImportError:
  45     from dulwich.misc import unpack_from
  46 import sys
  47 import zlib
  48 import difflib
  49
  50 from dulwich.errors import (
  51     ApplyDeltaError,
  52     ChecksumMismatch,
  53     )
  54 from dulwich.lru_cache import (
  55     LRUSizeCache,
  56     )
  57 from dulwich.objects import (
  58     ShaFile,
  59     hex_to_sha,
  60     sha_to_hex,
  61     )
  62 from dulwich.misc import make_sha
  63
  64 supports_mmap_offset = (sys.version_info[0] >= 3 or
  65         (sys.version_info[0] == 2 and sys.version_info[1] >= 6))
  66
  67
  68 def take_msb_bytes(map, offset):
  69     ret = []
  70     while len(ret) == 0 or ret[-1] & 0x80:
  71         ret.append(ord(map[offset]))
  72         offset += 1
  73     return ret
  74
  75
  76 def read_zlib(data, offset, dec_size):
  77     obj = zlib.decompressobj()
  78     x = ""
  79     fed = 0
  80     while obj.unused_data == "":
  81         base = offset+fed
  82         add = data[base:base+1024]
  83         if len(add) < 1024:
  84             add += "Z"
  85         fed += len(add)
  86         x += obj.decompress(add)
  87     assert len(x) == dec_size
  88     comp_len = fed-len(obj.unused_data)
  89     return x, comp_len
  90
  91
  92 def iter_sha1(iter):
  93     sha1 = make_sha()
  94     for name in iter:
  95         sha1.update(name)
  96     return sha1.hexdigest()
  97
  98
  99 MAX_MMAP_SIZE = 1024 * 1024 * 1024
 100
 101 def simple_mmap(f, offset, size, access=mmap.ACCESS_READ):
 102     """Simple wrapper for mmap() which always supports the offset parameter.
 103
 104     :param f: File object.
 105     :param offset: Offset in the file, from the beginning of the file.
 106     :param size: Size of the mmap'ed area
 107     :param access: Access mechanism.
 108     :return: MMAP'd area.
 109     """
 110     if offset+size > MAX_MMAP_SIZE and not supports_mmap_offset:
 111         raise AssertionError("%s is larger than 256 meg, and this version "
 112             "of Python does not support the offset argument to mmap().")
 113     if supports_mmap_offset:
 114         return mmap.mmap(f.fileno(), size, access=access, offset=offset), 0
 115     else:
 116         mem = mmap.mmap(f.fileno(), size+offset, access=access)
 117         return mem, offset
 118
 119
 120 class PackIndex(object):
 121     """An index in to a packfile.
 122
 123     Given a sha id of an object a pack index can tell you the location in the
 124     packfile of that object if it has it.
 125
 126     To do the loop it opens the file, and indexes first 256 4 byte groups
 127     with the first byte of the sha id. The value in the four byte group indexed
 128     is the end of the group that shares the same starting byte. Subtract one
 129     from the starting byte and index again to find the start of the group.
 130     The values are sorted by sha id within the group, so do the math to find
 131     the start and end offset and then bisect in to find if the value is present.
 132     """
 133
 134     def __init__(self, filename):
 135         """Create a pack index object.
 136
 137         Provide it with the name of the index file to consider, and it will map
 138         it whenever required.
 139         """
 140         self._filename = filename
 141         # Take the size now, so it can be checked each time we map the file to
 142         # ensure that it hasn't changed.
 143         self._size = os.path.getsize(filename)
 144         self._file = open(filename, 'r')
 145         self._contents, map_offset = simple_mmap(self._file, 0, self._size)
 146         assert map_offset == 0
 147         if self._contents[:4] != '\377tOc':
 148             self.version = 1
 149             self._fan_out_table = self._read_fan_out_table(0)
 150         else:
 151             (self.version, ) = unpack_from(">L", self._contents, 4)
 152             assert self.version in (2,), "Version was %d" % self.version
 153             self._fan_out_table = self._read_fan_out_table(8)
 154             self._name_table_offset = 8 + 0x100 * 4
 155             self._crc32_table_offset = self._name_table_offset + 20 * len(self)
 156             self._pack_offset_table_offset = self._crc32_table_offset + 4 * len(self)
 157
 158     def __eq__(self, other):
 159         if type(self) != type(other):
 160             return False
 161
 162         if self._fan_out_table != other._fan_out_table:
 163             return False
 164
 165         for (name1, _, _), (name2, _, _) in izip(self.iterentries(), other.iterentries()):
 166             if name1 != name2:
 167                 return False
 168         return True
 169
 170     def close(self):
 171         self._file.close()
 172
 173     def __len__(self):
 174         """Return the number of entries in this pack index."""
 175         return self._fan_out_table[-1]
 176
 177     def _unpack_entry(self, i):
 178         """Unpack the i-th entry in the index file.
 179
 180         :return: Tuple with object name (SHA), offset in pack file and
 181               CRC32 checksum (if known)."""
 182         if self.version == 1:
 183             (offset, name) = unpack_from(">L20s", self._contents,
 184                 (0x100 * 4) + (i * 24))
 185             return (name, offset, None)
 186         else:
 187             return (self._unpack_name(i), self._unpack_offset(i),
 188                     self._unpack_crc32_checksum(i))
 189
 190     def _unpack_name(self, i):
 191         if self.version == 1:
 192             offset = (0x100 * 4) + (i * 24) + 4
 193         else:
 194             offset = self._name_table_offset + i * 20
 195         return self._contents[offset:offset+20]
 196
 197     def _unpack_offset(self, i):
 198         if self.version == 1:
 199             offset = (0x100 * 4) + (i * 24)
 200         else:
 201             offset = self._pack_offset_table_offset + i * 4
 202         return unpack_from(">L", self._contents, offset)[0]
 203
 204     def _unpack_crc32_checksum(self, i):
 205         if self.version == 1:
 206             return None
 207         else:
 208             return unpack_from(">L", self._contents,
 209                                       self._crc32_table_offset + i * 4)[0]
 210
 211     def __iter__(self):
 212         return imap(sha_to_hex, self._itersha())
 213
 214     def _itersha(self):
 215         for i in range(len(self)):
 216             yield self._unpack_name(i)
 217
 218     def objects_sha1(self):
 219         return iter_sha1(self._itersha())
 220
 221     def iterentries(self):
 222         """Iterate over the entries in this pack index.
 223
 224         Will yield tuples with object name, offset in packfile and crc32 checksum.
 225         """
 226         for i in range(len(self)):
 227             yield self._unpack_entry(i)
 228
 229     def _read_fan_out_table(self, start_offset):
 230         ret = []
 231         for i in range(0x100):
 232             ret.append(struct.unpack(">L", self._contents[start_offset+i*4:start_offset+(i+1)*4])[0])
 233         return ret
 234
 235     def check(self):
 236         """Check that the stored checksum matches the actual checksum."""
 237         return self.calculate_checksum() == self.get_stored_checksum()
 238
 239     def calculate_checksum(self):
 240         f = open(self._filename, 'r')
 241         try:
 242             return make_sha(self._contents[:-20]).digest()
 243         finally:
 244             f.close()
 245
 246     def get_pack_checksum(self):
 247         """Return the SHA1 checksum stored for the corresponding packfile."""
 248         return str(self._contents[-40:-20])
 249
 250     def get_stored_checksum(self):
 251         """Return the SHA1 checksum stored for this index."""
 252         return str(self._contents[-20:])
 253
 254     def object_index(self, sha):
 255         """Return the index in to the corresponding packfile for the object.
 256
 257         Given the name of an object it will return the offset that object lives
 258         at within the corresponding pack file. If the pack file doesn't have the
 259         object then None will be returned.
 260         """
 261         if len(sha) == 40:
 262             sha = hex_to_sha(sha)
 263         return self._object_index(sha)
 264
 265     def _object_index(self, sha):
 266         """See object_index.
 267
 268         :param sha: A *binary* SHA string. (20 characters long)_
 269         """
 270         assert len(sha) == 20
 271         idx = ord(sha[0])
 272         if idx == 0:
 273             start = 0
 274         else:
 275             start = self._fan_out_table[idx-1]
 276         end = self._fan_out_table[idx]
 277         assert start <= end
 278         while start <= end:
 279             i = (start + end)/2
 280             file_sha = self._unpack_name(i)
 281             if file_sha < sha:
 282                 start = i + 1
 283             elif file_sha > sha:
 284                 end = i - 1
 285             else:
 286                 return self._unpack_offset(i)
 287         return None
 288
 289
 290 def read_pack_header(f):
 291     header = f.read(12)
 292     assert header[:4] == "PACK"
 293     (version,) = unpack_from(">L", header, 4)
 294     assert version in (2, 3), "Version was %d" % version
 295     (num_objects,) = unpack_from(">L", header, 8)
 296     return (version, num_objects)
 297
 298
 299 def read_pack_tail(f):
 300     return (f.read(20),)
 301
 302
 303 def unpack_object(map, offset=0):
 304     bytes = take_msb_bytes(map, offset)
 305     type = (bytes[0] >> 4) & 0x07
 306     size = bytes[0] & 0x0f
 307     for i, byte in enumerate(bytes[1:]):
 308         size += (byte & 0x7f) << ((i * 7) + 4)
 309     raw_base = len(bytes)
 310     if type == 6: # offset delta
 311         bytes = take_msb_bytes(map, raw_base + offset)
 312         assert not (bytes[-1] & 0x80)
 313         delta_base_offset = bytes[0] & 0x7f
 314         for byte in bytes[1:]:
 315             delta_base_offset += 1
 316             delta_base_offset <<= 7
 317             delta_base_offset += (byte & 0x7f)
 318         raw_base+=len(bytes)
 319         uncomp, comp_len = read_zlib(map, offset + raw_base, size)
 320         assert size == len(uncomp)
 321         return type, (delta_base_offset, uncomp), comp_len+raw_base
 322     elif type == 7: # ref delta
 323         basename = map[offset+raw_base:offset+raw_base+20]
 324         uncomp, comp_len = read_zlib(map, offset+raw_base+20, size)
 325         assert size == len(uncomp)
 326         return type, (basename, uncomp), comp_len+raw_base+20
 327     else:
 328         uncomp, comp_len = read_zlib(map, offset+raw_base, size)
 329         assert len(uncomp) == size
 330         return type, uncomp, comp_len+raw_base
 331
 332
 333 def compute_object_size((num, obj)):
 334     if num in (6, 7):
 335         return len(obj[1])
 336     assert isinstance(obj, str)
 337     return len(obj)
 338
 339
 340 class PackData(object):
 341     """The data contained in a packfile.
 342
 343     Pack files can be accessed both sequentially for exploding a pack, and
 344     directly with the help of an index to retrieve a specific object.
 345
 346     The objects within are either complete or a delta aginst another.
 347
 348     The header is variable length. If the MSB of each byte is set then it
 349     indicates that the subsequent byte is still part of the header.
 350     For the first byte the next MS bits are the type, which tells you the type
 351     of object, and whether it is a delta. The LS byte is the lowest bits of the
 352     size. For each subsequent byte the LS 7 bits are the next MS bits of the
 353     size, i.e. the last byte of the header contains the MS bits of the size.
 354
 355     For the complete objects the data is stored as zlib deflated data.
 356     The size in the header is the uncompressed object size, so to uncompress
 357     you need to just keep feeding data to zlib until you get an object back,
 358     or it errors on bad data. This is done here by just giving the complete
 359     buffer from the start of the deflated object on. This is bad, but until I
 360     get mmap sorted out it will have to do.
 361
 362     Currently there are no integrity checks done. Also no attempt is made to try
 363     and detect the delta case, or a request for an object at the wrong position.
 364     It will all just throw a zlib or KeyError.
 365     """
 366
 367     def __init__(self, filename):
 368         """Create a PackData object that represents the pack in the given filename.
 369
 370         The file must exist and stay readable until the object is disposed of. It
 371         must also stay the same size. It will be mapped whenever needed.
 372
 373         Currently there is a restriction on the size of the pack as the python
 374         mmap implementation is flawed.
 375         """
 376         self._filename = filename
 377         assert os.path.exists(filename), "%s is not a packfile" % filename
 378         self._size = os.path.getsize(filename)
 379         self._header_size = 12
 380         assert self._size >= self._header_size, "%s is too small for a packfile (%d < %d)" % (filename, self._size, self._header_size)
 381         self._read_header()
 382         self._offset_cache = LRUSizeCache(1024*1024*100,
 383             compute_size=compute_object_size)
 384
 385     def _read_header(self):
 386         f = open(self._filename, 'rb')
 387         try:
 388             (version, self._num_objects) = \
 389                     read_pack_header(f)
 390             f.seek(self._size-20)
 391             (self._stored_checksum,) = read_pack_tail(f)
 392         finally:
 393             f.close()
 394
 395     def __len__(self):
 396         """Returns the number of objects in this pack."""
 397         return self._num_objects
 398
 399     def calculate_checksum(self):
 400         """Calculate the checksum for this pack."""
 401         f = open(self._filename, 'rb')
 402         try:
 403             map, map_offset = simple_mmap(f, 0, self._size - 20)
 404             return make_sha(map[map_offset:self._size-20]).digest()
 405         finally:
 406             f.close()
 407
 408     def resolve_object(self, offset, type, obj, get_ref, get_offset=None):
 409         """Resolve an object, possibly resolving deltas when necessary.
 410
 411         :return: Tuple with object type and contents.
 412         """
 413         if type not in (6, 7): # Not a delta
 414             return type, obj
 415
 416         if get_offset is None:
 417             get_offset = self.get_object_at
 418
 419         if type == 6: # offset delta
 420             (delta_offset, delta) = obj
 421             assert isinstance(delta_offset, int)
 422             assert isinstance(delta, str)
 423             base_offset = offset-delta_offset
 424             type, base_obj = get_offset(base_offset)
 425             assert isinstance(type, int)
 426         elif type == 7: # ref delta
 427             (basename, delta) = obj
 428             assert isinstance(basename, str) and len(basename) == 20
 429             assert isinstance(delta, str)
 430             type, base_obj = get_ref(basename)
 431             assert isinstance(type, int)
 432             # Can't be a ofs delta, as we wouldn't know the base offset
 433             assert type != 6
 434             base_offset = None
 435         type, base_text = self.resolve_object(base_offset, type, base_obj, get_ref)
 436         if base_offset is not None:
 437             self._offset_cache[base_offset] = type, base_text
 438         ret = (type, apply_delta(base_text, delta))
 439         return ret
 440
 441     def iterobjects(self):
 442         offset = self._header_size
 443         f = open(self._filename, 'rb')
 444         num = len(self)
 445         map, _ = simple_mmap(f, 0, self._size)
 446         for i in range(num):
 447             (type, obj, total_size) = unpack_object(map, offset)
 448             crc32 = zlib.crc32(map[offset:offset+total_size]) & 0xffffffff
 449             yield offset, type, obj, crc32
 450             offset += total_size
 451         f.close()
 452
 453     def iterentries(self, ext_resolve_ref=None):
 454         found = {}
 455         postponed = defaultdict(list)
 456         class Postpone(Exception):
 457             """Raised to postpone delta resolving."""
 458
 459         def get_ref_text(sha):
 460             if sha in found:
 461                 return found[sha]
 462             if ext_resolve_ref:
 463                 try:
 464                     return ext_resolve_ref(sha)
 465                 except KeyError:
 466                     pass
 467             raise Postpone, (sha, )
 468         todo = list(self.iterobjects())
 469         while todo:
 470             (offset, type, obj, crc32) = todo.pop(0)
 471             assert isinstance(offset, int)
 472             assert isinstance(type, int)
 473             assert isinstance(obj, tuple) or isinstance(obj, str)
 474             try:
 475                 type, obj = self.resolve_object(offset, type, obj, get_ref_text)
 476             except Postpone, (sha, ):
 477                 postponed[sha].append((offset, type, obj))
 478             else:
 479                 shafile = ShaFile.from_raw_string(type, obj)
 480                 sha = shafile.sha().digest()
 481                 found[sha] = (type, obj)
 482                 yield sha, offset, crc32
 483                 todo += postponed.get(sha, [])
 484         if postponed:
 485             raise KeyError([sha_to_hex(h) for h in postponed.keys()])
 486
 487     def sorted_entries(self, resolve_ext_ref=None):
 488         ret = list(self.iterentries(resolve_ext_ref))
 489         ret.sort()
 490         return ret
 491
 492     def create_index_v1(self, filename, resolve_ext_ref=None):
 493         entries = self.sorted_entries(resolve_ext_ref)
 494         write_pack_index_v1(filename, entries, self.calculate_checksum())
 495
 496     def create_index_v2(self, filename, resolve_ext_ref=None):
 497         entries = self.sorted_entries(resolve_ext_ref)
 498         write_pack_index_v2(filename, entries, self.calculate_checksum())
 499
 500     def get_stored_checksum(self):
 501         return self._stored_checksum
 502
 503     def check(self):
 504         return (self.calculate_checksum() == self.get_stored_checksum())
 505
 506     def get_object_at(self, offset):
 507         """Given an offset in to the packfile return the object that is there.
 508
 509         Using the associated index the location of an object can be looked up, and
 510         then the packfile can be asked directly for that object using this
 511         function.
 512         """
 513         if offset in self._offset_cache:
 514             return self._offset_cache[offset]
 515         assert isinstance(offset, long) or isinstance(offset, int),\
 516                 "offset was %r" % offset
 517         assert offset >= self._header_size
 518         f = open(self._filename, 'rb')
 519         try:
 520             map, map_offset = simple_mmap(f, offset, self._size-offset)
 521             ret = unpack_object(map, map_offset)[:2]
 522             return ret
 523         finally:
 524             f.close()
 525
 526
 527 class SHA1Writer(object):
 528
 529     def __init__(self, f):
 530         self.f = f
 531         self.sha1 = make_sha("")
 532
 533     def write(self, data):
 534         self.sha1.update(data)
 535         self.f.write(data)
 536
 537     def write_sha(self):
 538         sha = self.sha1.digest()
 539         assert len(sha) == 20
 540         self.f.write(sha)
 541         return sha
 542
 543     def close(self):
 544         sha = self.write_sha()
 545         self.f.close()
 546         return sha
 547
 548     def tell(self):
 549         return self.f.tell()
 550
 551
 552 def write_pack_object(f, type, object):
 553     """Write pack object to a file.
 554
 555     :param f: File to write to
 556     :param o: Object to write
 557     :return: Tuple with offset at which the object was written, and crc32
 558     """
 559     ret = f.tell()
 560     packed_data_hdr = ""
 561     if type == 6: # ref delta
 562         (delta_base_offset, object) = object
 563     elif type == 7: # offset delta
 564         (basename, object) = object
 565     size = len(object)
 566     c = (type << 4) | (size & 15)
 567     size >>= 4
 568     while size:
 569         packed_data_hdr += (chr(c | 0x80))
 570         c = size & 0x7f
 571         size >>= 7
 572     packed_data_hdr += chr(c)
 573     if type == 6: # offset delta
 574         ret = [delta_base_offset & 0x7f]
 575         delta_base_offset >>= 7
 576         while delta_base_offset:
 577             delta_base_offset -= 1
 578             ret.insert(0, 0x80 | (delta_base_offset & 0x7f))
 579             delta_base_offset >>= 7
 580         packed_data_hdr += "".join([chr(x) for x in ret])
 581     elif type == 7: # ref delta
 582         assert len(basename) == 20
 583         packed_data_hdr += basename
 584     packed_data = packed_data_hdr + zlib.compress(object)
 585     f.write(packed_data)
 586     return (f.tell(), (zlib.crc32(packed_data) & 0xffffffff))
 587
 588
 589 def write_pack(filename, objects, num_objects):
 590     f = open(filename + ".pack", 'w')
 591     try:
 592         entries, data_sum = write_pack_data(f, objects, num_objects)
 593     finally:
 594         f.close()
 595     entries.sort()
 596     write_pack_index_v2(filename + ".idx", entries, data_sum)
 597
 598
 599 def write_pack_data(f, objects, num_objects, window=10):
 600     """Write a new pack file.
 601
 602     :param filename: The filename of the new pack file.
 603     :param objects: List of objects to write (tuples with object and path)
 604     :return: List with (name, offset, crc32 checksum) entries, pack checksum
 605     """
 606     recency = list(objects)
 607     # FIXME: Somehow limit delta depth
 608     # FIXME: Make thin-pack optional (its not used when cloning a pack)
 609     # Build a list of objects ordered by the magic Linus heuristic
 610     # This helps us find good objects to diff against us
 611     magic = []
 612     for obj, path in recency:
 613         magic.append( (obj.type, path, 1, -len(obj.as_raw_string()[1]), obj) )
 614     magic.sort()
 615     # Build a map of objects and their index in magic - so we can find preceeding objects
 616     # to diff against
 617     offs = {}
 618     for i in range(len(magic)):
 619         offs[magic[i][4]] = i
 620     # Write the pack
 621     entries = []
 622     f = SHA1Writer(f)
 623     f.write("PACK")               # Pack header
 624     f.write(struct.pack(">L", 2)) # Pack version
 625     f.write(struct.pack(">L", num_objects)) # Number of objects in pack
 626     for o, path in recency:
 627         sha1 = o.sha().digest()
 628         orig_t, raw = o.as_raw_string()
 629         winner = raw
 630         t = orig_t
 631         #for i in range(offs[o]-window, window):
 632         #    if i < 0 or i >= len(offs): continue
 633         #    b = magic[i][4]
 634         #    if b.type != orig_t: continue
 635         #    _, base = b.as_raw_string()
 636         #    delta = create_delta(base, raw)
 637         #    if len(delta) < len(winner):
 638         #        winner = delta
 639         #        t = 6 if magic[i][2] == 1 else 7
 640         offset, crc32 = write_pack_object(f, t, winner)
 641         entries.append((sha1, offset, crc32))
 642     return entries, f.write_sha()
 643
 644
 645 def write_pack_index_v1(filename, entries, pack_checksum):
 646     """Write a new pack index file.
 647
 648     :param filename: The filename of the new pack index file.
 649     :param entries: List of tuples with object name (sha), offset_in_pack,  and
 650             crc32_checksum.
 651     :param pack_checksum: Checksum of the pack file.
 652     """
 653     f = open(filename, 'w')
 654     f = SHA1Writer(f)
 655     fan_out_table = defaultdict(lambda: 0)
 656     for (name, offset, entry_checksum) in entries:
 657         fan_out_table[ord(name[0])] += 1
 658     # Fan-out table
 659     for i in range(0x100):
 660         f.write(struct.pack(">L", fan_out_table[i]))
 661         fan_out_table[i+1] += fan_out_table[i]
 662     for (name, offset, entry_checksum) in entries:
 663         f.write(struct.pack(">L20s", offset, name))
 664     assert len(pack_checksum) == 20
 665     f.write(pack_checksum)
 666     f.close()
 667
 668
 669 def create_delta(base_buf, target_buf):
 670     """Use python difflib to work out how to transform base_buf to target_buf"""
 671     assert isinstance(base_buf, str)
 672     assert isinstance(target_buf, str)
 673     out_buf = ""
 674     # write delta header
 675     def encode_size(size):
 676         ret = ""
 677         c = size & 0x7f
 678         size >>= 7
 679         while size:
 680             ret += chr(c | 0x80)
 681             c = size & 0x7f
 682             size >>= 7
 683         ret += chr(c)
 684         return ret
 685     out_buf += encode_size(len(base_buf))
 686     out_buf += encode_size(len(target_buf))
 687     # write out delta opcodes
 688     seq = difflib.SequenceMatcher(a=base_buf, b=target_buf)
 689     for opcode, i1, i2, j1, j2 in seq.get_opcodes():
 690         # Git patch opcodes don't care about deletes!
 691         #if opcode == "replace" or opcode == "delete":
 692         #    pass
 693         if opcode == "equal":
 694             # If they are equal, unpacker will use data from base_buf
 695             # Write out an opcode that says what range to use
 696             scratch = ""
 697             op = 0x80
 698             o = i1
 699             for i in range(4):
 700                 if o & 0xff << i*8:
 701                     scratch += chr(o >> i)
 702                     op |= 1 << i
 703             s = i2 - i1
 704             for i in range(2):
 705                 if s & 0xff << i*8:
 706                     scratch += chr(s >> i)
 707                     op |= 1 << (4+i)
 708             out_buf += chr(op)
 709             out_buf += scratch
 710         if opcode == "replace" or opcode == "insert":
 711             # If we are replacing a range or adding one, then we just
 712             # output it to the stream (prefixed by its size)
 713             s = j2 - j1
 714             o = j1
 715             while s > 127:
 716                 out_buf += chr(127)
 717                 out_buf += target_buf[o:o+127]
 718                 s -= 127
 719                 o += 127
 720             out_buf += chr(s)
 721             out_buf += target_buf[o:o+s]
 722     return out_buf
 723
 724
 725 def apply_delta(src_buf, delta):
 726     """Based on the similar function in git's patch-delta.c.
 727
 728     :param src_buf: Source buffer
 729     :param delta: Delta instructions
 730     """
 731     assert isinstance(src_buf, str), "was %r" % (src_buf,)
 732     assert isinstance(delta, str)
 733     out = []
 734     index = 0
 735     delta_length = len(delta)
 736     def get_delta_header_size(delta, index):
 737         size = 0
 738         i = 0
 739         while delta:
 740             cmd = ord(delta[index])
 741             index += 1
 742             size |= (cmd & ~0x80) << i
 743             i += 7
 744             if not cmd & 0x80:
 745                 break
 746         return size, index
 747     src_size, index = get_delta_header_size(delta, index)
 748     dest_size, index = get_delta_header_size(delta, index)
 749     assert src_size == len(src_buf), "%d vs %d" % (src_size, len(src_buf))
 750     while index < delta_length:
 751         cmd = ord(delta[index])
 752         index += 1
 753         if cmd & 0x80:
 754             cp_off = 0
 755             for i in range(4):
 756                 if cmd & (1 << i):
 757                     x = ord(delta[index])
 758                     index += 1
 759                     cp_off |= x << (i * 8)
 760             cp_size = 0
 761             for i in range(3):
 762                 if cmd & (1 << (4+i)):
 763                     x = ord(delta[index])
 764                     index += 1
 765                     cp_size |= x << (i * 8)
 766             if cp_size == 0:
 767                 cp_size = 0x10000
 768             if (cp_off + cp_size < cp_size or
 769                 cp_off + cp_size > src_size or
 770                 cp_size > dest_size):
 771                 break
 772             out.append(src_buf[cp_off:cp_off+cp_size])
 773         elif cmd != 0:
 774             out.append(delta[index:index+cmd])
 775             index += cmd
 776         else:
 777             raise ApplyDeltaError("Invalid opcode 0")
 778
 779     if index != delta_length:
 780         raise ApplyDeltaError("delta not empty: %r" % delta[index:])
 781
 782     out = ''.join(out)
 783     if dest_size != len(out):
 784         raise ApplyDeltaError("dest size incorrect")
 785
 786     return out
 787
 788
 789 def write_pack_index_v2(filename, entries, pack_checksum):
 790     """Write a new pack index file.
 791
 792     :param filename: The filename of the new pack index file.
 793     :param entries: List of tuples with object name (sha), offset_in_pack,  and
 794             crc32_checksum.
 795     :param pack_checksum: Checksum of the pack file.
 796     """
 797     f = open(filename, 'w')
 798     f = SHA1Writer(f)
 799     f.write('\377tOc') # Magic!
 800     f.write(struct.pack(">L", 2))
 801     fan_out_table = defaultdict(lambda: 0)
 802     for (name, offset, entry_checksum) in entries:
 803         fan_out_table[ord(name[0])] += 1
 804     # Fan-out table
 805     for i in range(0x100):
 806         f.write(struct.pack(">L", fan_out_table[i]))
 807         fan_out_table[i+1] += fan_out_table[i]
 808     for (name, offset, entry_checksum) in entries:
 809         f.write(name)
 810     for (name, offset, entry_checksum) in entries:
 811         f.write(struct.pack(">L", entry_checksum))
 812     for (name, offset, entry_checksum) in entries:
 813         # FIXME: handle if MSBit is set in offset
 814         f.write(struct.pack(">L", offset))
 815     # FIXME: handle table for pack files > 8 Gb
 816     assert len(pack_checksum) == 20
 817     f.write(pack_checksum)
 818     f.close()
 819
 820
 821 class Pack(object):
 822
 823     def __init__(self, basename):
 824         self._basename = basename
 825         self._data_path = self._basename + ".pack"
 826         self._idx_path = self._basename + ".idx"
 827         self._data = None
 828         self._idx = None
 829
 830     def name(self):
 831         """The SHA over the SHAs of the objects in this pack."""
 832         return self.idx.objects_sha1()
 833
 834     @property
 835     def data(self):
 836         if self._data is None:
 837             self._data = PackData(self._data_path)
 838             assert len(self.idx) == len(self._data)
 839             idx_stored_checksum = self.idx.get_pack_checksum()
 840             data_stored_checksum = self._data.get_stored_checksum()
 841             if idx_stored_checksum != data_stored_checksum:
 842                 raise ChecksumMismatch(sha_to_hex(idx_stored_checksum),
 843                                        sha_to_hex(data_stored_checksum))
 844         return self._data
 845
 846     @property
 847     def idx(self):
 848         if self._idx is None:
 849             self._idx = PackIndex(self._idx_path)
 850         return self._idx
 851
 852     def close(self):
 853         if self._data is not None:
 854             self._data.close()
 855         self.idx.close()
 856
 857     def __eq__(self, other):
 858         return type(self) == type(other) and self.idx == other.idx
 859
 860     def __len__(self):
 861         """Number of entries in this pack."""
 862         return len(self.idx)
 863
 864     def __repr__(self):
 865         return "Pack(%r)" % self._basename
 866
 867     def __iter__(self):
 868         """Iterate over all the sha1s of the objects in this pack."""
 869         return iter(self.idx)
 870
 871     def check(self):
 872         if not self.idx.check():
 873             return False
 874         if not self.data.check():
 875             return False
 876         return True
 877
 878     def get_stored_checksum(self):
 879         return self.data.get_stored_checksum()
 880
 881     def __contains__(self, sha1):
 882         """Check whether this pack contains a particular SHA1."""
 883         return (self.idx.object_index(sha1) is not None)
 884
 885     def get_raw(self, sha1, resolve_ref=None):
 886         offset = self.idx.object_index(sha1)
 887         if offset is None:
 888             raise KeyError(sha1)
 889         type, obj = self.data.get_object_at(offset)
 890         if isinstance(offset, long):
 891           offset = int(offset)
 892         if resolve_ref is None:
 893             resolve_ref = self.get_raw
 894         assert isinstance(offset, int)
 895         return self.data.resolve_object(offset, type, obj, resolve_ref)
 896
 897     def __getitem__(self, sha1):
 898         """Retrieve the specified SHA1."""
 899         type, uncomp = self.get_raw(sha1)
 900         return ShaFile.from_raw_string(type, uncomp)
 901
 902     def iterobjects(self, get_raw=None):
 903         if get_raw is None:
 904             get_raw = self.get_raw
 905         for offset, type, obj, crc32 in self.data.iterobjects():
 906             assert isinstance(offset, int)
 907             yield ShaFile.from_raw_string(
 908                     *self.data.resolve_object(offset, type, obj, get_raw))
 909
 910
 911 def load_packs(path):
 912     if not os.path.exists(path):
 913         return
 914     for name in os.listdir(path):
 915         if name.startswith("pack-") and name.endswith(".pack"):
 916             yield Pack(os.path.join(path, name[:-len(".pack")]))
 917
 918
 919 try:
 920     from dulwich._pack import apply_delta
 921 except ImportError:
 922     pass