dulwich/pack.py

   1 # pack.py -- For dealing with packed git objects.
   2 # Copyright (C) 2007 James Westby <jw+debian@jameswestby.net>
   3 # Copyright (C) 2008-2009 Jelmer Vernooij <jelmer@samba.org>
   4 #
   5 # This program is free software; you can redistribute it and/or
   6 # modify it under the terms of the GNU General Public License
   7 # as published by the Free Software Foundation; version 2
   8 # of the License or (at your option) a later version.
   9 #
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 # GNU General Public License for more details.
  14 #
  15 # You should have received a copy of the GNU General Public License
  16 # along with this program; if not, write to the Free Software
  17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
  18 # MA  02110-1301, USA.
  19
  20 """Classes for dealing with packed git objects.
  21
  22 A pack is a compact representation of a bunch of objects, stored
  23 using deltas where possible.
  24
  25 They have two parts, the pack file, which stores the data, and an index
  26 that tells you where the data is.
  27
  28 To find an object you look in all of the index files 'til you find a
  29 match for the object name. You then use the pointer got from this as
  30 a pointer in to the corresponding packfile.
  31 """
  32
  33 try:
  34     from collections import defaultdict
  35 except ImportError:
  36     from dulwich._compat import defaultdict
  37
  38 import binascii
  39 from cStringIO import (
  40     StringIO,
  41     )
  42 from collections import (
  43     deque,
  44     )
  45 import difflib
  46 from itertools import (
  47     chain,
  48     imap,
  49     izip,
  50     )
  51 try:
  52     import mmap
  53 except ImportError:
  54     has_mmap = False
  55 else:
  56     has_mmap = True
  57 import os
  58 import struct
  59 try:
  60     from struct import unpack_from
  61 except ImportError:
  62     from dulwich._compat import unpack_from
  63 import sys
  64 import warnings
  65 import zlib
  66
  67 from dulwich.errors import (
  68     ApplyDeltaError,
  69     ChecksumMismatch,
  70     )
  71 from dulwich.file import GitFile
  72 from dulwich.lru_cache import (
  73     LRUSizeCache,
  74     )
  75 from dulwich._compat import (
  76     make_sha,
  77     SEEK_CUR,
  78     SEEK_END,
  79     )
  80 from dulwich.objects import (
  81     ShaFile,
  82     hex_to_sha,
  83     sha_to_hex,
  84     object_header,
  85     )
  86
  87 supports_mmap_offset = (sys.version_info[0] >= 3 or
  88         (sys.version_info[0] == 2 and sys.version_info[1] >= 6))
  89
  90
  91 OFS_DELTA = 6
  92 REF_DELTA = 7
  93
  94 DELTA_TYPES = (OFS_DELTA, REF_DELTA)
  95
  96
  97 def take_msb_bytes(read, crc32=None):
  98     """Read bytes marked with most significant bit.
  99
 100     :param read: Read function
 101     """
 102     ret = []
 103     while len(ret) == 0 or ret[-1] & 0x80:
 104         b = read(1)
 105         if crc32 is not None:
 106             crc32 = binascii.crc32(b, crc32)
 107         ret.append(ord(b))
 108     return ret, crc32
 109
 110
 111 class UnpackedObject(object):
 112     """Class encapsulating an object unpacked from a pack file.
 113
 114     These objects should only be created from within unpack_object. Most
 115     members start out as empty and are filled in at various points by
 116     read_zlib_chunks, unpack_object, DeltaChainIterator, etc.
 117
 118     End users of this object should take care that the function they're getting
 119     this object from is guaranteed to set the members they need.
 120     """
 121
 122     __slots__ = [
 123       'offset',         # Offset in its pack.
 124       '_sha',           # Cached binary SHA.
 125       'obj_type_num',   # Type of this object.
 126       'obj_chunks',     # Decompressed and delta-resolved chunks.
 127       'pack_type_num',  # Type of this object in the pack (may be a delta).
 128       'delta_base',     # Delta base offset or SHA.
 129       'comp_chunks',    # Compressed object chunks.
 130       'decomp_chunks',  # Decompressed object chunks.
 131       'decomp_len',     # Decompressed length of this object.
 132       'crc32',          # CRC32.
 133       ]
 134
 135     # TODO(dborowitz): read_zlib_chunks and unpack_object could very well be
 136     # methods of this object.
 137     def __init__(self, pack_type_num, delta_base, decomp_len, crc32):
 138         self.offset = None
 139         self._sha = None
 140         self.pack_type_num = pack_type_num
 141         self.delta_base = delta_base
 142         self.comp_chunks = None
 143         self.decomp_chunks = []
 144         self.decomp_len = decomp_len
 145         self.crc32 = crc32
 146
 147         if pack_type_num in DELTA_TYPES:
 148             self.obj_type_num = None
 149             self.obj_chunks = None
 150         else:
 151             self.obj_type_num = pack_type_num
 152             self.obj_chunks = self.decomp_chunks
 153             self.delta_base = delta_base
 154
 155     def sha(self):
 156         """Return the binary SHA of this object."""
 157         if self._sha is None:
 158             self._sha = obj_sha(self.obj_type_num, self.obj_chunks)
 159         return self._sha
 160
 161     def sha_file(self):
 162         """Return a ShaFile from this object."""
 163         return ShaFile.from_raw_chunks(self.obj_type_num, self.obj_chunks)
 164
 165     # Only provided for backwards compatibility with code that expects either
 166     # chunks or a delta tuple.
 167     def _obj(self):
 168         """Return the decompressed chunks, or (delta base, delta chunks)."""
 169         if self.pack_type_num in DELTA_TYPES:
 170             return (self.delta_base, self.decomp_chunks)
 171         else:
 172             return self.decomp_chunks
 173
 174     def __eq__(self, other):
 175         if not isinstance(other, UnpackedObject):
 176             return False
 177         for slot in self.__slots__:
 178             if getattr(self, slot) != getattr(other, slot):
 179                 return False
 180         return True
 181
 182     def __ne__(self, other):
 183         return not (self == other)
 184
 185     def __repr__(self):
 186         data = ['%s=%r' % (s, getattr(self, s)) for s in self.__slots__]
 187         return '%s(%s)' % (self.__class__.__name__, ', '.join(data))
 188
 189
 190 _ZLIB_BUFSIZE = 4096
 191
 192
 193 def read_zlib_chunks(read_some, unpacked, include_comp=False,
 194                      buffer_size=_ZLIB_BUFSIZE):
 195     """Read zlib data from a buffer.
 196
 197     This function requires that the buffer have additional data following the
 198     compressed data, which is guaranteed to be the case for git pack files.
 199
 200     :param read_some: Read function that returns at least one byte, but may
 201         return less than the requested size.
 202     :param unpacked: An UnpackedObject to write result data to. If its crc32
 203         attr is not None, the CRC32 of the compressed bytes will be computed
 204         using this starting CRC32.
 205         After this function, will have the following attrs set:
 206         * comp_chunks    (if include_comp is True)
 207         * decomp_chunks
 208         * decomp_len
 209         * crc32
 210     :param include_comp: If True, include compressed data in the result.
 211     :param buffer_size: Size of the read buffer.
 212     :return: Leftover unused data from the decompression.
 213     :raise zlib.error: if a decompression error occurred.
 214     """
 215     if unpacked.decomp_len <= -1:
 216         raise ValueError('non-negative zlib data stream size expected')
 217     decomp_obj = zlib.decompressobj()
 218
 219     comp_chunks = []
 220     decomp_chunks = unpacked.decomp_chunks
 221     decomp_len = 0
 222     crc32 = unpacked.crc32
 223
 224     while True:
 225         add = read_some(buffer_size)
 226         if not add:
 227             raise zlib.error('EOF before end of zlib stream')
 228         comp_chunks.append(add)
 229         decomp = decomp_obj.decompress(add)
 230         decomp_len += len(decomp)
 231         decomp_chunks.append(decomp)
 232         unused = decomp_obj.unused_data
 233         if unused:
 234             left = len(unused)
 235             if crc32 is not None:
 236                 crc32 = binascii.crc32(add[:-left], crc32)
 237             if include_comp:
 238                 comp_chunks[-1] = add[:-left]
 239             break
 240         elif crc32 is not None:
 241             crc32 = binascii.crc32(add, crc32)
 242     if crc32 is not None:
 243         crc32 &= 0xffffffff
 244
 245     if decomp_len != unpacked.decomp_len:
 246         raise zlib.error('decompressed data does not match expected size')
 247
 248     unpacked.crc32 = crc32
 249     if include_comp:
 250         unpacked.comp_chunks = comp_chunks
 251     return unused
 252
 253
 254 def iter_sha1(iter):
 255     """Return the hexdigest of the SHA1 over a set of names.
 256
 257     :param iter: Iterator over string objects
 258     :return: 40-byte hex sha1 digest
 259     """
 260     sha1 = make_sha()
 261     for name in iter:
 262         sha1.update(name)
 263     return sha1.hexdigest()
 264
 265
 266 def load_pack_index(path):
 267     """Load an index file by path.
 268
 269     :param filename: Path to the index file
 270     :return: A PackIndex loaded from the given path
 271     """
 272     f = GitFile(path, 'rb')
 273     try:
 274         return load_pack_index_file(path, f)
 275     finally:
 276         f.close()
 277
 278
 279 def _load_file_contents(f, size=None):
 280     fileno = getattr(f, 'fileno', None)
 281     # Attempt to use mmap if possible
 282     if fileno is not None:
 283         fd = f.fileno()
 284         if size is None:
 285             size = os.fstat(fd).st_size
 286         if has_mmap:
 287             try:
 288                 contents = mmap.mmap(fd, size, access=mmap.ACCESS_READ)
 289             except mmap.error:
 290                 # Perhaps a socket?
 291                 pass
 292             else:
 293                 return contents, size
 294     contents = f.read()
 295     size = len(contents)
 296     return contents, size
 297
 298
 299 def load_pack_index_file(path, f):
 300     """Load an index file from a file-like object.
 301
 302     :param path: Path for the index file
 303     :param f: File-like object
 304     :return: A PackIndex loaded from the given file
 305     """
 306     contents, size = _load_file_contents(f)
 307     if contents[:4] == '\377tOc':
 308         version = struct.unpack('>L', contents[4:8])[0]
 309         if version == 2:
 310             return PackIndex2(path, file=f, contents=contents,
 311                 size=size)
 312         else:
 313             raise KeyError('Unknown pack index format %d' % version)
 314     else:
 315         return PackIndex1(path, file=f, contents=contents, size=size)
 316
 317
 318 def bisect_find_sha(start, end, sha, unpack_name):
 319     """Find a SHA in a data blob with sorted SHAs.
 320
 321     :param start: Start index of range to search
 322     :param end: End index of range to search
 323     :param sha: Sha to find
 324     :param unpack_name: Callback to retrieve SHA by index
 325     :return: Index of the SHA, or None if it wasn't found
 326     """
 327     assert start <= end
 328     while start <= end:
 329         i = (start + end)/2
 330         file_sha = unpack_name(i)
 331         x = cmp(file_sha, sha)
 332         if x < 0:
 333             start = i + 1
 334         elif x > 0:
 335             end = i - 1
 336         else:
 337             return i
 338     return None
 339
 340
 341 class PackIndex(object):
 342     """An index in to a packfile.
 343
 344     Given a sha id of an object a pack index can tell you the location in the
 345     packfile of that object if it has it.
 346     """
 347
 348     def __eq__(self, other):
 349         if not isinstance(other, PackIndex):
 350             return False
 351
 352         for (name1, _, _), (name2, _, _) in izip(self.iterentries(),
 353                                                  other.iterentries()):
 354             if name1 != name2:
 355                 return False
 356         return True
 357
 358     def __ne__(self, other):
 359         return not self.__eq__(other)
 360
 361     def __len__(self):
 362         """Return the number of entries in this pack index."""
 363         raise NotImplementedError(self.__len__)
 364
 365     def __iter__(self):
 366         """Iterate over the SHAs in this pack."""
 367         return imap(sha_to_hex, self._itersha())
 368
 369     def iterentries(self):
 370         """Iterate over the entries in this pack index.
 371
 372         :return: iterator over tuples with object name, offset in packfile and
 373             crc32 checksum.
 374         """
 375         raise NotImplementedError(self.iterentries)
 376
 377     def get_pack_checksum(self):
 378         """Return the SHA1 checksum stored for the corresponding packfile.
 379
 380         :return: 20-byte binary digest
 381         """
 382         raise NotImplementedError(self.get_pack_checksum)
 383
 384     def object_index(self, sha):
 385         """Return the index in to the corresponding packfile for the object.
 386
 387         Given the name of an object it will return the offset that object
 388         lives at within the corresponding pack file. If the pack file doesn't
 389         have the object then None will be returned.
 390         """
 391         if len(sha) == 40:
 392             sha = hex_to_sha(sha)
 393         return self._object_index(sha)
 394
 395     def _object_index(self, sha):
 396         """See object_index.
 397
 398         :param sha: A *binary* SHA string. (20 characters long)_
 399         """
 400         raise NotImplementedError(self._object_index)
 401
 402     def objects_sha1(self):
 403         """Return the hex SHA1 over all the shas of all objects in this pack.
 404
 405         :note: This is used for the filename of the pack.
 406         """
 407         return iter_sha1(self._itersha())
 408
 409     def _itersha(self):
 410         """Yield all the SHA1's of the objects in the index, sorted."""
 411         raise NotImplementedError(self._itersha)
 412
 413
 414 class MemoryPackIndex(PackIndex):
 415     """Pack index that is stored entirely in memory."""
 416
 417     def __init__(self, entries, pack_checksum=None):
 418         """Create a new MemoryPackIndex.
 419
 420         :param entries: Sequence of name, idx, crc32 (sorted)
 421         :param pack_checksum: Optional pack checksum
 422         """
 423         self._by_sha = {}
 424         for name, idx, crc32 in entries:
 425             self._by_sha[name] = idx
 426         self._entries = entries
 427         self._pack_checksum = pack_checksum
 428
 429     def get_pack_checksum(self):
 430         return self._pack_checksum
 431
 432     def __len__(self):
 433         return len(self._entries)
 434
 435     def _object_index(self, sha):
 436         return self._by_sha[sha][0]
 437
 438     def _itersha(self):
 439         return iter(self._by_sha)
 440
 441     def iterentries(self):
 442         return iter(self._entries)
 443
 444
 445 class FilePackIndex(PackIndex):
 446     """Pack index that is based on a file.
 447
 448     To do the loop it opens the file, and indexes first 256 4 byte groups
 449     with the first byte of the sha id. The value in the four byte group indexed
 450     is the end of the group that shares the same starting byte. Subtract one
 451     from the starting byte and index again to find the start of the group.
 452     The values are sorted by sha id within the group, so do the math to find
 453     the start and end offset and then bisect in to find if the value is present.
 454     """
 455
 456     def __init__(self, filename, file=None, contents=None, size=None):
 457         """Create a pack index object.
 458
 459         Provide it with the name of the index file to consider, and it will map
 460         it whenever required.
 461         """
 462         self._filename = filename
 463         # Take the size now, so it can be checked each time we map the file to
 464         # ensure that it hasn't changed.
 465         if file is None:
 466             self._file = GitFile(filename, 'rb')
 467         else:
 468             self._file = file
 469         if contents is None:
 470             self._contents, self._size = _load_file_contents(self._file, size)
 471         else:
 472             self._contents, self._size = (contents, size)
 473
 474     def __eq__(self, other):
 475         # Quick optimization:
 476         if (isinstance(other, FilePackIndex) and
 477             self._fan_out_table != other._fan_out_table):
 478             return False
 479
 480         return super(FilePackIndex, self).__eq__(other)
 481
 482     def close(self):
 483         self._file.close()
 484         if getattr(self._contents, "close", None) is not None:
 485             self._contents.close()
 486
 487     def __len__(self):
 488         """Return the number of entries in this pack index."""
 489         return self._fan_out_table[-1]
 490
 491     def _unpack_entry(self, i):
 492         """Unpack the i-th entry in the index file.
 493
 494         :return: Tuple with object name (SHA), offset in pack file and CRC32
 495             checksum (if known).
 496         """
 497         raise NotImplementedError(self._unpack_entry)
 498
 499     def _unpack_name(self, i):
 500         """Unpack the i-th name from the index file."""
 501         raise NotImplementedError(self._unpack_name)
 502
 503     def _unpack_offset(self, i):
 504         """Unpack the i-th object offset from the index file."""
 505         raise NotImplementedError(self._unpack_offset)
 506
 507     def _unpack_crc32_checksum(self, i):
 508         """Unpack the crc32 checksum for the i-th object from the index file."""
 509         raise NotImplementedError(self._unpack_crc32_checksum)
 510
 511     def _itersha(self):
 512         for i in range(len(self)):
 513             yield self._unpack_name(i)
 514
 515     def iterentries(self):
 516         """Iterate over the entries in this pack index.
 517
 518         :return: iterator over tuples with object name, offset in packfile and
 519             crc32 checksum.
 520         """
 521         for i in range(len(self)):
 522             yield self._unpack_entry(i)
 523
 524     def _read_fan_out_table(self, start_offset):
 525         ret = []
 526         for i in range(0x100):
 527             fanout_entry = self._contents[start_offset+i*4:start_offset+(i+1)*4]
 528             ret.append(struct.unpack('>L', fanout_entry)[0])
 529         return ret
 530
 531     def check(self):
 532         """Check that the stored checksum matches the actual checksum."""
 533         actual = self.calculate_checksum()
 534         stored = self.get_stored_checksum()
 535         if actual != stored:
 536             raise ChecksumMismatch(stored, actual)
 537
 538     def calculate_checksum(self):
 539         """Calculate the SHA1 checksum over this pack index.
 540
 541         :return: This is a 20-byte binary digest
 542         """
 543         return make_sha(self._contents[:-20]).digest()
 544
 545     def get_pack_checksum(self):
 546         """Return the SHA1 checksum stored for the corresponding packfile.
 547
 548         :return: 20-byte binary digest
 549         """
 550         return str(self._contents[-40:-20])
 551
 552     def get_stored_checksum(self):
 553         """Return the SHA1 checksum stored for this index.
 554
 555         :return: 20-byte binary digest
 556         """
 557         return str(self._contents[-20:])
 558
 559     def _object_index(self, sha):
 560         """See object_index.
 561
 562         :param sha: A *binary* SHA string. (20 characters long)_
 563         """
 564         assert len(sha) == 20
 565         idx = ord(sha[0])
 566         if idx == 0:
 567             start = 0
 568         else:
 569             start = self._fan_out_table[idx-1]
 570         end = self._fan_out_table[idx]
 571         i = bisect_find_sha(start, end, sha, self._unpack_name)
 572         if i is None:
 573             raise KeyError(sha)
 574         return self._unpack_offset(i)
 575
 576
 577 class PackIndex1(FilePackIndex):
 578     """Version 1 Pack Index file."""
 579
 580     def __init__(self, filename, file=None, contents=None, size=None):
 581         super(PackIndex1, self).__init__(filename, file, contents, size)
 582         self.version = 1
 583         self._fan_out_table = self._read_fan_out_table(0)
 584
 585     def _unpack_entry(self, i):
 586         (offset, name) = unpack_from('>L20s', self._contents,
 587                                      (0x100 * 4) + (i * 24))
 588         return (name, offset, None)
 589
 590     def _unpack_name(self, i):
 591         offset = (0x100 * 4) + (i * 24) + 4
 592         return self._contents[offset:offset+20]
 593
 594     def _unpack_offset(self, i):
 595         offset = (0x100 * 4) + (i * 24)
 596         return unpack_from('>L', self._contents, offset)[0]
 597
 598     def _unpack_crc32_checksum(self, i):
 599         # Not stored in v1 index files
 600         return None
 601
 602
 603 class PackIndex2(FilePackIndex):
 604     """Version 2 Pack Index file."""
 605
 606     def __init__(self, filename, file=None, contents=None, size=None):
 607         super(PackIndex2, self).__init__(filename, file, contents, size)
 608         if self._contents[:4] != '\377tOc':
 609             raise AssertionError('Not a v2 pack index file')
 610         (self.version, ) = unpack_from('>L', self._contents, 4)
 611         if self.version != 2:
 612             raise AssertionError('Version was %d' % self.version)
 613         self._fan_out_table = self._read_fan_out_table(8)
 614         self._name_table_offset = 8 + 0x100 * 4
 615         self._crc32_table_offset = self._name_table_offset + 20 * len(self)
 616         self._pack_offset_table_offset = (self._crc32_table_offset +
 617                                           4 * len(self))
 618         self._pack_offset_largetable_offset = (self._pack_offset_table_offset +
 619                                           4 * len(self))
 620
 621     def _unpack_entry(self, i):
 622         return (self._unpack_name(i), self._unpack_offset(i),
 623                 self._unpack_crc32_checksum(i))
 624
 625     def _unpack_name(self, i):
 626         offset = self._name_table_offset + i * 20
 627         return self._contents[offset:offset+20]
 628
 629     def _unpack_offset(self, i):
 630         offset = self._pack_offset_table_offset + i * 4
 631         offset = unpack_from('>L', self._contents, offset)[0]
 632         if offset & (2**31):
 633             offset = self._pack_offset_largetable_offset + (offset&(2**31-1)) * 8L
 634             offset = unpack_from('>Q', self._contents, offset)[0]
 635         return offset
 636
 637     def _unpack_crc32_checksum(self, i):
 638         return unpack_from('>L', self._contents,
 639                           self._crc32_table_offset + i * 4)[0]
 640
 641
 642 def read_pack_header(read):
 643     """Read the header of a pack file.
 644
 645     :param read: Read function
 646     :return: Tuple of (pack version, number of objects). If no data is available
 647         to read, returns (None, None).
 648     """
 649     header = read(12)
 650     if not header:
 651         return None, None
 652     if header[:4] != 'PACK':
 653         raise AssertionError('Invalid pack header %r' % header)
 654     (version,) = unpack_from('>L', header, 4)
 655     if version not in (2, 3):
 656         raise AssertionError('Version was %d' % version)
 657     (num_objects,) = unpack_from('>L', header, 8)
 658     return (version, num_objects)
 659
 660
 661 def chunks_length(chunks):
 662     return sum(imap(len, chunks))
 663
 664
 665 def unpack_object(read_all, read_some=None, compute_crc32=False,
 666                   include_comp=False, zlib_bufsize=_ZLIB_BUFSIZE):
 667     """Unpack a Git object.
 668
 669     :param read_all: Read function that blocks until the number of requested
 670         bytes are read.
 671     :param read_some: Read function that returns at least one byte, but may not
 672         return the number of bytes requested.
 673     :param compute_crc32: If True, compute the CRC32 of the compressed data. If
 674         False, the returned CRC32 will be None.
 675     :param include_comp: If True, include compressed data in the result.
 676     :param zlib_bufsize: An optional buffer size for zlib operations.
 677     :return: A tuple of (unpacked, unused), where unused is the unused data
 678         leftover from decompression, and unpacked in an UnpackedObject with
 679         the following attrs set:
 680
 681         * obj_chunks     (for non-delta types)
 682         * pack_type_num
 683         * delta_base     (for delta types)
 684         * comp_chunks    (if include_comp is True)
 685         * decomp_chunks
 686         * decomp_len
 687         * crc32          (if compute_crc32 is True)
 688     """
 689     if read_some is None:
 690         read_some = read_all
 691     if compute_crc32:
 692         crc32 = 0
 693     else:
 694         crc32 = None
 695
 696     bytes, crc32 = take_msb_bytes(read_all, crc32=crc32)
 697     type_num = (bytes[0] >> 4) & 0x07
 698     size = bytes[0] & 0x0f
 699     for i, byte in enumerate(bytes[1:]):
 700         size += (byte & 0x7f) << ((i * 7) + 4)
 701
 702     raw_base = len(bytes)
 703     if type_num == OFS_DELTA:
 704         bytes, crc32 = take_msb_bytes(read_all, crc32=crc32)
 705         raw_base += len(bytes)
 706         if bytes[-1] & 0x80:
 707             raise AssertionError
 708         delta_base_offset = bytes[0] & 0x7f
 709         for byte in bytes[1:]:
 710             delta_base_offset += 1
 711             delta_base_offset <<= 7
 712             delta_base_offset += (byte & 0x7f)
 713         delta_base = delta_base_offset
 714     elif type_num == REF_DELTA:
 715         delta_base = read_all(20)
 716         if compute_crc32:
 717             crc32 = binascii.crc32(delta_base, crc32)
 718         raw_base += 20
 719     else:
 720         delta_base = None
 721
 722     unpacked = UnpackedObject(type_num, delta_base, size, crc32)
 723     unused = read_zlib_chunks(read_some, unpacked, buffer_size=zlib_bufsize,
 724                               include_comp=include_comp)
 725     return unpacked, unused
 726
 727
 728 def _compute_object_size((num, obj)):
 729     """Compute the size of a unresolved object for use with LRUSizeCache."""
 730     if num in DELTA_TYPES:
 731         return chunks_length(obj[1])
 732     return chunks_length(obj)
 733
 734
 735 class PackStreamReader(object):
 736     """Class to read a pack stream.
 737
 738     The pack is read from a ReceivableProtocol using read() or recv() as
 739     appropriate.
 740     """
 741
 742     def __init__(self, read_all, read_some=None, zlib_bufsize=_ZLIB_BUFSIZE):
 743         self.read_all = read_all
 744         if read_some is None:
 745             self.read_some = read_all
 746         else:
 747             self.read_some = read_some
 748         self.sha = make_sha()
 749         self._offset = 0
 750         self._rbuf = StringIO()
 751         # trailer is a deque to avoid memory allocation on small reads
 752         self._trailer = deque()
 753         self._zlib_bufsize = zlib_bufsize
 754
 755     def _read(self, read, size):
 756         """Read up to size bytes using the given callback.
 757
 758         As a side effect, update the verifier's hash (excluding the last 20
 759         bytes read).
 760
 761         :param read: The read callback to read from.
 762         :param size: The maximum number of bytes to read; the particular
 763             behavior is callback-specific.
 764         """
 765         data = read(size)
 766
 767         # maintain a trailer of the last 20 bytes we've read
 768         n = len(data)
 769         self._offset += n
 770         tn = len(self._trailer)
 771         if n >= 20:
 772             to_pop = tn
 773             to_add = 20
 774         else:
 775             to_pop = max(n + tn - 20, 0)
 776             to_add = n
 777         for _ in xrange(to_pop):
 778             self.sha.update(self._trailer.popleft())
 779         self._trailer.extend(data[-to_add:])
 780
 781         # hash everything but the trailer
 782         self.sha.update(data[:-to_add])
 783         return data
 784
 785     def _buf_len(self):
 786         buf = self._rbuf
 787         start = buf.tell()
 788         buf.seek(0, SEEK_END)
 789         end = buf.tell()
 790         buf.seek(start)
 791         return end - start
 792
 793     @property
 794     def offset(self):
 795         return self._offset - self._buf_len()
 796
 797     def read(self, size):
 798         """Read, blocking until size bytes are read."""
 799         buf_len = self._buf_len()
 800         if buf_len >= size:
 801             return self._rbuf.read(size)
 802         buf_data = self._rbuf.read()
 803         self._rbuf = StringIO()
 804         return buf_data + self._read(self.read_all, size - buf_len)
 805
 806     def recv(self, size):
 807         """Read up to size bytes, blocking until one byte is read."""
 808         buf_len = self._buf_len()
 809         if buf_len:
 810             data = self._rbuf.read(size)
 811             if size >= buf_len:
 812                 self._rbuf = StringIO()
 813             return data
 814         return self._read(self.read_some, size)
 815
 816     def __len__(self):
 817         return self._num_objects
 818
 819     def read_objects(self, compute_crc32=False):
 820         """Read the objects in this pack file.
 821
 822         :param compute_crc32: If True, compute the CRC32 of the compressed
 823             data. If False, the returned CRC32 will be None.
 824         :return: Iterator over UnpackedObjects with the following members set:
 825             offset
 826             obj_type_num
 827             obj_chunks (for non-delta types)
 828             delta_base (for delta types)
 829             decomp_chunks
 830             decomp_len
 831             crc32 (if compute_crc32 is True)
 832         :raise ChecksumMismatch: if the checksum of the pack contents does not
 833             match the checksum in the pack trailer.
 834         :raise zlib.error: if an error occurred during zlib decompression.
 835         :raise IOError: if an error occurred writing to the output file.
 836         """
 837         pack_version, self._num_objects = read_pack_header(self.read)
 838         if pack_version is None:
 839             return
 840
 841         for i in xrange(self._num_objects):
 842             offset = self.offset
 843             unpacked, unused = unpack_object(
 844               self.read, read_some=self.recv, compute_crc32=compute_crc32,
 845               zlib_bufsize=self._zlib_bufsize)
 846             unpacked.offset = offset
 847
 848             # prepend any unused data to current read buffer
 849             buf = StringIO()
 850             buf.write(unused)
 851             buf.write(self._rbuf.read())
 852             buf.seek(0)
 853             self._rbuf = buf
 854
 855             yield unpacked
 856
 857         if self._buf_len() < 20:
 858             # If the read buffer is full, then the last read() got the whole
 859             # trailer off the wire. If not, it means there is still some of the
 860             # trailer to read. We need to read() all 20 bytes; N come from the
 861             # read buffer and (20 - N) come from the wire.
 862             self.read(20)
 863
 864         pack_sha = ''.join(self._trailer)
 865         if pack_sha != self.sha.digest():
 866             raise ChecksumMismatch(sha_to_hex(pack_sha), self.sha.hexdigest())
 867
 868
 869 class PackStreamCopier(PackStreamReader):
 870     """Class to verify a pack stream as it is being read.
 871
 872     The pack is read from a ReceivableProtocol using read() or recv() as
 873     appropriate and written out to the given file-like object.
 874     """
 875
 876     def __init__(self, read_all, read_some, outfile, delta_iter=None):
 877         """Initialize the copier.
 878
 879         :param read_all: Read function that blocks until the number of requested
 880             bytes are read.
 881         :param read_some: Read function that returns at least one byte, but may
 882             not return the number of bytes requested.
 883         :param outfile: File-like object to write output through.
 884         :param delta_iter: Optional DeltaChainIterator to record deltas as we
 885             read them.
 886         """
 887         super(PackStreamCopier, self).__init__(read_all, read_some=read_some)
 888         self.outfile = outfile
 889         self._delta_iter = delta_iter
 890
 891     def _read(self, read, size):
 892         """Read data from the read callback and write it to the file."""
 893         data = super(PackStreamCopier, self)._read(read, size)
 894         self.outfile.write(data)
 895         return data
 896
 897     def verify(self):
 898         """Verify a pack stream and write it to the output file.
 899
 900         See PackStreamReader.iterobjects for a list of exceptions this may
 901         throw.
 902         """
 903         if self._delta_iter:
 904             for unpacked in self.read_objects():
 905                 self._delta_iter.record(unpacked)
 906         else:
 907             for _ in self.read_objects():
 908                 pass
 909
 910
 911 def obj_sha(type, chunks):
 912     """Compute the SHA for a numeric type and object chunks."""
 913     sha = make_sha()
 914     sha.update(object_header(type, chunks_length(chunks)))
 915     for chunk in chunks:
 916         sha.update(chunk)
 917     return sha.digest()
 918
 919
 920 def compute_file_sha(f, start_ofs=0, end_ofs=0, buffer_size=1<<16):
 921     """Hash a portion of a file into a new SHA.
 922
 923     :param f: A file-like object to read from that supports seek().
 924     :param start_ofs: The offset in the file to start reading at.
 925     :param end_ofs: The offset in the file to end reading at, relative to the
 926         end of the file.
 927     :param buffer_size: A buffer size for reading.
 928     :return: A new SHA object updated with data read from the file.
 929     """
 930     sha = make_sha()
 931     f.seek(0, SEEK_END)
 932     todo = f.tell() + end_ofs - start_ofs
 933     f.seek(start_ofs)
 934     while todo:
 935         data = f.read(min(todo, buffer_size))
 936         sha.update(data)
 937         todo -= len(data)
 938     return sha
 939
 940
 941 class PackData(object):
 942     """The data contained in a packfile.
 943
 944     Pack files can be accessed both sequentially for exploding a pack, and
 945     directly with the help of an index to retrieve a specific object.
 946
 947     The objects within are either complete or a delta aginst another.
 948
 949     The header is variable length. If the MSB of each byte is set then it
 950     indicates that the subsequent byte is still part of the header.
 951     For the first byte the next MS bits are the type, which tells you the type
 952     of object, and whether it is a delta. The LS byte is the lowest bits of the
 953     size. For each subsequent byte the LS 7 bits are the next MS bits of the
 954     size, i.e. the last byte of the header contains the MS bits of the size.
 955
 956     For the complete objects the data is stored as zlib deflated data.
 957     The size in the header is the uncompressed object size, so to uncompress
 958     you need to just keep feeding data to zlib until you get an object back,
 959     or it errors on bad data. This is done here by just giving the complete
 960     buffer from the start of the deflated object on. This is bad, but until I
 961     get mmap sorted out it will have to do.
 962
 963     Currently there are no integrity checks done. Also no attempt is made to
 964     try and detect the delta case, or a request for an object at the wrong
 965     position.  It will all just throw a zlib or KeyError.
 966     """
 967
 968     def __init__(self, filename, file=None, size=None):
 969         """Create a PackData object representing the pack in the given filename.
 970
 971         The file must exist and stay readable until the object is disposed of. It
 972         must also stay the same size. It will be mapped whenever needed.
 973
 974         Currently there is a restriction on the size of the pack as the python
 975         mmap implementation is flawed.
 976         """
 977         self._filename = filename
 978         self._size = size
 979         self._header_size = 12
 980         if file is None:
 981             self._file = GitFile(self._filename, 'rb')
 982         else:
 983             self._file = file
 984         (version, self._num_objects) = read_pack_header(self._file.read)
 985         self._offset_cache = LRUSizeCache(1024*1024*20,
 986             compute_size=_compute_object_size)
 987         self.pack = None
 988
 989     @classmethod
 990     def from_file(cls, file, size):
 991         return cls(str(file), file=file, size=size)
 992
 993     @classmethod
 994     def from_path(cls, path):
 995         return cls(filename=path)
 996
 997     def close(self):
 998         self._file.close()
 999
1000     def _get_size(self):
1001         if self._size is not None:
1002             return self._size
1003         self._size = os.path.getsize(self._filename)
1004         if self._size < self._header_size:
1005             errmsg = ('%s is too small for a packfile (%d < %d)' %
1006                       (self._filename, self._size, self._header_size))
1007             raise AssertionError(errmsg)
1008         return self._size
1009
1010     def __len__(self):
1011         """Returns the number of objects in this pack."""
1012         return self._num_objects
1013
1014     def calculate_checksum(self):
1015         """Calculate the checksum for this pack.
1016
1017         :return: 20-byte binary SHA1 digest
1018         """
1019         return compute_file_sha(self._file, end_ofs=-20).digest()
1020
1021     def get_ref(self, sha):
1022         """Get the object for a ref SHA, only looking in this pack."""
1023         # TODO: cache these results
1024         if self.pack is None:
1025             raise KeyError(sha)
1026         try:
1027             offset = self.pack.index.object_index(sha)
1028         except KeyError:
1029             offset = None
1030         if offset:
1031             type, obj = self.get_object_at(offset)
1032         elif self.pack is not None and self.pack.resolve_ext_ref:
1033             type, obj = self.pack.resolve_ext_ref(sha)
1034         else:
1035             raise KeyError(sha)
1036         return offset, type, obj
1037
1038     def resolve_object(self, offset, type, obj, get_ref=None):
1039         """Resolve an object, possibly resolving deltas when necessary.
1040
1041         :return: Tuple with object type and contents.
1042         """
1043         if type not in DELTA_TYPES:
1044             return type, obj
1045
1046         if get_ref is None:
1047             get_ref = self.get_ref
1048         if type == OFS_DELTA:
1049             (delta_offset, delta) = obj
1050             # TODO: clean up asserts and replace with nicer error messages
1051             assert isinstance(offset, int) or isinstance(offset, long)
1052             assert isinstance(delta_offset, int) or isinstance(offset, long)
1053             base_offset = offset-delta_offset
1054             type, base_obj = self.get_object_at(base_offset)
1055             assert isinstance(type, int)
1056         elif type == REF_DELTA:
1057             (basename, delta) = obj
1058             assert isinstance(basename, str) and len(basename) == 20
1059             base_offset, type, base_obj = get_ref(basename)
1060             assert isinstance(type, int)
1061         type, base_chunks = self.resolve_object(base_offset, type, base_obj)
1062         chunks = apply_delta(base_chunks, delta)
1063         # TODO(dborowitz): This can result in poor performance if large base
1064         # objects are separated from deltas in the pack. We should reorganize
1065         # so that we apply deltas to all objects in a chain one after the other
1066         # to optimize cache performance.
1067         if offset is not None:
1068             self._offset_cache[offset] = type, chunks
1069         return type, chunks
1070
1071     def iterobjects(self, progress=None, compute_crc32=True):
1072         self._file.seek(self._header_size)
1073         for i in xrange(1, self._num_objects + 1):
1074             offset = self._file.tell()
1075             unpacked, unused = unpack_object(
1076               self._file.read, compute_crc32=compute_crc32)
1077             if progress is not None:
1078                 progress(i, self._num_objects)
1079             yield (offset, unpacked.pack_type_num, unpacked._obj(),
1080                    unpacked.crc32)
1081             self._file.seek(-len(unused), SEEK_CUR)  # Back up over unused data.
1082
1083     def _iter_unpacked(self):
1084         # TODO(dborowitz): Merge this with iterobjects, if we can change its
1085         # return type.
1086         self._file.seek(self._header_size)
1087         for _ in xrange(self._num_objects):
1088             offset = self._file.tell()
1089             unpacked, unused = unpack_object(
1090               self._file.read, compute_crc32=False)
1091             unpacked.offset = offset
1092             yield unpacked
1093             self._file.seek(-len(unused), SEEK_CUR)  # Back up over unused data.
1094
1095     def iterentries(self, progress=None):
1096         """Yield entries summarizing the contents of this pack.
1097
1098         :param progress: Progress function, called with current and total
1099             object count.
1100         :return: iterator of tuples with (sha, offset, crc32)
1101         """
1102         num_objects = self._num_objects
1103         resolve_ext_ref = (
1104             self.pack.resolve_ext_ref if self.pack is not None else None)
1105         indexer = PackIndexer.for_pack_data(
1106             self, resolve_ext_ref=resolve_ext_ref)
1107         for i, result in enumerate(indexer):
1108             if progress is not None:
1109                 progress(i, num_objects)
1110             yield result
1111
1112     def sorted_entries(self, progress=None):
1113         """Return entries in this pack, sorted by SHA.
1114
1115         :param progress: Progress function, called with current and total
1116             object count
1117         :return: List of tuples with (sha, offset, crc32)
1118         """
1119         ret = list(self.iterentries(progress=progress))
1120         ret.sort()
1121         return ret
1122
1123     def create_index_v1(self, filename, progress=None):
1124         """Create a version 1 file for this data file.
1125
1126         :param filename: Index filename.
1127         :param progress: Progress report function
1128         :return: Checksum of index file
1129         """
1130         entries = self.sorted_entries(progress=progress)
1131         f = GitFile(filename, 'wb')
1132         try:
1133             return write_pack_index_v1(f, entries, self.calculate_checksum())
1134         finally:
1135             f.close()
1136
1137     def create_index_v2(self, filename, progress=None):
1138         """Create a version 2 index file for this data file.
1139
1140         :param filename: Index filename.
1141         :param progress: Progress report function
1142         :return: Checksum of index file
1143         """
1144         entries = self.sorted_entries(progress=progress)
1145         f = GitFile(filename, 'wb')
1146         try:
1147             return write_pack_index_v2(f, entries, self.calculate_checksum())
1148         finally:
1149             f.close()
1150
1151     def create_index(self, filename, progress=None,
1152                      version=2):
1153         """Create an  index file for this data file.
1154
1155         :param filename: Index filename.
1156         :param progress: Progress report function
1157         :return: Checksum of index file
1158         """
1159         if version == 1:
1160             return self.create_index_v1(filename, progress)
1161         elif version == 2:
1162             return self.create_index_v2(filename, progress)
1163         else:
1164             raise ValueError('unknown index format %d' % version)
1165
1166     def get_stored_checksum(self):
1167         """Return the expected checksum stored in this pack."""
1168         self._file.seek(-20, SEEK_END)
1169         return self._file.read(20)
1170
1171     def check(self):
1172         """Check the consistency of this pack."""
1173         actual = self.calculate_checksum()
1174         stored = self.get_stored_checksum()
1175         if actual != stored:
1176             raise ChecksumMismatch(stored, actual)
1177
1178     def get_object_at(self, offset):
1179         """Given an offset in to the packfile return the object that is there.
1180
1181         Using the associated index the location of an object can be looked up,
1182         and then the packfile can be asked directly for that object using this
1183         function.
1184         """
1185         if offset in self._offset_cache:
1186             return self._offset_cache[offset]
1187         assert isinstance(offset, long) or isinstance(offset, int),\
1188                 'offset was %r' % offset
1189         assert offset >= self._header_size
1190         self._file.seek(offset)
1191         unpacked, _ = unpack_object(self._file.read)
1192         return (unpacked.pack_type_num, unpacked._obj())
1193
1194
1195 class DeltaChainIterator(object):
1196     """Abstract iterator over pack data based on delta chains.
1197
1198     Each object in the pack is guaranteed to be inflated exactly once,
1199     regardless of how many objects reference it as a delta base. As a result,
1200     memory usage is proportional to the length of the longest delta chain.
1201
1202     Subclasses can override _result to define the result type of the iterator.
1203     By default, results are UnpackedObjects with the following members set:
1204
1205     * offset
1206     * obj_type_num
1207     * obj_chunks
1208     * pack_type_num
1209     * delta_base     (for delta types)
1210     * comp_chunks    (if _include_comp is True)
1211     * decomp_chunks
1212     * decomp_len
1213     * crc32          (if _compute_crc32 is True)
1214     """
1215
1216     _compute_crc32 = False
1217     _include_comp = False
1218
1219     def __init__(self, file_obj, resolve_ext_ref=None):
1220         self._file = file_obj
1221         self._resolve_ext_ref = resolve_ext_ref
1222         self._pending_ofs = defaultdict(list)
1223         self._pending_ref = defaultdict(list)
1224         self._full_ofs = []
1225         self._shas = {}
1226         self._ext_refs = []
1227
1228     @classmethod
1229     def for_pack_data(cls, pack_data, resolve_ext_ref=None):
1230         walker = cls(None, resolve_ext_ref=resolve_ext_ref)
1231         walker.set_pack_data(pack_data)
1232         for unpacked in pack_data._iter_unpacked():
1233             walker.record(unpacked)
1234         return walker
1235
1236     def record(self, unpacked):
1237         type_num = unpacked.pack_type_num
1238         offset = unpacked.offset
1239         if type_num == OFS_DELTA:
1240             base_offset = offset - unpacked.delta_base
1241             self._pending_ofs[base_offset].append(offset)
1242         elif type_num == REF_DELTA:
1243             self._pending_ref[unpacked.delta_base].append(offset)
1244         else:
1245             self._full_ofs.append((offset, type_num))
1246
1247     def set_pack_data(self, pack_data):
1248         self._file = pack_data._file
1249
1250     def _walk_all_chains(self):
1251         for offset, type_num in self._full_ofs:
1252             for result in self._follow_chain(offset, type_num, None):
1253                 yield result
1254         for result in self._walk_ref_chains():
1255             yield result
1256         assert not self._pending_ofs
1257
1258     def _ensure_no_pending(self):
1259         if self._pending_ref:
1260             raise KeyError([sha_to_hex(s) for s in self._pending_ref])
1261
1262     def _walk_ref_chains(self):
1263         if not self._resolve_ext_ref:
1264             self._ensure_no_pending()
1265             return
1266
1267         for base_sha, pending in sorted(self._pending_ref.iteritems()):
1268             try:
1269                 type_num, chunks = self._resolve_ext_ref(base_sha)
1270             except KeyError:
1271                 # Not an external ref, but may depend on one. Either it will get
1272                 # popped via a _follow_chain call, or we will raise an error
1273                 # below.
1274                 continue
1275             self._ext_refs.append(base_sha)
1276             self._pending_ref.pop(base_sha)
1277             for new_offset in pending:
1278                 for result in self._follow_chain(new_offset, type_num, chunks):
1279                     yield result
1280
1281         self._ensure_no_pending()
1282
1283     def _result(self, unpacked):
1284         return unpacked
1285
1286     def _resolve_object(self, offset, obj_type_num, base_chunks):
1287         self._file.seek(offset)
1288         unpacked, _ = unpack_object(
1289           self._file.read, include_comp=self._include_comp,
1290           compute_crc32=self._compute_crc32)
1291         unpacked.offset = offset
1292         if base_chunks is None:
1293             assert unpacked.pack_type_num == obj_type_num
1294         else:
1295             assert unpacked.pack_type_num in DELTA_TYPES
1296             unpacked.obj_type_num = obj_type_num
1297             unpacked.obj_chunks = apply_delta(base_chunks,
1298                                               unpacked.decomp_chunks)
1299         return unpacked
1300
1301     def _follow_chain(self, offset, obj_type_num, base_chunks):
1302         # Unlike PackData.get_object_at, there is no need to cache offsets as
1303         # this approach by design inflates each object exactly once.
1304         unpacked = self._resolve_object(offset, obj_type_num, base_chunks)
1305         yield self._result(unpacked)
1306
1307         pending = chain(self._pending_ofs.pop(unpacked.offset, []),
1308                         self._pending_ref.pop(unpacked.sha(), []))
1309         for new_offset in pending:
1310             for new_result in self._follow_chain(
1311               new_offset, unpacked.obj_type_num, unpacked.obj_chunks):
1312                 yield new_result
1313
1314     def __iter__(self):
1315         return self._walk_all_chains()
1316
1317     def ext_refs(self):
1318         return self._ext_refs
1319
1320
1321 class PackIndexer(DeltaChainIterator):
1322     """Delta chain iterator that yields index entries."""
1323
1324     _compute_crc32 = True
1325
1326     def _result(self, unpacked):
1327         return unpacked.sha(), unpacked.offset, unpacked.crc32
1328
1329
1330 class PackInflater(DeltaChainIterator):
1331     """Delta chain iterator that yields ShaFile objects."""
1332
1333     def _result(self, unpacked):
1334         return unpacked.sha_file()
1335
1336
1337 class SHA1Reader(object):
1338     """Wrapper around a file-like object that remembers the SHA1 of its data."""
1339
1340     def __init__(self, f):
1341         self.f = f
1342         self.sha1 = make_sha('')
1343
1344     def read(self, num=None):
1345         data = self.f.read(num)
1346         self.sha1.update(data)
1347         return data
1348
1349     def check_sha(self):
1350         stored = self.f.read(20)
1351         if stored != self.sha1.digest():
1352             raise ChecksumMismatch(self.sha1.hexdigest(), sha_to_hex(stored))
1353
1354     def close(self):
1355         return self.f.close()
1356
1357     def tell(self):
1358         return self.f.tell()
1359
1360
1361 class SHA1Writer(object):
1362     """Wrapper around a file-like object that remembers the SHA1 of its data."""
1363
1364     def __init__(self, f):
1365         self.f = f
1366         self.length = 0
1367         self.sha1 = make_sha('')
1368
1369     def write(self, data):
1370         self.sha1.update(data)
1371         self.f.write(data)
1372         self.length += len(data)
1373
1374     def write_sha(self):
1375         sha = self.sha1.digest()
1376         assert len(sha) == 20
1377         self.f.write(sha)
1378         self.length += len(sha)
1379         return sha
1380
1381     def close(self):
1382         sha = self.write_sha()
1383         self.f.close()
1384         return sha
1385
1386     def offset(self):
1387         return self.length
1388
1389     def tell(self):
1390         return self.f.tell()
1391
1392
1393 def pack_object_header(type_num, delta_base, size):
1394     """Create a pack object header for the given object info.
1395
1396     :param type_num: Numeric type of the object.
1397     :param delta_base: Delta base offset or ref, or None for whole objects.
1398     :param size: Uncompressed object size.
1399     :return: A header for a packed object.
1400     """
1401     header = ''
1402     c = (type_num << 4) | (size & 15)
1403     size >>= 4
1404     while size:
1405         header += (chr(c | 0x80))
1406         c = size & 0x7f
1407         size >>= 7
1408     header += chr(c)
1409     if type_num == OFS_DELTA:
1410         ret = [delta_base & 0x7f]
1411         delta_base >>= 7
1412         while delta_base:
1413             delta_base -= 1
1414             ret.insert(0, 0x80 | (delta_base & 0x7f))
1415             delta_base >>= 7
1416         header += ''.join([chr(x) for x in ret])
1417     elif type_num == REF_DELTA:
1418         assert len(delta_base) == 20
1419         header += delta_base
1420     return header
1421
1422
1423 def write_pack_object(f, type, object, sha=None):
1424     """Write pack object to a file.
1425
1426     :param f: File to write to
1427     :param type: Numeric type of the object
1428     :param object: Object to write
1429     :return: Tuple with offset at which the object was written, and crc32
1430     """
1431     if type in DELTA_TYPES:
1432         delta_base, object = object
1433     else:
1434         delta_base = None
1435     header = pack_object_header(type, delta_base, len(object))
1436     comp_data = zlib.compress(object)
1437     crc32 = 0
1438     for data in (header, comp_data):
1439         f.write(data)
1440         if sha is not None:
1441             sha.update(data)
1442         crc32 = binascii.crc32(data, crc32)
1443     return crc32 & 0xffffffff
1444
1445
1446 def write_pack(filename, objects, num_objects=None):
1447     """Write a new pack data file.
1448
1449     :param filename: Path to the new pack file (without .pack extension)
1450     :param objects: Iterable of (object, path) tuples to write.
1451         Should provide __len__
1452     :return: Tuple with checksum of pack file and index file
1453     """
1454     if num_objects is not None:
1455         warnings.warn('num_objects argument to write_pack is deprecated',
1456                       DeprecationWarning)
1457     f = GitFile(filename + '.pack', 'wb')
1458     try:
1459         entries, data_sum = write_pack_objects(f, objects,
1460             num_objects=num_objects)
1461     finally:
1462         f.close()
1463     entries = [(k, v[0], v[1]) for (k, v) in entries.iteritems()]
1464     entries.sort()
1465     f = GitFile(filename + '.idx', 'wb')
1466     try:
1467         return data_sum, write_pack_index_v2(f, entries, data_sum)
1468     finally:
1469         f.close()
1470
1471
1472 def write_pack_header(f, num_objects):
1473     """Write a pack header for the given number of objects."""
1474     f.write('PACK')                          # Pack header
1475     f.write(struct.pack('>L', 2))            # Pack version
1476     f.write(struct.pack('>L', num_objects))  # Number of objects in pack
1477
1478
1479 def deltify_pack_objects(objects, window=10):
1480     """Generate deltas for pack objects.
1481
1482     :param objects: Objects to deltify
1483     :param window: Window size
1484     :return: Iterator over type_num, object id, delta_base, content
1485         delta_base is None for full text entries
1486     """
1487     # Build a list of objects ordered by the magic Linus heuristic
1488     # This helps us find good objects to diff against us
1489     magic = []
1490     for obj, path in objects:
1491         magic.append((obj.type_num, path, -obj.raw_length(), obj))
1492     magic.sort()
1493
1494     possible_bases = deque()
1495
1496     for type_num, path, neg_length, o in magic:
1497         raw = o.as_raw_string()
1498         winner = raw
1499         winner_base = None
1500         for base in possible_bases:
1501             if base.type_num != type_num:
1502                 continue
1503             delta = create_delta(base.as_raw_string(), raw)
1504             if len(delta) < len(winner):
1505                 winner_base = base.sha().digest()
1506                 winner = delta
1507         yield type_num, o.sha().digest(), winner_base, winner
1508         possible_bases.appendleft(o)
1509         while len(possible_bases) > window:
1510             possible_bases.pop()
1511
1512
1513 def write_pack_objects(f, objects, window=10, num_objects=None):
1514     """Write a new pack data file.
1515
1516     :param f: File to write to
1517     :param objects: Iterable of (object, path) tuples to write.
1518         Should provide __len__
1519     :param window: Sliding window size for searching for deltas; currently
1520                    unimplemented
1521     :param num_objects: Number of objects (do not use, deprecated)
1522     :return: Dict mapping id -> (offset, crc32 checksum), pack checksum
1523     """
1524     if num_objects is None:
1525         num_objects = len(objects)
1526     # FIXME: pack_contents = deltify_pack_objects(objects, window)
1527     pack_contents = (
1528         (o.type_num, o.sha().digest(), None, o.as_raw_string())
1529         for (o, path) in objects)
1530     return write_pack_data(f, num_objects, pack_contents)
1531
1532
1533 def write_pack_data(f, num_records, records):
1534     """Write a new pack data file.
1535
1536     :param f: File to write to
1537     :param num_records: Number of records
1538     :param records: Iterator over type_num, object_id, delta_base, raw
1539     :return: Dict mapping id -> (offset, crc32 checksum), pack checksum
1540     """
1541     # Write the pack
1542     entries = {}
1543     f = SHA1Writer(f)
1544     write_pack_header(f, num_records)
1545     for type_num, object_id, delta_base, raw in records:
1546         if delta_base is not None:
1547             try:
1548                 base_offset, base_crc32 = entries[delta_base]
1549             except KeyError:
1550                 type_num = REF_DELTA
1551                 raw = (delta_base, raw)
1552             else:
1553                 type_num = OFS_DELTA
1554                 raw = (base_offset, raw)
1555         offset = f.offset()
1556         crc32 = write_pack_object(f, type_num, raw)
1557         entries[object_id] = (offset, crc32)
1558     return entries, f.write_sha()
1559
1560
1561 def write_pack_index_v1(f, entries, pack_checksum):
1562     """Write a new pack index file.
1563
1564     :param f: A file-like object to write to
1565     :param entries: List of tuples with object name (sha), offset_in_pack,
1566         and crc32_checksum.
1567     :param pack_checksum: Checksum of the pack file.
1568     :return: The SHA of the written index file
1569     """
1570     f = SHA1Writer(f)
1571     fan_out_table = defaultdict(lambda: 0)
1572     for (name, offset, entry_checksum) in entries:
1573         fan_out_table[ord(name[0])] += 1
1574     # Fan-out table
1575     for i in range(0x100):
1576         f.write(struct.pack('>L', fan_out_table[i]))
1577         fan_out_table[i+1] += fan_out_table[i]
1578     for (name, offset, entry_checksum) in entries:
1579         if not (offset <= 0xffffffff):
1580             raise TypeError("pack format 1 only supports offsets < 2Gb")
1581         f.write(struct.pack('>L20s', offset, name))
1582     assert len(pack_checksum) == 20
1583     f.write(pack_checksum)
1584     return f.write_sha()
1585
1586
1587 def create_delta(base_buf, target_buf):
1588     """Use python difflib to work out how to transform base_buf to target_buf.
1589
1590     :param base_buf: Base buffer
1591     :param target_buf: Target buffer
1592     """
1593     assert isinstance(base_buf, str)
1594     assert isinstance(target_buf, str)
1595     out_buf = ''
1596     # write delta header
1597     def encode_size(size):
1598         ret = ''
1599         c = size & 0x7f
1600         size >>= 7
1601         while size:
1602             ret += chr(c | 0x80)
1603             c = size & 0x7f
1604             size >>= 7
1605         ret += chr(c)
1606         return ret
1607     out_buf += encode_size(len(base_buf))
1608     out_buf += encode_size(len(target_buf))
1609     # write out delta opcodes
1610     seq = difflib.SequenceMatcher(a=base_buf, b=target_buf)
1611     for opcode, i1, i2, j1, j2 in seq.get_opcodes():
1612         # Git patch opcodes don't care about deletes!
1613         #if opcode == 'replace' or opcode == 'delete':
1614         #    pass
1615         if opcode == 'equal':
1616             # If they are equal, unpacker will use data from base_buf
1617             # Write out an opcode that says what range to use
1618             scratch = ''
1619             op = 0x80
1620             o = i1
1621             for i in range(4):
1622                 if o & 0xff << i*8:
1623                     scratch += chr((o >> i*8) & 0xff)
1624                     op |= 1 << i
1625             s = i2 - i1
1626             for i in range(2):
1627                 if s & 0xff << i*8:
1628                     scratch += chr((s >> i*8) & 0xff)
1629                     op |= 1 << (4+i)
1630             out_buf += chr(op)
1631             out_buf += scratch
1632         if opcode == 'replace' or opcode == 'insert':
1633             # If we are replacing a range or adding one, then we just
1634             # output it to the stream (prefixed by its size)
1635             s = j2 - j1
1636             o = j1
1637             while s > 127:
1638                 out_buf += chr(127)
1639                 out_buf += target_buf[o:o+127]
1640                 s -= 127
1641                 o += 127
1642             out_buf += chr(s)
1643             out_buf += target_buf[o:o+s]
1644     return out_buf
1645
1646
1647 def apply_delta(src_buf, delta):
1648     """Based on the similar function in git's patch-delta.c.
1649
1650     :param src_buf: Source buffer
1651     :param delta: Delta instructions
1652     """
1653     if type(src_buf) != str:
1654         src_buf = ''.join(src_buf)
1655     if type(delta) != str:
1656         delta = ''.join(delta)
1657     out = []
1658     index = 0
1659     delta_length = len(delta)
1660     def get_delta_header_size(delta, index):
1661         size = 0
1662         i = 0
1663         while delta:
1664             cmd = ord(delta[index])
1665             index += 1
1666             size |= (cmd & ~0x80) << i
1667             i += 7
1668             if not cmd & 0x80:
1669                 break
1670         return size, index
1671     src_size, index = get_delta_header_size(delta, index)
1672     dest_size, index = get_delta_header_size(delta, index)
1673     assert src_size == len(src_buf), '%d vs %d' % (src_size, len(src_buf))
1674     while index < delta_length:
1675         cmd = ord(delta[index])
1676         index += 1
1677         if cmd & 0x80:
1678             cp_off = 0
1679             for i in range(4):
1680                 if cmd & (1 << i):
1681                     x = ord(delta[index])
1682                     index += 1
1683                     cp_off |= x << (i * 8)
1684             cp_size = 0
1685             for i in range(3):
1686                 if cmd & (1 << (4+i)):
1687                     x = ord(delta[index])
1688                     index += 1
1689                     cp_size |= x << (i * 8)
1690             if cp_size == 0:
1691                 cp_size = 0x10000
1692             if (cp_off + cp_size < cp_size or
1693                 cp_off + cp_size > src_size or
1694                 cp_size > dest_size):
1695                 break
1696             out.append(src_buf[cp_off:cp_off+cp_size])
1697         elif cmd != 0:
1698             out.append(delta[index:index+cmd])
1699             index += cmd
1700         else:
1701             raise ApplyDeltaError('Invalid opcode 0')
1702
1703     if index != delta_length:
1704         raise ApplyDeltaError('delta not empty: %r' % delta[index:])
1705
1706     if dest_size != chunks_length(out):
1707         raise ApplyDeltaError('dest size incorrect')
1708
1709     return out
1710
1711
1712 def write_pack_index_v2(f, entries, pack_checksum):
1713     """Write a new pack index file.
1714
1715     :param f: File-like object to write to
1716     :param entries: List of tuples with object name (sha), offset_in_pack, and
1717         crc32_checksum.
1718     :param pack_checksum: Checksum of the pack file.
1719     :return: The SHA of the index file written
1720     """
1721     f = SHA1Writer(f)
1722     f.write('\377tOc') # Magic!
1723     f.write(struct.pack('>L', 2))
1724     fan_out_table = defaultdict(lambda: 0)
1725     for (name, offset, entry_checksum) in entries:
1726         fan_out_table[ord(name[0])] += 1
1727     # Fan-out table
1728     largetable = []
1729     for i in range(0x100):
1730         f.write(struct.pack('>L', fan_out_table[i]))
1731         fan_out_table[i+1] += fan_out_table[i]
1732     for (name, offset, entry_checksum) in entries:
1733         f.write(name)
1734     for (name, offset, entry_checksum) in entries:
1735         f.write(struct.pack('>L', entry_checksum))
1736     for (name, offset, entry_checksum) in entries:
1737         if offset < 2**31:
1738             f.write(struct.pack('>L', offset))
1739         else:
1740             f.write(struct.pack('>L', 2**31 + len(largetable)))
1741             largetable.append(offset)
1742     for offset in largetable:
1743         f.write(struct.pack('>Q', offset))
1744     assert len(pack_checksum) == 20
1745     f.write(pack_checksum)
1746     return f.write_sha()
1747
1748
1749 class Pack(object):
1750     """A Git pack object."""
1751
1752     def __init__(self, basename, resolve_ext_ref=None):
1753         self._basename = basename
1754         self._data = None
1755         self._idx = None
1756         self._idx_path = self._basename + '.idx'
1757         self._data_path = self._basename + '.pack'
1758         self._data_load = lambda: PackData(self._data_path)
1759         self._idx_load = lambda: load_pack_index(self._idx_path)
1760         self.resolve_ext_ref = resolve_ext_ref
1761
1762     @classmethod
1763     def from_lazy_objects(self, data_fn, idx_fn):
1764         """Create a new pack object from callables to load pack data and
1765         index objects."""
1766         ret = Pack('')
1767         ret._data_load = data_fn
1768         ret._idx_load = idx_fn
1769         return ret
1770
1771     @classmethod
1772     def from_objects(self, data, idx):
1773         """Create a new pack object from pack data and index objects."""
1774         ret = Pack('')
1775         ret._data_load = lambda: data
1776         ret._idx_load = lambda: idx
1777         return ret
1778
1779     def name(self):
1780         """The SHA over the SHAs of the objects in this pack."""
1781         return self.index.objects_sha1()
1782
1783     @property
1784     def data(self):
1785         """The pack data object being used."""
1786         if self._data is None:
1787             self._data = self._data_load()
1788             self._data.pack = self
1789             self.check_length_and_checksum()
1790         return self._data
1791
1792     @property
1793     def index(self):
1794         """The index being used.
1795
1796         :note: This may be an in-memory index
1797         """
1798         if self._idx is None:
1799             self._idx = self._idx_load()
1800         return self._idx
1801
1802     def close(self):
1803         if self._data is not None:
1804             self._data.close()
1805         self.index.close()
1806
1807     def __eq__(self, other):
1808         return type(self) == type(other) and self.index == other.index
1809
1810     def __len__(self):
1811         """Number of entries in this pack."""
1812         return len(self.index)
1813
1814     def __repr__(self):
1815         return '%s(%r)' % (self.__class__.__name__, self._basename)
1816
1817     def __iter__(self):
1818         """Iterate over all the sha1s of the objects in this pack."""
1819         return iter(self.index)
1820
1821     def check_length_and_checksum(self):
1822         """Sanity check the length and checksum of the pack index and data."""
1823         assert len(self.index) == len(self.data)
1824         idx_stored_checksum = self.index.get_pack_checksum()
1825         data_stored_checksum = self.data.get_stored_checksum()
1826         if idx_stored_checksum != data_stored_checksum:
1827             raise ChecksumMismatch(sha_to_hex(idx_stored_checksum),
1828                                    sha_to_hex(data_stored_checksum))
1829
1830     def check(self):
1831         """Check the integrity of this pack.
1832
1833         :raise ChecksumMismatch: if a checksum for the index or data is wrong
1834         """
1835         self.index.check()
1836         self.data.check()
1837         for obj in self.iterobjects():
1838             obj.check()
1839         # TODO: object connectivity checks
1840
1841     def get_stored_checksum(self):
1842         return self.data.get_stored_checksum()
1843
1844     def __contains__(self, sha1):
1845         """Check whether this pack contains a particular SHA1."""
1846         try:
1847             self.index.object_index(sha1)
1848             return True
1849         except KeyError:
1850             return False
1851
1852     def get_raw(self, sha1):
1853         offset = self.index.object_index(sha1)
1854         obj_type, obj = self.data.get_object_at(offset)
1855         type_num, chunks = self.data.resolve_object(offset, obj_type, obj)
1856         return type_num, ''.join(chunks)
1857
1858     def __getitem__(self, sha1):
1859         """Retrieve the specified SHA1."""
1860         type, uncomp = self.get_raw(sha1)
1861         return ShaFile.from_raw_string(type, uncomp)
1862
1863     def iterobjects(self):
1864         """Iterate over the objects in this pack."""
1865         return iter(PackInflater.for_pack_data(
1866             self.data, resolve_ext_ref=self.resolve_ext_ref))
1867
1868     def pack_tuples(self):
1869         """Provide an iterable for use with write_pack_objects.
1870
1871         :return: Object that can iterate over (object, path) tuples
1872             and provides __len__
1873         """
1874         class PackTupleIterable(object):
1875
1876             def __init__(self, pack):
1877                 self.pack = pack
1878
1879             def __len__(self):
1880                 return len(self.pack)
1881
1882             def __iter__(self):
1883                 return ((o, None) for o in self.pack.iterobjects())
1884
1885         return PackTupleIterable(self)
1886
1887     def keep(self, msg=None):
1888         """Add a .keep file for the pack, preventing git from garbage collecting it.
1889
1890         :param msg: A message written inside the .keep file; can be used later to
1891                     determine whether or not a .keep file is obsolete.
1892         :return: The path of the .keep file, as a string.
1893         """
1894         keepfile_name = '%s.keep' % self._basename
1895         keepfile = GitFile(keepfile_name, 'wb')
1896         try:
1897             if msg:
1898                 keepfile.write(msg)
1899                 keepfile.write('\n')
1900         finally:
1901             keepfile.close()
1902         return keepfile_name
1903
1904
1905 try:
1906     from dulwich._pack import apply_delta, bisect_find_sha
1907 except ImportError:
1908     pass