dulwich/pack.py

   1 # pack.py -- For dealing wih packed git objects.
   2 # Copyright (C) 2007 James Westby <jw+debian@jameswestby.net>
   3 # Copryight (C) 2008-2009 Jelmer Vernooij <jelmer@samba.org>
   4 #
   5 # This program is free software; you can redistribute it and/or
   6 # modify it under the terms of the GNU General Public License
   7 # as published by the Free Software Foundation; version 2
   8 # of the License or (at your option) a later version.
   9 #
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 # GNU General Public License for more details.
  14 #
  15 # You should have received a copy of the GNU General Public License
  16 # along with this program; if not, write to the Free Software
  17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
  18 # MA  02110-1301, USA.
  19
  20 """Classes for dealing with packed git objects.
  21
  22 A pack is a compact representation of a bunch of objects, stored
  23 using deltas where possible.
  24
  25 They have two parts, the pack file, which stores the data, and an index
  26 that tells you where the data is.
  27
  28 To find an object you look in all of the index files 'til you find a
  29 match for the object name. You then use the pointer got from this as
  30 a pointer in to the corresponding packfile.
  31 """
  32
  33 try:
  34     from collections import defaultdict
  35 except ImportError:
  36     from misc import defaultdict
  37
  38 from cStringIO import (
  39     StringIO,
  40     )
  41 from collections import (
  42     deque,
  43     )
  44 import difflib
  45 from itertools import (
  46     chain,
  47     imap,
  48     izip,
  49     )
  50 import mmap
  51 import os
  52 import struct
  53 try:
  54     from struct import unpack_from
  55 except ImportError:
  56     from dulwich.misc import unpack_from
  57 import sys
  58 import zlib
  59
  60 from dulwich.errors import (
  61     ApplyDeltaError,
  62     ChecksumMismatch,
  63     )
  64 from dulwich.file import GitFile
  65 from dulwich.lru_cache import (
  66     LRUSizeCache,
  67     )
  68 from dulwich.objects import (
  69     ShaFile,
  70     hex_to_sha,
  71     sha_to_hex,
  72     )
  73 from dulwich.misc import (
  74     make_sha,
  75     )
  76
  77 supports_mmap_offset = (sys.version_info[0] >= 3 or
  78         (sys.version_info[0] == 2 and sys.version_info[1] >= 6))
  79
  80
  81 def take_msb_bytes(read):
  82     """Read bytes marked with most significant bit.
  83
  84     :param read: Read function
  85     """
  86     ret = []
  87     while len(ret) == 0 or ret[-1] & 0x80:
  88         ret.append(ord(read(1)))
  89     return ret
  90
  91
  92 def read_zlib_chunks(read_some, dec_size, buffer_size=4096):
  93     """Read zlib data from a buffer.
  94
  95     This function requires that the buffer have additional data following the
  96     compressed data, which is guaranteed to be the case for git pack files.
  97
  98     :param read_some: Read function that returns at least one byte, but may
  99         return less than the requested size
 100     :param dec_size: Expected size of the decompressed buffer
 101     :param buffer_size: Size of the read buffer
 102     :return: Tuple with list of chunks, length of compressed data length and
 103         and unused read data.
 104     :raise zlib.error: if a decompression error occurred.
 105     """
 106     if dec_size <= -1:
 107         raise ValueError("non-negative zlib data stream size expected")
 108     obj = zlib.decompressobj()
 109     ret = []
 110     fed = 0
 111     size = 0
 112     while obj.unused_data == "":
 113         add = read_some(buffer_size)
 114         if not add:
 115             raise zlib.error("EOF before end of zlib stream")
 116         fed += len(add)
 117         decomp = obj.decompress(add)
 118         size += len(decomp)
 119         ret.append(decomp)
 120     if size != dec_size:
 121         raise zlib.error("decompressed data does not match expected size")
 122     comp_len = fed - len(obj.unused_data)
 123     return ret, comp_len, obj.unused_data
 124
 125 def iter_sha1(iter):
 126     """Return the hexdigest of the SHA1 over a set of names.
 127
 128     :param iter: Iterator over string objects
 129     :return: 40-byte hex sha1 digest
 130     """
 131     sha1 = make_sha()
 132     for name in iter:
 133         sha1.update(name)
 134     return sha1.hexdigest()
 135
 136
 137 def load_pack_index(path):
 138     """Load an index file by path.
 139
 140     :param filename: Path to the index file
 141     :return: A PackIndex loaded from the given path
 142     """
 143     f = GitFile(path, 'rb')
 144     try:
 145         return load_pack_index_file(path, f)
 146     finally:
 147         f.close()
 148
 149
 150 def _load_file_contents(f, size=None):
 151     fileno = getattr(f, 'fileno', None)
 152     # Attempt to use mmap if possible
 153     if fileno is not None:
 154         fd = f.fileno()
 155         if size is None:
 156             size = os.fstat(fd).st_size
 157         try:
 158             contents = mmap.mmap(fd, size, access=mmap.ACCESS_READ)
 159         except mmap.error:
 160             # Perhaps a socket?
 161             pass
 162         else:
 163             return contents, size
 164     contents = f.read()
 165     size = len(contents)
 166     return contents, size
 167
 168
 169 def load_pack_index_file(path, f):
 170     """Load an index file from a file-like object.
 171
 172     :param path: Path for the index file
 173     :param f: File-like object
 174     :return: A PackIndex loaded from the given file
 175     """
 176     contents, size = _load_file_contents(f)
 177     if contents[:4] == '\377tOc':
 178         version = struct.unpack(">L", contents[4:8])[0]
 179         if version == 2:
 180             return PackIndex2(path, file=f, contents=contents,
 181                 size=size)
 182         else:
 183             raise KeyError("Unknown pack index format %d" % version)
 184     else:
 185         return PackIndex1(path, file=f, contents=contents, size=size)
 186
 187
 188 def bisect_find_sha(start, end, sha, unpack_name):
 189     """Find a SHA in a data blob with sorted SHAs.
 190
 191     :param start: Start index of range to search
 192     :param end: End index of range to search
 193     :param sha: Sha to find
 194     :param unpack_name: Callback to retrieve SHA by index
 195     :return: Index of the SHA, or None if it wasn't found
 196     """
 197     assert start <= end
 198     while start <= end:
 199         i = (start + end)/2
 200         file_sha = unpack_name(i)
 201         x = cmp(file_sha, sha)
 202         if x < 0:
 203             start = i + 1
 204         elif x > 0:
 205             end = i - 1
 206         else:
 207             return i
 208     return None
 209
 210
 211 class PackIndex(object):
 212     """An index in to a packfile.
 213
 214     Given a sha id of an object a pack index can tell you the location in the
 215     packfile of that object if it has it.
 216
 217     To do the loop it opens the file, and indexes first 256 4 byte groups
 218     with the first byte of the sha id. The value in the four byte group indexed
 219     is the end of the group that shares the same starting byte. Subtract one
 220     from the starting byte and index again to find the start of the group.
 221     The values are sorted by sha id within the group, so do the math to find
 222     the start and end offset and then bisect in to find if the value is present.
 223     """
 224
 225     def __init__(self, filename, file=None, contents=None, size=None):
 226         """Create a pack index object.
 227
 228         Provide it with the name of the index file to consider, and it will map
 229         it whenever required.
 230         """
 231         self._filename = filename
 232         # Take the size now, so it can be checked each time we map the file to
 233         # ensure that it hasn't changed.
 234         if file is None:
 235             self._file = GitFile(filename, 'rb')
 236         else:
 237             self._file = file
 238         if contents is None:
 239             self._contents, self._size = _load_file_contents(file, size)
 240         else:
 241             self._contents, self._size = (contents, size)
 242
 243     def __eq__(self, other):
 244         if not isinstance(other, PackIndex):
 245             return False
 246
 247         if self._fan_out_table != other._fan_out_table:
 248             return False
 249
 250         for (name1, _, _), (name2, _, _) in izip(self.iterentries(),
 251                                                  other.iterentries()):
 252             if name1 != name2:
 253                 return False
 254         return True
 255
 256     def __ne__(self, other):
 257         return not self.__eq__(other)
 258
 259     def close(self):
 260         self._file.close()
 261
 262     def __len__(self):
 263         """Return the number of entries in this pack index."""
 264         return self._fan_out_table[-1]
 265
 266     def _unpack_entry(self, i):
 267         """Unpack the i-th entry in the index file.
 268
 269         :return: Tuple with object name (SHA), offset in pack file and CRC32
 270             checksum (if known)."""
 271         raise NotImplementedError(self._unpack_entry)
 272
 273     def _unpack_name(self, i):
 274         """Unpack the i-th name from the index file."""
 275         raise NotImplementedError(self._unpack_name)
 276
 277     def _unpack_offset(self, i):
 278         """Unpack the i-th object offset from the index file."""
 279         raise NotImplementedError(self._unpack_offset)
 280
 281     def _unpack_crc32_checksum(self, i):
 282         """Unpack the crc32 checksum for the i-th object from the index file."""
 283         raise NotImplementedError(self._unpack_crc32_checksum)
 284
 285     def __iter__(self):
 286         """Iterate over the SHAs in this pack."""
 287         return imap(sha_to_hex, self._itersha())
 288
 289     def _itersha(self):
 290         for i in range(len(self)):
 291             yield self._unpack_name(i)
 292
 293     def objects_sha1(self):
 294         """Return the hex SHA1 over all the shas of all objects in this pack.
 295
 296         :note: This is used for the filename of the pack.
 297         """
 298         return iter_sha1(self._itersha())
 299
 300     def iterentries(self):
 301         """Iterate over the entries in this pack index.
 302
 303         Will yield tuples with object name, offset in packfile and crc32
 304         checksum.
 305         """
 306         for i in range(len(self)):
 307             yield self._unpack_entry(i)
 308
 309     def _read_fan_out_table(self, start_offset):
 310         ret = []
 311         for i in range(0x100):
 312             fanout_entry = self._contents[start_offset+i*4:start_offset+(i+1)*4]
 313             ret.append(struct.unpack(">L", fanout_entry)[0])
 314         return ret
 315
 316     def check(self):
 317         """Check that the stored checksum matches the actual checksum."""
 318         # TODO: Check pack contents, too
 319         return self.calculate_checksum() == self.get_stored_checksum()
 320
 321     def calculate_checksum(self):
 322         """Calculate the SHA1 checksum over this pack index.
 323
 324         :return: This is a 20-byte binary digest
 325         """
 326         return make_sha(self._contents[:-20]).digest()
 327
 328     def get_pack_checksum(self):
 329         """Return the SHA1 checksum stored for the corresponding packfile.
 330
 331         :return: 20-byte binary digest
 332         """
 333         return str(self._contents[-40:-20])
 334
 335     def get_stored_checksum(self):
 336         """Return the SHA1 checksum stored for this index.
 337
 338         :return: 20-byte binary digest
 339         """
 340         return str(self._contents[-20:])
 341
 342     def object_index(self, sha):
 343         """Return the index in to the corresponding packfile for the object.
 344
 345         Given the name of an object it will return the offset that object
 346         lives at within the corresponding pack file. If the pack file doesn't
 347         have the object then None will be returned.
 348         """
 349         if len(sha) == 40:
 350             sha = hex_to_sha(sha)
 351         return self._object_index(sha)
 352
 353     def _object_index(self, sha):
 354         """See object_index.
 355
 356         :param sha: A *binary* SHA string. (20 characters long)_
 357         """
 358         assert len(sha) == 20
 359         idx = ord(sha[0])
 360         if idx == 0:
 361             start = 0
 362         else:
 363             start = self._fan_out_table[idx-1]
 364         end = self._fan_out_table[idx]
 365         i = bisect_find_sha(start, end, sha, self._unpack_name)
 366         if i is None:
 367             raise KeyError(sha)
 368         return self._unpack_offset(i)
 369
 370
 371 class PackIndex1(PackIndex):
 372     """Version 1 Pack Index."""
 373
 374     def __init__(self, filename, file=None, contents=None, size=None):
 375         PackIndex.__init__(self, filename, file, contents, size)
 376         self.version = 1
 377         self._fan_out_table = self._read_fan_out_table(0)
 378
 379     def _unpack_entry(self, i):
 380         (offset, name) = unpack_from(">L20s", self._contents,
 381                                      (0x100 * 4) + (i * 24))
 382         return (name, offset, None)
 383
 384     def _unpack_name(self, i):
 385         offset = (0x100 * 4) + (i * 24) + 4
 386         return self._contents[offset:offset+20]
 387
 388     def _unpack_offset(self, i):
 389         offset = (0x100 * 4) + (i * 24)
 390         return unpack_from(">L", self._contents, offset)[0]
 391
 392     def _unpack_crc32_checksum(self, i):
 393         # Not stored in v1 index files
 394         return None
 395
 396
 397 class PackIndex2(PackIndex):
 398     """Version 2 Pack Index."""
 399
 400     def __init__(self, filename, file=None, contents=None, size=None):
 401         PackIndex.__init__(self, filename, file, contents, size)
 402         assert self._contents[:4] == '\377tOc', "Not a v2 pack index file"
 403         (self.version, ) = unpack_from(">L", self._contents, 4)
 404         assert self.version == 2, "Version was %d" % self.version
 405         self._fan_out_table = self._read_fan_out_table(8)
 406         self._name_table_offset = 8 + 0x100 * 4
 407         self._crc32_table_offset = self._name_table_offset + 20 * len(self)
 408         self._pack_offset_table_offset = (self._crc32_table_offset +
 409                                           4 * len(self))
 410
 411     def _unpack_entry(self, i):
 412         return (self._unpack_name(i), self._unpack_offset(i),
 413                 self._unpack_crc32_checksum(i))
 414
 415     def _unpack_name(self, i):
 416         offset = self._name_table_offset + i * 20
 417         return self._contents[offset:offset+20]
 418
 419     def _unpack_offset(self, i):
 420         offset = self._pack_offset_table_offset + i * 4
 421         return unpack_from(">L", self._contents, offset)[0]
 422
 423     def _unpack_crc32_checksum(self, i):
 424         return unpack_from(">L", self._contents,
 425                           self._crc32_table_offset + i * 4)[0]
 426
 427
 428 def read_pack_header(read):
 429     """Read the header of a pack file.
 430
 431     :param read: Read function
 432     :return: Tuple with pack version and number of objects
 433     """
 434     header = read(12)
 435     assert header[:4] == "PACK"
 436     (version,) = unpack_from(">L", header, 4)
 437     assert version in (2, 3), "Version was %d" % version
 438     (num_objects,) = unpack_from(">L", header, 8)
 439     return (version, num_objects)
 440
 441
 442 def chunks_length(chunks):
 443     return sum(imap(len, chunks))
 444
 445
 446 def unpack_object(read_all, read_some=None):
 447     """Unpack a Git object.
 448
 449     :param read_all: Read function that blocks until the number of requested
 450         bytes are read.
 451     :param read_some: Read function that returns at least one byte, but may not
 452         return the number of bytes requested.
 453     :return: tuple with type, uncompressed data, compressed size and tail data.
 454     """
 455     if read_some is None:
 456         read_some = read_all
 457     bytes = take_msb_bytes(read_all)
 458     type = (bytes[0] >> 4) & 0x07
 459     size = bytes[0] & 0x0f
 460     for i, byte in enumerate(bytes[1:]):
 461         size += (byte & 0x7f) << ((i * 7) + 4)
 462     raw_base = len(bytes)
 463     if type == 6: # offset delta
 464         bytes = take_msb_bytes(read_all)
 465         raw_base += len(bytes)
 466         assert not (bytes[-1] & 0x80)
 467         delta_base_offset = bytes[0] & 0x7f
 468         for byte in bytes[1:]:
 469             delta_base_offset += 1
 470             delta_base_offset <<= 7
 471             delta_base_offset += (byte & 0x7f)
 472         uncomp, comp_len, unused = read_zlib_chunks(read_some, size)
 473         assert size == chunks_length(uncomp)
 474         return type, (delta_base_offset, uncomp), comp_len+raw_base, unused
 475     elif type == 7: # ref delta
 476         basename = read_all(20)
 477         raw_base += 20
 478         uncomp, comp_len, unused = read_zlib_chunks(read_some, size)
 479         assert size == chunks_length(uncomp)
 480         return type, (basename, uncomp), comp_len+raw_base, unused
 481     else:
 482         uncomp, comp_len, unused = read_zlib_chunks(read_some, size)
 483         assert chunks_length(uncomp) == size
 484         return type, uncomp, comp_len+raw_base, unused
 485
 486
 487 def _compute_object_size((num, obj)):
 488     """Compute the size of a unresolved object for use with LRUSizeCache."""
 489     if num in (6, 7):
 490         return chunks_length(obj[1])
 491     return chunks_length(obj)
 492
 493
 494 class PackStreamReader(object):
 495     """Class to read a pack stream.
 496
 497     The pack is read from a ReceivableProtocol using read() or recv() as
 498     appropriate.
 499     """
 500
 501     def __init__(self, read_all, read_some=None):
 502         self.read_all = read_all
 503         if read_some is None:
 504             self.read_some = read_all
 505         else:
 506             self.read_some = read_some
 507         self.sha = make_sha()
 508         self._rbuf = StringIO()
 509         # trailer is a deque to avoid memory allocation on small reads
 510         self._trailer = deque()
 511
 512     def _read(self, read, size):
 513         """Read up to size bytes using the given callback.
 514
 515         As a side effect, update the verifier's hash (excluding the last 20
 516         bytes read) and write through to the output file.
 517
 518         :param read: The read callback to read from.
 519         :param size: The maximum number of bytes to read; the particular
 520             behavior is callback-specific.
 521         """
 522         data = read(size)
 523
 524         # maintain a trailer of the last 20 bytes we've read
 525         n = len(data)
 526         tn = len(self._trailer)
 527         if n >= 20:
 528             to_pop = tn
 529             to_add = 20
 530         else:
 531             to_pop = max(n + tn - 20, 0)
 532             to_add = n
 533         for _ in xrange(to_pop):
 534             self.sha.update(self._trailer.popleft())
 535         self._trailer.extend(data[-to_add:])
 536
 537         # hash everything but the trailer
 538         self.sha.update(data[:-to_add])
 539         return data
 540
 541     def _buf_len(self):
 542         buf = self._rbuf
 543         start = buf.tell()
 544         buf.seek(0, os.SEEK_END)
 545         end = buf.tell()
 546         buf.seek(start)
 547         return end - start
 548
 549     def read(self, size):
 550         """Read, blocking until size bytes are read."""
 551         buf_len = self._buf_len()
 552         if buf_len >= size:
 553             return self._rbuf.read(size)
 554         buf_data = self._rbuf.read()
 555         self._rbuf = StringIO()
 556         return buf_data + self._read(self.read_all, size - buf_len)
 557
 558     def recv(self, size):
 559         """Read up to size bytes, blocking until one byte is read."""
 560         buf_len = self._buf_len()
 561         if buf_len:
 562             data = self._rbuf.read(size)
 563             if size >= buf_len:
 564                 self._rbuf = StringIO()
 565             return data
 566         return self._read(self.read_some, size)
 567
 568     def read_objects(self):
 569         """Read the objects in this pack file.
 570
 571         :raise AssertionError: if there is an error in the pack format.
 572         :raise ChecksumMismatch: if the checksum of the pack contents does not
 573             match the checksum in the pack trailer.
 574         :raise zlib.error: if an error occurred during zlib decompression.
 575         :raise IOError: if an error occurred writing to the output file.
 576         """
 577         pack_version, num_objects = read_pack_header(self.read)
 578         for i in xrange(num_objects):
 579             type, uncomp, comp_len, unused = unpack_object(self.read, self.recv)
 580             yield type, uncomp, comp_len
 581
 582             # prepend any unused data to current read buffer
 583             buf = StringIO()
 584             buf.write(unused)
 585             buf.write(self._rbuf.read())
 586             buf.seek(0)
 587             self._rbuf = buf
 588
 589         pack_sha = sha_to_hex(''.join([c for c in self._trailer]))
 590         calculated_sha = self.sha.hexdigest()
 591         if pack_sha != calculated_sha:
 592             raise ChecksumMismatch(pack_sha, calculated_sha)
 593
 594
 595 class PackData(object):
 596     """The data contained in a packfile.
 597
 598     Pack files can be accessed both sequentially for exploding a pack, and
 599     directly with the help of an index to retrieve a specific object.
 600
 601     The objects within are either complete or a delta aginst another.
 602
 603     The header is variable length. If the MSB of each byte is set then it
 604     indicates that the subsequent byte is still part of the header.
 605     For the first byte the next MS bits are the type, which tells you the type
 606     of object, and whether it is a delta. The LS byte is the lowest bits of the
 607     size. For each subsequent byte the LS 7 bits are the next MS bits of the
 608     size, i.e. the last byte of the header contains the MS bits of the size.
 609
 610     For the complete objects the data is stored as zlib deflated data.
 611     The size in the header is the uncompressed object size, so to uncompress
 612     you need to just keep feeding data to zlib until you get an object back,
 613     or it errors on bad data. This is done here by just giving the complete
 614     buffer from the start of the deflated object on. This is bad, but until I
 615     get mmap sorted out it will have to do.
 616
 617     Currently there are no integrity checks done. Also no attempt is made to
 618     try and detect the delta case, or a request for an object at the wrong
 619     position.  It will all just throw a zlib or KeyError.
 620     """
 621
 622     def __init__(self, filename, file=None, size=None):
 623         """Create a PackData object representing the pack in the given filename.
 624
 625         The file must exist and stay readable until the object is disposed of.
 626         It must also stay the same size. It will be mapped whenever needed.
 627
 628         Currently there is a restriction on the size of the pack as the python
 629         mmap implementation is flawed.
 630         """
 631         self._filename = filename
 632         self._size = size
 633         self._header_size = 12
 634         if file is None:
 635             self._file = GitFile(self._filename, 'rb')
 636         else:
 637             self._file = file
 638         (version, self._num_objects) = read_pack_header(self._file.read)
 639         self._offset_cache = LRUSizeCache(1024*1024*20,
 640             compute_size=_compute_object_size)
 641
 642     @classmethod
 643     def from_file(cls, file, size):
 644         return cls(str(file), file=file, size=size)
 645
 646     @classmethod
 647     def from_path(cls, path):
 648         return cls(filename=path)
 649
 650     def close(self):
 651         self._file.close()
 652
 653     def __del__(self):
 654         self.close()
 655
 656     def _get_size(self):
 657         if self._size is not None:
 658             return self._size
 659         self._size = os.path.getsize(self._filename)
 660         if self._size < self._header_size:
 661             errmsg = ("%s is too small for a packfile (%d < %d)" %
 662                       (self._filename, self._size, self._header_size))
 663             raise AssertionError(errmsg)
 664         return self._size
 665
 666     def __len__(self):
 667         """Returns the number of objects in this pack."""
 668         return self._num_objects
 669
 670     def calculate_checksum(self):
 671         """Calculate the checksum for this pack.
 672
 673         :return: 20-byte binary SHA1 digest
 674         """
 675         s = make_sha()
 676         self._file.seek(0)
 677         todo = self._get_size() - 20
 678         while todo > 0:
 679             x = self._file.read(min(todo, 1<<16))
 680             s.update(x)
 681             todo -= len(x)
 682         return s.digest()
 683
 684     def resolve_object(self, offset, type, obj, get_ref, get_offset=None):
 685         """Resolve an object, possibly resolving deltas when necessary.
 686
 687         :return: Tuple with object type and contents.
 688         """
 689         if type not in (6, 7): # Not a delta
 690             return type, obj
 691
 692         if get_offset is None:
 693             get_offset = self.get_object_at
 694
 695         if type == 6: # offset delta
 696             (delta_offset, delta) = obj
 697             assert isinstance(delta_offset, int)
 698             base_offset = offset-delta_offset
 699             type, base_obj = get_offset(base_offset)
 700             assert isinstance(type, int)
 701         elif type == 7: # ref delta
 702             (basename, delta) = obj
 703             assert isinstance(basename, str) and len(basename) == 20
 704             type, base_obj = get_ref(basename)
 705             assert isinstance(type, int)
 706             # Can't be a ofs delta, as we wouldn't know the base offset
 707             assert type != 6
 708             base_offset = None
 709         type, base_chunks = self.resolve_object(base_offset, type, base_obj,
 710                                                 get_ref)
 711         if base_offset is not None:
 712             self._offset_cache[base_offset] = type, base_chunks
 713         return (type, apply_delta(base_chunks, delta))
 714
 715     def iterobjects(self, progress=None):
 716
 717         class ObjectIterator(object):
 718
 719             def __init__(self, pack):
 720                 self.i = 0
 721                 self.offset = pack._header_size
 722                 self.num = len(pack)
 723                 self.map = pack._file
 724
 725             def __iter__(self):
 726                 return self
 727
 728             def __len__(self):
 729                 return self.num
 730
 731             def next(self):
 732                 if self.i == self.num:
 733                     raise StopIteration
 734                 self.map.seek(self.offset)
 735                 (type, obj, total_size, unused) = unpack_object(self.map.read)
 736                 self.map.seek(self.offset)
 737                 crc32 = zlib.crc32(self.map.read(total_size)) & 0xffffffff
 738                 ret = (self.offset, type, obj, crc32)
 739                 self.offset += total_size
 740                 if progress:
 741                     progress(self.i, self.num)
 742                 self.i+=1
 743                 return ret
 744
 745         return ObjectIterator(self)
 746
 747     def iterentries(self, ext_resolve_ref=None, progress=None):
 748         """Yield entries summarizing the contents of this pack.
 749
 750         :param ext_resolve_ref: Optional function to resolve base
 751             objects (in case this is a thin pack)
 752         :param progress: Progress function, called with current and
 753             total object count.
 754
 755         This will yield tuples with (sha, offset, crc32)
 756         """
 757         found = {}
 758         postponed = defaultdict(list)
 759         class Postpone(Exception):
 760             """Raised to postpone delta resolving."""
 761
 762         def get_ref_text(sha):
 763             assert len(sha) == 20
 764             if sha in found:
 765                 return self.get_object_at(found[sha])
 766             if ext_resolve_ref:
 767                 try:
 768                     return ext_resolve_ref(sha)
 769                 except KeyError:
 770                     pass
 771             raise Postpone, (sha, )
 772         extra = []
 773         todo = chain(self.iterobjects(progress=progress), extra)
 774         for (offset, type, obj, crc32) in todo:
 775             assert isinstance(offset, int)
 776             assert isinstance(type, int)
 777             try:
 778                 type, obj = self.resolve_object(offset, type, obj,
 779                     get_ref_text)
 780             except Postpone, (sha, ):
 781                 postponed[sha].append((offset, type, obj))
 782             else:
 783                 shafile = ShaFile.from_raw_chunks(type, obj)
 784                 sha = shafile.sha().digest()
 785                 found[sha] = offset
 786                 yield sha, offset, crc32
 787                 extra.extend(postponed.get(sha, []))
 788         if postponed:
 789             raise KeyError([sha_to_hex(h) for h in postponed.keys()])
 790
 791     def sorted_entries(self, resolve_ext_ref=None, progress=None):
 792         """Return entries in this pack, sorted by SHA.
 793
 794         :param resolve_ext_ref: Optional function to resolve base
 795             objects (in case this is a thin pack)
 796         :param progress: Progress function, called with current and
 797             total object count
 798         :return: List of tuples with (sha, offset, crc32)
 799         """
 800         ret = list(self.iterentries(resolve_ext_ref, progress=progress))
 801         ret.sort()
 802         return ret
 803
 804     def create_index_v1(self, filename, resolve_ext_ref=None, progress=None):
 805         """Create a version 1 file for this data file.
 806
 807         :param filename: Index filename.
 808         :param resolve_ext_ref: Function to use for resolving externally
 809             referenced SHA1s (for thin packs)
 810         :param progress: Progress report function
 811         """
 812         entries = self.sorted_entries(resolve_ext_ref, progress=progress)
 813         write_pack_index_v1(filename, entries, self.calculate_checksum())
 814
 815     def create_index_v2(self, filename, resolve_ext_ref=None, progress=None):
 816         """Create a version 2 index file for this data file.
 817
 818         :param filename: Index filename.
 819         :param resolve_ext_ref: Function to use for resolving externally
 820             referenced SHA1s (for thin packs)
 821         :param progress: Progress report function
 822         """
 823         entries = self.sorted_entries(resolve_ext_ref, progress=progress)
 824         write_pack_index_v2(filename, entries, self.calculate_checksum())
 825
 826     def create_index(self, filename, resolve_ext_ref=None, progress=None,
 827                      version=2):
 828         """Create an  index file for this data file.
 829
 830         :param filename: Index filename.
 831         :param resolve_ext_ref: Function to use for resolving externally
 832             referenced SHA1s (for thin packs)
 833         :param progress: Progress report function
 834         """
 835         if version == 1:
 836             self.create_index_v1(filename, resolve_ext_ref, progress)
 837         elif version == 2:
 838             self.create_index_v2(filename, resolve_ext_ref, progress)
 839         else:
 840             raise ValueError("unknown index format %d" % version)
 841
 842     def get_stored_checksum(self):
 843         """Return the expected checksum stored in this pack."""
 844         self._file.seek(self._get_size()-20)
 845         return self._file.read(20)
 846
 847     def check(self):
 848         """Check the consistency of this pack."""
 849         return (self.calculate_checksum() == self.get_stored_checksum())
 850
 851     def get_object_at(self, offset):
 852         """Given an offset in to the packfile return the object that is there.
 853
 854         Using the associated index the location of an object can be looked up,
 855         and then the packfile can be asked directly for that object using this
 856         function.
 857         """
 858         if offset in self._offset_cache:
 859             return self._offset_cache[offset]
 860         assert isinstance(offset, long) or isinstance(offset, int),\
 861                 "offset was %r" % offset
 862         assert offset >= self._header_size
 863         self._file.seek(offset)
 864         return unpack_object(self._file.read)[:2]
 865
 866
 867 class SHA1Reader(object):
 868     """Wrapper around a file-like object that remembers the SHA1 of its data."""
 869
 870     def __init__(self, f):
 871         self.f = f
 872         self.sha1 = make_sha("")
 873
 874     def read(self, num=None):
 875         data = self.f.read(num)
 876         self.sha1.update(data)
 877         return data
 878
 879     def check_sha(self):
 880         stored = self.f.read(20)
 881         if stored != self.sha1.digest():
 882             raise ChecksumMismatch(self.sha1.hexdigest(), sha_to_hex(stored))
 883
 884     def close(self):
 885         return self.f.close()
 886
 887     def tell(self):
 888         return self.f.tell()
 889
 890
 891 class SHA1Writer(object):
 892     """Wrapper around a file-like object that remembers the SHA1 of its data."""
 893
 894     def __init__(self, f):
 895         self.f = f
 896         self.sha1 = make_sha("")
 897
 898     def write(self, data):
 899         self.sha1.update(data)
 900         self.f.write(data)
 901
 902     def write_sha(self):
 903         sha = self.sha1.digest()
 904         assert len(sha) == 20
 905         self.f.write(sha)
 906         return sha
 907
 908     def close(self):
 909         sha = self.write_sha()
 910         self.f.close()
 911         return sha
 912
 913     def tell(self):
 914         return self.f.tell()
 915
 916
 917 def write_pack_object(f, type, object):
 918     """Write pack object to a file.
 919
 920     :param f: File to write to
 921     :param type: Numeric type of the object
 922     :param object: Object to write
 923     :return: Tuple with offset at which the object was written, and crc32
 924     """
 925     offset = f.tell()
 926     packed_data_hdr = ""
 927     if type == 6: # offset delta
 928         (delta_base_offset, object) = object
 929     elif type == 7: # ref delta
 930         (basename, object) = object
 931     size = len(object)
 932     c = (type << 4) | (size & 15)
 933     size >>= 4
 934     while size:
 935         packed_data_hdr += (chr(c | 0x80))
 936         c = size & 0x7f
 937         size >>= 7
 938     packed_data_hdr += chr(c)
 939     if type == 6: # offset delta
 940         ret = [delta_base_offset & 0x7f]
 941         delta_base_offset >>= 7
 942         while delta_base_offset:
 943             delta_base_offset -= 1
 944             ret.insert(0, 0x80 | (delta_base_offset & 0x7f))
 945             delta_base_offset >>= 7
 946         packed_data_hdr += "".join([chr(x) for x in ret])
 947     elif type == 7: # ref delta
 948         assert len(basename) == 20
 949         packed_data_hdr += basename
 950     packed_data = packed_data_hdr + zlib.compress(object)
 951     f.write(packed_data)
 952     return (offset, (zlib.crc32(packed_data) & 0xffffffff))
 953
 954
 955 def write_pack(filename, objects, num_objects):
 956     """Write a new pack data file.
 957
 958     :param filename: Path to the new pack file (without .pack extension)
 959     :param objects: Iterable over (object, path) tuples to write
 960     :param num_objects: Number of objects to write
 961     """
 962     f = GitFile(filename + ".pack", 'wb')
 963     try:
 964         entries, data_sum = write_pack_data(f, objects, num_objects)
 965     finally:
 966         f.close()
 967     entries.sort()
 968     write_pack_index_v2(filename + ".idx", entries, data_sum)
 969
 970
 971 def write_pack_data(f, objects, num_objects, window=10):
 972     """Write a new pack file.
 973
 974     :param filename: The filename of the new pack file.
 975     :param objects: List of objects to write (tuples with object and path)
 976     :return: List with (name, offset, crc32 checksum) entries, pack checksum
 977     """
 978     recency = list(objects)
 979     # FIXME: Somehow limit delta depth
 980     # FIXME: Make thin-pack optional (its not used when cloning a pack)
 981     # Build a list of objects ordered by the magic Linus heuristic
 982     # This helps us find good objects to diff against us
 983     magic = []
 984     for obj, path in recency:
 985         magic.append( (obj.type_num, path, 1, -obj.raw_length(), obj) )
 986     magic.sort()
 987     # Build a map of objects and their index in magic - so we can find
 988     # preceeding objects to diff against
 989     offs = {}
 990     for i in range(len(magic)):
 991         offs[magic[i][4]] = i
 992     # Write the pack
 993     entries = []
 994     f = SHA1Writer(f)
 995     f.write("PACK")               # Pack header
 996     f.write(struct.pack(">L", 2)) # Pack version
 997     f.write(struct.pack(">L", num_objects)) # Number of objects in pack
 998     for o, path in recency:
 999         sha1 = o.sha().digest()
1000         orig_t = o.type_num
1001         raw = o.as_raw_string()
1002         winner = raw
1003         t = orig_t
1004         #for i in range(offs[o]-window, window):
1005         #    if i < 0 or i >= len(offs): continue
1006         #    b = magic[i][4]
1007         #    if b.type_num != orig_t: continue
1008         #    base = b.as_raw_string()
1009         #    delta = create_delta(base, raw)
1010         #    if len(delta) < len(winner):
1011         #        winner = delta
1012         #        t = 6 if magic[i][2] == 1 else 7
1013         offset, crc32 = write_pack_object(f, t, winner)
1014         entries.append((sha1, offset, crc32))
1015     return entries, f.write_sha()
1016
1017
1018 def write_pack_index_v1(filename, entries, pack_checksum):
1019     """Write a new pack index file.
1020
1021     :param filename: The filename of the new pack index file.
1022     :param entries: List of tuples with object name (sha), offset_in_pack,
1023         and crc32_checksum.
1024     :param pack_checksum: Checksum of the pack file.
1025     """
1026     f = GitFile(filename, 'wb')
1027     try:
1028         f = SHA1Writer(f)
1029         fan_out_table = defaultdict(lambda: 0)
1030         for (name, offset, entry_checksum) in entries:
1031             fan_out_table[ord(name[0])] += 1
1032         # Fan-out table
1033         for i in range(0x100):
1034             f.write(struct.pack(">L", fan_out_table[i]))
1035             fan_out_table[i+1] += fan_out_table[i]
1036         for (name, offset, entry_checksum) in entries:
1037             f.write(struct.pack(">L20s", offset, name))
1038         assert len(pack_checksum) == 20
1039         f.write(pack_checksum)
1040     finally:
1041         f.close()
1042
1043
1044 def create_delta(base_buf, target_buf):
1045     """Use python difflib to work out how to transform base_buf to target_buf.
1046
1047     :param base_buf: Base buffer
1048     :param target_buf: Target buffer
1049     """
1050     assert isinstance(base_buf, str)
1051     assert isinstance(target_buf, str)
1052     out_buf = ""
1053     # write delta header
1054     def encode_size(size):
1055         ret = ""
1056         c = size & 0x7f
1057         size >>= 7
1058         while size:
1059             ret += chr(c | 0x80)
1060             c = size & 0x7f
1061             size >>= 7
1062         ret += chr(c)
1063         return ret
1064     out_buf += encode_size(len(base_buf))
1065     out_buf += encode_size(len(target_buf))
1066     # write out delta opcodes
1067     seq = difflib.SequenceMatcher(a=base_buf, b=target_buf)
1068     for opcode, i1, i2, j1, j2 in seq.get_opcodes():
1069         # Git patch opcodes don't care about deletes!
1070         #if opcode == "replace" or opcode == "delete":
1071         #    pass
1072         if opcode == "equal":
1073             # If they are equal, unpacker will use data from base_buf
1074             # Write out an opcode that says what range to use
1075             scratch = ""
1076             op = 0x80
1077             o = i1
1078             for i in range(4):
1079                 if o & 0xff << i*8:
1080                     scratch += chr((o >> i*8) & 0xff)
1081                     op |= 1 << i
1082             s = i2 - i1
1083             for i in range(2):
1084                 if s & 0xff << i*8:
1085                     scratch += chr((s >> i*8) & 0xff)
1086                     op |= 1 << (4+i)
1087             out_buf += chr(op)
1088             out_buf += scratch
1089         if opcode == "replace" or opcode == "insert":
1090             # If we are replacing a range or adding one, then we just
1091             # output it to the stream (prefixed by its size)
1092             s = j2 - j1
1093             o = j1
1094             while s > 127:
1095                 out_buf += chr(127)
1096                 out_buf += target_buf[o:o+127]
1097                 s -= 127
1098                 o += 127
1099             out_buf += chr(s)
1100             out_buf += target_buf[o:o+s]
1101     return out_buf
1102
1103
1104 def apply_delta(src_buf, delta):
1105     """Based on the similar function in git's patch-delta.c.
1106
1107     :param src_buf: Source buffer
1108     :param delta: Delta instructions
1109     """
1110     if type(src_buf) != str:
1111         src_buf = "".join(src_buf)
1112     if type(delta) != str:
1113         delta = "".join(delta)
1114     out = []
1115     index = 0
1116     delta_length = len(delta)
1117     def get_delta_header_size(delta, index):
1118         size = 0
1119         i = 0
1120         while delta:
1121             cmd = ord(delta[index])
1122             index += 1
1123             size |= (cmd & ~0x80) << i
1124             i += 7
1125             if not cmd & 0x80:
1126                 break
1127         return size, index
1128     src_size, index = get_delta_header_size(delta, index)
1129     dest_size, index = get_delta_header_size(delta, index)
1130     assert src_size == len(src_buf), "%d vs %d" % (src_size, len(src_buf))
1131     while index < delta_length:
1132         cmd = ord(delta[index])
1133         index += 1
1134         if cmd & 0x80:
1135             cp_off = 0
1136             for i in range(4):
1137                 if cmd & (1 << i):
1138                     x = ord(delta[index])
1139                     index += 1
1140                     cp_off |= x << (i * 8)
1141             cp_size = 0
1142             for i in range(3):
1143                 if cmd & (1 << (4+i)):
1144                     x = ord(delta[index])
1145                     index += 1
1146                     cp_size |= x << (i * 8)
1147             if cp_size == 0:
1148                 cp_size = 0x10000
1149             if (cp_off + cp_size < cp_size or
1150                 cp_off + cp_size > src_size or
1151                 cp_size > dest_size):
1152                 break
1153             out.append(src_buf[cp_off:cp_off+cp_size])
1154         elif cmd != 0:
1155             out.append(delta[index:index+cmd])
1156             index += cmd
1157         else:
1158             raise ApplyDeltaError("Invalid opcode 0")
1159
1160     if index != delta_length:
1161         raise ApplyDeltaError("delta not empty: %r" % delta[index:])
1162
1163     if dest_size != chunks_length(out):
1164         raise ApplyDeltaError("dest size incorrect")
1165
1166     return out
1167
1168
1169 def write_pack_index_v2(filename, entries, pack_checksum):
1170     """Write a new pack index file.
1171
1172     :param filename: The filename of the new pack index file.
1173     :param entries: List of tuples with object name (sha), offset_in_pack, and
1174         crc32_checksum.
1175     :param pack_checksum: Checksum of the pack file.
1176     """
1177     f = GitFile(filename, 'wb')
1178     try:
1179         f = SHA1Writer(f)
1180         f.write('\377tOc') # Magic!
1181         f.write(struct.pack(">L", 2))
1182         fan_out_table = defaultdict(lambda: 0)
1183         for (name, offset, entry_checksum) in entries:
1184             fan_out_table[ord(name[0])] += 1
1185         # Fan-out table
1186         for i in range(0x100):
1187             f.write(struct.pack(">L", fan_out_table[i]))
1188             fan_out_table[i+1] += fan_out_table[i]
1189         for (name, offset, entry_checksum) in entries:
1190             f.write(name)
1191         for (name, offset, entry_checksum) in entries:
1192             f.write(struct.pack(">L", entry_checksum))
1193         for (name, offset, entry_checksum) in entries:
1194             # FIXME: handle if MSBit is set in offset
1195             f.write(struct.pack(">L", offset))
1196         # FIXME: handle table for pack files > 8 Gb
1197         assert len(pack_checksum) == 20
1198         f.write(pack_checksum)
1199     finally:
1200         f.close()
1201
1202
1203 class Pack(object):
1204     """A Git pack object."""
1205
1206     def __init__(self, basename):
1207         self._basename = basename
1208         self._data_path = self._basename + ".pack"
1209         self._idx_path = self._basename + ".idx"
1210         self._data = None
1211         self._idx = None
1212
1213     @classmethod
1214     def from_objects(self, data, idx):
1215         """Create a new pack object from pack data and index objects."""
1216         ret = Pack("")
1217         ret._data = data
1218         ret._idx = idx
1219         return ret
1220
1221     def name(self):
1222         """The SHA over the SHAs of the objects in this pack."""
1223         return self.index.objects_sha1()
1224
1225     @property
1226     def data(self):
1227         """The pack data object being used."""
1228         if self._data is None:
1229             self._data = PackData(self._data_path)
1230             assert len(self.index) == len(self._data)
1231             idx_stored_checksum = self.index.get_pack_checksum()
1232             data_stored_checksum = self._data.get_stored_checksum()
1233             if idx_stored_checksum != data_stored_checksum:
1234                 raise ChecksumMismatch(sha_to_hex(idx_stored_checksum),
1235                                        sha_to_hex(data_stored_checksum))
1236         return self._data
1237
1238     @property
1239     def index(self):
1240         """The index being used.
1241
1242         :note: This may be an in-memory index
1243         """
1244         if self._idx is None:
1245             self._idx = load_pack_index(self._idx_path)
1246         return self._idx
1247
1248     def close(self):
1249         if self._data is not None:
1250             self._data.close()
1251         self.index.close()
1252
1253     def __eq__(self, other):
1254         return type(self) == type(other) and self.index == other.index
1255
1256     def __len__(self):
1257         """Number of entries in this pack."""
1258         return len(self.index)
1259
1260     def __repr__(self):
1261         return "%s(%r)" % (self.__class__.__name__, self._basename)
1262
1263     def __iter__(self):
1264         """Iterate over all the sha1s of the objects in this pack."""
1265         return iter(self.index)
1266
1267     def check(self):
1268         """Check the integrity of this pack."""
1269         if not self.index.check():
1270             return False
1271         if not self.data.check():
1272             return False
1273         return True
1274
1275     def get_stored_checksum(self):
1276         return self.data.get_stored_checksum()
1277
1278     def __contains__(self, sha1):
1279         """Check whether this pack contains a particular SHA1."""
1280         try:
1281             self.index.object_index(sha1)
1282             return True
1283         except KeyError:
1284             return False
1285
1286     def get_raw(self, sha1, resolve_ref=None):
1287         offset = self.index.object_index(sha1)
1288         obj_type, obj = self.data.get_object_at(offset)
1289         if type(offset) is long:
1290           offset = int(offset)
1291         if resolve_ref is None:
1292             resolve_ref = self.get_raw
1293         kind, chunks = self.data.resolve_object(offset, obj_type, obj,
1294             resolve_ref)
1295         return kind, "".join(chunks)
1296
1297     def __getitem__(self, sha1):
1298         """Retrieve the specified SHA1."""
1299         type, uncomp = self.get_raw(sha1)
1300         return ShaFile.from_raw_string(type, uncomp)
1301
1302     def iterobjects(self, get_raw=None):
1303         """Iterate over the objects in this pack."""
1304         if get_raw is None:
1305             get_raw = self.get_raw
1306         for offset, type, obj, crc32 in self.data.iterobjects():
1307             assert isinstance(offset, int)
1308             type, obj = self.data.resolve_object(offset, type, obj, get_raw)
1309             yield ShaFile.from_raw_chunks(type, obj)
1310
1311
1312 try:
1313     from dulwich._pack import apply_delta, bisect_find_sha
1314 except ImportError:
1315     pass