dulwich/pack.py

   1 # pack.py -- For dealing with packed git objects.
   2 # Copyright (C) 2007 James Westby <jw+debian@jameswestby.net>
   3 # Copyright (C) 2008-2009 Jelmer Vernooij <jelmer@samba.org>
   4 #
   5 # This program is free software; you can redistribute it and/or
   6 # modify it under the terms of the GNU General Public License
   7 # as published by the Free Software Foundation; version 2
   8 # of the License or (at your option) a later version.
   9 #
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 # GNU General Public License for more details.
  14 #
  15 # You should have received a copy of the GNU General Public License
  16 # along with this program; if not, write to the Free Software
  17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
  18 # MA  02110-1301, USA.
  19
  20 """Classes for dealing with packed git objects.
  21
  22 A pack is a compact representation of a bunch of objects, stored
  23 using deltas where possible.
  24
  25 They have two parts, the pack file, which stores the data, and an index
  26 that tells you where the data is.
  27
  28 To find an object you look in all of the index files 'til you find a
  29 match for the object name. You then use the pointer got from this as
  30 a pointer in to the corresponding packfile.
  31 """
  32
  33 try:
  34     from collections import defaultdict
  35 except ImportError:
  36     from misc import defaultdict
  37
  38 from cStringIO import (
  39     StringIO,
  40     )
  41 from collections import (
  42     deque,
  43     )
  44 import difflib
  45 from itertools import (
  46     chain,
  47     imap,
  48     izip,
  49     )
  50 import mmap
  51 import os
  52 import struct
  53 try:
  54     from struct import unpack_from
  55 except ImportError:
  56     from dulwich.misc import unpack_from
  57 import sys
  58 import zlib
  59
  60 from dulwich.errors import (
  61     ApplyDeltaError,
  62     ChecksumMismatch,
  63     )
  64 from dulwich.file import GitFile
  65 from dulwich.lru_cache import (
  66     LRUSizeCache,
  67     )
  68 from dulwich.misc import (
  69     make_sha,
  70     SEEK_END,
  71     )
  72 from dulwich.objects import (
  73     ShaFile,
  74     hex_to_sha,
  75     sha_to_hex,
  76     object_header,
  77     )
  78
  79 supports_mmap_offset = (sys.version_info[0] >= 3 or
  80         (sys.version_info[0] == 2 and sys.version_info[1] >= 6))
  81
  82
  83 OFS_DELTA = 6
  84 REF_DELTA = 7
  85
  86 DELTA_TYPES = (OFS_DELTA, REF_DELTA)
  87
  88
  89 def take_msb_bytes(read):
  90     """Read bytes marked with most significant bit.
  91
  92     :param read: Read function
  93     """
  94     ret = []
  95     while len(ret) == 0 or ret[-1] & 0x80:
  96         ret.append(ord(read(1)))
  97     return ret
  98
  99
 100 def read_zlib_chunks(read_some, dec_size, buffer_size=4096):
 101     """Read zlib data from a buffer.
 102
 103     This function requires that the buffer have additional data following the
 104     compressed data, which is guaranteed to be the case for git pack files.
 105
 106     :param read_some: Read function that returns at least one byte, but may
 107         return less than the requested size
 108     :param dec_size: Expected size of the decompressed buffer
 109     :param buffer_size: Size of the read buffer
 110     :return: Tuple with list of chunks, length of compressed data length and
 111         and unused read data.
 112     :raise zlib.error: if a decompression error occurred.
 113     """
 114     if dec_size <= -1:
 115         raise ValueError("non-negative zlib data stream size expected")
 116     obj = zlib.decompressobj()
 117     ret = []
 118     fed = 0
 119     size = 0
 120     while obj.unused_data == "":
 121         add = read_some(buffer_size)
 122         if not add:
 123             raise zlib.error("EOF before end of zlib stream")
 124         fed += len(add)
 125         decomp = obj.decompress(add)
 126         size += len(decomp)
 127         ret.append(decomp)
 128     if size != dec_size:
 129         raise zlib.error("decompressed data does not match expected size")
 130     comp_len = fed - len(obj.unused_data)
 131     return ret, comp_len, obj.unused_data
 132
 133
 134 def iter_sha1(iter):
 135     """Return the hexdigest of the SHA1 over a set of names.
 136
 137     :param iter: Iterator over string objects
 138     :return: 40-byte hex sha1 digest
 139     """
 140     sha1 = make_sha()
 141     for name in iter:
 142         sha1.update(name)
 143     return sha1.hexdigest()
 144
 145
 146 def load_pack_index(path):
 147     """Load an index file by path.
 148
 149     :param filename: Path to the index file
 150     :return: A PackIndex loaded from the given path
 151     """
 152     f = GitFile(path, 'rb')
 153     try:
 154         return load_pack_index_file(path, f)
 155     finally:
 156         f.close()
 157
 158
 159 def _load_file_contents(f, size=None):
 160     fileno = getattr(f, 'fileno', None)
 161     # Attempt to use mmap if possible
 162     if fileno is not None:
 163         fd = f.fileno()
 164         if size is None:
 165             size = os.fstat(fd).st_size
 166         try:
 167             contents = mmap.mmap(fd, size, access=mmap.ACCESS_READ)
 168         except mmap.error:
 169             # Perhaps a socket?
 170             pass
 171         else:
 172             return contents, size
 173     contents = f.read()
 174     size = len(contents)
 175     return contents, size
 176
 177
 178 def load_pack_index_file(path, f):
 179     """Load an index file from a file-like object.
 180
 181     :param path: Path for the index file
 182     :param f: File-like object
 183     :return: A PackIndex loaded from the given file
 184     """
 185     contents, size = _load_file_contents(f)
 186     if contents[:4] == '\377tOc':
 187         version = struct.unpack(">L", contents[4:8])[0]
 188         if version == 2:
 189             return PackIndex2(path, file=f, contents=contents,
 190                 size=size)
 191         else:
 192             raise KeyError("Unknown pack index format %d" % version)
 193     else:
 194         return PackIndex1(path, file=f, contents=contents, size=size)
 195
 196
 197 def bisect_find_sha(start, end, sha, unpack_name):
 198     """Find a SHA in a data blob with sorted SHAs.
 199
 200     :param start: Start index of range to search
 201     :param end: End index of range to search
 202     :param sha: Sha to find
 203     :param unpack_name: Callback to retrieve SHA by index
 204     :return: Index of the SHA, or None if it wasn't found
 205     """
 206     assert start <= end
 207     while start <= end:
 208         i = (start + end)/2
 209         file_sha = unpack_name(i)
 210         x = cmp(file_sha, sha)
 211         if x < 0:
 212             start = i + 1
 213         elif x > 0:
 214             end = i - 1
 215         else:
 216             return i
 217     return None
 218
 219
 220 class PackIndex(object):
 221     """An index in to a packfile.
 222
 223     Given a sha id of an object a pack index can tell you the location in the
 224     packfile of that object if it has it.
 225
 226     To do the loop it opens the file, and indexes first 256 4 byte groups
 227     with the first byte of the sha id. The value in the four byte group indexed
 228     is the end of the group that shares the same starting byte. Subtract one
 229     from the starting byte and index again to find the start of the group.
 230     The values are sorted by sha id within the group, so do the math to find
 231     the start and end offset and then bisect in to find if the value is present.
 232     """
 233
 234     def __init__(self, filename, file=None, contents=None, size=None):
 235         """Create a pack index object.
 236
 237         Provide it with the name of the index file to consider, and it will map
 238         it whenever required.
 239         """
 240         self._filename = filename
 241         # Take the size now, so it can be checked each time we map the file to
 242         # ensure that it hasn't changed.
 243         if file is None:
 244             self._file = GitFile(filename, 'rb')
 245         else:
 246             self._file = file
 247         if contents is None:
 248             self._contents, self._size = _load_file_contents(file, size)
 249         else:
 250             self._contents, self._size = (contents, size)
 251
 252     def __eq__(self, other):
 253         if not isinstance(other, PackIndex):
 254             return False
 255
 256         if self._fan_out_table != other._fan_out_table:
 257             return False
 258
 259         for (name1, _, _), (name2, _, _) in izip(self.iterentries(),
 260                                                  other.iterentries()):
 261             if name1 != name2:
 262                 return False
 263         return True
 264
 265     def __ne__(self, other):
 266         return not self.__eq__(other)
 267
 268     def close(self):
 269         self._file.close()
 270
 271     def __len__(self):
 272         """Return the number of entries in this pack index."""
 273         return self._fan_out_table[-1]
 274
 275     def _unpack_entry(self, i):
 276         """Unpack the i-th entry in the index file.
 277
 278         :return: Tuple with object name (SHA), offset in pack file and CRC32
 279             checksum (if known).
 280         """
 281         raise NotImplementedError(self._unpack_entry)
 282
 283     def _unpack_name(self, i):
 284         """Unpack the i-th name from the index file."""
 285         raise NotImplementedError(self._unpack_name)
 286
 287     def _unpack_offset(self, i):
 288         """Unpack the i-th object offset from the index file."""
 289         raise NotImplementedError(self._unpack_offset)
 290
 291     def _unpack_crc32_checksum(self, i):
 292         """Unpack the crc32 checksum for the i-th object from the index file."""
 293         raise NotImplementedError(self._unpack_crc32_checksum)
 294
 295     def __iter__(self):
 296         """Iterate over the SHAs in this pack."""
 297         return imap(sha_to_hex, self._itersha())
 298
 299     def _itersha(self):
 300         for i in range(len(self)):
 301             yield self._unpack_name(i)
 302
 303     def objects_sha1(self):
 304         """Return the hex SHA1 over all the shas of all objects in this pack.
 305
 306         :note: This is used for the filename of the pack.
 307         """
 308         return iter_sha1(self._itersha())
 309
 310     def iterentries(self):
 311         """Iterate over the entries in this pack index.
 312
 313         :yields: tuples with object name, offset in packfile and crc32 checksum.
 314         """
 315         for i in range(len(self)):
 316             yield self._unpack_entry(i)
 317
 318     def _read_fan_out_table(self, start_offset):
 319         ret = []
 320         for i in range(0x100):
 321             fanout_entry = self._contents[start_offset+i*4:start_offset+(i+1)*4]
 322             ret.append(struct.unpack(">L", fanout_entry)[0])
 323         return ret
 324
 325     def check(self):
 326         """Check that the stored checksum matches the actual checksum."""
 327         actual = self.calculate_checksum()
 328         stored = self.get_stored_checksum()
 329         if actual != stored:
 330             raise ChecksumMismatch(stored, actual)
 331
 332     def calculate_checksum(self):
 333         """Calculate the SHA1 checksum over this pack index.
 334
 335         :return: This is a 20-byte binary digest
 336         """
 337         return make_sha(self._contents[:-20]).digest()
 338
 339     def get_pack_checksum(self):
 340         """Return the SHA1 checksum stored for the corresponding packfile.
 341
 342         :return: 20-byte binary digest
 343         """
 344         return str(self._contents[-40:-20])
 345
 346     def get_stored_checksum(self):
 347         """Return the SHA1 checksum stored for this index.
 348
 349         :return: 20-byte binary digest
 350         """
 351         return str(self._contents[-20:])
 352
 353     def object_index(self, sha):
 354         """Return the index in to the corresponding packfile for the object.
 355
 356         Given the name of an object it will return the offset that object
 357         lives at within the corresponding pack file. If the pack file doesn't
 358         have the object then None will be returned.
 359         """
 360         if len(sha) == 40:
 361             sha = hex_to_sha(sha)
 362         return self._object_index(sha)
 363
 364     def _object_index(self, sha):
 365         """See object_index.
 366
 367         :param sha: A *binary* SHA string. (20 characters long)_
 368         """
 369         assert len(sha) == 20
 370         idx = ord(sha[0])
 371         if idx == 0:
 372             start = 0
 373         else:
 374             start = self._fan_out_table[idx-1]
 375         end = self._fan_out_table[idx]
 376         i = bisect_find_sha(start, end, sha, self._unpack_name)
 377         if i is None:
 378             raise KeyError(sha)
 379         return self._unpack_offset(i)
 380
 381
 382 class PackIndex1(PackIndex):
 383     """Version 1 Pack Index."""
 384
 385     def __init__(self, filename, file=None, contents=None, size=None):
 386         PackIndex.__init__(self, filename, file, contents, size)
 387         self.version = 1
 388         self._fan_out_table = self._read_fan_out_table(0)
 389
 390     def _unpack_entry(self, i):
 391         (offset, name) = unpack_from(">L20s", self._contents,
 392                                      (0x100 * 4) + (i * 24))
 393         return (name, offset, None)
 394
 395     def _unpack_name(self, i):
 396         offset = (0x100 * 4) + (i * 24) + 4
 397         return self._contents[offset:offset+20]
 398
 399     def _unpack_offset(self, i):
 400         offset = (0x100 * 4) + (i * 24)
 401         return unpack_from(">L", self._contents, offset)[0]
 402
 403     def _unpack_crc32_checksum(self, i):
 404         # Not stored in v1 index files
 405         return None
 406
 407
 408 class PackIndex2(PackIndex):
 409     """Version 2 Pack Index."""
 410
 411     def __init__(self, filename, file=None, contents=None, size=None):
 412         PackIndex.__init__(self, filename, file, contents, size)
 413         assert self._contents[:4] == '\377tOc', "Not a v2 pack index file"
 414         (self.version, ) = unpack_from(">L", self._contents, 4)
 415         assert self.version == 2, "Version was %d" % self.version
 416         self._fan_out_table = self._read_fan_out_table(8)
 417         self._name_table_offset = 8 + 0x100 * 4
 418         self._crc32_table_offset = self._name_table_offset + 20 * len(self)
 419         self._pack_offset_table_offset = (self._crc32_table_offset +
 420                                           4 * len(self))
 421
 422     def _unpack_entry(self, i):
 423         return (self._unpack_name(i), self._unpack_offset(i),
 424                 self._unpack_crc32_checksum(i))
 425
 426     def _unpack_name(self, i):
 427         offset = self._name_table_offset + i * 20
 428         return self._contents[offset:offset+20]
 429
 430     def _unpack_offset(self, i):
 431         offset = self._pack_offset_table_offset + i * 4
 432         return unpack_from(">L", self._contents, offset)[0]
 433
 434     def _unpack_crc32_checksum(self, i):
 435         return unpack_from(">L", self._contents,
 436                           self._crc32_table_offset + i * 4)[0]
 437
 438
 439 def read_pack_header(read):
 440     """Read the header of a pack file.
 441
 442     :param read: Read function
 443     :return: Tuple with pack version and number of objects
 444     """
 445     header = read(12)
 446     assert header[:4] == "PACK"
 447     (version,) = unpack_from(">L", header, 4)
 448     assert version in (2, 3), "Version was %d" % version
 449     (num_objects,) = unpack_from(">L", header, 8)
 450     return (version, num_objects)
 451
 452
 453 def chunks_length(chunks):
 454     return sum(imap(len, chunks))
 455
 456
 457 def unpack_object(read_all, read_some=None):
 458     """Unpack a Git object.
 459
 460     :param read_all: Read function that blocks until the number of requested
 461         bytes are read.
 462     :param read_some: Read function that returns at least one byte, but may not
 463         return the number of bytes requested.
 464     :return: tuple with type, uncompressed data, compressed size and tail data.
 465     """
 466     if read_some is None:
 467         read_some = read_all
 468     bytes = take_msb_bytes(read_all)
 469     type = (bytes[0] >> 4) & 0x07
 470     size = bytes[0] & 0x0f
 471     for i, byte in enumerate(bytes[1:]):
 472         size += (byte & 0x7f) << ((i * 7) + 4)
 473     raw_base = len(bytes)
 474     if type == OFS_DELTA:
 475         bytes = take_msb_bytes(read_all)
 476         raw_base += len(bytes)
 477         assert not (bytes[-1] & 0x80)
 478         delta_base_offset = bytes[0] & 0x7f
 479         for byte in bytes[1:]:
 480             delta_base_offset += 1
 481             delta_base_offset <<= 7
 482             delta_base_offset += (byte & 0x7f)
 483         uncomp, comp_len, unused = read_zlib_chunks(read_some, size)
 484         assert size == chunks_length(uncomp)
 485         return type, (delta_base_offset, uncomp), comp_len+raw_base, unused
 486     elif type == REF_DELTA:
 487         basename = read_all(20)
 488         raw_base += 20
 489         uncomp, comp_len, unused = read_zlib_chunks(read_some, size)
 490         assert size == chunks_length(uncomp)
 491         return type, (basename, uncomp), comp_len+raw_base, unused
 492     else:
 493         uncomp, comp_len, unused = read_zlib_chunks(read_some, size)
 494         assert chunks_length(uncomp) == size
 495         return type, uncomp, comp_len+raw_base, unused
 496
 497
 498 def _compute_object_size((num, obj)):
 499     """Compute the size of a unresolved object for use with LRUSizeCache."""
 500     if num in DELTA_TYPES:
 501         return chunks_length(obj[1])
 502     return chunks_length(obj)
 503
 504
 505 class PackStreamReader(object):
 506     """Class to read a pack stream.
 507
 508     The pack is read from a ReceivableProtocol using read() or recv() as
 509     appropriate.
 510     """
 511
 512     def __init__(self, read_all, read_some=None):
 513         self.read_all = read_all
 514         if read_some is None:
 515             self.read_some = read_all
 516         else:
 517             self.read_some = read_some
 518         self.sha = make_sha()
 519         self._offset = 0
 520         self._rbuf = StringIO()
 521         # trailer is a deque to avoid memory allocation on small reads
 522         self._trailer = deque()
 523
 524     def _read(self, read, size):
 525         """Read up to size bytes using the given callback.
 526
 527         As a side effect, update the verifier's hash (excluding the last 20
 528         bytes read) and write through to the output file.
 529
 530         :param read: The read callback to read from.
 531         :param size: The maximum number of bytes to read; the particular
 532             behavior is callback-specific.
 533         """
 534         data = read(size)
 535
 536         # maintain a trailer of the last 20 bytes we've read
 537         n = len(data)
 538         self._offset += n
 539         tn = len(self._trailer)
 540         if n >= 20:
 541             to_pop = tn
 542             to_add = 20
 543         else:
 544             to_pop = max(n + tn - 20, 0)
 545             to_add = n
 546         for _ in xrange(to_pop):
 547             self.sha.update(self._trailer.popleft())
 548         self._trailer.extend(data[-to_add:])
 549
 550         # hash everything but the trailer
 551         self.sha.update(data[:-to_add])
 552         return data
 553
 554     def _buf_len(self):
 555         buf = self._rbuf
 556         start = buf.tell()
 557         buf.seek(0, SEEK_END)
 558         end = buf.tell()
 559         buf.seek(start)
 560         return end - start
 561
 562     @property
 563     def offset(self):
 564         return self._offset - self._buf_len()
 565
 566     def read(self, size):
 567         """Read, blocking until size bytes are read."""
 568         buf_len = self._buf_len()
 569         if buf_len >= size:
 570             return self._rbuf.read(size)
 571         buf_data = self._rbuf.read()
 572         self._rbuf = StringIO()
 573         return buf_data + self._read(self.read_all, size - buf_len)
 574
 575     def recv(self, size):
 576         """Read up to size bytes, blocking until one byte is read."""
 577         buf_len = self._buf_len()
 578         if buf_len:
 579             data = self._rbuf.read(size)
 580             if size >= buf_len:
 581                 self._rbuf = StringIO()
 582             return data
 583         return self._read(self.read_some, size)
 584
 585     def __len__(self):
 586         return self._num_objects
 587
 588     def read_objects(self):
 589         """Read the objects in this pack file.
 590
 591         :raise AssertionError: if there is an error in the pack format.
 592         :raise ChecksumMismatch: if the checksum of the pack contents does not
 593             match the checksum in the pack trailer.
 594         :raise zlib.error: if an error occurred during zlib decompression.
 595         :raise IOError: if an error occurred writing to the output file.
 596         """
 597         pack_version, self._num_objects = read_pack_header(self.read)
 598         for i in xrange(self._num_objects):
 599             type, uncomp, comp_len, unused = unpack_object(self.read, self.recv)
 600             yield type, uncomp, comp_len
 601
 602             # prepend any unused data to current read buffer
 603             buf = StringIO()
 604             buf.write(unused)
 605             buf.write(self._rbuf.read())
 606             buf.seek(0)
 607             self._rbuf = buf
 608
 609         pack_sha = sha_to_hex(''.join([c for c in self._trailer]))
 610         calculated_sha = self.sha.hexdigest()
 611         if pack_sha != calculated_sha:
 612             raise ChecksumMismatch(pack_sha, calculated_sha)
 613
 614
 615 class PackObjectIterator(object):
 616
 617     def __init__(self, pack, progress=None):
 618         self.i = 0
 619         self.offset = pack._header_size
 620         self.num = len(pack)
 621         self.map = pack._file
 622         self._progress = progress
 623
 624     def __iter__(self):
 625         return self
 626
 627     def __len__(self):
 628         return self.num
 629
 630     def next(self):
 631         if self.i == self.num:
 632             raise StopIteration
 633         self.map.seek(self.offset)
 634         (type, obj, total_size, unused) = unpack_object(self.map.read)
 635         self.map.seek(self.offset)
 636         crc32 = zlib.crc32(self.map.read(total_size)) & 0xffffffff
 637         ret = (self.offset, type, obj, crc32)
 638         self.offset += total_size
 639         if self._progress is not None:
 640             self._progress(self.i, self.num)
 641         self.i+=1
 642         return ret
 643
 644 def obj_sha(type, chunks):
 645     """Compute the SHA for a numeric type and object chunks."""
 646     sha = make_sha()
 647     sha.update(object_header(type, chunks_length(chunks)))
 648     for chunk in chunks:
 649         sha.update(chunk)
 650     return sha.digest()
 651
 652
 653 class PackData(object):
 654     """The data contained in a packfile.
 655
 656     Pack files can be accessed both sequentially for exploding a pack, and
 657     directly with the help of an index to retrieve a specific object.
 658
 659     The objects within are either complete or a delta aginst another.
 660
 661     The header is variable length. If the MSB of each byte is set then it
 662     indicates that the subsequent byte is still part of the header.
 663     For the first byte the next MS bits are the type, which tells you the type
 664     of object, and whether it is a delta. The LS byte is the lowest bits of the
 665     size. For each subsequent byte the LS 7 bits are the next MS bits of the
 666     size, i.e. the last byte of the header contains the MS bits of the size.
 667
 668     For the complete objects the data is stored as zlib deflated data.
 669     The size in the header is the uncompressed object size, so to uncompress
 670     you need to just keep feeding data to zlib until you get an object back,
 671     or it errors on bad data. This is done here by just giving the complete
 672     buffer from the start of the deflated object on. This is bad, but until I
 673     get mmap sorted out it will have to do.
 674
 675     Currently there are no integrity checks done. Also no attempt is made to
 676     try and detect the delta case, or a request for an object at the wrong
 677     position.  It will all just throw a zlib or KeyError.
 678     """
 679
 680     def __init__(self, filename, file=None, size=None):
 681         """Create a PackData object representing the pack in the given filename.
 682
 683         The file must exist and stay readable until the object is disposed of. It
 684         must also stay the same size. It will be mapped whenever needed.
 685
 686         Currently there is a restriction on the size of the pack as the python
 687         mmap implementation is flawed.
 688         """
 689         self._filename = filename
 690         self._size = size
 691         self._header_size = 12
 692         if file is None:
 693             self._file = GitFile(self._filename, 'rb')
 694         else:
 695             self._file = file
 696         (version, self._num_objects) = read_pack_header(self._file.read)
 697         self._offset_cache = LRUSizeCache(1024*1024*20,
 698             compute_size=_compute_object_size)
 699         self.pack = None
 700
 701     @classmethod
 702     def from_file(cls, file, size):
 703         return cls(str(file), file=file, size=size)
 704
 705     @classmethod
 706     def from_path(cls, path):
 707         return cls(filename=path)
 708
 709     def close(self):
 710         self._file.close()
 711
 712     def _get_size(self):
 713         if self._size is not None:
 714             return self._size
 715         self._size = os.path.getsize(self._filename)
 716         if self._size < self._header_size:
 717             errmsg = ("%s is too small for a packfile (%d < %d)" %
 718                       (self._filename, self._size, self._header_size))
 719             raise AssertionError(errmsg)
 720         return self._size
 721
 722     def __len__(self):
 723         """Returns the number of objects in this pack."""
 724         return self._num_objects
 725
 726     def calculate_checksum(self):
 727         """Calculate the checksum for this pack.
 728
 729         :return: 20-byte binary SHA1 digest
 730         """
 731         s = make_sha()
 732         self._file.seek(0)
 733         todo = self._get_size() - 20
 734         while todo > 0:
 735             x = self._file.read(min(todo, 1<<16))
 736             s.update(x)
 737             todo -= len(x)
 738         return s.digest()
 739
 740     def get_ref(self, sha):
 741         """Get the object for a ref SHA, only looking in this pack."""
 742         # TODO: cache these results
 743         if self.pack is None:
 744             raise KeyError(sha)
 745         offset = self.pack.index.object_index(sha)
 746         if not offset:
 747             raise KeyError(sha)
 748         type, obj = self.get_object_at(offset)
 749         return offset, type, obj
 750
 751     def resolve_object(self, offset, type, obj, get_ref=None):
 752         """Resolve an object, possibly resolving deltas when necessary.
 753
 754         :return: Tuple with object type and contents.
 755         """
 756         if type not in DELTA_TYPES:
 757             return type, obj
 758
 759         if get_ref is None:
 760             get_ref = self.get_ref
 761         if type == OFS_DELTA:
 762             (delta_offset, delta) = obj
 763             # TODO: clean up asserts and replace with nicer error messages
 764             assert isinstance(offset, int)
 765             assert isinstance(delta_offset, int)
 766             base_offset = offset-delta_offset
 767             type, base_obj = self.get_object_at(base_offset)
 768             assert isinstance(type, int)
 769         elif type == REF_DELTA:
 770             (basename, delta) = obj
 771             assert isinstance(basename, str) and len(basename) == 20
 772             base_offset, type, base_obj = get_ref(basename)
 773             assert isinstance(type, int)
 774         type, base_chunks = self.resolve_object(base_offset, type, base_obj)
 775         chunks = apply_delta(base_chunks, delta)
 776         # TODO(dborowitz): This can result in poor performance if large base
 777         # objects are separated from deltas in the pack. We should reorganize
 778         # so that we apply deltas to all objects in a chain one after the other
 779         # to optimize cache performance.
 780         if offset is not None:
 781             self._offset_cache[offset] = type, chunks
 782         return type, chunks
 783
 784     def iterobjects(self, progress=None):
 785         return PackObjectIterator(self, progress)
 786
 787     def iterentries(self, progress=None):
 788         """Yield entries summarizing the contents of this pack.
 789
 790         :param progress: Progress function, called with current and total object
 791             count.
 792         :yields: tuples with (sha, offset, crc32)
 793         """
 794         for offset, type, obj, crc32 in self.iterobjects(progress=progress):
 795             assert isinstance(offset, int)
 796             assert isinstance(type, int)
 797             assert isinstance(obj, list) or isinstance(obj, tuple)
 798             type, obj = self.resolve_object(offset, type, obj)
 799             yield obj_sha(type, obj), offset, crc32
 800
 801     def sorted_entries(self, progress=None):
 802         """Return entries in this pack, sorted by SHA.
 803
 804         :param progress: Progress function, called with current and total object
 805             count
 806         :return: List of tuples with (sha, offset, crc32)
 807         """
 808         ret = list(self.iterentries(progress=progress))
 809         ret.sort()
 810         return ret
 811
 812     def create_index_v1(self, filename, progress=None):
 813         """Create a version 1 file for this data file.
 814
 815         :param filename: Index filename.
 816         :param progress: Progress report function
 817         :return: Checksum of index file
 818         """
 819         entries = self.sorted_entries(progress=progress)
 820         f = GitFile(filename, 'wb')
 821         try:
 822             return write_pack_index_v1(f, entries, self.calculate_checksum())
 823         finally:
 824             f.close()
 825
 826     def create_index_v2(self, filename, progress=None):
 827         """Create a version 2 index file for this data file.
 828
 829         :param filename: Index filename.
 830         :param progress: Progress report function
 831         :return: Checksum of index file
 832         """
 833         entries = self.sorted_entries(progress=progress)
 834         f = GitFile(filename, 'wb')
 835         try:
 836             return write_pack_index_v2(f, entries, self.calculate_checksum())
 837         finally:
 838             f.close()
 839
 840     def create_index(self, filename, progress=None,
 841                      version=2):
 842         """Create an  index file for this data file.
 843
 844         :param filename: Index filename.
 845         :param progress: Progress report function
 846         :return: Checksum of index file
 847         """
 848         if version == 1:
 849             return self.create_index_v1(filename, progress)
 850         elif version == 2:
 851             return self.create_index_v2(filename, progress)
 852         else:
 853             raise ValueError("unknown index format %d" % version)
 854
 855     def get_stored_checksum(self):
 856         """Return the expected checksum stored in this pack."""
 857         self._file.seek(self._get_size()-20)
 858         return self._file.read(20)
 859
 860     def check(self):
 861         """Check the consistency of this pack."""
 862         actual = self.calculate_checksum()
 863         stored = self.get_stored_checksum()
 864         if actual != stored:
 865             raise ChecksumMismatch(stored, actual)
 866
 867     def get_object_at(self, offset):
 868         """Given an offset in to the packfile return the object that is there.
 869
 870         Using the associated index the location of an object can be looked up,
 871         and then the packfile can be asked directly for that object using this
 872         function.
 873         """
 874         if offset in self._offset_cache:
 875             return self._offset_cache[offset]
 876         assert isinstance(offset, long) or isinstance(offset, int),\
 877                 "offset was %r" % offset
 878         assert offset >= self._header_size
 879         self._file.seek(offset)
 880         return unpack_object(self._file.read)[:2]
 881
 882
 883 class ThinPackData(PackData):
 884     """PackData for thin packs, which require an ObjectStore for resolving."""
 885
 886     def __init__(self, resolve_ext_ref, *args, **kwargs):
 887         super(ThinPackData, self).__init__(*args, **kwargs)
 888         self.resolve_ext_ref = resolve_ext_ref
 889
 890     def get_ref(self, sha):
 891         """Resolve a reference looking in both this pack and the store."""
 892         try:
 893             # As part of completing a pack we create a Pack object with a
 894             # ThinPackData and a full PackIndex, so check in the index first if
 895             # possible.
 896             # TODO(dborowitz): reevaluate this when the pack completion code is
 897             # rewritten.
 898             return super(ThinPackData, self).get_ref(sha)
 899         except KeyError:
 900             type, obj = self.resolve_ext_ref(sha)
 901             return None, type, obj
 902
 903     def iterentries(self, progress=None):
 904         """Yield entries summarizing the contents of this pack.
 905
 906         :param progress: Progress function, called with current and
 907             total object count.
 908
 909         This will yield tuples with (sha, offset, crc32)
 910         """
 911         found = {}
 912         postponed = defaultdict(list)
 913
 914         class Postpone(Exception):
 915             """Raised to postpone delta resolving."""
 916
 917             def __init__(self, sha):
 918                 self.sha = sha
 919
 920         def get_ref_text(sha):
 921             assert len(sha) == 20
 922             if sha in found:
 923                 offset = found[sha]
 924                 type, obj = self.get_object_at(offset)
 925                 return offset, type, obj
 926             try:
 927                 return self.get_ref(sha)
 928             except KeyError:
 929                 raise Postpone(sha)
 930
 931         extra = []
 932         todo = chain(self.iterobjects(progress=progress), extra)
 933         for (offset, type, obj, crc32) in todo:
 934             assert isinstance(offset, int)
 935             if obj is None:
 936                 # Inflate postponed delta
 937                 obj, type = self.get_object_at(offset)
 938             assert isinstance(type, int)
 939             assert isinstance(obj, list) or isinstance(obj, tuple)
 940             try:
 941                 type, obj = self.resolve_object(offset, type, obj, get_ref_text)
 942             except Postpone, e:
 943                 # Save memory by not storing the inflated obj in postponed
 944                 postponed[e.sha].append((offset, type, None, crc32))
 945             else:
 946                 sha = obj_sha(type, obj)
 947                 found[sha] = offset
 948                 yield sha, offset, crc32
 949                 extra.extend(postponed.pop(sha, []))
 950         if postponed:
 951             raise KeyError([sha_to_hex(h) for h in postponed.keys()])
 952
 953
 954 class SHA1Reader(object):
 955     """Wrapper around a file-like object that remembers the SHA1 of its data."""
 956
 957     def __init__(self, f):
 958         self.f = f
 959         self.sha1 = make_sha("")
 960
 961     def read(self, num=None):
 962         data = self.f.read(num)
 963         self.sha1.update(data)
 964         return data
 965
 966     def check_sha(self):
 967         stored = self.f.read(20)
 968         if stored != self.sha1.digest():
 969             raise ChecksumMismatch(self.sha1.hexdigest(), sha_to_hex(stored))
 970
 971     def close(self):
 972         return self.f.close()
 973
 974     def tell(self):
 975         return self.f.tell()
 976
 977
 978 class SHA1Writer(object):
 979     """Wrapper around a file-like object that remembers the SHA1 of its data."""
 980
 981     def __init__(self, f):
 982         self.f = f
 983         self.sha1 = make_sha("")
 984
 985     def write(self, data):
 986         self.sha1.update(data)
 987         self.f.write(data)
 988
 989     def write_sha(self):
 990         sha = self.sha1.digest()
 991         assert len(sha) == 20
 992         self.f.write(sha)
 993         return sha
 994
 995     def close(self):
 996         sha = self.write_sha()
 997         self.f.close()
 998         return sha
 999
1000     def tell(self):
1001         return self.f.tell()
1002
1003
1004 def write_pack_object(f, type, object):
1005     """Write pack object to a file.
1006
1007     :param f: File to write to
1008     :param type: Numeric type of the object
1009     :param object: Object to write
1010     :return: Tuple with offset at which the object was written, and crc32
1011     """
1012     offset = f.tell()
1013     packed_data_hdr = ""
1014     if type == OFS_DELTA:
1015         (delta_base_offset, object) = object
1016     elif type == REF_DELTA:
1017         (basename, object) = object
1018     size = len(object)
1019     c = (type << 4) | (size & 15)
1020     size >>= 4
1021     while size:
1022         packed_data_hdr += (chr(c | 0x80))
1023         c = size & 0x7f
1024         size >>= 7
1025     packed_data_hdr += chr(c)
1026     if type == OFS_DELTA:
1027         ret = [delta_base_offset & 0x7f]
1028         delta_base_offset >>= 7
1029         while delta_base_offset:
1030             delta_base_offset -= 1
1031             ret.insert(0, 0x80 | (delta_base_offset & 0x7f))
1032             delta_base_offset >>= 7
1033         packed_data_hdr += "".join([chr(x) for x in ret])
1034     elif type == REF_DELTA:
1035         assert len(basename) == 20
1036         packed_data_hdr += basename
1037     packed_data = packed_data_hdr + zlib.compress(object)
1038     f.write(packed_data)
1039     return (offset, (zlib.crc32(packed_data) & 0xffffffff))
1040
1041
1042 def write_pack(filename, objects, num_objects):
1043     """Write a new pack data file.
1044
1045     :param filename: Path to the new pack file (without .pack extension)
1046     :param objects: Iterable over (object, path) tuples to write
1047     :param num_objects: Number of objects to write
1048     :return: Tuple with checksum of pack file and index file
1049     """
1050     f = GitFile(filename + ".pack", 'wb')
1051     try:
1052         entries, data_sum = write_pack_data(f, objects, num_objects)
1053     finally:
1054         f.close()
1055     entries.sort()
1056     f = GitFile(filename + ".idx", 'wb')
1057     try:
1058         return data_sum, write_pack_index_v2(f, entries, data_sum)
1059     finally:
1060         f.close()
1061
1062
1063 def write_pack_data(f, objects, num_objects, window=10):
1064     """Write a new pack file.
1065
1066     :param filename: The filename of the new pack file.
1067     :param objects: List of objects to write (tuples with object and path)
1068     :return: List with (name, offset, crc32 checksum) entries, pack checksum
1069     """
1070     recency = list(objects)
1071     # FIXME: Somehow limit delta depth
1072     # FIXME: Make thin-pack optional (its not used when cloning a pack)
1073     # Build a list of objects ordered by the magic Linus heuristic
1074     # This helps us find good objects to diff against us
1075     magic = []
1076     for obj, path in recency:
1077         magic.append( (obj.type_num, path, 1, -obj.raw_length(), obj) )
1078     magic.sort()
1079     # Build a map of objects and their index in magic - so we can find
1080     # preceeding objects to diff against
1081     offs = {}
1082     for i in range(len(magic)):
1083         offs[magic[i][4]] = i
1084     # Write the pack
1085     entries = []
1086     f = SHA1Writer(f)
1087     f.write("PACK")               # Pack header
1088     f.write(struct.pack(">L", 2)) # Pack version
1089     f.write(struct.pack(">L", num_objects)) # Number of objects in pack
1090     for o, path in recency:
1091         sha1 = o.sha().digest()
1092         orig_t = o.type_num
1093         raw = o.as_raw_string()
1094         winner = raw
1095         t = orig_t
1096         #for i in range(offs[o]-window, window):
1097         #    if i < 0 or i >= len(offs): continue
1098         #    b = magic[i][4]
1099         #    if b.type_num != orig_t: continue
1100         #    base = b.as_raw_string()
1101         #    delta = create_delta(base, raw)
1102         #    if len(delta) < len(winner):
1103         #        winner = delta
1104         #        t = 6 if magic[i][2] == 1 else 7
1105         offset, crc32 = write_pack_object(f, t, winner)
1106         entries.append((sha1, offset, crc32))
1107     return entries, f.write_sha()
1108
1109
1110 def write_pack_index_v1(f, entries, pack_checksum):
1111     """Write a new pack index file.
1112
1113     :param f: A file-like object to write to
1114     :param entries: List of tuples with object name (sha), offset_in_pack,
1115         and crc32_checksum.
1116     :param pack_checksum: Checksum of the pack file.
1117     :return: The SHA of the written index file
1118     """
1119     f = SHA1Writer(f)
1120     fan_out_table = defaultdict(lambda: 0)
1121     for (name, offset, entry_checksum) in entries:
1122         fan_out_table[ord(name[0])] += 1
1123     # Fan-out table
1124     for i in range(0x100):
1125         f.write(struct.pack(">L", fan_out_table[i]))
1126         fan_out_table[i+1] += fan_out_table[i]
1127     for (name, offset, entry_checksum) in entries:
1128         f.write(struct.pack(">L20s", offset, name))
1129     assert len(pack_checksum) == 20
1130     f.write(pack_checksum)
1131     return f.write_sha()
1132
1133
1134 def create_delta(base_buf, target_buf):
1135     """Use python difflib to work out how to transform base_buf to target_buf.
1136
1137     :param base_buf: Base buffer
1138     :param target_buf: Target buffer
1139     """
1140     assert isinstance(base_buf, str)
1141     assert isinstance(target_buf, str)
1142     out_buf = ""
1143     # write delta header
1144     def encode_size(size):
1145         ret = ""
1146         c = size & 0x7f
1147         size >>= 7
1148         while size:
1149             ret += chr(c | 0x80)
1150             c = size & 0x7f
1151             size >>= 7
1152         ret += chr(c)
1153         return ret
1154     out_buf += encode_size(len(base_buf))
1155     out_buf += encode_size(len(target_buf))
1156     # write out delta opcodes
1157     seq = difflib.SequenceMatcher(a=base_buf, b=target_buf)
1158     for opcode, i1, i2, j1, j2 in seq.get_opcodes():
1159         # Git patch opcodes don't care about deletes!
1160         #if opcode == "replace" or opcode == "delete":
1161         #    pass
1162         if opcode == "equal":
1163             # If they are equal, unpacker will use data from base_buf
1164             # Write out an opcode that says what range to use
1165             scratch = ""
1166             op = 0x80
1167             o = i1
1168             for i in range(4):
1169                 if o & 0xff << i*8:
1170                     scratch += chr((o >> i*8) & 0xff)
1171                     op |= 1 << i
1172             s = i2 - i1
1173             for i in range(2):
1174                 if s & 0xff << i*8:
1175                     scratch += chr((s >> i*8) & 0xff)
1176                     op |= 1 << (4+i)
1177             out_buf += chr(op)
1178             out_buf += scratch
1179         if opcode == "replace" or opcode == "insert":
1180             # If we are replacing a range or adding one, then we just
1181             # output it to the stream (prefixed by its size)
1182             s = j2 - j1
1183             o = j1
1184             while s > 127:
1185                 out_buf += chr(127)
1186                 out_buf += target_buf[o:o+127]
1187                 s -= 127
1188                 o += 127
1189             out_buf += chr(s)
1190             out_buf += target_buf[o:o+s]
1191     return out_buf
1192
1193
1194 def apply_delta(src_buf, delta):
1195     """Based on the similar function in git's patch-delta.c.
1196
1197     :param src_buf: Source buffer
1198     :param delta: Delta instructions
1199     """
1200     if type(src_buf) != str:
1201         src_buf = "".join(src_buf)
1202     if type(delta) != str:
1203         delta = "".join(delta)
1204     out = []
1205     index = 0
1206     delta_length = len(delta)
1207     def get_delta_header_size(delta, index):
1208         size = 0
1209         i = 0
1210         while delta:
1211             cmd = ord(delta[index])
1212             index += 1
1213             size |= (cmd & ~0x80) << i
1214             i += 7
1215             if not cmd & 0x80:
1216                 break
1217         return size, index
1218     src_size, index = get_delta_header_size(delta, index)
1219     dest_size, index = get_delta_header_size(delta, index)
1220     assert src_size == len(src_buf), "%d vs %d" % (src_size, len(src_buf))
1221     while index < delta_length:
1222         cmd = ord(delta[index])
1223         index += 1
1224         if cmd & 0x80:
1225             cp_off = 0
1226             for i in range(4):
1227                 if cmd & (1 << i):
1228                     x = ord(delta[index])
1229                     index += 1
1230                     cp_off |= x << (i * 8)
1231             cp_size = 0
1232             for i in range(3):
1233                 if cmd & (1 << (4+i)):
1234                     x = ord(delta[index])
1235                     index += 1
1236                     cp_size |= x << (i * 8)
1237             if cp_size == 0:
1238                 cp_size = 0x10000
1239             if (cp_off + cp_size < cp_size or
1240                 cp_off + cp_size > src_size or
1241                 cp_size > dest_size):
1242                 break
1243             out.append(src_buf[cp_off:cp_off+cp_size])
1244         elif cmd != 0:
1245             out.append(delta[index:index+cmd])
1246             index += cmd
1247         else:
1248             raise ApplyDeltaError("Invalid opcode 0")
1249
1250     if index != delta_length:
1251         raise ApplyDeltaError("delta not empty: %r" % delta[index:])
1252
1253     if dest_size != chunks_length(out):
1254         raise ApplyDeltaError("dest size incorrect")
1255
1256     return out
1257
1258
1259 def write_pack_index_v2(f, entries, pack_checksum):
1260     """Write a new pack index file.
1261
1262     :param f: File-like object to write to
1263     :param entries: List of tuples with object name (sha), offset_in_pack, and
1264         crc32_checksum.
1265     :param pack_checksum: Checksum of the pack file.
1266     :return: The SHA of the index file written
1267     """
1268     f = SHA1Writer(f)
1269     f.write('\377tOc') # Magic!
1270     f.write(struct.pack(">L", 2))
1271     fan_out_table = defaultdict(lambda: 0)
1272     for (name, offset, entry_checksum) in entries:
1273         fan_out_table[ord(name[0])] += 1
1274     # Fan-out table
1275     for i in range(0x100):
1276         f.write(struct.pack(">L", fan_out_table[i]))
1277         fan_out_table[i+1] += fan_out_table[i]
1278     for (name, offset, entry_checksum) in entries:
1279         f.write(name)
1280     for (name, offset, entry_checksum) in entries:
1281         f.write(struct.pack(">L", entry_checksum))
1282     for (name, offset, entry_checksum) in entries:
1283         # FIXME: handle if MSBit is set in offset
1284         f.write(struct.pack(">L", offset))
1285     # FIXME: handle table for pack files > 8 Gb
1286     assert len(pack_checksum) == 20
1287     f.write(pack_checksum)
1288     return f.write_sha()
1289
1290
1291 class Pack(object):
1292     """A Git pack object."""
1293
1294     def __init__(self, basename):
1295         self._basename = basename
1296         self._data = None
1297         self._idx = None
1298         self._idx_path = self._basename + ".idx"
1299         self._data_path = self._basename + ".pack"
1300         self._data_load = lambda: PackData(self._data_path)
1301         self._idx_load = lambda: load_pack_index(self._idx_path)
1302
1303     @classmethod
1304     def from_lazy_objects(self, data_fn, idx_fn):
1305         """Create a new pack object from callables to load pack data and
1306         index objects."""
1307         ret = Pack("")
1308         ret._data_load = data_fn
1309         ret._idx_load = idx_fn
1310         return ret
1311
1312     @classmethod
1313     def from_objects(self, data, idx):
1314         """Create a new pack object from pack data and index objects."""
1315         ret = Pack("")
1316         ret._data_load = lambda: data
1317         ret._idx_load = lambda: idx
1318         return ret
1319
1320     def name(self):
1321         """The SHA over the SHAs of the objects in this pack."""
1322         return self.index.objects_sha1()
1323
1324     @property
1325     def data(self):
1326         """The pack data object being used."""
1327         if self._data is None:
1328             self._data = self._data_load()
1329             self._data.pack = self
1330             assert len(self.index) == len(self._data)
1331             idx_stored_checksum = self.index.get_pack_checksum()
1332             data_stored_checksum = self._data.get_stored_checksum()
1333             if idx_stored_checksum != data_stored_checksum:
1334                 raise ChecksumMismatch(sha_to_hex(idx_stored_checksum),
1335                                        sha_to_hex(data_stored_checksum))
1336         return self._data
1337
1338     @property
1339     def index(self):
1340         """The index being used.
1341
1342         :note: This may be an in-memory index
1343         """
1344         if self._idx is None:
1345             self._idx = self._idx_load()
1346         return self._idx
1347
1348     def close(self):
1349         if self._data is not None:
1350             self._data.close()
1351         self.index.close()
1352
1353     def __eq__(self, other):
1354         return type(self) == type(other) and self.index == other.index
1355
1356     def __len__(self):
1357         """Number of entries in this pack."""
1358         return len(self.index)
1359
1360     def __repr__(self):
1361         return "%s(%r)" % (self.__class__.__name__, self._basename)
1362
1363     def __iter__(self):
1364         """Iterate over all the sha1s of the objects in this pack."""
1365         return iter(self.index)
1366
1367     def check(self):
1368         """Check the integrity of this pack.
1369
1370         :raise ChecksumMismatch: if a checksum for the index or data is wrong
1371         """
1372         self.index.check()
1373         self.data.check()
1374         for obj in self.iterobjects():
1375             obj.check()
1376         # TODO: object connectivity checks
1377
1378     def get_stored_checksum(self):
1379         return self.data.get_stored_checksum()
1380
1381     def __contains__(self, sha1):
1382         """Check whether this pack contains a particular SHA1."""
1383         try:
1384             self.index.object_index(sha1)
1385             return True
1386         except KeyError:
1387             return False
1388
1389     def get_raw(self, sha1):
1390         offset = self.index.object_index(sha1)
1391         obj_type, obj = self.data.get_object_at(offset)
1392         if type(offset) is long:
1393           offset = int(offset)
1394         type_num, chunks = self.data.resolve_object(offset, obj_type, obj)
1395         return type_num, "".join(chunks)
1396
1397     def __getitem__(self, sha1):
1398         """Retrieve the specified SHA1."""
1399         type, uncomp = self.get_raw(sha1)
1400         return ShaFile.from_raw_string(type, uncomp)
1401
1402     def iterobjects(self):
1403         """Iterate over the objects in this pack."""
1404         for offset, type, obj, crc32 in self.data.iterobjects():
1405             assert isinstance(offset, int)
1406             yield ShaFile.from_raw_chunks(
1407               *self.data.resolve_object(offset, type, obj))
1408
1409
1410 try:
1411     from dulwich._pack import apply_delta, bisect_find_sha
1412 except ImportError:
1413     pass