dulwich/pack.py

   1 # pack.py -- For dealing wih packed git objects.
   2 # Copyright (C) 2007 James Westby <jw+debian@jameswestby.net>
   3 # Copryight (C) 2008-2009 Jelmer Vernooij <jelmer@samba.org>
   4 #
   5 # This program is free software; you can redistribute it and/or
   6 # modify it under the terms of the GNU General Public License
   7 # as published by the Free Software Foundation; version 2
   8 # of the License or (at your option) a later version.
   9 #
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 # GNU General Public License for more details.
  14 #
  15 # You should have received a copy of the GNU General Public License
  16 # along with this program; if not, write to the Free Software
  17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
  18 # MA  02110-1301, USA.
  19
  20 """Classes for dealing with packed git objects.
  21
  22 A pack is a compact representation of a bunch of objects, stored
  23 using deltas where possible.
  24
  25 They have two parts, the pack file, which stores the data, and an index
  26 that tells you where the data is.
  27
  28 To find an object you look in all of the index files 'til you find a
  29 match for the object name. You then use the pointer got from this as
  30 a pointer in to the corresponding packfile.
  31 """
  32
  33 try:
  34     from collections import defaultdict
  35 except ImportError:
  36     from misc import defaultdict
  37
  38 import difflib
  39 import errno
  40 from itertools import (
  41     chain,
  42     imap,
  43     izip,
  44     )
  45 import mmap
  46 import os
  47 import struct
  48 try:
  49     from struct import unpack_from
  50 except ImportError:
  51     from dulwich.misc import unpack_from
  52 import sys
  53 import zlib
  54
  55 from dulwich.errors import (
  56     ApplyDeltaError,
  57     ChecksumMismatch,
  58     )
  59 from dulwich.file import GitFile
  60 from dulwich.lru_cache import (
  61     LRUSizeCache,
  62     )
  63 from dulwich.objects import (
  64     ShaFile,
  65     hex_to_sha,
  66     sha_to_hex,
  67     )
  68 from dulwich.misc import (
  69     make_sha,
  70     )
  71
  72 supports_mmap_offset = (sys.version_info[0] >= 3 or
  73         (sys.version_info[0] == 2 and sys.version_info[1] >= 6))
  74
  75
  76 def take_msb_bytes(read):
  77     """Read bytes marked with most significant bit.
  78
  79     :param read: Read function
  80     """
  81     ret = []
  82     while len(ret) == 0 or ret[-1] & 0x80:
  83         ret.append(ord(read(1)))
  84     return ret
  85
  86
  87 def read_zlib_chunks(read_some, dec_size, buffer_size=4096):
  88     """Read zlib data from a buffer.
  89
  90     This function requires that the buffer have additional data following the
  91     compressed data, which is guaranteed to be the case for git pack files.
  92
  93     :param read_some: Read function that returns at least one byte, but may
  94         return less than the requested size
  95     :param dec_size: Expected size of the decompressed buffer
  96     :param buffer_size: Size of the read buffer
  97     :return: Tuple with list of chunks, length of compressed data length and
  98         and unused read data.
  99     :raise zlib.error: if a decompression error occurred.
 100     """
 101     if dec_size <= -1:
 102         raise ValueError("non-negative zlib data stream size expected")
 103     obj = zlib.decompressobj()
 104     ret = []
 105     fed = 0
 106     size = 0
 107     while obj.unused_data == "":
 108         add = read_some(buffer_size)
 109         if not add:
 110             raise zlib.error("EOF before end of zlib stream")
 111         fed += len(add)
 112         decomp = obj.decompress(add)
 113         size += len(decomp)
 114         ret.append(decomp)
 115     if size != dec_size:
 116         raise zlib.error("decompressed data does not match expected size")
 117     comp_len = fed - len(obj.unused_data)
 118     return ret, comp_len, obj.unused_data
 119
 120 def iter_sha1(iter):
 121     """Return the hexdigest of the SHA1 over a set of names.
 122
 123     :param iter: Iterator over string objects
 124     :return: 40-byte hex sha1 digest
 125     """
 126     sha1 = make_sha()
 127     for name in iter:
 128         sha1.update(name)
 129     return sha1.hexdigest()
 130
 131
 132 def load_pack_index(path):
 133     """Load an index file by path.
 134
 135     :param filename: Path to the index file
 136     :return: A PackIndex loaded from the given path
 137     """
 138     f = GitFile(path, 'rb')
 139     return load_pack_index_file(path, f)
 140
 141
 142 def _load_file_contents(f, size=None):
 143     fileno = getattr(f, 'fileno', None)
 144     # Attempt to use mmap if possible
 145     if fileno is not None:
 146         fd = f.fileno()
 147         if size is None:
 148             size = os.fstat(fd).st_size
 149         try:
 150             contents = mmap.mmap(fd, size, access=mmap.ACCESS_READ)
 151         except mmap.error:
 152             # Perhaps a socket?
 153             pass
 154         else:
 155             return contents, size
 156     contents = f.read()
 157     size = len(contents)
 158     return contents, size
 159
 160
 161 def load_pack_index_file(path, f):
 162     """Load an index file from a file-like object.
 163
 164     :param path: Path for the index file
 165     :param f: File-like object
 166     :return: A PackIndex loaded from the given file
 167     """
 168     contents, size = _load_file_contents(f)
 169     if contents[:4] == '\377tOc':
 170         version = struct.unpack(">L", contents[4:8])[0]
 171         if version == 2:
 172             return PackIndex2(path, file=f, contents=contents,
 173                 size=size)
 174         else:
 175             raise KeyError("Unknown pack index format %d" % version)
 176     else:
 177         return PackIndex1(path, file=f, contents=contents, size=size)
 178
 179
 180 def bisect_find_sha(start, end, sha, unpack_name):
 181     """Find a SHA in a data blob with sorted SHAs.
 182
 183     :param start: Start index of range to search
 184     :param end: End index of range to search
 185     :param sha: Sha to find
 186     :param unpack_name: Callback to retrieve SHA by index
 187     :return: Index of the SHA, or None if it wasn't found
 188     """
 189     assert start <= end
 190     while start <= end:
 191         i = (start + end)/2
 192         file_sha = unpack_name(i)
 193         x = cmp(file_sha, sha)
 194         if x < 0:
 195             start = i + 1
 196         elif x > 0:
 197             end = i - 1
 198         else:
 199             return i
 200     return None
 201
 202
 203 class PackIndex(object):
 204     """An index in to a packfile.
 205
 206     Given a sha id of an object a pack index can tell you the location in the
 207     packfile of that object if it has it.
 208
 209     To do the loop it opens the file, and indexes first 256 4 byte groups
 210     with the first byte of the sha id. The value in the four byte group indexed
 211     is the end of the group that shares the same starting byte. Subtract one
 212     from the starting byte and index again to find the start of the group.
 213     The values are sorted by sha id within the group, so do the math to find
 214     the start and end offset and then bisect in to find if the value is present.
 215     """
 216
 217     def __init__(self, filename, file=None, contents=None, size=None):
 218         """Create a pack index object.
 219
 220         Provide it with the name of the index file to consider, and it will map
 221         it whenever required.
 222         """
 223         self._filename = filename
 224         # Take the size now, so it can be checked each time we map the file to
 225         # ensure that it hasn't changed.
 226         if file is None:
 227             self._file = GitFile(filename, 'rb')
 228         else:
 229             self._file = file
 230         if contents is None:
 231             self._contents, self._size = _load_file_contents(file, size)
 232         else:
 233             self._contents, self._size = (contents, size)
 234
 235     def __eq__(self, other):
 236         if not isinstance(other, PackIndex):
 237             return False
 238
 239         if self._fan_out_table != other._fan_out_table:
 240             return False
 241
 242         for (name1, _, _), (name2, _, _) in izip(self.iterentries(),
 243                                                  other.iterentries()):
 244             if name1 != name2:
 245                 return False
 246         return True
 247
 248     def __ne__(self, other):
 249         return not self.__eq__(other)
 250
 251     def close(self):
 252         self._file.close()
 253
 254     def __len__(self):
 255         """Return the number of entries in this pack index."""
 256         return self._fan_out_table[-1]
 257
 258     def _unpack_entry(self, i):
 259         """Unpack the i-th entry in the index file.
 260
 261         :return: Tuple with object name (SHA), offset in pack file and CRC32
 262             checksum (if known)."""
 263         raise NotImplementedError(self._unpack_entry)
 264
 265     def _unpack_name(self, i):
 266         """Unpack the i-th name from the index file."""
 267         raise NotImplementedError(self._unpack_name)
 268
 269     def _unpack_offset(self, i):
 270         """Unpack the i-th object offset from the index file."""
 271         raise NotImplementedError(self._unpack_offset)
 272
 273     def _unpack_crc32_checksum(self, i):
 274         """Unpack the crc32 checksum for the i-th object from the index file."""
 275         raise NotImplementedError(self._unpack_crc32_checksum)
 276
 277     def __iter__(self):
 278         """Iterate over the SHAs in this pack."""
 279         return imap(sha_to_hex, self._itersha())
 280
 281     def _itersha(self):
 282         for i in range(len(self)):
 283             yield self._unpack_name(i)
 284
 285     def objects_sha1(self):
 286         """Return the hex SHA1 over all the shas of all objects in this pack.
 287
 288         :note: This is used for the filename of the pack.
 289         """
 290         return iter_sha1(self._itersha())
 291
 292     def iterentries(self):
 293         """Iterate over the entries in this pack index.
 294
 295         Will yield tuples with object name, offset in packfile and crc32
 296         checksum.
 297         """
 298         for i in range(len(self)):
 299             yield self._unpack_entry(i)
 300
 301     def _read_fan_out_table(self, start_offset):
 302         ret = []
 303         for i in range(0x100):
 304             fanout_entry = self._contents[start_offset+i*4:start_offset+(i+1)*4]
 305             ret.append(struct.unpack(">L", fanout_entry)[0])
 306         return ret
 307
 308     def check(self):
 309         """Check that the stored checksum matches the actual checksum."""
 310         # TODO: Check pack contents, too
 311         return self.calculate_checksum() == self.get_stored_checksum()
 312
 313     def calculate_checksum(self):
 314         """Calculate the SHA1 checksum over this pack index.
 315
 316         :return: This is a 20-byte binary digest
 317         """
 318         return make_sha(self._contents[:-20]).digest()
 319
 320     def get_pack_checksum(self):
 321         """Return the SHA1 checksum stored for the corresponding packfile.
 322
 323         :return: 20-byte binary digest
 324         """
 325         return str(self._contents[-40:-20])
 326
 327     def get_stored_checksum(self):
 328         """Return the SHA1 checksum stored for this index.
 329
 330         :return: 20-byte binary digest
 331         """
 332         return str(self._contents[-20:])
 333
 334     def object_index(self, sha):
 335         """Return the index in to the corresponding packfile for the object.
 336
 337         Given the name of an object it will return the offset that object
 338         lives at within the corresponding pack file. If the pack file doesn't
 339         have the object then None will be returned.
 340         """
 341         if len(sha) == 40:
 342             sha = hex_to_sha(sha)
 343         return self._object_index(sha)
 344
 345     def _object_index(self, sha):
 346         """See object_index.
 347
 348         :param sha: A *binary* SHA string. (20 characters long)_
 349         """
 350         assert len(sha) == 20
 351         idx = ord(sha[0])
 352         if idx == 0:
 353             start = 0
 354         else:
 355             start = self._fan_out_table[idx-1]
 356         end = self._fan_out_table[idx]
 357         i = bisect_find_sha(start, end, sha, self._unpack_name)
 358         if i is None:
 359             raise KeyError(sha)
 360         return self._unpack_offset(i)
 361
 362
 363 class PackIndex1(PackIndex):
 364     """Version 1 Pack Index."""
 365
 366     def __init__(self, filename, file=None, contents=None, size=None):
 367         PackIndex.__init__(self, filename, file, contents, size)
 368         self.version = 1
 369         self._fan_out_table = self._read_fan_out_table(0)
 370
 371     def _unpack_entry(self, i):
 372         (offset, name) = unpack_from(">L20s", self._contents,
 373                                      (0x100 * 4) + (i * 24))
 374         return (name, offset, None)
 375
 376     def _unpack_name(self, i):
 377         offset = (0x100 * 4) + (i * 24) + 4
 378         return self._contents[offset:offset+20]
 379
 380     def _unpack_offset(self, i):
 381         offset = (0x100 * 4) + (i * 24)
 382         return unpack_from(">L", self._contents, offset)[0]
 383
 384     def _unpack_crc32_checksum(self, i):
 385         # Not stored in v1 index files
 386         return None
 387
 388
 389 class PackIndex2(PackIndex):
 390     """Version 2 Pack Index."""
 391
 392     def __init__(self, filename, file=None, contents=None, size=None):
 393         PackIndex.__init__(self, filename, file, contents, size)
 394         assert self._contents[:4] == '\377tOc', "Not a v2 pack index file"
 395         (self.version, ) = unpack_from(">L", self._contents, 4)
 396         assert self.version == 2, "Version was %d" % self.version
 397         self._fan_out_table = self._read_fan_out_table(8)
 398         self._name_table_offset = 8 + 0x100 * 4
 399         self._crc32_table_offset = self._name_table_offset + 20 * len(self)
 400         self._pack_offset_table_offset = (self._crc32_table_offset +
 401                                           4 * len(self))
 402
 403     def _unpack_entry(self, i):
 404         return (self._unpack_name(i), self._unpack_offset(i),
 405                 self._unpack_crc32_checksum(i))
 406
 407     def _unpack_name(self, i):
 408         offset = self._name_table_offset + i * 20
 409         return self._contents[offset:offset+20]
 410
 411     def _unpack_offset(self, i):
 412         offset = self._pack_offset_table_offset + i * 4
 413         return unpack_from(">L", self._contents, offset)[0]
 414
 415     def _unpack_crc32_checksum(self, i):
 416         return unpack_from(">L", self._contents,
 417                           self._crc32_table_offset + i * 4)[0]
 418
 419
 420 def read_pack_header(read):
 421     """Read the header of a pack file.
 422
 423     :param read: Read function
 424     """
 425     header = read(12)
 426     assert header[:4] == "PACK"
 427     (version,) = unpack_from(">L", header, 4)
 428     assert version in (2, 3), "Version was %d" % version
 429     (num_objects,) = unpack_from(">L", header, 8)
 430     return (version, num_objects)
 431
 432
 433 def chunks_length(chunks):
 434     return sum(imap(len, chunks))
 435
 436
 437 def unpack_object(read_all, read_some=None):
 438     """Unpack a Git object.
 439
 440     :param read_all: Read function that blocks until the number of requested
 441         bytes are read.
 442     :param read_some: Read function that returns at least one byte, but may not
 443         return the number of bytes requested.
 444     :return: tuple with type, uncompressed data, compressed size and tail data.
 445     """
 446     if read_some is None:
 447         read_some = read_all
 448     bytes = take_msb_bytes(read_all)
 449     type = (bytes[0] >> 4) & 0x07
 450     size = bytes[0] & 0x0f
 451     for i, byte in enumerate(bytes[1:]):
 452         size += (byte & 0x7f) << ((i * 7) + 4)
 453     raw_base = len(bytes)
 454     if type == 6: # offset delta
 455         bytes = take_msb_bytes(read_all)
 456         raw_base += len(bytes)
 457         assert not (bytes[-1] & 0x80)
 458         delta_base_offset = bytes[0] & 0x7f
 459         for byte in bytes[1:]:
 460             delta_base_offset += 1
 461             delta_base_offset <<= 7
 462             delta_base_offset += (byte & 0x7f)
 463         uncomp, comp_len, unused = read_zlib_chunks(read_some, size)
 464         assert size == chunks_length(uncomp)
 465         return type, (delta_base_offset, uncomp), comp_len+raw_base, unused
 466     elif type == 7: # ref delta
 467         basename = read_all(20)
 468         raw_base += 20
 469         uncomp, comp_len, unused = read_zlib_chunks(read_some, size)
 470         assert size == chunks_length(uncomp)
 471         return type, (basename, uncomp), comp_len+raw_base, unused
 472     else:
 473         uncomp, comp_len, unused = read_zlib_chunks(read_some, size)
 474         assert chunks_length(uncomp) == size
 475         return type, uncomp, comp_len+raw_base, unused
 476
 477
 478 def _compute_object_size((num, obj)):
 479     """Compute the size of a unresolved object for use with LRUSizeCache."""
 480     if num in (6, 7):
 481         return chunks_length(obj[1])
 482     return chunks_length(obj)
 483
 484
 485 class PackData(object):
 486     """The data contained in a packfile.
 487
 488     Pack files can be accessed both sequentially for exploding a pack, and
 489     directly with the help of an index to retrieve a specific object.
 490
 491     The objects within are either complete or a delta aginst another.
 492
 493     The header is variable length. If the MSB of each byte is set then it
 494     indicates that the subsequent byte is still part of the header.
 495     For the first byte the next MS bits are the type, which tells you the type
 496     of object, and whether it is a delta. The LS byte is the lowest bits of the
 497     size. For each subsequent byte the LS 7 bits are the next MS bits of the
 498     size, i.e. the last byte of the header contains the MS bits of the size.
 499
 500     For the complete objects the data is stored as zlib deflated data.
 501     The size in the header is the uncompressed object size, so to uncompress
 502     you need to just keep feeding data to zlib until you get an object back,
 503     or it errors on bad data. This is done here by just giving the complete
 504     buffer from the start of the deflated object on. This is bad, but until I
 505     get mmap sorted out it will have to do.
 506
 507     Currently there are no integrity checks done. Also no attempt is made to
 508     try and detect the delta case, or a request for an object at the wrong
 509     position.  It will all just throw a zlib or KeyError.
 510     """
 511
 512     def __init__(self, filename, file=None, size=None):
 513         """Create a PackData object representing the pack in the given filename.
 514
 515         The file must exist and stay readable until the object is disposed of.
 516         It must also stay the same size. It will be mapped whenever needed.
 517
 518         Currently there is a restriction on the size of the pack as the python
 519         mmap implementation is flawed.
 520         """
 521         self._filename = filename
 522         self._size = size
 523         self._header_size = 12
 524         if file is None:
 525             self._file = GitFile(self._filename, 'rb')
 526         else:
 527             self._file = file
 528         (version, self._num_objects) = read_pack_header(self._file.read)
 529         self._offset_cache = LRUSizeCache(1024*1024*20,
 530             compute_size=_compute_object_size)
 531
 532     @classmethod
 533     def from_file(cls, file, size):
 534         return cls(str(file), file=file, size=size)
 535
 536     @classmethod
 537     def from_path(cls, path):
 538         return cls(filename=path)
 539
 540     def close(self):
 541         self._file.close()
 542
 543     def _get_size(self):
 544         if self._size is not None:
 545             return self._size
 546         self._size = os.path.getsize(self._filename)
 547         if self._size < self._header_size:
 548             errmsg = ("%s is too small for a packfile (%d < %d)" %
 549                       (self._filename, self._size, self._header_size))
 550             raise AssertionError(errmsg)
 551         return self._size
 552
 553     def __len__(self):
 554         """Returns the number of objects in this pack."""
 555         return self._num_objects
 556
 557     def calculate_checksum(self):
 558         """Calculate the checksum for this pack.
 559
 560         :return: 20-byte binary SHA1 digest
 561         """
 562         s = make_sha()
 563         self._file.seek(0)
 564         todo = self._get_size() - 20
 565         while todo > 0:
 566             x = self._file.read(min(todo, 1<<16))
 567             s.update(x)
 568             todo -= len(x)
 569         return s.digest()
 570
 571     def resolve_object(self, offset, type, obj, get_ref, get_offset=None):
 572         """Resolve an object, possibly resolving deltas when necessary.
 573
 574         :return: Tuple with object type and contents.
 575         """
 576         if type not in (6, 7): # Not a delta
 577             return type, obj
 578
 579         if get_offset is None:
 580             get_offset = self.get_object_at
 581
 582         if type == 6: # offset delta
 583             (delta_offset, delta) = obj
 584             assert isinstance(delta_offset, int)
 585             base_offset = offset-delta_offset
 586             type, base_obj = get_offset(base_offset)
 587             assert isinstance(type, int)
 588         elif type == 7: # ref delta
 589             (basename, delta) = obj
 590             assert isinstance(basename, str) and len(basename) == 20
 591             type, base_obj = get_ref(basename)
 592             assert isinstance(type, int)
 593             # Can't be a ofs delta, as we wouldn't know the base offset
 594             assert type != 6
 595             base_offset = None
 596         type, base_chunks = self.resolve_object(base_offset, type, base_obj,
 597                                                 get_ref)
 598         if base_offset is not None:
 599             self._offset_cache[base_offset] = type, base_chunks
 600         return (type, apply_delta(base_chunks, delta))
 601
 602     def iterobjects(self, progress=None):
 603
 604         class ObjectIterator(object):
 605
 606             def __init__(self, pack):
 607                 self.i = 0
 608                 self.offset = pack._header_size
 609                 self.num = len(pack)
 610                 self.map = pack._file
 611
 612             def __iter__(self):
 613                 return self
 614
 615             def __len__(self):
 616                 return self.num
 617
 618             def next(self):
 619                 if self.i == self.num:
 620                     raise StopIteration
 621                 self.map.seek(self.offset)
 622                 (type, obj, total_size, unused) = unpack_object(self.map.read)
 623                 self.map.seek(self.offset)
 624                 crc32 = zlib.crc32(self.map.read(total_size)) & 0xffffffff
 625                 ret = (self.offset, type, obj, crc32)
 626                 self.offset += total_size
 627                 if progress:
 628                     progress(self.i, self.num)
 629                 self.i+=1
 630                 return ret
 631         return ObjectIterator(self)
 632
 633     def iterentries(self, ext_resolve_ref=None, progress=None):
 634         """Yield entries summarizing the contents of this pack.
 635
 636         :param ext_resolve_ref: Optional function to resolve base
 637             objects (in case this is a thin pack)
 638         :param progress: Progress function, called with current and
 639             total object count.
 640
 641         This will yield tuples with (sha, offset, crc32)
 642         """
 643         found = {}
 644         postponed = defaultdict(list)
 645         class Postpone(Exception):
 646             """Raised to postpone delta resolving."""
 647
 648         def get_ref_text(sha):
 649             assert len(sha) == 20
 650             if sha in found:
 651                 return self.get_object_at(found[sha])
 652             if ext_resolve_ref:
 653                 try:
 654                     return ext_resolve_ref(sha)
 655                 except KeyError:
 656                     pass
 657             raise Postpone, (sha, )
 658         extra = []
 659         todo = chain(self.iterobjects(progress=progress), extra)
 660         for (offset, type, obj, crc32) in todo:
 661             assert isinstance(offset, int)
 662             assert isinstance(type, int)
 663             try:
 664                 type, obj = self.resolve_object(offset, type, obj,
 665                     get_ref_text)
 666             except Postpone, (sha, ):
 667                 postponed[sha].append((offset, type, obj))
 668             else:
 669                 shafile = ShaFile.from_raw_chunks(type, obj)
 670                 sha = shafile.sha().digest()
 671                 found[sha] = offset
 672                 yield sha, offset, crc32
 673                 extra.extend(postponed.get(sha, []))
 674         if postponed:
 675             raise KeyError([sha_to_hex(h) for h in postponed.keys()])
 676
 677     def sorted_entries(self, resolve_ext_ref=None, progress=None):
 678         """Return entries in this pack, sorted by SHA.
 679
 680         :param resolve_ext_ref: Optional function to resolve base
 681             objects (in case this is a thin pack)
 682         :param progress: Progress function, called with current and
 683             total object count
 684         :return: List of tuples with (sha, offset, crc32)
 685         """
 686         ret = list(self.iterentries(resolve_ext_ref, progress=progress))
 687         ret.sort()
 688         return ret
 689
 690     def create_index_v1(self, filename, resolve_ext_ref=None, progress=None):
 691         """Create a version 1 file for this data file.
 692
 693         :param filename: Index filename.
 694         :param resolve_ext_ref: Function to use for resolving externally
 695             referenced SHA1s (for thin packs)
 696         :param progress: Progress report function
 697         """
 698         entries = self.sorted_entries(resolve_ext_ref, progress=progress)
 699         write_pack_index_v1(filename, entries, self.calculate_checksum())
 700
 701     def create_index_v2(self, filename, resolve_ext_ref=None, progress=None):
 702         """Create a version 2 index file for this data file.
 703
 704         :param filename: Index filename.
 705         :param resolve_ext_ref: Function to use for resolving externally
 706             referenced SHA1s (for thin packs)
 707         :param progress: Progress report function
 708         """
 709         entries = self.sorted_entries(resolve_ext_ref, progress=progress)
 710         write_pack_index_v2(filename, entries, self.calculate_checksum())
 711
 712     def create_index(self, filename, resolve_ext_ref=None, progress=None,
 713                      version=2):
 714         """Create an  index file for this data file.
 715
 716         :param filename: Index filename.
 717         :param resolve_ext_ref: Function to use for resolving externally
 718             referenced SHA1s (for thin packs)
 719         :param progress: Progress report function
 720         """
 721         if version == 1:
 722             self.create_index_v1(filename, resolve_ext_ref, progress)
 723         elif version == 2:
 724             self.create_index_v2(filename, resolve_ext_ref, progress)
 725         else:
 726             raise ValueError("unknown index format %d" % version)
 727
 728     def get_stored_checksum(self):
 729         """Return the expected checksum stored in this pack."""
 730         self._file.seek(self._get_size()-20)
 731         return self._file.read(20)
 732
 733     def check(self):
 734         """Check the consistency of this pack."""
 735         return (self.calculate_checksum() == self.get_stored_checksum())
 736
 737     def get_object_at(self, offset):
 738         """Given an offset in to the packfile return the object that is there.
 739
 740         Using the associated index the location of an object can be looked up,
 741         and then the packfile can be asked directly for that object using this
 742         function.
 743         """
 744         if offset in self._offset_cache:
 745             return self._offset_cache[offset]
 746         assert isinstance(offset, long) or isinstance(offset, int),\
 747                 "offset was %r" % offset
 748         assert offset >= self._header_size
 749         self._file.seek(offset)
 750         return unpack_object(self._file.read)[:2]
 751
 752
 753 class SHA1Reader(object):
 754     """Wrapper around a file-like object that remembers the SHA1 of its data."""
 755
 756     def __init__(self, f):
 757         self.f = f
 758         self.sha1 = make_sha("")
 759
 760     def read(self, num=None):
 761         data = self.f.read(num)
 762         self.sha1.update(data)
 763         return data
 764
 765     def check_sha(self):
 766         stored = self.f.read(20)
 767         if stored != self.sha1.digest():
 768             raise ChecksumMismatch(self.sha1.hexdigest(), sha_to_hex(stored))
 769
 770     def close(self):
 771         return self.f.close()
 772
 773     def tell(self):
 774         return self.f.tell()
 775
 776
 777 class SHA1Writer(object):
 778     """Wrapper around a file-like object that remembers the SHA1 of its data."""
 779
 780     def __init__(self, f):
 781         self.f = f
 782         self.sha1 = make_sha("")
 783
 784     def write(self, data):
 785         self.sha1.update(data)
 786         self.f.write(data)
 787
 788     def write_sha(self):
 789         sha = self.sha1.digest()
 790         assert len(sha) == 20
 791         self.f.write(sha)
 792         return sha
 793
 794     def close(self):
 795         sha = self.write_sha()
 796         self.f.close()
 797         return sha
 798
 799     def tell(self):
 800         return self.f.tell()
 801
 802
 803 def write_pack_object(f, type, object):
 804     """Write pack object to a file.
 805
 806     :param f: File to write to
 807     :param type: Numeric type of the object
 808     :param object: Object to write
 809     :return: Tuple with offset at which the object was written, and crc32
 810     """
 811     offset = f.tell()
 812     packed_data_hdr = ""
 813     if type == 6: # offset delta
 814         (delta_base_offset, object) = object
 815     elif type == 7: # ref delta
 816         (basename, object) = object
 817     size = len(object)
 818     c = (type << 4) | (size & 15)
 819     size >>= 4
 820     while size:
 821         packed_data_hdr += (chr(c | 0x80))
 822         c = size & 0x7f
 823         size >>= 7
 824     packed_data_hdr += chr(c)
 825     if type == 6: # offset delta
 826         ret = [delta_base_offset & 0x7f]
 827         delta_base_offset >>= 7
 828         while delta_base_offset:
 829             delta_base_offset -= 1
 830             ret.insert(0, 0x80 | (delta_base_offset & 0x7f))
 831             delta_base_offset >>= 7
 832         packed_data_hdr += "".join([chr(x) for x in ret])
 833     elif type == 7: # ref delta
 834         assert len(basename) == 20
 835         packed_data_hdr += basename
 836     packed_data = packed_data_hdr + zlib.compress(object)
 837     f.write(packed_data)
 838     return (offset, (zlib.crc32(packed_data) & 0xffffffff))
 839
 840
 841 def write_pack(filename, objects, num_objects):
 842     """Write a new pack data file.
 843
 844     :param filename: Path to the new pack file (without .pack extension)
 845     :param objects: Iterable over (object, path) tuples to write
 846     :param num_objects: Number of objects to write
 847     """
 848     f = GitFile(filename + ".pack", 'wb')
 849     try:
 850         entries, data_sum = write_pack_data(f, objects, num_objects)
 851     finally:
 852         f.close()
 853     entries.sort()
 854     write_pack_index_v2(filename + ".idx", entries, data_sum)
 855
 856
 857 def write_pack_data(f, objects, num_objects, window=10):
 858     """Write a new pack file.
 859
 860     :param filename: The filename of the new pack file.
 861     :param objects: List of objects to write (tuples with object and path)
 862     :return: List with (name, offset, crc32 checksum) entries, pack checksum
 863     """
 864     recency = list(objects)
 865     # FIXME: Somehow limit delta depth
 866     # FIXME: Make thin-pack optional (its not used when cloning a pack)
 867     # Build a list of objects ordered by the magic Linus heuristic
 868     # This helps us find good objects to diff against us
 869     magic = []
 870     for obj, path in recency:
 871         magic.append( (obj.type_num, path, 1, -obj.raw_length(), obj) )
 872     magic.sort()
 873     # Build a map of objects and their index in magic - so we can find
 874     # preceeding objects to diff against
 875     offs = {}
 876     for i in range(len(magic)):
 877         offs[magic[i][4]] = i
 878     # Write the pack
 879     entries = []
 880     f = SHA1Writer(f)
 881     f.write("PACK")               # Pack header
 882     f.write(struct.pack(">L", 2)) # Pack version
 883     f.write(struct.pack(">L", num_objects)) # Number of objects in pack
 884     for o, path in recency:
 885         sha1 = o.sha().digest()
 886         orig_t = o.type_num
 887         raw = o.as_raw_string()
 888         winner = raw
 889         t = orig_t
 890         #for i in range(offs[o]-window, window):
 891         #    if i < 0 or i >= len(offs): continue
 892         #    b = magic[i][4]
 893         #    if b.type_num != orig_t: continue
 894         #    base = b.as_raw_string()
 895         #    delta = create_delta(base, raw)
 896         #    if len(delta) < len(winner):
 897         #        winner = delta
 898         #        t = 6 if magic[i][2] == 1 else 7
 899         offset, crc32 = write_pack_object(f, t, winner)
 900         entries.append((sha1, offset, crc32))
 901     return entries, f.write_sha()
 902
 903
 904 def write_pack_index_v1(filename, entries, pack_checksum):
 905     """Write a new pack index file.
 906
 907     :param filename: The filename of the new pack index file.
 908     :param entries: List of tuples with object name (sha), offset_in_pack,
 909         and crc32_checksum.
 910     :param pack_checksum: Checksum of the pack file.
 911     """
 912     f = GitFile(filename, 'wb')
 913     f = SHA1Writer(f)
 914     fan_out_table = defaultdict(lambda: 0)
 915     for (name, offset, entry_checksum) in entries:
 916         fan_out_table[ord(name[0])] += 1
 917     # Fan-out table
 918     for i in range(0x100):
 919         f.write(struct.pack(">L", fan_out_table[i]))
 920         fan_out_table[i+1] += fan_out_table[i]
 921     for (name, offset, entry_checksum) in entries:
 922         f.write(struct.pack(">L20s", offset, name))
 923     assert len(pack_checksum) == 20
 924     f.write(pack_checksum)
 925     f.close()
 926
 927
 928 def create_delta(base_buf, target_buf):
 929     """Use python difflib to work out how to transform base_buf to target_buf.
 930
 931     :param base_buf: Base buffer
 932     :param target_buf: Target buffer
 933     """
 934     assert isinstance(base_buf, str)
 935     assert isinstance(target_buf, str)
 936     out_buf = ""
 937     # write delta header
 938     def encode_size(size):
 939         ret = ""
 940         c = size & 0x7f
 941         size >>= 7
 942         while size:
 943             ret += chr(c | 0x80)
 944             c = size & 0x7f
 945             size >>= 7
 946         ret += chr(c)
 947         return ret
 948     out_buf += encode_size(len(base_buf))
 949     out_buf += encode_size(len(target_buf))
 950     # write out delta opcodes
 951     seq = difflib.SequenceMatcher(a=base_buf, b=target_buf)
 952     for opcode, i1, i2, j1, j2 in seq.get_opcodes():
 953         # Git patch opcodes don't care about deletes!
 954         #if opcode == "replace" or opcode == "delete":
 955         #    pass
 956         if opcode == "equal":
 957             # If they are equal, unpacker will use data from base_buf
 958             # Write out an opcode that says what range to use
 959             scratch = ""
 960             op = 0x80
 961             o = i1
 962             for i in range(4):
 963                 if o & 0xff << i*8:
 964                     scratch += chr((o >> i*8) & 0xff)
 965                     op |= 1 << i
 966             s = i2 - i1
 967             for i in range(2):
 968                 if s & 0xff << i*8:
 969                     scratch += chr((s >> i*8) & 0xff)
 970                     op |= 1 << (4+i)
 971             out_buf += chr(op)
 972             out_buf += scratch
 973         if opcode == "replace" or opcode == "insert":
 974             # If we are replacing a range or adding one, then we just
 975             # output it to the stream (prefixed by its size)
 976             s = j2 - j1
 977             o = j1
 978             while s > 127:
 979                 out_buf += chr(127)
 980                 out_buf += target_buf[o:o+127]
 981                 s -= 127
 982                 o += 127
 983             out_buf += chr(s)
 984             out_buf += target_buf[o:o+s]
 985     return out_buf
 986
 987
 988 def apply_delta(src_buf, delta):
 989     """Based on the similar function in git's patch-delta.c.
 990
 991     :param src_buf: Source buffer
 992     :param delta: Delta instructions
 993     """
 994     if type(src_buf) != str:
 995         src_buf = "".join(src_buf)
 996     if type(delta) != str:
 997         delta = "".join(delta)
 998     out = []
 999     index = 0
1000     delta_length = len(delta)
1001     def get_delta_header_size(delta, index):
1002         size = 0
1003         i = 0
1004         while delta:
1005             cmd = ord(delta[index])
1006             index += 1
1007             size |= (cmd & ~0x80) << i
1008             i += 7
1009             if not cmd & 0x80:
1010                 break
1011         return size, index
1012     src_size, index = get_delta_header_size(delta, index)
1013     dest_size, index = get_delta_header_size(delta, index)
1014     assert src_size == len(src_buf), "%d vs %d" % (src_size, len(src_buf))
1015     while index < delta_length:
1016         cmd = ord(delta[index])
1017         index += 1
1018         if cmd & 0x80:
1019             cp_off = 0
1020             for i in range(4):
1021                 if cmd & (1 << i):
1022                     x = ord(delta[index])
1023                     index += 1
1024                     cp_off |= x << (i * 8)
1025             cp_size = 0
1026             for i in range(3):
1027                 if cmd & (1 << (4+i)):
1028                     x = ord(delta[index])
1029                     index += 1
1030                     cp_size |= x << (i * 8)
1031             if cp_size == 0:
1032                 cp_size = 0x10000
1033             if (cp_off + cp_size < cp_size or
1034                 cp_off + cp_size > src_size or
1035                 cp_size > dest_size):
1036                 break
1037             out.append(src_buf[cp_off:cp_off+cp_size])
1038         elif cmd != 0:
1039             out.append(delta[index:index+cmd])
1040             index += cmd
1041         else:
1042             raise ApplyDeltaError("Invalid opcode 0")
1043
1044     if index != delta_length:
1045         raise ApplyDeltaError("delta not empty: %r" % delta[index:])
1046
1047     if dest_size != chunks_length(out):
1048         raise ApplyDeltaError("dest size incorrect")
1049
1050     return out
1051
1052
1053 def write_pack_index_v2(filename, entries, pack_checksum):
1054     """Write a new pack index file.
1055
1056     :param filename: The filename of the new pack index file.
1057     :param entries: List of tuples with object name (sha), offset_in_pack, and
1058         crc32_checksum.
1059     :param pack_checksum: Checksum of the pack file.
1060     """
1061     f = GitFile(filename, 'wb')
1062     f = SHA1Writer(f)
1063     f.write('\377tOc') # Magic!
1064     f.write(struct.pack(">L", 2))
1065     fan_out_table = defaultdict(lambda: 0)
1066     for (name, offset, entry_checksum) in entries:
1067         fan_out_table[ord(name[0])] += 1
1068     # Fan-out table
1069     for i in range(0x100):
1070         f.write(struct.pack(">L", fan_out_table[i]))
1071         fan_out_table[i+1] += fan_out_table[i]
1072     for (name, offset, entry_checksum) in entries:
1073         f.write(name)
1074     for (name, offset, entry_checksum) in entries:
1075         f.write(struct.pack(">L", entry_checksum))
1076     for (name, offset, entry_checksum) in entries:
1077         # FIXME: handle if MSBit is set in offset
1078         f.write(struct.pack(">L", offset))
1079     # FIXME: handle table for pack files > 8 Gb
1080     assert len(pack_checksum) == 20
1081     f.write(pack_checksum)
1082     f.close()
1083
1084
1085 class Pack(object):
1086     """A Git pack object."""
1087
1088     def __init__(self, basename):
1089         self._basename = basename
1090         self._data_path = self._basename + ".pack"
1091         self._idx_path = self._basename + ".idx"
1092         self._data = None
1093         self._idx = None
1094
1095     @classmethod
1096     def from_objects(self, data, idx):
1097         """Create a new pack object from pack data and index objects."""
1098         ret = Pack("")
1099         ret._data = data
1100         ret._idx = idx
1101         return ret
1102
1103     def name(self):
1104         """The SHA over the SHAs of the objects in this pack."""
1105         return self.index.objects_sha1()
1106
1107     @property
1108     def data(self):
1109         """The pack data object being used."""
1110         if self._data is None:
1111             self._data = PackData(self._data_path)
1112             assert len(self.index) == len(self._data)
1113             idx_stored_checksum = self.index.get_pack_checksum()
1114             data_stored_checksum = self._data.get_stored_checksum()
1115             if idx_stored_checksum != data_stored_checksum:
1116                 raise ChecksumMismatch(sha_to_hex(idx_stored_checksum),
1117                                        sha_to_hex(data_stored_checksum))
1118         return self._data
1119
1120     @property
1121     def index(self):
1122         """The index being used.
1123
1124         :note: This may be an in-memory index
1125         """
1126         if self._idx is None:
1127             self._idx = load_pack_index(self._idx_path)
1128         return self._idx
1129
1130     def close(self):
1131         if self._data is not None:
1132             self._data.close()
1133         self.index.close()
1134
1135     def __eq__(self, other):
1136         return type(self) == type(other) and self.index == other.index
1137
1138     def __len__(self):
1139         """Number of entries in this pack."""
1140         return len(self.index)
1141
1142     def __repr__(self):
1143         return "%s(%r)" % (self.__class__.__name__, self._basename)
1144
1145     def __iter__(self):
1146         """Iterate over all the sha1s of the objects in this pack."""
1147         return iter(self.index)
1148
1149     def check(self):
1150         """Check the integrity of this pack."""
1151         if not self.index.check():
1152             return False
1153         if not self.data.check():
1154             return False
1155         return True
1156
1157     def get_stored_checksum(self):
1158         return self.data.get_stored_checksum()
1159
1160     def __contains__(self, sha1):
1161         """Check whether this pack contains a particular SHA1."""
1162         try:
1163             self.index.object_index(sha1)
1164             return True
1165         except KeyError:
1166             return False
1167
1168     def get_raw(self, sha1, resolve_ref=None):
1169         offset = self.index.object_index(sha1)
1170         obj_type, obj = self.data.get_object_at(offset)
1171         if type(offset) is long:
1172           offset = int(offset)
1173         if resolve_ref is None:
1174             resolve_ref = self.get_raw
1175         kind, chunks = self.data.resolve_object(offset, obj_type, obj,
1176             resolve_ref)
1177         return kind, "".join(chunks)
1178
1179     def __getitem__(self, sha1):
1180         """Retrieve the specified SHA1."""
1181         type, uncomp = self.get_raw(sha1)
1182         return ShaFile.from_raw_string(type, uncomp)
1183
1184     def iterobjects(self, get_raw=None):
1185         """Iterate over the objects in this pack."""
1186         if get_raw is None:
1187             get_raw = self.get_raw
1188         for offset, type, obj, crc32 in self.data.iterobjects():
1189             assert isinstance(offset, int)
1190             type, obj = self.data.resolve_object(offset, type, obj, get_raw)
1191             yield ShaFile.from_raw_chunks(type, obj)
1192
1193
1194 try:
1195     from dulwich._pack import apply_delta, bisect_find_sha
1196 except ImportError:
1197     pass