dulwich/pack.py

   1 # pack.py -- For dealing wih packed git objects.
   2 # Copyright (C) 2007 James Westby <jw+debian@jameswestby.net>
   3 # Copryight (C) 2008-2009 Jelmer Vernooij <jelmer@samba.org>
   4 #
   5 # This program is free software; you can redistribute it and/or
   6 # modify it under the terms of the GNU General Public License
   7 # as published by the Free Software Foundation; version 2
   8 # of the License or (at your option) a later version.
   9 #
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 # GNU General Public License for more details.
  14 #
  15 # You should have received a copy of the GNU General Public License
  16 # along with this program; if not, write to the Free Software
  17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
  18 # MA  02110-1301, USA.
  19
  20 """Classes for dealing with packed git objects.
  21
  22 A pack is a compact representation of a bunch of objects, stored
  23 using deltas where possible.
  24
  25 They have two parts, the pack file, which stores the data, and an index
  26 that tells you where the data is.
  27
  28 To find an object you look in all of the index files 'til you find a
  29 match for the object name. You then use the pointer got from this as
  30 a pointer in to the corresponding packfile.
  31 """
  32
  33 try:
  34     from collections import defaultdict
  35 except ImportError:
  36     from misc import defaultdict
  37
  38 import difflib
  39 import errno
  40 from itertools import (
  41     chain,
  42     imap,
  43     izip,
  44     )
  45 import mmap
  46 import os
  47 import struct
  48 try:
  49     from struct import unpack_from
  50 except ImportError:
  51     from dulwich.misc import unpack_from
  52 import sys
  53 import zlib
  54
  55 from dulwich.errors import (
  56     ApplyDeltaError,
  57     ChecksumMismatch,
  58     )
  59 from dulwich.file import GitFile
  60 from dulwich.lru_cache import (
  61     LRUSizeCache,
  62     )
  63 from dulwich.objects import (
  64     ShaFile,
  65     hex_to_sha,
  66     sha_to_hex,
  67     )
  68 from dulwich.misc import (
  69     make_sha,
  70     )
  71
  72 supports_mmap_offset = (sys.version_info[0] >= 3 or
  73         (sys.version_info[0] == 2 and sys.version_info[1] >= 6))
  74
  75
  76 def take_msb_bytes(read):
  77     """Read bytes marked with most significant bit.
  78
  79     :param read: Read function
  80     """
  81     ret = []
  82     while len(ret) == 0 or ret[-1] & 0x80:
  83         ret.append(ord(read(1)))
  84     return ret
  85
  86
  87 def read_zlib_chunks(read, buffer_size=4096):
  88     """Read chunks of zlib data from a buffer.
  89
  90     :param read: Read function
  91     :return: Tuple with list of chunks, length of
  92         compressed data length and unused read data
  93     """
  94     obj = zlib.decompressobj()
  95     ret = []
  96     fed = 0
  97     while obj.unused_data == "":
  98         add = read(buffer_size)
  99         if len(add) < buffer_size:
 100             add += "Z"
 101         fed += len(add)
 102         ret.append(obj.decompress(add))
 103     comp_len = fed-len(obj.unused_data)
 104     return ret, comp_len, obj.unused_data
 105
 106
 107 def iter_sha1(iter):
 108     """Return the hexdigest of the SHA1 over a set of names.
 109
 110     :param iter: Iterator over string objects
 111     :return: 40-byte hex sha1 digest
 112     """
 113     sha1 = make_sha()
 114     for name in iter:
 115         sha1.update(name)
 116     return sha1.hexdigest()
 117
 118
 119 def load_pack_index(path):
 120     """Load an index file by path.
 121
 122     :param filename: Path to the index file
 123     """
 124     f = GitFile(path, 'rb')
 125     return load_pack_index_file(path, f)
 126
 127
 128 def _load_file_contents(f, size=None):
 129     fileno = getattr(f, 'fileno', None)
 130     # Attempt to use mmap if possible
 131     if fileno is not None:
 132         fd = f.fileno()
 133         if size is None:
 134             size = os.fstat(fd).st_size
 135         try:
 136             contents = mmap.mmap(fd, size, access=mmap.ACCESS_READ)
 137         except mmap.error:
 138             # Perhaps a socket?
 139             pass
 140         else:
 141             return contents, size
 142     contents = f.read()
 143     size = len(contents)
 144     return contents, size
 145
 146
 147 def load_pack_index_file(path, f):
 148     """Load an index file from a file-like object.
 149
 150     :param path: Path for the index file
 151     :param f: File-like object
 152     """
 153     contents, size = _load_file_contents(f)
 154     if contents[:4] == '\377tOc':
 155         version = struct.unpack(">L", contents[4:8])[0]
 156         if version == 2:
 157             return PackIndex2(path, file=f, contents=contents,
 158                 size=size)
 159         else:
 160             raise KeyError("Unknown pack index format %d" % version)
 161     else:
 162         return PackIndex1(path, file=f, contents=contents, size=size)
 163
 164
 165 def bisect_find_sha(start, end, sha, unpack_name):
 166     """Find a SHA in a data blob with sorted SHAs.
 167
 168     :param start: Start index of range to search
 169     :param end: End index of range to search
 170     :param sha: Sha to find
 171     :param unpack_name: Callback to retrieve SHA by index
 172     :return: Index of the SHA, or None if it wasn't found
 173     """
 174     assert start <= end
 175     while start <= end:
 176         i = (start + end)/2
 177         file_sha = unpack_name(i)
 178         x = cmp(file_sha, sha)
 179         if x < 0:
 180             start = i + 1
 181         elif x > 0:
 182             end = i - 1
 183         else:
 184             return i
 185     return None
 186
 187
 188 class PackIndex(object):
 189     """An index in to a packfile.
 190
 191     Given a sha id of an object a pack index can tell you the location in the
 192     packfile of that object if it has it.
 193
 194     To do the loop it opens the file, and indexes first 256 4 byte groups
 195     with the first byte of the sha id. The value in the four byte group indexed
 196     is the end of the group that shares the same starting byte. Subtract one
 197     from the starting byte and index again to find the start of the group.
 198     The values are sorted by sha id within the group, so do the math to find
 199     the start and end offset and then bisect in to find if the value is present.
 200     """
 201
 202     def __init__(self, filename, file=None, contents=None, size=None):
 203         """Create a pack index object.
 204
 205         Provide it with the name of the index file to consider, and it will map
 206         it whenever required.
 207         """
 208         self._filename = filename
 209         # Take the size now, so it can be checked each time we map the file to
 210         # ensure that it hasn't changed.
 211         if file is None:
 212             self._file = GitFile(filename, 'rb')
 213         else:
 214             self._file = file
 215         if contents is None:
 216             self._contents, self._size = _load_file_contents(file, size)
 217         else:
 218             self._contents, self._size = (contents, size)
 219
 220     def __eq__(self, other):
 221         if not isinstance(other, PackIndex):
 222             return False
 223
 224         if self._fan_out_table != other._fan_out_table:
 225             return False
 226
 227         for (name1, _, _), (name2, _, _) in izip(self.iterentries(),
 228                                                  other.iterentries()):
 229             if name1 != name2:
 230                 return False
 231         return True
 232
 233     def __ne__(self, other):
 234         return not self.__eq__(other)
 235
 236     def close(self):
 237         self._file.close()
 238
 239     def __len__(self):
 240         """Return the number of entries in this pack index."""
 241         return self._fan_out_table[-1]
 242
 243     def _unpack_entry(self, i):
 244         """Unpack the i-th entry in the index file.
 245
 246         :return: Tuple with object name (SHA), offset in pack file and
 247               CRC32 checksum (if known)."""
 248         raise NotImplementedError(self._unpack_entry)
 249
 250     def _unpack_name(self, i):
 251         """Unpack the i-th name from the index file."""
 252         raise NotImplementedError(self._unpack_name)
 253
 254     def _unpack_offset(self, i):
 255         """Unpack the i-th object offset from the index file."""
 256         raise NotImplementedError(self._unpack_offset)
 257
 258     def _unpack_crc32_checksum(self, i):
 259         """Unpack the crc32 checksum for the i-th object from the index file."""
 260         raise NotImplementedError(self._unpack_crc32_checksum)
 261
 262     def __iter__(self):
 263         """Iterate over the SHAs in this pack."""
 264         return imap(sha_to_hex, self._itersha())
 265
 266     def _itersha(self):
 267         for i in range(len(self)):
 268             yield self._unpack_name(i)
 269
 270     def objects_sha1(self):
 271         """Return the hex SHA1 over all the shas of all objects in this pack.
 272
 273         :note: This is used for the filename of the pack.
 274         """
 275         return iter_sha1(self._itersha())
 276
 277     def iterentries(self):
 278         """Iterate over the entries in this pack index.
 279
 280         Will yield tuples with object name, offset in packfile and crc32
 281         checksum.
 282         """
 283         for i in range(len(self)):
 284             yield self._unpack_entry(i)
 285
 286     def _read_fan_out_table(self, start_offset):
 287         ret = []
 288         for i in range(0x100):
 289             ret.append(struct.unpack(">L",
 290                 self._contents[start_offset+i*4:start_offset+(i+1)*4])[0])
 291         return ret
 292
 293     def check(self):
 294         """Check that the stored checksum matches the actual checksum."""
 295         # TODO: Check pack contents, too
 296         return self.calculate_checksum() == self.get_stored_checksum()
 297
 298     def calculate_checksum(self):
 299         """Calculate the SHA1 checksum over this pack index.
 300
 301         :return: This is a 20-byte binary digest
 302         """
 303         return make_sha(self._contents[:-20]).digest()
 304
 305     def get_pack_checksum(self):
 306         """Return the SHA1 checksum stored for the corresponding packfile.
 307
 308         :return: 20-byte binary digest
 309         """
 310         return str(self._contents[-40:-20])
 311
 312     def get_stored_checksum(self):
 313         """Return the SHA1 checksum stored for this index.
 314
 315         :return: 20-byte binary digest
 316         """
 317         return str(self._contents[-20:])
 318
 319     def object_index(self, sha):
 320         """Return the index in to the corresponding packfile for the object.
 321
 322         Given the name of an object it will return the offset that object
 323         lives at within the corresponding pack file. If the pack file doesn't
 324         have the object then None will be returned.
 325         """
 326         if len(sha) == 40:
 327             sha = hex_to_sha(sha)
 328         return self._object_index(sha)
 329
 330     def _object_index(self, sha):
 331         """See object_index.
 332
 333         :param sha: A *binary* SHA string. (20 characters long)_
 334         """
 335         assert len(sha) == 20
 336         idx = ord(sha[0])
 337         if idx == 0:
 338             start = 0
 339         else:
 340             start = self._fan_out_table[idx-1]
 341         end = self._fan_out_table[idx]
 342         i = bisect_find_sha(start, end, sha, self._unpack_name)
 343         if i is None:
 344             raise KeyError(sha)
 345         return self._unpack_offset(i)
 346
 347
 348
 349 class PackIndex1(PackIndex):
 350     """Version 1 Pack Index."""
 351
 352     def __init__(self, filename, file=None, contents=None, size=None):
 353         PackIndex.__init__(self, filename, file, contents, size)
 354         self.version = 1
 355         self._fan_out_table = self._read_fan_out_table(0)
 356
 357     def _unpack_entry(self, i):
 358         (offset, name) = unpack_from(">L20s", self._contents,
 359             (0x100 * 4) + (i * 24))
 360         return (name, offset, None)
 361
 362     def _unpack_name(self, i):
 363         offset = (0x100 * 4) + (i * 24) + 4
 364         return self._contents[offset:offset+20]
 365
 366     def _unpack_offset(self, i):
 367         offset = (0x100 * 4) + (i * 24)
 368         return unpack_from(">L", self._contents, offset)[0]
 369
 370     def _unpack_crc32_checksum(self, i):
 371         # Not stored in v1 index files
 372         return None
 373
 374
 375 class PackIndex2(PackIndex):
 376     """Version 2 Pack Index."""
 377
 378     def __init__(self, filename, file=None, contents=None, size=None):
 379         PackIndex.__init__(self, filename, file, contents, size)
 380         assert self._contents[:4] == '\377tOc', "Not a v2 pack index file"
 381         (self.version, ) = unpack_from(">L", self._contents, 4)
 382         assert self.version == 2, "Version was %d" % self.version
 383         self._fan_out_table = self._read_fan_out_table(8)
 384         self._name_table_offset = 8 + 0x100 * 4
 385         self._crc32_table_offset = self._name_table_offset + 20 * len(self)
 386         self._pack_offset_table_offset = self._crc32_table_offset + 4 * len(self)
 387
 388     def _unpack_entry(self, i):
 389         return (self._unpack_name(i), self._unpack_offset(i),
 390                 self._unpack_crc32_checksum(i))
 391
 392     def _unpack_name(self, i):
 393         offset = self._name_table_offset + i * 20
 394         return self._contents[offset:offset+20]
 395
 396     def _unpack_offset(self, i):
 397         offset = self._pack_offset_table_offset + i * 4
 398         return unpack_from(">L", self._contents, offset)[0]
 399
 400     def _unpack_crc32_checksum(self, i):
 401         return unpack_from(">L", self._contents,
 402                           self._crc32_table_offset + i * 4)[0]
 403
 404
 405
 406 def read_pack_header(f):
 407     """Read the header of a pack file.
 408
 409     :param f: File-like object to read from
 410     """
 411     header = f.read(12)
 412     assert header[:4] == "PACK"
 413     (version,) = unpack_from(">L", header, 4)
 414     assert version in (2, 3), "Version was %d" % version
 415     (num_objects,) = unpack_from(">L", header, 8)
 416     return (version, num_objects)
 417
 418
 419 def chunks_length(chunks):
 420     return sum(imap(len, chunks))
 421
 422
 423 def unpack_object(read):
 424     """Unpack a Git object.
 425
 426     :return: tuple with type, uncompressed data as chunks, compressed size and
 427         tail data
 428     """
 429     bytes = take_msb_bytes(read)
 430     type = (bytes[0] >> 4) & 0x07
 431     size = bytes[0] & 0x0f
 432     for i, byte in enumerate(bytes[1:]):
 433         size += (byte & 0x7f) << ((i * 7) + 4)
 434     raw_base = len(bytes)
 435     if type == 6: # offset delta
 436         bytes = take_msb_bytes(read)
 437         raw_base += len(bytes)
 438         assert not (bytes[-1] & 0x80)
 439         delta_base_offset = bytes[0] & 0x7f
 440         for byte in bytes[1:]:
 441             delta_base_offset += 1
 442             delta_base_offset <<= 7
 443             delta_base_offset += (byte & 0x7f)
 444         uncomp, comp_len, unused = read_zlib_chunks(read)
 445         assert size == chunks_length(uncomp)
 446         return type, (delta_base_offset, uncomp), comp_len+raw_base, unused
 447     elif type == 7: # ref delta
 448         basename = read(20)
 449         raw_base += 20
 450         uncomp, comp_len, unused = read_zlib_chunks(read)
 451         assert size == chunks_length(uncomp)
 452         return type, (basename, uncomp), comp_len+raw_base, unused
 453     else:
 454         uncomp, comp_len, unused = read_zlib_chunks(read)
 455         assert chunks_length(uncomp) == size
 456         return type, uncomp, comp_len+raw_base, unused
 457
 458
 459 def _compute_object_size((num, obj)):
 460     """Compute the size of a unresolved object for use with LRUSizeCache.
 461     """
 462     if num in (6, 7):
 463         return chunks_length(obj[1])
 464     return chunks_length(obj)
 465
 466
 467 class PackData(object):
 468     """The data contained in a packfile.
 469
 470     Pack files can be accessed both sequentially for exploding a pack, and
 471     directly with the help of an index to retrieve a specific object.
 472
 473     The objects within are either complete or a delta aginst another.
 474
 475     The header is variable length. If the MSB of each byte is set then it
 476     indicates that the subsequent byte is still part of the header.
 477     For the first byte the next MS bits are the type, which tells you the type
 478     of object, and whether it is a delta. The LS byte is the lowest bits of the
 479     size. For each subsequent byte the LS 7 bits are the next MS bits of the
 480     size, i.e. the last byte of the header contains the MS bits of the size.
 481
 482     For the complete objects the data is stored as zlib deflated data.
 483     The size in the header is the uncompressed object size, so to uncompress
 484     you need to just keep feeding data to zlib until you get an object back,
 485     or it errors on bad data. This is done here by just giving the complete
 486     buffer from the start of the deflated object on. This is bad, but until I
 487     get mmap sorted out it will have to do.
 488
 489     Currently there are no integrity checks done. Also no attempt is made to
 490     try and detect the delta case, or a request for an object at the wrong
 491     position.  It will all just throw a zlib or KeyError.
 492     """
 493
 494     def __init__(self, filename, file=None, size=None):
 495         """Create a PackData object that represents the pack in the given
 496         filename.
 497
 498         The file must exist and stay readable until the object is disposed of.
 499         It must also stay the same size. It will be mapped whenever needed.
 500
 501         Currently there is a restriction on the size of the pack as the python
 502         mmap implementation is flawed.
 503         """
 504         self._filename = filename
 505         self._size = size
 506         self._header_size = 12
 507         if file is None:
 508             self._file = GitFile(self._filename, 'rb')
 509         else:
 510             self._file = file
 511         (version, self._num_objects) = read_pack_header(self._file)
 512         self._offset_cache = LRUSizeCache(1024*1024*20,
 513             compute_size=_compute_object_size)
 514
 515     @classmethod
 516     def from_file(cls, file, size):
 517         return cls(str(file), file=file, size=size)
 518
 519     @classmethod
 520     def from_path(cls, path):
 521         return cls(filename=path)
 522
 523     def close(self):
 524         self._file.close()
 525
 526     def _get_size(self):
 527         if self._size is not None:
 528             return self._size
 529         self._size = os.path.getsize(self._filename)
 530         assert self._size >= self._header_size, "%s is too small for a packfile (%d < %d)" % (self._filename, self._size, self._header_size)
 531         return self._size
 532
 533     def __len__(self):
 534         """Returns the number of objects in this pack."""
 535         return self._num_objects
 536
 537     def calculate_checksum(self):
 538         """Calculate the checksum for this pack.
 539
 540         :return: 20-byte binary SHA1 digest
 541         """
 542         s = make_sha()
 543         self._file.seek(0)
 544         todo = self._get_size() - 20
 545         while todo > 0:
 546             x = self._file.read(min(todo, 1<<16))
 547             s.update(x)
 548             todo -= len(x)
 549         return s.digest()
 550
 551     def resolve_object(self, offset, type, obj, get_ref, get_offset=None):
 552         """Resolve an object, possibly resolving deltas when necessary.
 553
 554         :return: Tuple with object type and contents.
 555         """
 556         if type not in (6, 7): # Not a delta
 557             return type, obj
 558
 559         if get_offset is None:
 560             get_offset = self.get_object_at
 561
 562         if type == 6: # offset delta
 563             (delta_offset, delta) = obj
 564             assert isinstance(delta_offset, int)
 565             base_offset = offset-delta_offset
 566             type, base_obj = get_offset(base_offset)
 567             assert isinstance(type, int)
 568         elif type == 7: # ref delta
 569             (basename, delta) = obj
 570             assert isinstance(basename, str) and len(basename) == 20
 571             type, base_obj = get_ref(basename)
 572             assert isinstance(type, int)
 573             # Can't be a ofs delta, as we wouldn't know the base offset
 574             assert type != 6
 575             base_offset = None
 576         type, base_chunks = self.resolve_object(base_offset, type, base_obj,
 577             get_ref)
 578         if base_offset is not None:
 579             self._offset_cache[base_offset] = type, base_chunks
 580         return (type, apply_delta(base_chunks, delta))
 581
 582     def iterobjects(self, progress=None):
 583
 584         class ObjectIterator(object):
 585
 586             def __init__(self, pack):
 587                 self.i = 0
 588                 self.offset = pack._header_size
 589                 self.num = len(pack)
 590                 self.map = pack._file
 591
 592             def __iter__(self):
 593                 return self
 594
 595             def __len__(self):
 596                 return self.num
 597
 598             def next(self):
 599                 if self.i == self.num:
 600                     raise StopIteration
 601                 self.map.seek(self.offset)
 602                 (type, obj, total_size, unused) = unpack_object(self.map.read)
 603                 self.map.seek(self.offset)
 604                 crc32 = zlib.crc32(self.map.read(total_size)) & 0xffffffff
 605                 ret = (self.offset, type, obj, crc32)
 606                 self.offset += total_size
 607                 if progress:
 608                     progress(self.i, self.num)
 609                 self.i+=1
 610                 return ret
 611         return ObjectIterator(self)
 612
 613     def iterentries(self, ext_resolve_ref=None, progress=None):
 614         """Yield entries summarizing the contents of this pack.
 615
 616         :param ext_resolve_ref: Optional function to resolve base
 617             objects (in case this is a thin pack)
 618         :param progress: Progress function, called with current and
 619             total object count.
 620
 621         This will yield tuples with (sha, offset, crc32)
 622         """
 623         found = {}
 624         postponed = defaultdict(list)
 625         class Postpone(Exception):
 626             """Raised to postpone delta resolving."""
 627
 628         def get_ref_text(sha):
 629             assert len(sha) == 20
 630             if sha in found:
 631                 return self.get_object_at(found[sha])
 632             if ext_resolve_ref:
 633                 try:
 634                     return ext_resolve_ref(sha)
 635                 except KeyError:
 636                     pass
 637             raise Postpone, (sha, )
 638         extra = []
 639         todo = chain(self.iterobjects(progress=progress), extra)
 640         for (offset, type, obj, crc32) in todo:
 641             assert isinstance(offset, int)
 642             assert isinstance(type, int)
 643             try:
 644                 type, obj = self.resolve_object(offset, type, obj,
 645                     get_ref_text)
 646             except Postpone, (sha, ):
 647                 postponed[sha].append((offset, type, obj))
 648             else:
 649                 shafile = ShaFile.from_raw_chunks(type, obj)
 650                 sha = shafile.sha().digest()
 651                 found[sha] = offset
 652                 yield sha, offset, crc32
 653                 extra.extend(postponed.get(sha, []))
 654         if postponed:
 655             raise KeyError([sha_to_hex(h) for h in postponed.keys()])
 656
 657     def sorted_entries(self, resolve_ext_ref=None, progress=None):
 658         """Return entries in this pack, sorted by SHA.
 659
 660         :param ext_resolve_ref: Optional function to resolve base
 661             objects (in case this is a thin pack)
 662         :param progress: Progress function, called with current and
 663             total object count.
 664         :return: List of tuples with (sha, offset, crc32)
 665         """
 666         ret = list(self.iterentries(resolve_ext_ref, progress=progress))
 667         ret.sort()
 668         return ret
 669
 670     def create_index_v1(self, filename, resolve_ext_ref=None, progress=None):
 671         """Create a version 1 file for this data file.
 672
 673         :param filename: Index filename.
 674         :param resolve_ext_ref: Function to use for resolving externally
 675             referenced SHA1s (for thin packs)
 676         :param progress: Progress report function
 677         """
 678         entries = self.sorted_entries(resolve_ext_ref, progress=progress)
 679         write_pack_index_v1(filename, entries, self.calculate_checksum())
 680
 681     def create_index_v2(self, filename, resolve_ext_ref=None, progress=None):
 682         """Create a version 2 index file for this data file.
 683
 684         :param filename: Index filename.
 685         :param resolve_ext_ref: Function to use for resolving externally
 686             referenced SHA1s (for thin packs)
 687         :param progress: Progress report function
 688         """
 689         entries = self.sorted_entries(resolve_ext_ref, progress=progress)
 690         write_pack_index_v2(filename, entries, self.calculate_checksum())
 691
 692     def create_index(self, filename, resolve_ext_ref=None, progress=None,
 693                      version=2):
 694         """Create an  index file for this data file.
 695
 696         :param filename: Index filename.
 697         :param resolve_ext_ref: Function to use for resolving externally
 698             referenced SHA1s (for thin packs)
 699         :param progress: Progress report function
 700         """
 701         if version == 1:
 702             self.create_index_v1(filename, resolve_ext_ref, progress)
 703         elif version == 2:
 704             self.create_index_v2(filename, resolve_ext_ref, progress)
 705         else:
 706             raise ValueError("unknown index format %d" % version)
 707
 708     def get_stored_checksum(self):
 709         """Return the expected checksum stored in this pack."""
 710         self._file.seek(self._get_size()-20)
 711         return self._file.read(20)
 712
 713     def check(self):
 714         """Check the consistency of this pack."""
 715         return (self.calculate_checksum() == self.get_stored_checksum())
 716
 717     def get_object_at(self, offset):
 718         """Given an offset in to the packfile return the object that is there.
 719
 720         Using the associated index the location of an object can be looked up,
 721         and then the packfile can be asked directly for that object using this
 722         function.
 723         """
 724         if offset in self._offset_cache:
 725             return self._offset_cache[offset]
 726         assert isinstance(offset, long) or isinstance(offset, int),\
 727                 "offset was %r" % offset
 728         assert offset >= self._header_size
 729         self._file.seek(offset)
 730         return unpack_object(self._file.read)[:2]
 731
 732
 733 class SHA1Reader(object):
 734     """Wrapper around a file-like object that remembers the SHA1 of
 735     the data read from it."""
 736
 737     def __init__(self, f):
 738         self.f = f
 739         self.sha1 = make_sha("")
 740
 741     def read(self, num=None):
 742         data = self.f.read(num)
 743         self.sha1.update(data)
 744         return data
 745
 746     def check_sha(self):
 747         stored = self.f.read(20)
 748         if stored != self.sha1.digest():
 749             raise ChecksumMismatch(self.sha1.hexdigest(), sha_to_hex(stored))
 750
 751     def close(self):
 752         return self.f.close()
 753
 754     def tell(self):
 755         return self.f.tell()
 756
 757
 758 class SHA1Writer(object):
 759     """Wrapper around a file-like object that remembers the SHA1 of
 760     the data written to it."""
 761
 762     def __init__(self, f):
 763         self.f = f
 764         self.sha1 = make_sha("")
 765
 766     def write(self, data):
 767         self.sha1.update(data)
 768         self.f.write(data)
 769
 770     def write_sha(self):
 771         sha = self.sha1.digest()
 772         assert len(sha) == 20
 773         self.f.write(sha)
 774         return sha
 775
 776     def close(self):
 777         sha = self.write_sha()
 778         self.f.close()
 779         return sha
 780
 781     def tell(self):
 782         return self.f.tell()
 783
 784
 785 def write_pack_object(f, type, object):
 786     """Write pack object to a file.
 787
 788     :param f: File to write to
 789     :param o: Object to write
 790     :return: Tuple with offset at which the object was written, and crc32
 791     """
 792     offset = f.tell()
 793     packed_data_hdr = ""
 794     if type == 6: # offset delta
 795         (delta_base_offset, object) = object
 796     elif type == 7: # ref delta
 797         (basename, object) = object
 798     size = len(object)
 799     c = (type << 4) | (size & 15)
 800     size >>= 4
 801     while size:
 802         packed_data_hdr += (chr(c | 0x80))
 803         c = size & 0x7f
 804         size >>= 7
 805     packed_data_hdr += chr(c)
 806     if type == 6: # offset delta
 807         ret = [delta_base_offset & 0x7f]
 808         delta_base_offset >>= 7
 809         while delta_base_offset:
 810             delta_base_offset -= 1
 811             ret.insert(0, 0x80 | (delta_base_offset & 0x7f))
 812             delta_base_offset >>= 7
 813         packed_data_hdr += "".join([chr(x) for x in ret])
 814     elif type == 7: # ref delta
 815         assert len(basename) == 20
 816         packed_data_hdr += basename
 817     packed_data = packed_data_hdr + zlib.compress(object)
 818     f.write(packed_data)
 819     return (offset, (zlib.crc32(packed_data) & 0xffffffff))
 820
 821
 822 def write_pack(filename, objects, num_objects):
 823     """Write a new pack data file.
 824
 825     :param filename: Path to the new pack file (without .pack extension)
 826     :param objects: Iterable over (object, path) tuples to write
 827     :param num_objects: Number of objects to write
 828     """
 829     f = GitFile(filename + ".pack", 'wb')
 830     try:
 831         entries, data_sum = write_pack_data(f, objects, num_objects)
 832     finally:
 833         f.close()
 834     entries.sort()
 835     write_pack_index_v2(filename + ".idx", entries, data_sum)
 836
 837
 838 def write_pack_data(f, objects, num_objects, window=10):
 839     """Write a new pack file.
 840
 841     :param filename: The filename of the new pack file.
 842     :param objects: List of objects to write (tuples with object and path)
 843     :return: List with (name, offset, crc32 checksum) entries, pack checksum
 844     """
 845     recency = list(objects)
 846     # FIXME: Somehow limit delta depth
 847     # FIXME: Make thin-pack optional (its not used when cloning a pack)
 848     # Build a list of objects ordered by the magic Linus heuristic
 849     # This helps us find good objects to diff against us
 850     magic = []
 851     for obj, path in recency:
 852         magic.append( (obj.type_num, path, 1, -obj.raw_length(), obj) )
 853     magic.sort()
 854     # Build a map of objects and their index in magic - so we can find preceeding objects
 855     # to diff against
 856     offs = {}
 857     for i in range(len(magic)):
 858         offs[magic[i][4]] = i
 859     # Write the pack
 860     entries = []
 861     f = SHA1Writer(f)
 862     f.write("PACK")               # Pack header
 863     f.write(struct.pack(">L", 2)) # Pack version
 864     f.write(struct.pack(">L", num_objects)) # Number of objects in pack
 865     for o, path in recency:
 866         sha1 = o.sha().digest()
 867         orig_t = o.type_num
 868         raw = o.as_raw_string()
 869         winner = raw
 870         t = orig_t
 871         #for i in range(offs[o]-window, window):
 872         #    if i < 0 or i >= len(offs): continue
 873         #    b = magic[i][4]
 874         #    if b.type_num != orig_t: continue
 875         #    base = b.as_raw_string()
 876         #    delta = create_delta(base, raw)
 877         #    if len(delta) < len(winner):
 878         #        winner = delta
 879         #        t = 6 if magic[i][2] == 1 else 7
 880         offset, crc32 = write_pack_object(f, t, winner)
 881         entries.append((sha1, offset, crc32))
 882     return entries, f.write_sha()
 883
 884
 885 def write_pack_index_v1(filename, entries, pack_checksum):
 886     """Write a new pack index file.
 887
 888     :param filename: The filename of the new pack index file.
 889     :param entries: List of tuples with object name (sha), offset_in_pack,
 890         and crc32_checksum.
 891     :param pack_checksum: Checksum of the pack file.
 892     """
 893     f = GitFile(filename, 'wb')
 894     f = SHA1Writer(f)
 895     fan_out_table = defaultdict(lambda: 0)
 896     for (name, offset, entry_checksum) in entries:
 897         fan_out_table[ord(name[0])] += 1
 898     # Fan-out table
 899     for i in range(0x100):
 900         f.write(struct.pack(">L", fan_out_table[i]))
 901         fan_out_table[i+1] += fan_out_table[i]
 902     for (name, offset, entry_checksum) in entries:
 903         f.write(struct.pack(">L20s", offset, name))
 904     assert len(pack_checksum) == 20
 905     f.write(pack_checksum)
 906     f.close()
 907
 908
 909 def create_delta(base_buf, target_buf):
 910     """Use python difflib to work out how to transform base_buf to target_buf.
 911
 912     :param base_buf: Base buffer
 913     :param target_buf: Target buffer
 914     """
 915     assert isinstance(base_buf, str)
 916     assert isinstance(target_buf, str)
 917     out_buf = ""
 918     # write delta header
 919     def encode_size(size):
 920         ret = ""
 921         c = size & 0x7f
 922         size >>= 7
 923         while size:
 924             ret += chr(c | 0x80)
 925             c = size & 0x7f
 926             size >>= 7
 927         ret += chr(c)
 928         return ret
 929     out_buf += encode_size(len(base_buf))
 930     out_buf += encode_size(len(target_buf))
 931     # write out delta opcodes
 932     seq = difflib.SequenceMatcher(a=base_buf, b=target_buf)
 933     for opcode, i1, i2, j1, j2 in seq.get_opcodes():
 934         # Git patch opcodes don't care about deletes!
 935         #if opcode == "replace" or opcode == "delete":
 936         #    pass
 937         if opcode == "equal":
 938             # If they are equal, unpacker will use data from base_buf
 939             # Write out an opcode that says what range to use
 940             scratch = ""
 941             op = 0x80
 942             o = i1
 943             for i in range(4):
 944                 if o & 0xff << i*8:
 945                     scratch += chr((o >> i*8) & 0xff)
 946                     op |= 1 << i
 947             s = i2 - i1
 948             for i in range(2):
 949                 if s & 0xff << i*8:
 950                     scratch += chr((s >> i*8) & 0xff)
 951                     op |= 1 << (4+i)
 952             out_buf += chr(op)
 953             out_buf += scratch
 954         if opcode == "replace" or opcode == "insert":
 955             # If we are replacing a range or adding one, then we just
 956             # output it to the stream (prefixed by its size)
 957             s = j2 - j1
 958             o = j1
 959             while s > 127:
 960                 out_buf += chr(127)
 961                 out_buf += target_buf[o:o+127]
 962                 s -= 127
 963                 o += 127
 964             out_buf += chr(s)
 965             out_buf += target_buf[o:o+s]
 966     return out_buf
 967
 968
 969 def apply_delta(src_buf, delta):
 970     """Based on the similar function in git's patch-delta.c.
 971
 972     :param src_buf: Source buffer
 973     :param delta: Delta instructions
 974     """
 975     if type(src_buf) != str:
 976         src_buf = "".join(src_buf)
 977     if type(delta) != str:
 978         delta = "".join(delta)
 979     out = []
 980     index = 0
 981     delta_length = len(delta)
 982     def get_delta_header_size(delta, index):
 983         size = 0
 984         i = 0
 985         while delta:
 986             cmd = ord(delta[index])
 987             index += 1
 988             size |= (cmd & ~0x80) << i
 989             i += 7
 990             if not cmd & 0x80:
 991                 break
 992         return size, index
 993     src_size, index = get_delta_header_size(delta, index)
 994     dest_size, index = get_delta_header_size(delta, index)
 995     assert src_size == len(src_buf), "%d vs %d" % (src_size, len(src_buf))
 996     while index < delta_length:
 997         cmd = ord(delta[index])
 998         index += 1
 999         if cmd & 0x80:
1000             cp_off = 0
1001             for i in range(4):
1002                 if cmd & (1 << i):
1003                     x = ord(delta[index])
1004                     index += 1
1005                     cp_off |= x << (i * 8)
1006             cp_size = 0
1007             for i in range(3):
1008                 if cmd & (1 << (4+i)):
1009                     x = ord(delta[index])
1010                     index += 1
1011                     cp_size |= x << (i * 8)
1012             if cp_size == 0:
1013                 cp_size = 0x10000
1014             if (cp_off + cp_size < cp_size or
1015                 cp_off + cp_size > src_size or
1016                 cp_size > dest_size):
1017                 break
1018             out.append(src_buf[cp_off:cp_off+cp_size])
1019         elif cmd != 0:
1020             out.append(delta[index:index+cmd])
1021             index += cmd
1022         else:
1023             raise ApplyDeltaError("Invalid opcode 0")
1024
1025     if index != delta_length:
1026         raise ApplyDeltaError("delta not empty: %r" % delta[index:])
1027
1028     if dest_size != chunks_length(out):
1029         raise ApplyDeltaError("dest size incorrect")
1030
1031     return out
1032
1033
1034 def write_pack_index_v2(filename, entries, pack_checksum):
1035     """Write a new pack index file.
1036
1037     :param filename: The filename of the new pack index file.
1038     :param entries: List of tuples with object name (sha), offset_in_pack, and
1039         crc32_checksum.
1040     :param pack_checksum: Checksum of the pack file.
1041     """
1042     f = GitFile(filename, 'wb')
1043     f = SHA1Writer(f)
1044     f.write('\377tOc') # Magic!
1045     f.write(struct.pack(">L", 2))
1046     fan_out_table = defaultdict(lambda: 0)
1047     for (name, offset, entry_checksum) in entries:
1048         fan_out_table[ord(name[0])] += 1
1049     # Fan-out table
1050     for i in range(0x100):
1051         f.write(struct.pack(">L", fan_out_table[i]))
1052         fan_out_table[i+1] += fan_out_table[i]
1053     for (name, offset, entry_checksum) in entries:
1054         f.write(name)
1055     for (name, offset, entry_checksum) in entries:
1056         f.write(struct.pack(">L", entry_checksum))
1057     for (name, offset, entry_checksum) in entries:
1058         # FIXME: handle if MSBit is set in offset
1059         f.write(struct.pack(">L", offset))
1060     # FIXME: handle table for pack files > 8 Gb
1061     assert len(pack_checksum) == 20
1062     f.write(pack_checksum)
1063     f.close()
1064
1065
1066 class Pack(object):
1067     """A Git pack object."""
1068
1069     def __init__(self, basename):
1070         self._basename = basename
1071         self._data_path = self._basename + ".pack"
1072         self._idx_path = self._basename + ".idx"
1073         self._data = None
1074         self._idx = None
1075
1076     @classmethod
1077     def from_objects(self, data, idx):
1078         """Create a new pack object from pack data and index objects."""
1079         ret = Pack("")
1080         ret._data = data
1081         ret._idx = idx
1082         return ret
1083
1084     def name(self):
1085         """The SHA over the SHAs of the objects in this pack."""
1086         return self.index.objects_sha1()
1087
1088     @property
1089     def data(self):
1090         """The pack data object being used."""
1091         if self._data is None:
1092             self._data = PackData(self._data_path)
1093             assert len(self.index) == len(self._data)
1094             idx_stored_checksum = self.index.get_pack_checksum()
1095             data_stored_checksum = self._data.get_stored_checksum()
1096             if idx_stored_checksum != data_stored_checksum:
1097                 raise ChecksumMismatch(sha_to_hex(idx_stored_checksum),
1098                                        sha_to_hex(data_stored_checksum))
1099         return self._data
1100
1101     @property
1102     def index(self):
1103         """The index being used.
1104
1105         :note: This may be an in-memory index
1106         """
1107         if self._idx is None:
1108             self._idx = load_pack_index(self._idx_path)
1109         return self._idx
1110
1111     def close(self):
1112         if self._data is not None:
1113             self._data.close()
1114         self.index.close()
1115
1116     def __eq__(self, other):
1117         return type(self) == type(other) and self.index == other.index
1118
1119     def __len__(self):
1120         """Number of entries in this pack."""
1121         return len(self.index)
1122
1123     def __repr__(self):
1124         return "%s(%r)" % (self.__class__.__name__, self._basename)
1125
1126     def __iter__(self):
1127         """Iterate over all the sha1s of the objects in this pack."""
1128         return iter(self.index)
1129
1130     def check(self):
1131         """Check the integrity of this pack."""
1132         if not self.index.check():
1133             return False
1134         if not self.data.check():
1135             return False
1136         return True
1137
1138     def get_stored_checksum(self):
1139         return self.data.get_stored_checksum()
1140
1141     def __contains__(self, sha1):
1142         """Check whether this pack contains a particular SHA1."""
1143         try:
1144             self.index.object_index(sha1)
1145             return True
1146         except KeyError:
1147             return False
1148
1149     def get_raw(self, sha1, resolve_ref=None):
1150         offset = self.index.object_index(sha1)
1151         obj_type, obj = self.data.get_object_at(offset)
1152         if type(offset) is long:
1153           offset = int(offset)
1154         if resolve_ref is None:
1155             resolve_ref = self.get_raw
1156         kind, chunks = self.data.resolve_object(offset, obj_type, obj,
1157             resolve_ref)
1158         return kind, "".join(chunks)
1159
1160     def __getitem__(self, sha1):
1161         """Retrieve the specified SHA1."""
1162         type, uncomp = self.get_raw(sha1)
1163         return ShaFile.from_raw_string(type, uncomp)
1164
1165     def iterobjects(self, get_raw=None):
1166         """Iterate over the objects in this pack."""
1167         if get_raw is None:
1168             get_raw = self.get_raw
1169         for offset, type, obj, crc32 in self.data.iterobjects():
1170             assert isinstance(offset, int)
1171             type, obj = self.data.resolve_object(offset, type, obj, get_raw)
1172             yield ShaFile.from_raw_chunks(type, obj)
1173
1174
1175 try:
1176     from dulwich._pack import apply_delta, bisect_find_sha
1177 except ImportError:
1178     pass