dulwich/pack.py

   1 # pack.py -- For dealing wih packed git objects.
   2 # Copyright (C) 2007 James Westby <jw+debian@jameswestby.net>
   3 # Copryight (C) 2008-2009 Jelmer Vernooij <jelmer@samba.org>
   4 #
   5 # This program is free software; you can redistribute it and/or
   6 # modify it under the terms of the GNU General Public License
   7 # as published by the Free Software Foundation; version 2
   8 # of the License or (at your option) a later version.
   9 #
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 # GNU General Public License for more details.
  14 #
  15 # You should have received a copy of the GNU General Public License
  16 # along with this program; if not, write to the Free Software
  17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
  18 # MA  02110-1301, USA.
  19
  20 """Classes for dealing with packed git objects.
  21
  22 A pack is a compact representation of a bunch of objects, stored
  23 using deltas where possible.
  24
  25 They have two parts, the pack file, which stores the data, and an index
  26 that tells you where the data is.
  27
  28 To find an object you look in all of the index files 'til you find a
  29 match for the object name. You then use the pointer got from this as
  30 a pointer in to the corresponding packfile.
  31 """
  32
  33 try:
  34     from collections import defaultdict
  35 except ImportError:
  36     from misc import defaultdict
  37
  38 import difflib
  39 from itertools import (
  40     chain,
  41     imap,
  42     izip,
  43     )
  44 import mmap
  45 import os
  46 import struct
  47 try:
  48     from struct import unpack_from
  49 except ImportError:
  50     from dulwich.misc import unpack_from
  51 import sys
  52 import zlib
  53
  54 from dulwich.errors import (
  55     ApplyDeltaError,
  56     ChecksumMismatch,
  57     )
  58 from dulwich.file import GitFile
  59 from dulwich.lru_cache import (
  60     LRUSizeCache,
  61     )
  62 from dulwich.objects import (
  63     ShaFile,
  64     hex_to_sha,
  65     sha_to_hex,
  66     )
  67 from dulwich.misc import (
  68     make_sha,
  69     )
  70
  71 supports_mmap_offset = (sys.version_info[0] >= 3 or
  72         (sys.version_info[0] == 2 and sys.version_info[1] >= 6))
  73
  74
  75 def take_msb_bytes(read):
  76     """Read bytes marked with most significant bit.
  77
  78     :param read: Read function
  79     """
  80     ret = []
  81     while len(ret) == 0 or ret[-1] & 0x80:
  82         ret.append(ord(read(1)))
  83     return ret
  84
  85
  86 def read_zlib_chunks(read, buffer_size=4096):
  87     """Read chunks of zlib data from a buffer.
  88
  89     :param read: Read function
  90     :return: Tuple with list of chunks, length of
  91         compressed data length and unused read data
  92     """
  93     obj = zlib.decompressobj()
  94     ret = []
  95     fed = 0
  96     while obj.unused_data == "":
  97         add = read(buffer_size)
  98         if len(add) < buffer_size:
  99             add += "Z"
 100         fed += len(add)
 101         ret.append(obj.decompress(add))
 102     comp_len = fed-len(obj.unused_data)
 103     return ret, comp_len, obj.unused_data
 104
 105
 106 def read_zlib(read, dec_size):
 107     """Read zlib-compressed data from a buffer.
 108
 109     :param read: Read function
 110     :param dec_size: Size of the decompressed buffer
 111     :return: Uncompressed buffer, compressed buffer length and unused read
 112         data.
 113     """
 114     ret, comp_len, unused = read_zlib_chunks(read)
 115     x = "".join(ret)
 116     assert len(x) == dec_size
 117     return x, comp_len, unused
 118
 119
 120
 121 def iter_sha1(iter):
 122     """Return the hexdigest of the SHA1 over a set of names.
 123
 124     :param iter: Iterator over string objects
 125     :return: 40-byte hex sha1 digest
 126     """
 127     sha1 = make_sha()
 128     for name in iter:
 129         sha1.update(name)
 130     return sha1.hexdigest()
 131
 132
 133 def load_pack_index(path):
 134     """Load an index file by path.
 135
 136     :param filename: Path to the index file
 137     """
 138     f = GitFile(path, 'rb')
 139     return load_pack_index_file(path, f)
 140
 141
 142 def load_pack_index_file(path, f):
 143     """Load an index file from a file-like object.
 144
 145     :param path: Path for the index file
 146     :param f: File-like object
 147     """
 148     if f.read(4) == '\377tOc':
 149         version = struct.unpack(">L", f.read(4))[0]
 150         if version == 2:
 151             f.seek(0)
 152             return PackIndex2(path, file=f)
 153         else:
 154             raise KeyError("Unknown pack index format %d" % version)
 155     else:
 156         f.seek(0)
 157         return PackIndex1(path, file=f)
 158
 159
 160 def bisect_find_sha(start, end, sha, unpack_name):
 161     """Find a SHA in a data blob with sorted SHAs.
 162
 163     :param start: Start index of range to search
 164     :param end: End index of range to search
 165     :param sha: Sha to find
 166     :param unpack_name: Callback to retrieve SHA by index
 167     :return: Index of the SHA, or None if it wasn't found
 168     """
 169     assert start <= end
 170     while start <= end:
 171         i = (start + end)/2
 172         file_sha = unpack_name(i)
 173         x = cmp(file_sha, sha)
 174         if x < 0:
 175             start = i + 1
 176         elif x > 0:
 177             end = i - 1
 178         else:
 179             return i
 180     return None
 181
 182
 183 class PackIndex(object):
 184     """An index in to a packfile.
 185
 186     Given a sha id of an object a pack index can tell you the location in the
 187     packfile of that object if it has it.
 188
 189     To do the loop it opens the file, and indexes first 256 4 byte groups
 190     with the first byte of the sha id. The value in the four byte group indexed
 191     is the end of the group that shares the same starting byte. Subtract one
 192     from the starting byte and index again to find the start of the group.
 193     The values are sorted by sha id within the group, so do the math to find
 194     the start and end offset and then bisect in to find if the value is present.
 195     """
 196
 197     def __init__(self, filename, file=None, size=None):
 198         """Create a pack index object.
 199
 200         Provide it with the name of the index file to consider, and it will map
 201         it whenever required.
 202         """
 203         self._filename = filename
 204         # Take the size now, so it can be checked each time we map the file to
 205         # ensure that it hasn't changed.
 206         if file is None:
 207             self._file = GitFile(filename, 'rb')
 208         else:
 209             self._file = file
 210         fileno = getattr(self._file, 'fileno', None)
 211         if fileno is not None:
 212             fd = self._file.fileno()
 213             if size is None:
 214                 self._size = os.fstat(fd).st_size
 215             else:
 216                 self._size = size
 217             self._contents = mmap.mmap(fd, self._size,
 218                 access=mmap.ACCESS_READ)
 219         else:
 220             self._file.seek(0)
 221             self._contents = self._file.read()
 222             self._size = len(self._contents)
 223
 224     def __eq__(self, other):
 225         if not isinstance(other, PackIndex):
 226             return False
 227
 228         if self._fan_out_table != other._fan_out_table:
 229             return False
 230
 231         for (name1, _, _), (name2, _, _) in izip(self.iterentries(), other.iterentries()):
 232             if name1 != name2:
 233                 return False
 234         return True
 235
 236     def __ne__(self, other):
 237         return not self.__eq__(other)
 238
 239     def close(self):
 240         self._file.close()
 241
 242     def __len__(self):
 243         """Return the number of entries in this pack index."""
 244         return self._fan_out_table[-1]
 245
 246     def _unpack_entry(self, i):
 247         """Unpack the i-th entry in the index file.
 248
 249         :return: Tuple with object name (SHA), offset in pack file and
 250               CRC32 checksum (if known)."""
 251         raise NotImplementedError(self._unpack_entry)
 252
 253     def _unpack_name(self, i):
 254         """Unpack the i-th name from the index file."""
 255         raise NotImplementedError(self._unpack_name)
 256
 257     def _unpack_offset(self, i):
 258         """Unpack the i-th object offset from the index file."""
 259         raise NotImplementedError(self._unpack_offset)
 260
 261     def _unpack_crc32_checksum(self, i):
 262         """Unpack the crc32 checksum for the i-th object from the index file."""
 263         raise NotImplementedError(self._unpack_crc32_checksum)
 264
 265     def __iter__(self):
 266         """Iterate over the SHAs in this pack."""
 267         return imap(sha_to_hex, self._itersha())
 268
 269     def _itersha(self):
 270         for i in range(len(self)):
 271             yield self._unpack_name(i)
 272
 273     def objects_sha1(self):
 274         """Return the hex SHA1 over all the shas of all objects in this pack.
 275
 276         :note: This is used for the filename of the pack.
 277         """
 278         return iter_sha1(self._itersha())
 279
 280     def iterentries(self):
 281         """Iterate over the entries in this pack index.
 282
 283         Will yield tuples with object name, offset in packfile and crc32 checksum.
 284         """
 285         for i in range(len(self)):
 286             yield self._unpack_entry(i)
 287
 288     def _read_fan_out_table(self, start_offset):
 289         ret = []
 290         for i in range(0x100):
 291             ret.append(struct.unpack(">L", self._contents[start_offset+i*4:start_offset+(i+1)*4])[0])
 292         return ret
 293
 294     def check(self):
 295         """Check that the stored checksum matches the actual checksum."""
 296         # TODO: Check pack contents, too
 297         return self.calculate_checksum() == self.get_stored_checksum()
 298
 299     def calculate_checksum(self):
 300         """Calculate the SHA1 checksum over this pack index.
 301
 302         :return: This is a 20-byte binary digest
 303         """
 304         return make_sha(self._contents[:-20]).digest()
 305
 306     def get_pack_checksum(self):
 307         """Return the SHA1 checksum stored for the corresponding packfile.
 308
 309         :return: 20-byte binary digest
 310         """
 311         return str(self._contents[-40:-20])
 312
 313     def get_stored_checksum(self):
 314         """Return the SHA1 checksum stored for this index.
 315
 316         :return: 20-byte binary digest
 317         """
 318         return str(self._contents[-20:])
 319
 320     def object_index(self, sha):
 321         """Return the index in to the corresponding packfile for the object.
 322
 323         Given the name of an object it will return the offset that object lives
 324         at within the corresponding pack file. If the pack file doesn't have the
 325         object then None will be returned.
 326         """
 327         if len(sha) == 40:
 328             sha = hex_to_sha(sha)
 329         return self._object_index(sha)
 330
 331     def _object_index(self, sha):
 332         """See object_index.
 333
 334         :param sha: A *binary* SHA string. (20 characters long)_
 335         """
 336         assert len(sha) == 20
 337         idx = ord(sha[0])
 338         if idx == 0:
 339             start = 0
 340         else:
 341             start = self._fan_out_table[idx-1]
 342         end = self._fan_out_table[idx]
 343         i = bisect_find_sha(start, end, sha, self._unpack_name)
 344         if i is None:
 345             raise KeyError(sha)
 346         return self._unpack_offset(i)
 347
 348
 349
 350 class PackIndex1(PackIndex):
 351     """Version 1 Pack Index."""
 352
 353     def __init__(self, filename, file=None, size=None):
 354         PackIndex.__init__(self, filename, file, size)
 355         self.version = 1
 356         self._fan_out_table = self._read_fan_out_table(0)
 357
 358     def _unpack_entry(self, i):
 359         (offset, name) = unpack_from(">L20s", self._contents,
 360             (0x100 * 4) + (i * 24))
 361         return (name, offset, None)
 362
 363     def _unpack_name(self, i):
 364         offset = (0x100 * 4) + (i * 24) + 4
 365         return self._contents[offset:offset+20]
 366
 367     def _unpack_offset(self, i):
 368         offset = (0x100 * 4) + (i * 24)
 369         return unpack_from(">L", self._contents, offset)[0]
 370
 371     def _unpack_crc32_checksum(self, i):
 372         # Not stored in v1 index files
 373         return None
 374
 375
 376 class PackIndex2(PackIndex):
 377     """Version 2 Pack Index."""
 378
 379     def __init__(self, filename, file=None, size=None):
 380         PackIndex.__init__(self, filename, file, size)
 381         assert self._contents[:4] == '\377tOc', "Not a v2 pack index file"
 382         (self.version, ) = unpack_from(">L", self._contents, 4)
 383         assert self.version == 2, "Version was %d" % self.version
 384         self._fan_out_table = self._read_fan_out_table(8)
 385         self._name_table_offset = 8 + 0x100 * 4
 386         self._crc32_table_offset = self._name_table_offset + 20 * len(self)
 387         self._pack_offset_table_offset = self._crc32_table_offset + 4 * len(self)
 388
 389     def _unpack_entry(self, i):
 390         return (self._unpack_name(i), self._unpack_offset(i),
 391                 self._unpack_crc32_checksum(i))
 392
 393     def _unpack_name(self, i):
 394         offset = self._name_table_offset + i * 20
 395         return self._contents[offset:offset+20]
 396
 397     def _unpack_offset(self, i):
 398         offset = self._pack_offset_table_offset + i * 4
 399         return unpack_from(">L", self._contents, offset)[0]
 400
 401     def _unpack_crc32_checksum(self, i):
 402         return unpack_from(">L", self._contents,
 403                           self._crc32_table_offset + i * 4)[0]
 404
 405
 406
 407 def read_pack_header(f):
 408     """Read the header of a pack file.
 409
 410     :param f: File-like object to read from
 411     """
 412     header = f.read(12)
 413     assert header[:4] == "PACK"
 414     (version,) = unpack_from(">L", header, 4)
 415     assert version in (2, 3), "Version was %d" % version
 416     (num_objects,) = unpack_from(">L", header, 8)
 417     return (version, num_objects)
 418
 419
 420 def unpack_object(read):
 421     """Unpack a Git object.
 422
 423     :return: tuple with type, uncompressed data, compressed size and
 424         tail data
 425     """
 426     bytes = take_msb_bytes(read)
 427     type = (bytes[0] >> 4) & 0x07
 428     size = bytes[0] & 0x0f
 429     for i, byte in enumerate(bytes[1:]):
 430         size += (byte & 0x7f) << ((i * 7) + 4)
 431     raw_base = len(bytes)
 432     if type == 6: # offset delta
 433         bytes = take_msb_bytes(read)
 434         raw_base += len(bytes)
 435         assert not (bytes[-1] & 0x80)
 436         delta_base_offset = bytes[0] & 0x7f
 437         for byte in bytes[1:]:
 438             delta_base_offset += 1
 439             delta_base_offset <<= 7
 440             delta_base_offset += (byte & 0x7f)
 441         uncomp, comp_len, unused = read_zlib(read, size)
 442         assert size == len(uncomp)
 443         return type, (delta_base_offset, uncomp), comp_len+raw_base, unused
 444     elif type == 7: # ref delta
 445         basename = read(20)
 446         raw_base += 20
 447         uncomp, comp_len, unused = read_zlib(read, size)
 448         assert size == len(uncomp)
 449         return type, (basename, uncomp), comp_len+raw_base, unused
 450     else:
 451         uncomp, comp_len, unused = read_zlib(read, size)
 452         assert len(uncomp) == size
 453         return type, uncomp, comp_len+raw_base, unused
 454
 455
 456 def _compute_object_size((num, obj)):
 457     """Compute the size of a unresolved object for use with LRUSizeCache.
 458     """
 459     if num in (6, 7):
 460         return len(obj[1])
 461     assert isinstance(obj, str)
 462     return len(obj)
 463
 464
 465 class PackData(object):
 466     """The data contained in a packfile.
 467
 468     Pack files can be accessed both sequentially for exploding a pack, and
 469     directly with the help of an index to retrieve a specific object.
 470
 471     The objects within are either complete or a delta aginst another.
 472
 473     The header is variable length. If the MSB of each byte is set then it
 474     indicates that the subsequent byte is still part of the header.
 475     For the first byte the next MS bits are the type, which tells you the type
 476     of object, and whether it is a delta. The LS byte is the lowest bits of the
 477     size. For each subsequent byte the LS 7 bits are the next MS bits of the
 478     size, i.e. the last byte of the header contains the MS bits of the size.
 479
 480     For the complete objects the data is stored as zlib deflated data.
 481     The size in the header is the uncompressed object size, so to uncompress
 482     you need to just keep feeding data to zlib until you get an object back,
 483     or it errors on bad data. This is done here by just giving the complete
 484     buffer from the start of the deflated object on. This is bad, but until I
 485     get mmap sorted out it will have to do.
 486
 487     Currently there are no integrity checks done. Also no attempt is made to try
 488     and detect the delta case, or a request for an object at the wrong position.
 489     It will all just throw a zlib or KeyError.
 490     """
 491
 492     def __init__(self, filename, file=None, size=None):
 493         """Create a PackData object that represents the pack in the given filename.
 494
 495         The file must exist and stay readable until the object is disposed of. It
 496         must also stay the same size. It will be mapped whenever needed.
 497
 498         Currently there is a restriction on the size of the pack as the python
 499         mmap implementation is flawed.
 500         """
 501         self._filename = filename
 502         self._size = size
 503         self._header_size = 12
 504         if file is None:
 505             self._file = GitFile(self._filename, 'rb')
 506         else:
 507             self._file = file
 508         (version, self._num_objects) = read_pack_header(self._file)
 509         self._offset_cache = LRUSizeCache(1024*1024*20,
 510             compute_size=_compute_object_size)
 511
 512     @classmethod
 513     def from_file(cls, file, size):
 514         return cls(str(file), file=file, size=size)
 515
 516     @classmethod
 517     def from_path(cls, path):
 518         return cls(filename=path)
 519
 520     def close(self):
 521         self._file.close()
 522
 523     def _get_size(self):
 524         if self._size is not None:
 525             return self._size
 526         self._size = os.path.getsize(self._filename)
 527         assert self._size >= self._header_size, "%s is too small for a packfile (%d < %d)" % (self._filename, self._size, self._header_size)
 528         return self._size
 529
 530     def __len__(self):
 531         """Returns the number of objects in this pack."""
 532         return self._num_objects
 533
 534     def calculate_checksum(self):
 535         """Calculate the checksum for this pack.
 536
 537         :return: 20-byte binary SHA1 digest
 538         """
 539         s = make_sha()
 540         self._file.seek(0)
 541         todo = self._get_size() - 20
 542         while todo > 0:
 543             x = self._file.read(min(todo, 1<<16))
 544             s.update(x)
 545             todo -= len(x)
 546         return s.digest()
 547
 548     def resolve_object(self, offset, type, obj, get_ref, get_offset=None):
 549         """Resolve an object, possibly resolving deltas when necessary.
 550
 551         :return: Tuple with object type and contents.
 552         """
 553         if type not in (6, 7): # Not a delta
 554             return type, obj
 555
 556         if get_offset is None:
 557             get_offset = self.get_object_at
 558
 559         if type == 6: # offset delta
 560             (delta_offset, delta) = obj
 561             assert isinstance(delta_offset, int)
 562             assert isinstance(delta, str)
 563             base_offset = offset-delta_offset
 564             type, base_obj = get_offset(base_offset)
 565             assert isinstance(type, int)
 566         elif type == 7: # ref delta
 567             (basename, delta) = obj
 568             assert isinstance(basename, str) and len(basename) == 20
 569             assert isinstance(delta, str)
 570             type, base_obj = get_ref(basename)
 571             assert isinstance(type, int)
 572             # Can't be a ofs delta, as we wouldn't know the base offset
 573             assert type != 6
 574             base_offset = None
 575         type, base_text = self.resolve_object(base_offset, type, base_obj, get_ref)
 576         if base_offset is not None:
 577             self._offset_cache[base_offset] = type, base_text
 578         ret = (type, apply_delta(base_text, delta))
 579         return ret
 580
 581     def iterobjects(self, progress=None):
 582
 583         class ObjectIterator(object):
 584
 585             def __init__(self, pack):
 586                 self.i = 0
 587                 self.offset = pack._header_size
 588                 self.num = len(pack)
 589                 self.map = pack._file
 590
 591             def __iter__(self):
 592                 return self
 593
 594             def __len__(self):
 595                 return self.num
 596
 597             def next(self):
 598                 if self.i == self.num:
 599                     raise StopIteration
 600                 self.map.seek(self.offset)
 601                 (type, obj, total_size, unused) = unpack_object(self.map.read)
 602                 self.map.seek(self.offset)
 603                 crc32 = zlib.crc32(self.map.read(total_size)) & 0xffffffff
 604                 ret = (self.offset, type, obj, crc32)
 605                 self.offset += total_size
 606                 if progress:
 607                     progress(self.i, self.num)
 608                 self.i+=1
 609                 return ret
 610         return ObjectIterator(self)
 611
 612     def iterentries(self, ext_resolve_ref=None, progress=None):
 613         """Yield entries summarizing the contents of this pack.
 614
 615         :param ext_resolve_ref: Optional function to resolve base
 616             objects (in case this is a thin pack)
 617         :param progress: Progress function, called with current and
 618             total object count.
 619
 620         This will yield tuples with (sha, offset, crc32)
 621         """
 622         found = {}
 623         postponed = defaultdict(list)
 624         class Postpone(Exception):
 625             """Raised to postpone delta resolving."""
 626
 627         def get_ref_text(sha):
 628             assert len(sha) == 20
 629             if sha in found:
 630                 return self.get_object_at(found[sha])
 631             if ext_resolve_ref:
 632                 try:
 633                     return ext_resolve_ref(sha)
 634                 except KeyError:
 635                     pass
 636             raise Postpone, (sha, )
 637         extra = []
 638         todo = chain(self.iterobjects(progress=progress), extra)
 639         for (offset, type, obj, crc32) in todo:
 640             assert isinstance(offset, int)
 641             assert isinstance(type, int)
 642             assert isinstance(obj, tuple) or isinstance(obj, str)
 643             try:
 644                 type, obj = self.resolve_object(offset, type, obj, get_ref_text)
 645             except Postpone, (sha, ):
 646                 postponed[sha].append((offset, type, obj))
 647             else:
 648                 shafile = ShaFile.from_raw_string(type, obj)
 649                 sha = shafile.sha().digest()
 650                 found[sha] = offset
 651                 yield sha, offset, crc32
 652                 extra.extend(postponed.get(sha, []))
 653         if postponed:
 654             raise KeyError([sha_to_hex(h) for h in postponed.keys()])
 655
 656     def sorted_entries(self, resolve_ext_ref=None, progress=None):
 657         """Return entries in this pack, sorted by SHA.
 658
 659         :param ext_resolve_ref: Optional function to resolve base
 660             objects (in case this is a thin pack)
 661         :param progress: Progress function, called with current and
 662             total object count.
 663         :return: List of tuples with (sha, offset, crc32)
 664         """
 665         ret = list(self.iterentries(resolve_ext_ref, progress=progress))
 666         ret.sort()
 667         return ret
 668
 669     def create_index_v1(self, filename, resolve_ext_ref=None, progress=None):
 670         """Create a version 1 file for this data file.
 671
 672         :param filename: Index filename.
 673         :param resolve_ext_ref: Function to use for resolving externally referenced
 674             SHA1s (for thin packs)
 675         :param progress: Progress report function
 676         """
 677         entries = self.sorted_entries(resolve_ext_ref, progress=progress)
 678         write_pack_index_v1(filename, entries, self.calculate_checksum())
 679
 680     def create_index_v2(self, filename, resolve_ext_ref=None, progress=None):
 681         """Create a version 2 index file for this data file.
 682
 683         :param filename: Index filename.
 684         :param resolve_ext_ref: Function to use for resolving externally referenced
 685             SHA1s (for thin packs)
 686         :param progress: Progress report function
 687         """
 688         entries = self.sorted_entries(resolve_ext_ref, progress=progress)
 689         write_pack_index_v2(filename, entries, self.calculate_checksum())
 690
 691     def create_index(self, filename, resolve_ext_ref=None, progress=None,
 692                      version=2):
 693         """Create an  index file for this data file.
 694
 695         :param filename: Index filename.
 696         :param resolve_ext_ref: Function to use for resolving externally referenced
 697             SHA1s (for thin packs)
 698         :param progress: Progress report function
 699         """
 700         if version == 1:
 701             self.create_index_v1(filename, resolve_ext_ref, progress)
 702         elif version == 2:
 703             self.create_index_v2(filename, resolve_ext_ref, progress)
 704         else:
 705             raise ValueError("unknown index format %d" % version)
 706
 707     def get_stored_checksum(self):
 708         """Return the expected checksum stored in this pack."""
 709         self._file.seek(self._get_size()-20)
 710         return self._file.read(20)
 711
 712     def check(self):
 713         """Check the consistency of this pack."""
 714         return (self.calculate_checksum() == self.get_stored_checksum())
 715
 716     def get_object_at(self, offset):
 717         """Given an offset in to the packfile return the object that is there.
 718
 719         Using the associated index the location of an object can be looked up, and
 720         then the packfile can be asked directly for that object using this
 721         function.
 722         """
 723         if offset in self._offset_cache:
 724             return self._offset_cache[offset]
 725         assert isinstance(offset, long) or isinstance(offset, int),\
 726                 "offset was %r" % offset
 727         assert offset >= self._header_size
 728         self._file.seek(offset)
 729         return unpack_object(self._file.read)[:2]
 730
 731
 732 class SHA1Reader(object):
 733     """Wrapper around a file-like object that remembers the SHA1 of
 734     the data read from it."""
 735
 736     def __init__(self, f):
 737         self.f = f
 738         self.sha1 = make_sha("")
 739
 740     def read(self, num=None):
 741         data = self.f.read(num)
 742         self.sha1.update(data)
 743         return data
 744
 745     def check_sha(self):
 746         stored = self.f.read(20)
 747         if stored != self.sha1.digest():
 748             raise ChecksumMismatch(self.sha1.hexdigest(), sha_to_hex(stored))
 749
 750     def close(self):
 751         return self.f.close()
 752
 753     def tell(self):
 754         return self.f.tell()
 755
 756
 757 class SHA1Writer(object):
 758     """Wrapper around a file-like object that remembers the SHA1 of
 759     the data written to it."""
 760
 761     def __init__(self, f):
 762         self.f = f
 763         self.sha1 = make_sha("")
 764
 765     def write(self, data):
 766         self.sha1.update(data)
 767         self.f.write(data)
 768
 769     def write_sha(self):
 770         sha = self.sha1.digest()
 771         assert len(sha) == 20
 772         self.f.write(sha)
 773         return sha
 774
 775     def close(self):
 776         sha = self.write_sha()
 777         self.f.close()
 778         return sha
 779
 780     def tell(self):
 781         return self.f.tell()
 782
 783
 784 def write_pack_object(f, type, object):
 785     """Write pack object to a file.
 786
 787     :param f: File to write to
 788     :param o: Object to write
 789     :return: Tuple with offset at which the object was written, and crc32
 790     """
 791     offset = f.tell()
 792     packed_data_hdr = ""
 793     if type == 6: # offset delta
 794         (delta_base_offset, object) = object
 795     elif type == 7: # ref delta
 796         (basename, object) = object
 797     size = len(object)
 798     c = (type << 4) | (size & 15)
 799     size >>= 4
 800     while size:
 801         packed_data_hdr += (chr(c | 0x80))
 802         c = size & 0x7f
 803         size >>= 7
 804     packed_data_hdr += chr(c)
 805     if type == 6: # offset delta
 806         ret = [delta_base_offset & 0x7f]
 807         delta_base_offset >>= 7
 808         while delta_base_offset:
 809             delta_base_offset -= 1
 810             ret.insert(0, 0x80 | (delta_base_offset & 0x7f))
 811             delta_base_offset >>= 7
 812         packed_data_hdr += "".join([chr(x) for x in ret])
 813     elif type == 7: # ref delta
 814         assert len(basename) == 20
 815         packed_data_hdr += basename
 816     packed_data = packed_data_hdr + zlib.compress(object)
 817     f.write(packed_data)
 818     return (offset, (zlib.crc32(packed_data) & 0xffffffff))
 819
 820
 821 def write_pack(filename, objects, num_objects):
 822     """Write a new pack data file.
 823
 824     :param filename: Path to the new pack file (without .pack extension)
 825     :param objects: Iterable over (object, path) tuples to write
 826     :param num_objects: Number of objects to write
 827     """
 828     f = GitFile(filename + ".pack", 'wb')
 829     try:
 830         entries, data_sum = write_pack_data(f, objects, num_objects)
 831     finally:
 832         f.close()
 833     entries.sort()
 834     write_pack_index_v2(filename + ".idx", entries, data_sum)
 835
 836
 837 def write_pack_data(f, objects, num_objects, window=10):
 838     """Write a new pack file.
 839
 840     :param filename: The filename of the new pack file.
 841     :param objects: List of objects to write (tuples with object and path)
 842     :return: List with (name, offset, crc32 checksum) entries, pack checksum
 843     """
 844     recency = list(objects)
 845     # FIXME: Somehow limit delta depth
 846     # FIXME: Make thin-pack optional (its not used when cloning a pack)
 847     # Build a list of objects ordered by the magic Linus heuristic
 848     # This helps us find good objects to diff against us
 849     magic = []
 850     for obj, path in recency:
 851         magic.append( (obj.type, path, 1, -len(obj.as_raw_string()), obj) )
 852     magic.sort()
 853     # Build a map of objects and their index in magic - so we can find preceeding objects
 854     # to diff against
 855     offs = {}
 856     for i in range(len(magic)):
 857         offs[magic[i][4]] = i
 858     # Write the pack
 859     entries = []
 860     f = SHA1Writer(f)
 861     f.write("PACK")               # Pack header
 862     f.write(struct.pack(">L", 2)) # Pack version
 863     f.write(struct.pack(">L", num_objects)) # Number of objects in pack
 864     for o, path in recency:
 865         sha1 = o.sha().digest()
 866         orig_t = o.type
 867         raw = o.as_raw_string()
 868         winner = raw
 869         t = orig_t
 870         #for i in range(offs[o]-window, window):
 871         #    if i < 0 or i >= len(offs): continue
 872         #    b = magic[i][4]
 873         #    if b.type != orig_t: continue
 874         #    base = b.as_raw_string()
 875         #    delta = create_delta(base, raw)
 876         #    if len(delta) < len(winner):
 877         #        winner = delta
 878         #        t = 6 if magic[i][2] == 1 else 7
 879         offset, crc32 = write_pack_object(f, t, winner)
 880         entries.append((sha1, offset, crc32))
 881     return entries, f.write_sha()
 882
 883
 884 def write_pack_index_v1(filename, entries, pack_checksum):
 885     """Write a new pack index file.
 886
 887     :param filename: The filename of the new pack index file.
 888     :param entries: List of tuples with object name (sha), offset_in_pack,  and
 889             crc32_checksum.
 890     :param pack_checksum: Checksum of the pack file.
 891     """
 892     f = GitFile(filename, 'wb')
 893     f = SHA1Writer(f)
 894     fan_out_table = defaultdict(lambda: 0)
 895     for (name, offset, entry_checksum) in entries:
 896         fan_out_table[ord(name[0])] += 1
 897     # Fan-out table
 898     for i in range(0x100):
 899         f.write(struct.pack(">L", fan_out_table[i]))
 900         fan_out_table[i+1] += fan_out_table[i]
 901     for (name, offset, entry_checksum) in entries:
 902         f.write(struct.pack(">L20s", offset, name))
 903     assert len(pack_checksum) == 20
 904     f.write(pack_checksum)
 905     f.close()
 906
 907
 908 def create_delta(base_buf, target_buf):
 909     """Use python difflib to work out how to transform base_buf to target_buf.
 910
 911     :param base_buf: Base buffer
 912     :param target_buf: Target buffer
 913     """
 914     assert isinstance(base_buf, str)
 915     assert isinstance(target_buf, str)
 916     out_buf = ""
 917     # write delta header
 918     def encode_size(size):
 919         ret = ""
 920         c = size & 0x7f
 921         size >>= 7
 922         while size:
 923             ret += chr(c | 0x80)
 924             c = size & 0x7f
 925             size >>= 7
 926         ret += chr(c)
 927         return ret
 928     out_buf += encode_size(len(base_buf))
 929     out_buf += encode_size(len(target_buf))
 930     # write out delta opcodes
 931     seq = difflib.SequenceMatcher(a=base_buf, b=target_buf)
 932     for opcode, i1, i2, j1, j2 in seq.get_opcodes():
 933         # Git patch opcodes don't care about deletes!
 934         #if opcode == "replace" or opcode == "delete":
 935         #    pass
 936         if opcode == "equal":
 937             # If they are equal, unpacker will use data from base_buf
 938             # Write out an opcode that says what range to use
 939             scratch = ""
 940             op = 0x80
 941             o = i1
 942             for i in range(4):
 943                 if o & 0xff << i*8:
 944                     scratch += chr((o >> i*8) & 0xff)
 945                     op |= 1 << i
 946             s = i2 - i1
 947             for i in range(2):
 948                 if s & 0xff << i*8:
 949                     scratch += chr((s >> i*8) & 0xff)
 950                     op |= 1 << (4+i)
 951             out_buf += chr(op)
 952             out_buf += scratch
 953         if opcode == "replace" or opcode == "insert":
 954             # If we are replacing a range or adding one, then we just
 955             # output it to the stream (prefixed by its size)
 956             s = j2 - j1
 957             o = j1
 958             while s > 127:
 959                 out_buf += chr(127)
 960                 out_buf += target_buf[o:o+127]
 961                 s -= 127
 962                 o += 127
 963             out_buf += chr(s)
 964             out_buf += target_buf[o:o+s]
 965     return out_buf
 966
 967
 968 def apply_delta(src_buf, delta):
 969     """Based on the similar function in git's patch-delta.c.
 970
 971     :param src_buf: Source buffer
 972     :param delta: Delta instructions
 973     """
 974     assert isinstance(src_buf, str), "was %r" % (src_buf,)
 975     assert isinstance(delta, str)
 976     out = []
 977     index = 0
 978     delta_length = len(delta)
 979     def get_delta_header_size(delta, index):
 980         size = 0
 981         i = 0
 982         while delta:
 983             cmd = ord(delta[index])
 984             index += 1
 985             size |= (cmd & ~0x80) << i
 986             i += 7
 987             if not cmd & 0x80:
 988                 break
 989         return size, index
 990     src_size, index = get_delta_header_size(delta, index)
 991     dest_size, index = get_delta_header_size(delta, index)
 992     assert src_size == len(src_buf), "%d vs %d" % (src_size, len(src_buf))
 993     while index < delta_length:
 994         cmd = ord(delta[index])
 995         index += 1
 996         if cmd & 0x80:
 997             cp_off = 0
 998             for i in range(4):
 999                 if cmd & (1 << i):
1000                     x = ord(delta[index])
1001                     index += 1
1002                     cp_off |= x << (i * 8)
1003             cp_size = 0
1004             for i in range(3):
1005                 if cmd & (1 << (4+i)):
1006                     x = ord(delta[index])
1007                     index += 1
1008                     cp_size |= x << (i * 8)
1009             if cp_size == 0:
1010                 cp_size = 0x10000
1011             if (cp_off + cp_size < cp_size or
1012                 cp_off + cp_size > src_size or
1013                 cp_size > dest_size):
1014                 break
1015             out.append(src_buf[cp_off:cp_off+cp_size])
1016         elif cmd != 0:
1017             out.append(delta[index:index+cmd])
1018             index += cmd
1019         else:
1020             raise ApplyDeltaError("Invalid opcode 0")
1021
1022     if index != delta_length:
1023         raise ApplyDeltaError("delta not empty: %r" % delta[index:])
1024
1025     out = ''.join(out)
1026     if dest_size != len(out):
1027         raise ApplyDeltaError("dest size incorrect")
1028
1029     return out
1030
1031
1032 def write_pack_index_v2(filename, entries, pack_checksum):
1033     """Write a new pack index file.
1034
1035     :param filename: The filename of the new pack index file.
1036     :param entries: List of tuples with object name (sha), offset_in_pack,  and
1037             crc32_checksum.
1038     :param pack_checksum: Checksum of the pack file.
1039     """
1040     f = GitFile(filename, 'wb')
1041     f = SHA1Writer(f)
1042     f.write('\377tOc') # Magic!
1043     f.write(struct.pack(">L", 2))
1044     fan_out_table = defaultdict(lambda: 0)
1045     for (name, offset, entry_checksum) in entries:
1046         fan_out_table[ord(name[0])] += 1
1047     # Fan-out table
1048     for i in range(0x100):
1049         f.write(struct.pack(">L", fan_out_table[i]))
1050         fan_out_table[i+1] += fan_out_table[i]
1051     for (name, offset, entry_checksum) in entries:
1052         f.write(name)
1053     for (name, offset, entry_checksum) in entries:
1054         f.write(struct.pack(">L", entry_checksum))
1055     for (name, offset, entry_checksum) in entries:
1056         # FIXME: handle if MSBit is set in offset
1057         f.write(struct.pack(">L", offset))
1058     # FIXME: handle table for pack files > 8 Gb
1059     assert len(pack_checksum) == 20
1060     f.write(pack_checksum)
1061     f.close()
1062
1063
1064 class Pack(object):
1065     """A Git pack object."""
1066
1067     def __init__(self, basename):
1068         self._basename = basename
1069         self._data_path = self._basename + ".pack"
1070         self._idx_path = self._basename + ".idx"
1071         self._data = None
1072         self._idx = None
1073
1074     @classmethod
1075     def from_objects(self, data, idx):
1076         """Create a new pack object from pack data and index objects."""
1077         ret = Pack("")
1078         ret._data = data
1079         ret._idx = idx
1080         return ret
1081
1082     def name(self):
1083         """The SHA over the SHAs of the objects in this pack."""
1084         return self.index.objects_sha1()
1085
1086     @property
1087     def data(self):
1088         """The pack data object being used."""
1089         if self._data is None:
1090             self._data = PackData(self._data_path)
1091             assert len(self.index) == len(self._data)
1092             idx_stored_checksum = self.index.get_pack_checksum()
1093             data_stored_checksum = self._data.get_stored_checksum()
1094             if idx_stored_checksum != data_stored_checksum:
1095                 raise ChecksumMismatch(sha_to_hex(idx_stored_checksum),
1096                                        sha_to_hex(data_stored_checksum))
1097         return self._data
1098
1099     @property
1100     def index(self):
1101         """The index being used.
1102
1103         :note: This may be an in-memory index
1104         """
1105         if self._idx is None:
1106             self._idx = load_pack_index(self._idx_path)
1107         return self._idx
1108
1109     def close(self):
1110         if self._data is not None:
1111             self._data.close()
1112         self.index.close()
1113
1114     def __eq__(self, other):
1115         return type(self) == type(other) and self.index == other.index
1116
1117     def __len__(self):
1118         """Number of entries in this pack."""
1119         return len(self.index)
1120
1121     def __repr__(self):
1122         return "%s(%r)" % (self.__class__.__name__, self._basename)
1123
1124     def __iter__(self):
1125         """Iterate over all the sha1s of the objects in this pack."""
1126         return iter(self.index)
1127
1128     def check(self):
1129         """Check the integrity of this pack."""
1130         if not self.index.check():
1131             return False
1132         if not self.data.check():
1133             return False
1134         return True
1135
1136     def get_stored_checksum(self):
1137         return self.data.get_stored_checksum()
1138
1139     def __contains__(self, sha1):
1140         """Check whether this pack contains a particular SHA1."""
1141         try:
1142             self.index.object_index(sha1)
1143             return True
1144         except KeyError:
1145             return False
1146
1147     def get_raw(self, sha1, resolve_ref=None):
1148         offset = self.index.object_index(sha1)
1149         obj_type, obj = self.data.get_object_at(offset)
1150         if type(offset) is long:
1151           offset = int(offset)
1152         if resolve_ref is None:
1153             resolve_ref = self.get_raw
1154         return self.data.resolve_object(offset, obj_type, obj, resolve_ref)
1155
1156     def __getitem__(self, sha1):
1157         """Retrieve the specified SHA1."""
1158         type, uncomp = self.get_raw(sha1)
1159         return ShaFile.from_raw_string(type, uncomp)
1160
1161     def iterobjects(self, get_raw=None):
1162         """Iterate over the objects in this pack."""
1163         if get_raw is None:
1164             get_raw = self.get_raw
1165         for offset, type, obj, crc32 in self.data.iterobjects():
1166             assert isinstance(offset, int)
1167             yield ShaFile.from_raw_string(
1168                     *self.data.resolve_object(offset, type, obj, get_raw))
1169
1170
1171 try:
1172     from dulwich._pack import apply_delta, bisect_find_sha
1173 except ImportError:
1174     pass