dulwich/pack.py

   1 # pack.py -- For dealing wih packed git objects.
   2 # Copyright (C) 2007 James Westby <jw+debian@jameswestby.net>
   3 # Copryight (C) 2008-2009 Jelmer Vernooij <jelmer@samba.org>
   4 #
   5 # This program is free software; you can redistribute it and/or
   6 # modify it under the terms of the GNU General Public License
   7 # as published by the Free Software Foundation; version 2
   8 # of the License or (at your option) a later version.
   9 #
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 # GNU General Public License for more details.
  14 #
  15 # You should have received a copy of the GNU General Public License
  16 # along with this program; if not, write to the Free Software
  17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
  18 # MA  02110-1301, USA.
  19
  20 """Classes for dealing with packed git objects.
  21
  22 A pack is a compact representation of a bunch of objects, stored
  23 using deltas where possible.
  24
  25 They have two parts, the pack file, which stores the data, and an index
  26 that tells you where the data is.
  27
  28 To find an object you look in all of the index files 'til you find a
  29 match for the object name. You then use the pointer got from this as
  30 a pointer in to the corresponding packfile.
  31 """
  32
  33 try:
  34     from collections import defaultdict
  35 except ImportError:
  36     from misc import defaultdict
  37
  38 import difflib
  39 from itertools import (
  40     chain,
  41     imap,
  42     izip,
  43     )
  44 import mmap
  45 import os
  46 import struct
  47 try:
  48     from struct import unpack_from
  49 except ImportError:
  50     from dulwich.misc import unpack_from
  51 import sys
  52 import zlib
  53
  54 from dulwich.errors import (
  55     ApplyDeltaError,
  56     ChecksumMismatch,
  57     )
  58 from dulwich.file import GitFile
  59 from dulwich.lru_cache import (
  60     LRUSizeCache,
  61     )
  62 from dulwich.objects import (
  63     ShaFile,
  64     hex_to_sha,
  65     sha_to_hex,
  66     )
  67 from dulwich.misc import (
  68     make_sha,
  69     )
  70
  71 supports_mmap_offset = (sys.version_info[0] >= 3 or
  72         (sys.version_info[0] == 2 and sys.version_info[1] >= 6))
  73
  74
  75 def take_msb_bytes(read):
  76     """Read bytes marked with most significant bit.
  77
  78     :param read: Read function
  79     """
  80     ret = []
  81     while len(ret) == 0 or ret[-1] & 0x80:
  82         ret.append(ord(read(1)))
  83     return ret
  84
  85
  86 def read_zlib_chunks(read, buffer_size=4096):
  87     """Read chunks of zlib data from a buffer.
  88
  89     :param read: Read function
  90     :return: Tuple with list of chunks and length of
  91         compressed data length
  92     """
  93     obj = zlib.decompressobj()
  94     ret = []
  95     fed = 0
  96     while obj.unused_data == "":
  97         add = read(buffer_size)
  98         if len(add) < buffer_size:
  99             add += "Z"
 100         fed += len(add)
 101         ret.append(obj.decompress(add))
 102     comp_len = fed-len(obj.unused_data)
 103     return ret, comp_len
 104
 105
 106 def read_zlib(read, dec_size):
 107     """Read zlib-compressed data from a buffer.
 108
 109     :param read: Read function
 110     :param dec_size: Size of the decompressed buffer
 111     :return: Uncompressed buffer and compressed buffer length.
 112     """
 113     ret, comp_len = read_zlib_chunks(read)
 114     x = "".join(ret)
 115     assert len(x) == dec_size
 116     return x, comp_len
 117
 118
 119 def iter_sha1(iter):
 120     """Return the hexdigest of the SHA1 over a set of names.
 121
 122     :param iter: Iterator over string objects
 123     :return: 40-byte hex sha1 digest
 124     """
 125     sha1 = make_sha()
 126     for name in iter:
 127         sha1.update(name)
 128     return sha1.hexdigest()
 129
 130
 131 def load_pack_index(path):
 132     """Load an index file by path.
 133
 134     :param filename: Path to the index file
 135     """
 136     f = GitFile(path, 'rb')
 137     return load_pack_index_file(path, f)
 138
 139
 140 def load_pack_index_file(path, f):
 141     """Load an index file from a file-like object.
 142
 143     :param path: Path for the index file
 144     :param f: File-like object
 145     """
 146     if f.read(4) == '\377tOc':
 147         version = struct.unpack(">L", f.read(4))[0]
 148         if version == 2:
 149             f.seek(0)
 150             return PackIndex2(path, file=f)
 151         else:
 152             raise KeyError("Unknown pack index format %d" % version)
 153     else:
 154         f.seek(0)
 155         return PackIndex1(path, file=f)
 156
 157
 158 def bisect_find_sha(start, end, sha, unpack_name):
 159     """Find a SHA in a data blob with sorted SHAs.
 160
 161     :param start: Start index of range to search
 162     :param end: End index of range to search
 163     :param sha: Sha to find
 164     :param unpack_name: Callback to retrieve SHA by index
 165     :return: Index of the SHA, or None if it wasn't found
 166     """
 167     assert start <= end
 168     while start <= end:
 169         i = (start + end)/2
 170         file_sha = unpack_name(i)
 171         x = cmp(file_sha, sha)
 172         if x < 0:
 173             start = i + 1
 174         elif x > 0:
 175             end = i - 1
 176         else:
 177             return i
 178     return None
 179
 180
 181 class PackIndex(object):
 182     """An index in to a packfile.
 183
 184     Given a sha id of an object a pack index can tell you the location in the
 185     packfile of that object if it has it.
 186
 187     To do the loop it opens the file, and indexes first 256 4 byte groups
 188     with the first byte of the sha id. The value in the four byte group indexed
 189     is the end of the group that shares the same starting byte. Subtract one
 190     from the starting byte and index again to find the start of the group.
 191     The values are sorted by sha id within the group, so do the math to find
 192     the start and end offset and then bisect in to find if the value is present.
 193     """
 194
 195     def __init__(self, filename, file=None, size=None):
 196         """Create a pack index object.
 197
 198         Provide it with the name of the index file to consider, and it will map
 199         it whenever required.
 200         """
 201         self._filename = filename
 202         # Take the size now, so it can be checked each time we map the file to
 203         # ensure that it hasn't changed.
 204         if file is None:
 205             self._file = GitFile(filename, 'rb')
 206         else:
 207             self._file = file
 208         fileno = getattr(self._file, 'fileno', None)
 209         if fileno is not None:
 210             fd = self._file.fileno()
 211             if size is None:
 212                 self._size = os.fstat(fd).st_size
 213             else:
 214                 self._size = size
 215             self._contents = mmap.mmap(fd, self._size,
 216                 access=mmap.ACCESS_READ)
 217         else:
 218             self._file.seek(0)
 219             self._contents = self._file.read()
 220             self._size = len(self._contents)
 221
 222     def __eq__(self, other):
 223         if not isinstance(other, PackIndex):
 224             return False
 225
 226         if self._fan_out_table != other._fan_out_table:
 227             return False
 228
 229         for (name1, _, _), (name2, _, _) in izip(self.iterentries(), other.iterentries()):
 230             if name1 != name2:
 231                 return False
 232         return True
 233
 234     def __ne__(self, other):
 235         return not self.__eq__(other)
 236
 237     def close(self):
 238         self._file.close()
 239
 240     def __len__(self):
 241         """Return the number of entries in this pack index."""
 242         return self._fan_out_table[-1]
 243
 244     def _unpack_entry(self, i):
 245         """Unpack the i-th entry in the index file.
 246
 247         :return: Tuple with object name (SHA), offset in pack file and
 248               CRC32 checksum (if known)."""
 249         raise NotImplementedError(self._unpack_entry)
 250
 251     def _unpack_name(self, i):
 252         """Unpack the i-th name from the index file."""
 253         raise NotImplementedError(self._unpack_name)
 254
 255     def _unpack_offset(self, i):
 256         """Unpack the i-th object offset from the index file."""
 257         raise NotImplementedError(self._unpack_offset)
 258
 259     def _unpack_crc32_checksum(self, i):
 260         """Unpack the crc32 checksum for the i-th object from the index file."""
 261         raise NotImplementedError(self._unpack_crc32_checksum)
 262
 263     def __iter__(self):
 264         """Iterate over the SHAs in this pack."""
 265         return imap(sha_to_hex, self._itersha())
 266
 267     def _itersha(self):
 268         for i in range(len(self)):
 269             yield self._unpack_name(i)
 270
 271     def objects_sha1(self):
 272         """Return the hex SHA1 over all the shas of all objects in this pack.
 273
 274         :note: This is used for the filename of the pack.
 275         """
 276         return iter_sha1(self._itersha())
 277
 278     def iterentries(self):
 279         """Iterate over the entries in this pack index.
 280
 281         Will yield tuples with object name, offset in packfile and crc32 checksum.
 282         """
 283         for i in range(len(self)):
 284             yield self._unpack_entry(i)
 285
 286     def _read_fan_out_table(self, start_offset):
 287         ret = []
 288         for i in range(0x100):
 289             ret.append(struct.unpack(">L", self._contents[start_offset+i*4:start_offset+(i+1)*4])[0])
 290         return ret
 291
 292     def check(self):
 293         """Check that the stored checksum matches the actual checksum."""
 294         # TODO: Check pack contents, too
 295         return self.calculate_checksum() == self.get_stored_checksum()
 296
 297     def calculate_checksum(self):
 298         """Calculate the SHA1 checksum over this pack index.
 299
 300         :return: This is a 20-byte binary digest
 301         """
 302         return make_sha(self._contents[:-20]).digest()
 303
 304     def get_pack_checksum(self):
 305         """Return the SHA1 checksum stored for the corresponding packfile.
 306
 307         :return: 20-byte binary digest
 308         """
 309         return str(self._contents[-40:-20])
 310
 311     def get_stored_checksum(self):
 312         """Return the SHA1 checksum stored for this index.
 313
 314         :return: 20-byte binary digest
 315         """
 316         return str(self._contents[-20:])
 317
 318     def object_index(self, sha):
 319         """Return the index in to the corresponding packfile for the object.
 320
 321         Given the name of an object it will return the offset that object lives
 322         at within the corresponding pack file. If the pack file doesn't have the
 323         object then None will be returned.
 324         """
 325         if len(sha) == 40:
 326             sha = hex_to_sha(sha)
 327         return self._object_index(sha)
 328
 329     def _object_index(self, sha):
 330         """See object_index.
 331
 332         :param sha: A *binary* SHA string. (20 characters long)_
 333         """
 334         assert len(sha) == 20
 335         idx = ord(sha[0])
 336         if idx == 0:
 337             start = 0
 338         else:
 339             start = self._fan_out_table[idx-1]
 340         end = self._fan_out_table[idx]
 341         i = bisect_find_sha(start, end, sha, self._unpack_name)
 342         if i is None:
 343             raise KeyError(sha)
 344         return self._unpack_offset(i)
 345
 346
 347
 348 class PackIndex1(PackIndex):
 349     """Version 1 Pack Index."""
 350
 351     def __init__(self, filename, file=None, size=None):
 352         PackIndex.__init__(self, filename, file, size)
 353         self.version = 1
 354         self._fan_out_table = self._read_fan_out_table(0)
 355
 356     def _unpack_entry(self, i):
 357         (offset, name) = unpack_from(">L20s", self._contents,
 358             (0x100 * 4) + (i * 24))
 359         return (name, offset, None)
 360
 361     def _unpack_name(self, i):
 362         offset = (0x100 * 4) + (i * 24) + 4
 363         return self._contents[offset:offset+20]
 364
 365     def _unpack_offset(self, i):
 366         offset = (0x100 * 4) + (i * 24)
 367         return unpack_from(">L", self._contents, offset)[0]
 368
 369     def _unpack_crc32_checksum(self, i):
 370         # Not stored in v1 index files
 371         return None
 372
 373
 374 class PackIndex2(PackIndex):
 375     """Version 2 Pack Index."""
 376
 377     def __init__(self, filename, file=None, size=None):
 378         PackIndex.__init__(self, filename, file, size)
 379         assert self._contents[:4] == '\377tOc', "Not a v2 pack index file"
 380         (self.version, ) = unpack_from(">L", self._contents, 4)
 381         assert self.version == 2, "Version was %d" % self.version
 382         self._fan_out_table = self._read_fan_out_table(8)
 383         self._name_table_offset = 8 + 0x100 * 4
 384         self._crc32_table_offset = self._name_table_offset + 20 * len(self)
 385         self._pack_offset_table_offset = self._crc32_table_offset + 4 * len(self)
 386
 387     def _unpack_entry(self, i):
 388         return (self._unpack_name(i), self._unpack_offset(i),
 389                 self._unpack_crc32_checksum(i))
 390
 391     def _unpack_name(self, i):
 392         offset = self._name_table_offset + i * 20
 393         return self._contents[offset:offset+20]
 394
 395     def _unpack_offset(self, i):
 396         offset = self._pack_offset_table_offset + i * 4
 397         return unpack_from(">L", self._contents, offset)[0]
 398
 399     def _unpack_crc32_checksum(self, i):
 400         return unpack_from(">L", self._contents,
 401                           self._crc32_table_offset + i * 4)[0]
 402
 403
 404
 405 def read_pack_header(f):
 406     """Read the header of a pack file.
 407
 408     :param f: File-like object to read from
 409     """
 410     header = f.read(12)
 411     assert header[:4] == "PACK"
 412     (version,) = unpack_from(">L", header, 4)
 413     assert version in (2, 3), "Version was %d" % version
 414     (num_objects,) = unpack_from(">L", header, 8)
 415     return (version, num_objects)
 416
 417
 418 def unpack_object(read):
 419     """Unpack a Git object.
 420
 421     :return: tuple with type, uncompressed data and compressed size
 422     """
 423     bytes = take_msb_bytes(read)
 424     type = (bytes[0] >> 4) & 0x07
 425     size = bytes[0] & 0x0f
 426     for i, byte in enumerate(bytes[1:]):
 427         size += (byte & 0x7f) << ((i * 7) + 4)
 428     raw_base = len(bytes)
 429     if type == 6: # offset delta
 430         bytes = take_msb_bytes(read)
 431         raw_base += len(bytes)
 432         assert not (bytes[-1] & 0x80)
 433         delta_base_offset = bytes[0] & 0x7f
 434         for byte in bytes[1:]:
 435             delta_base_offset += 1
 436             delta_base_offset <<= 7
 437             delta_base_offset += (byte & 0x7f)
 438         uncomp, comp_len = read_zlib(read, size)
 439         assert size == len(uncomp)
 440         return type, (delta_base_offset, uncomp), comp_len+raw_base
 441     elif type == 7: # ref delta
 442         basename = map.read(20)
 443         raw_base += 20
 444         uncomp, comp_len = read_zlib(read, size)
 445         assert size == len(uncomp)
 446         return type, (basename, uncomp), comp_len+raw_base
 447     else:
 448         uncomp, comp_len = read_zlib(read, size)
 449         assert len(uncomp) == size
 450         return type, uncomp, comp_len+raw_base
 451
 452
 453 def _compute_object_size((num, obj)):
 454     """Compute the size of a unresolved object for use with LRUSizeCache.
 455     """
 456     if num in (6, 7):
 457         return len(obj[1])
 458     assert isinstance(obj, str)
 459     return len(obj)
 460
 461
 462 class PackData(object):
 463     """The data contained in a packfile.
 464
 465     Pack files can be accessed both sequentially for exploding a pack, and
 466     directly with the help of an index to retrieve a specific object.
 467
 468     The objects within are either complete or a delta aginst another.
 469
 470     The header is variable length. If the MSB of each byte is set then it
 471     indicates that the subsequent byte is still part of the header.
 472     For the first byte the next MS bits are the type, which tells you the type
 473     of object, and whether it is a delta. The LS byte is the lowest bits of the
 474     size. For each subsequent byte the LS 7 bits are the next MS bits of the
 475     size, i.e. the last byte of the header contains the MS bits of the size.
 476
 477     For the complete objects the data is stored as zlib deflated data.
 478     The size in the header is the uncompressed object size, so to uncompress
 479     you need to just keep feeding data to zlib until you get an object back,
 480     or it errors on bad data. This is done here by just giving the complete
 481     buffer from the start of the deflated object on. This is bad, but until I
 482     get mmap sorted out it will have to do.
 483
 484     Currently there are no integrity checks done. Also no attempt is made to try
 485     and detect the delta case, or a request for an object at the wrong position.
 486     It will all just throw a zlib or KeyError.
 487     """
 488
 489     def __init__(self, filename, file=None, size=None):
 490         """Create a PackData object that represents the pack in the given filename.
 491
 492         The file must exist and stay readable until the object is disposed of. It
 493         must also stay the same size. It will be mapped whenever needed.
 494
 495         Currently there is a restriction on the size of the pack as the python
 496         mmap implementation is flawed.
 497         """
 498         self._filename = filename
 499         self._size = size
 500         self._header_size = 12
 501         if file is None:
 502             self._file = GitFile(self._filename, 'rb')
 503         else:
 504             self._file = file
 505         self._read_header()
 506         self._offset_cache = LRUSizeCache(1024*1024*20,
 507             compute_size=_compute_object_size)
 508
 509     @classmethod
 510     def from_file(cls, file, size):
 511         return cls(str(file), file=file, size=size)
 512
 513     @classmethod
 514     def from_path(cls, path):
 515         return cls(filename=path)
 516
 517     def close(self):
 518         self._file.close()
 519
 520     def _get_size(self):
 521         self._size = os.path.getsize(self._filename)
 522         assert self._size >= self._header_size, "%s is too small for a packfile (%d < %d)" % (self._filename, self._size, self._header_size)
 523
 524     def _read_header(self):
 525         (version, self._num_objects) = read_pack_header(self._file)
 526
 527     def __len__(self):
 528         """Returns the number of objects in this pack."""
 529         return self._num_objects
 530
 531     def calculate_checksum(self):
 532         """Calculate the checksum for this pack.
 533
 534         :return: 20-byte binary SHA1 digest
 535         """
 536         s = make_sha()
 537         self._file.seek(0)
 538         todo = self._get_size() - 20
 539         while todo > 0:
 540             x = self._file.read(min(todo, 1<<16))
 541             s.update(x)
 542             todo -= len(x)
 543         return s.digest()
 544
 545     def resolve_object(self, offset, type, obj, get_ref, get_offset=None):
 546         """Resolve an object, possibly resolving deltas when necessary.
 547
 548         :return: Tuple with object type and contents.
 549         """
 550         if type not in (6, 7): # Not a delta
 551             return type, obj
 552
 553         if get_offset is None:
 554             get_offset = self.get_object_at
 555
 556         if type == 6: # offset delta
 557             (delta_offset, delta) = obj
 558             assert isinstance(delta_offset, int)
 559             assert isinstance(delta, str)
 560             base_offset = offset-delta_offset
 561             type, base_obj = get_offset(base_offset)
 562             assert isinstance(type, int)
 563         elif type == 7: # ref delta
 564             (basename, delta) = obj
 565             assert isinstance(basename, str) and len(basename) == 20
 566             assert isinstance(delta, str)
 567             type, base_obj = get_ref(basename)
 568             assert isinstance(type, int)
 569             # Can't be a ofs delta, as we wouldn't know the base offset
 570             assert type != 6
 571             base_offset = None
 572         type, base_text = self.resolve_object(base_offset, type, base_obj, get_ref)
 573         if base_offset is not None:
 574             self._offset_cache[base_offset] = type, base_text
 575         ret = (type, apply_delta(base_text, delta))
 576         return ret
 577
 578     def iterobjects(self, progress=None):
 579
 580         class ObjectIterator(object):
 581
 582             def __init__(self, pack):
 583                 self.i = 0
 584                 self.offset = pack._header_size
 585                 self.num = len(pack)
 586                 self.map = pack._file
 587
 588             def __iter__(self):
 589                 return self
 590
 591             def __len__(self):
 592                 return self.num
 593
 594             def next(self):
 595                 if self.i == self.num:
 596                     raise StopIteration
 597                 self.map.seek(self.offset)
 598                 (type, obj, total_size) = unpack_object(self.map.read)
 599                 self.map.seek(self.offset)
 600                 crc32 = zlib.crc32(self.map.read(total_size)) & 0xffffffff
 601                 ret = (self.offset, type, obj, crc32)
 602                 self.offset += total_size
 603                 if progress:
 604                     progress(self.i, self.num)
 605                 self.i+=1
 606                 return ret
 607         return ObjectIterator(self)
 608
 609     def iterentries(self, ext_resolve_ref=None, progress=None):
 610         """Yield entries summarizing the contents of this pack.
 611
 612         :param ext_resolve_ref: Optional function to resolve base
 613             objects (in case this is a thin pack)
 614         :param progress: Progress function, called with current and
 615             total object count.
 616
 617         This will yield tuples with (sha, offset, crc32)
 618         """
 619         found = {}
 620         postponed = defaultdict(list)
 621         class Postpone(Exception):
 622             """Raised to postpone delta resolving."""
 623
 624         def get_ref_text(sha):
 625             assert len(sha) == 20
 626             if sha in found:
 627                 return self.get_object_at(found[sha])
 628             if ext_resolve_ref:
 629                 try:
 630                     return ext_resolve_ref(sha)
 631                 except KeyError:
 632                     pass
 633             raise Postpone, (sha, )
 634         extra = []
 635         todo = chain(self.iterobjects(progress=progress), extra)
 636         for (offset, type, obj, crc32) in todo:
 637             assert isinstance(offset, int)
 638             assert isinstance(type, int)
 639             assert isinstance(obj, tuple) or isinstance(obj, str)
 640             try:
 641                 type, obj = self.resolve_object(offset, type, obj, get_ref_text)
 642             except Postpone, (sha, ):
 643                 postponed[sha].append((offset, type, obj))
 644             else:
 645                 shafile = ShaFile.from_raw_string(type, obj)
 646                 sha = shafile.sha().digest()
 647                 found[sha] = offset
 648                 yield sha, offset, crc32
 649                 extra.extend(postponed.get(sha, []))
 650         if postponed:
 651             raise KeyError([sha_to_hex(h) for h in postponed.keys()])
 652
 653     def sorted_entries(self, resolve_ext_ref=None, progress=None):
 654         """Return entries in this pack, sorted by SHA.
 655
 656         :param ext_resolve_ref: Optional function to resolve base
 657             objects (in case this is a thin pack)
 658         :param progress: Progress function, called with current and
 659             total object count.
 660         :return: List of tuples with (sha, offset, crc32)
 661         """
 662         ret = list(self.iterentries(resolve_ext_ref, progress=progress))
 663         ret.sort()
 664         return ret
 665
 666     def create_index_v1(self, filename, resolve_ext_ref=None, progress=None):
 667         """Create a version 1 file for this data file.
 668
 669         :param filename: Index filename.
 670         :param resolve_ext_ref: Function to use for resolving externally referenced
 671             SHA1s (for thin packs)
 672         :param progress: Progress report function
 673         """
 674         entries = self.sorted_entries(resolve_ext_ref, progress=progress)
 675         write_pack_index_v1(filename, entries, self.calculate_checksum())
 676
 677     def create_index_v2(self, filename, resolve_ext_ref=None, progress=None):
 678         """Create a version 2 index file for this data file.
 679
 680         :param filename: Index filename.
 681         :param resolve_ext_ref: Function to use for resolving externally referenced
 682             SHA1s (for thin packs)
 683         :param progress: Progress report function
 684         """
 685         entries = self.sorted_entries(resolve_ext_ref, progress=progress)
 686         write_pack_index_v2(filename, entries, self.calculate_checksum())
 687
 688     def create_index(self, filename, resolve_ext_ref=None, progress=None,
 689                      version=2):
 690         """Create an  index file for this data file.
 691
 692         :param filename: Index filename.
 693         :param resolve_ext_ref: Function to use for resolving externally referenced
 694             SHA1s (for thin packs)
 695         :param progress: Progress report function
 696         """
 697         if version == 1:
 698             self.create_index_v1(filename, resolve_ext_ref, progress)
 699         elif version == 2:
 700             self.create_index_v2(filename, resolve_ext_ref, progress)
 701         else:
 702             raise ValueError("unknown index format %d" % version)
 703
 704     def get_stored_checksum(self):
 705         """Return the expected checksum stored in this pack."""
 706         self._file.seek(self._get_size()-20)
 707         return self._file.read(20)
 708
 709     def check(self):
 710         """Check the consistency of this pack."""
 711         return (self.calculate_checksum() == self.get_stored_checksum())
 712
 713     def get_object_at(self, offset):
 714         """Given an offset in to the packfile return the object that is there.
 715
 716         Using the associated index the location of an object can be looked up, and
 717         then the packfile can be asked directly for that object using this
 718         function.
 719         """
 720         if offset in self._offset_cache:
 721             return self._offset_cache[offset]
 722         assert isinstance(offset, long) or isinstance(offset, int),\
 723                 "offset was %r" % offset
 724         assert offset >= self._header_size
 725         self._file.seek(offset)
 726         return unpack_object(self._file.read)[:2]
 727
 728
 729 class SHA1Reader(object):
 730     """Wrapper around a file-like object that remembers the SHA1 of
 731     the data read from it."""
 732
 733     def __init__(self, f):
 734         self.f = f
 735         self.sha1 = make_sha("")
 736
 737     def read(self, num=None):
 738         data = self.f.read(num)
 739         self.sha1.update(data)
 740         return data
 741
 742     def check_sha(self):
 743         stored = self.f.read(20)
 744         if stored != self.sha1.digest():
 745             raise ChecksumMismatch(self.sha1.hexdigest(), sha_to_hex(stored))
 746
 747     def close(self):
 748         return self.f.close()
 749
 750     def tell(self):
 751         return self.f.tell()
 752
 753
 754 class SHA1Writer(object):
 755     """Wrapper around a file-like object that remembers the SHA1 of
 756     the data written to it."""
 757
 758     def __init__(self, f):
 759         self.f = f
 760         self.sha1 = make_sha("")
 761
 762     def write(self, data):
 763         self.sha1.update(data)
 764         self.f.write(data)
 765
 766     def write_sha(self):
 767         sha = self.sha1.digest()
 768         assert len(sha) == 20
 769         self.f.write(sha)
 770         return sha
 771
 772     def close(self):
 773         sha = self.write_sha()
 774         self.f.close()
 775         return sha
 776
 777     def tell(self):
 778         return self.f.tell()
 779
 780
 781 def write_pack_object(f, type, object):
 782     """Write pack object to a file.
 783
 784     :param f: File to write to
 785     :param o: Object to write
 786     :return: Tuple with offset at which the object was written, and crc32
 787     """
 788     offset = f.tell()
 789     packed_data_hdr = ""
 790     if type == 6: # offset delta
 791         (delta_base_offset, object) = object
 792     elif type == 7: # ref delta
 793         (basename, object) = object
 794     size = len(object)
 795     c = (type << 4) | (size & 15)
 796     size >>= 4
 797     while size:
 798         packed_data_hdr += (chr(c | 0x80))
 799         c = size & 0x7f
 800         size >>= 7
 801     packed_data_hdr += chr(c)
 802     if type == 6: # offset delta
 803         ret = [delta_base_offset & 0x7f]
 804         delta_base_offset >>= 7
 805         while delta_base_offset:
 806             delta_base_offset -= 1
 807             ret.insert(0, 0x80 | (delta_base_offset & 0x7f))
 808             delta_base_offset >>= 7
 809         packed_data_hdr += "".join([chr(x) for x in ret])
 810     elif type == 7: # ref delta
 811         assert len(basename) == 20
 812         packed_data_hdr += basename
 813     packed_data = packed_data_hdr + zlib.compress(object)
 814     f.write(packed_data)
 815     return (offset, (zlib.crc32(packed_data) & 0xffffffff))
 816
 817
 818 def write_pack(filename, objects, num_objects):
 819     """Write a new pack data file.
 820
 821     :param filename: Path to the new pack file (without .pack extension)
 822     :param objects: Iterable over (object, path) tuples to write
 823     :param num_objects: Number of objects to write
 824     """
 825     f = GitFile(filename + ".pack", 'wb')
 826     try:
 827         entries, data_sum = write_pack_data(f, objects, num_objects)
 828     finally:
 829         f.close()
 830     entries.sort()
 831     write_pack_index_v2(filename + ".idx", entries, data_sum)
 832
 833
 834 def write_pack_data(f, objects, num_objects, window=10):
 835     """Write a new pack file.
 836
 837     :param filename: The filename of the new pack file.
 838     :param objects: List of objects to write (tuples with object and path)
 839     :return: List with (name, offset, crc32 checksum) entries, pack checksum
 840     """
 841     recency = list(objects)
 842     # FIXME: Somehow limit delta depth
 843     # FIXME: Make thin-pack optional (its not used when cloning a pack)
 844     # Build a list of objects ordered by the magic Linus heuristic
 845     # This helps us find good objects to diff against us
 846     magic = []
 847     for obj, path in recency:
 848         magic.append( (obj.type, path, 1, -len(obj.as_raw_string()), obj) )
 849     magic.sort()
 850     # Build a map of objects and their index in magic - so we can find preceeding objects
 851     # to diff against
 852     offs = {}
 853     for i in range(len(magic)):
 854         offs[magic[i][4]] = i
 855     # Write the pack
 856     entries = []
 857     f = SHA1Writer(f)
 858     f.write("PACK")               # Pack header
 859     f.write(struct.pack(">L", 2)) # Pack version
 860     f.write(struct.pack(">L", num_objects)) # Number of objects in pack
 861     for o, path in recency:
 862         sha1 = o.sha().digest()
 863         orig_t = o.type
 864         raw = o.as_raw_string()
 865         winner = raw
 866         t = orig_t
 867         #for i in range(offs[o]-window, window):
 868         #    if i < 0 or i >= len(offs): continue
 869         #    b = magic[i][4]
 870         #    if b.type != orig_t: continue
 871         #    base = b.as_raw_string()
 872         #    delta = create_delta(base, raw)
 873         #    if len(delta) < len(winner):
 874         #        winner = delta
 875         #        t = 6 if magic[i][2] == 1 else 7
 876         offset, crc32 = write_pack_object(f, t, winner)
 877         entries.append((sha1, offset, crc32))
 878     return entries, f.write_sha()
 879
 880
 881 def write_pack_index_v1(filename, entries, pack_checksum):
 882     """Write a new pack index file.
 883
 884     :param filename: The filename of the new pack index file.
 885     :param entries: List of tuples with object name (sha), offset_in_pack,  and
 886             crc32_checksum.
 887     :param pack_checksum: Checksum of the pack file.
 888     """
 889     f = GitFile(filename, 'wb')
 890     f = SHA1Writer(f)
 891     fan_out_table = defaultdict(lambda: 0)
 892     for (name, offset, entry_checksum) in entries:
 893         fan_out_table[ord(name[0])] += 1
 894     # Fan-out table
 895     for i in range(0x100):
 896         f.write(struct.pack(">L", fan_out_table[i]))
 897         fan_out_table[i+1] += fan_out_table[i]
 898     for (name, offset, entry_checksum) in entries:
 899         f.write(struct.pack(">L20s", offset, name))
 900     assert len(pack_checksum) == 20
 901     f.write(pack_checksum)
 902     f.close()
 903
 904
 905 def create_delta(base_buf, target_buf):
 906     """Use python difflib to work out how to transform base_buf to target_buf.
 907
 908     :param base_buf: Base buffer
 909     :param target_buf: Target buffer
 910     """
 911     assert isinstance(base_buf, str)
 912     assert isinstance(target_buf, str)
 913     out_buf = ""
 914     # write delta header
 915     def encode_size(size):
 916         ret = ""
 917         c = size & 0x7f
 918         size >>= 7
 919         while size:
 920             ret += chr(c | 0x80)
 921             c = size & 0x7f
 922             size >>= 7
 923         ret += chr(c)
 924         return ret
 925     out_buf += encode_size(len(base_buf))
 926     out_buf += encode_size(len(target_buf))
 927     # write out delta opcodes
 928     seq = difflib.SequenceMatcher(a=base_buf, b=target_buf)
 929     for opcode, i1, i2, j1, j2 in seq.get_opcodes():
 930         # Git patch opcodes don't care about deletes!
 931         #if opcode == "replace" or opcode == "delete":
 932         #    pass
 933         if opcode == "equal":
 934             # If they are equal, unpacker will use data from base_buf
 935             # Write out an opcode that says what range to use
 936             scratch = ""
 937             op = 0x80
 938             o = i1
 939             for i in range(4):
 940                 if o & 0xff << i*8:
 941                     scratch += chr((o >> i*8) & 0xff)
 942                     op |= 1 << i
 943             s = i2 - i1
 944             for i in range(2):
 945                 if s & 0xff << i*8:
 946                     scratch += chr((s >> i*8) & 0xff)
 947                     op |= 1 << (4+i)
 948             out_buf += chr(op)
 949             out_buf += scratch
 950         if opcode == "replace" or opcode == "insert":
 951             # If we are replacing a range or adding one, then we just
 952             # output it to the stream (prefixed by its size)
 953             s = j2 - j1
 954             o = j1
 955             while s > 127:
 956                 out_buf += chr(127)
 957                 out_buf += target_buf[o:o+127]
 958                 s -= 127
 959                 o += 127
 960             out_buf += chr(s)
 961             out_buf += target_buf[o:o+s]
 962     return out_buf
 963
 964
 965 def apply_delta(src_buf, delta):
 966     """Based on the similar function in git's patch-delta.c.
 967
 968     :param src_buf: Source buffer
 969     :param delta: Delta instructions
 970     """
 971     assert isinstance(src_buf, str), "was %r" % (src_buf,)
 972     assert isinstance(delta, str)
 973     out = []
 974     index = 0
 975     delta_length = len(delta)
 976     def get_delta_header_size(delta, index):
 977         size = 0
 978         i = 0
 979         while delta:
 980             cmd = ord(delta[index])
 981             index += 1
 982             size |= (cmd & ~0x80) << i
 983             i += 7
 984             if not cmd & 0x80:
 985                 break
 986         return size, index
 987     src_size, index = get_delta_header_size(delta, index)
 988     dest_size, index = get_delta_header_size(delta, index)
 989     assert src_size == len(src_buf), "%d vs %d" % (src_size, len(src_buf))
 990     while index < delta_length:
 991         cmd = ord(delta[index])
 992         index += 1
 993         if cmd & 0x80:
 994             cp_off = 0
 995             for i in range(4):
 996                 if cmd & (1 << i):
 997                     x = ord(delta[index])
 998                     index += 1
 999                     cp_off |= x << (i * 8)
1000             cp_size = 0
1001             for i in range(3):
1002                 if cmd & (1 << (4+i)):
1003                     x = ord(delta[index])
1004                     index += 1
1005                     cp_size |= x << (i * 8)
1006             if cp_size == 0:
1007                 cp_size = 0x10000
1008             if (cp_off + cp_size < cp_size or
1009                 cp_off + cp_size > src_size or
1010                 cp_size > dest_size):
1011                 break
1012             out.append(src_buf[cp_off:cp_off+cp_size])
1013         elif cmd != 0:
1014             out.append(delta[index:index+cmd])
1015             index += cmd
1016         else:
1017             raise ApplyDeltaError("Invalid opcode 0")
1018
1019     if index != delta_length:
1020         raise ApplyDeltaError("delta not empty: %r" % delta[index:])
1021
1022     out = ''.join(out)
1023     if dest_size != len(out):
1024         raise ApplyDeltaError("dest size incorrect")
1025
1026     return out
1027
1028
1029 def write_pack_index_v2(filename, entries, pack_checksum):
1030     """Write a new pack index file.
1031
1032     :param filename: The filename of the new pack index file.
1033     :param entries: List of tuples with object name (sha), offset_in_pack,  and
1034             crc32_checksum.
1035     :param pack_checksum: Checksum of the pack file.
1036     """
1037     f = GitFile(filename, 'wb')
1038     f = SHA1Writer(f)
1039     f.write('\377tOc') # Magic!
1040     f.write(struct.pack(">L", 2))
1041     fan_out_table = defaultdict(lambda: 0)
1042     for (name, offset, entry_checksum) in entries:
1043         fan_out_table[ord(name[0])] += 1
1044     # Fan-out table
1045     for i in range(0x100):
1046         f.write(struct.pack(">L", fan_out_table[i]))
1047         fan_out_table[i+1] += fan_out_table[i]
1048     for (name, offset, entry_checksum) in entries:
1049         f.write(name)
1050     for (name, offset, entry_checksum) in entries:
1051         f.write(struct.pack(">L", entry_checksum))
1052     for (name, offset, entry_checksum) in entries:
1053         # FIXME: handle if MSBit is set in offset
1054         f.write(struct.pack(">L", offset))
1055     # FIXME: handle table for pack files > 8 Gb
1056     assert len(pack_checksum) == 20
1057     f.write(pack_checksum)
1058     f.close()
1059
1060
1061 class Pack(object):
1062     """A Git pack object."""
1063
1064     def __init__(self, basename):
1065         self._basename = basename
1066         self._data_path = self._basename + ".pack"
1067         self._idx_path = self._basename + ".idx"
1068         self._data = None
1069         self._idx = None
1070
1071     @classmethod
1072     def from_objects(self, data, idx):
1073         """Create a new pack object from pack data and index objects."""
1074         ret = Pack("")
1075         ret._data = data
1076         ret._idx = idx
1077         return ret
1078
1079     def name(self):
1080         """The SHA over the SHAs of the objects in this pack."""
1081         return self.index.objects_sha1()
1082
1083     @property
1084     def data(self):
1085         """The pack data object being used."""
1086         if self._data is None:
1087             self._data = PackData(self._data_path)
1088             assert len(self.index) == len(self._data)
1089             idx_stored_checksum = self.index.get_pack_checksum()
1090             data_stored_checksum = self._data.get_stored_checksum()
1091             if idx_stored_checksum != data_stored_checksum:
1092                 raise ChecksumMismatch(sha_to_hex(idx_stored_checksum),
1093                                        sha_to_hex(data_stored_checksum))
1094         return self._data
1095
1096     @property
1097     def index(self):
1098         """The index being used.
1099
1100         :note: This may be an in-memory index
1101         """
1102         if self._idx is None:
1103             self._idx = load_pack_index(self._idx_path)
1104         return self._idx
1105
1106     def close(self):
1107         if self._data is not None:
1108             self._data.close()
1109         self.index.close()
1110
1111     def __eq__(self, other):
1112         return type(self) == type(other) and self.index == other.index
1113
1114     def __len__(self):
1115         """Number of entries in this pack."""
1116         return len(self.index)
1117
1118     def __repr__(self):
1119         return "%s(%r)" % (self.__class__.__name__, self._basename)
1120
1121     def __iter__(self):
1122         """Iterate over all the sha1s of the objects in this pack."""
1123         return iter(self.index)
1124
1125     def check(self):
1126         """Check the integrity of this pack."""
1127         if not self.index.check():
1128             return False
1129         if not self.data.check():
1130             return False
1131         return True
1132
1133     def get_stored_checksum(self):
1134         return self.data.get_stored_checksum()
1135
1136     def __contains__(self, sha1):
1137         """Check whether this pack contains a particular SHA1."""
1138         try:
1139             self.index.object_index(sha1)
1140             return True
1141         except KeyError:
1142             return False
1143
1144     def get_raw(self, sha1, resolve_ref=None):
1145         offset = self.index.object_index(sha1)
1146         obj_type, obj = self.data.get_object_at(offset)
1147         if type(offset) is long:
1148           offset = int(offset)
1149         if resolve_ref is None:
1150             resolve_ref = self.get_raw
1151         return self.data.resolve_object(offset, obj_type, obj, resolve_ref)
1152
1153     def __getitem__(self, sha1):
1154         """Retrieve the specified SHA1."""
1155         type, uncomp = self.get_raw(sha1)
1156         return ShaFile.from_raw_string(type, uncomp)
1157
1158     def iterobjects(self, get_raw=None):
1159         """Iterate over the objects in this pack."""
1160         if get_raw is None:
1161             get_raw = self.get_raw
1162         for offset, type, obj, crc32 in self.data.iterobjects():
1163             assert isinstance(offset, int)
1164             yield ShaFile.from_raw_string(
1165                     *self.data.resolve_object(offset, type, obj, get_raw))
1166
1167
1168 try:
1169     from dulwich._pack import apply_delta, bisect_find_sha
1170 except ImportError:
1171     pass