dulwich/pack.py

   1 # pack.py -- For dealing wih packed git objects.
   2 # Copyright (C) 2007 James Westby <jw+debian@jameswestby.net>
   3 # Copryight (C) 2008-2009 Jelmer Vernooij <jelmer@samba.org>
   4 #
   5 # This program is free software; you can redistribute it and/or
   6 # modify it under the terms of the GNU General Public License
   7 # as published by the Free Software Foundation; version 2
   8 # of the License or (at your option) a later version.
   9 #
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 # GNU General Public License for more details.
  14 #
  15 # You should have received a copy of the GNU General Public License
  16 # along with this program; if not, write to the Free Software
  17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
  18 # MA  02110-1301, USA.
  19
  20 """Classes for dealing with packed git objects.
  21
  22 A pack is a compact representation of a bunch of objects, stored
  23 using deltas where possible.
  24
  25 They have two parts, the pack file, which stores the data, and an index
  26 that tells you where the data is.
  27
  28 To find an object you look in all of the index files 'til you find a
  29 match for the object name. You then use the pointer got from this as
  30 a pointer in to the corresponding packfile.
  31 """
  32
  33 try:
  34     from collections import defaultdict
  35 except ImportError:
  36     from misc import defaultdict
  37
  38 import difflib
  39 from itertools import (
  40     chain,
  41     imap,
  42     izip,
  43     )
  44 import mmap
  45 import os
  46 import struct
  47 try:
  48     from struct import unpack_from
  49 except ImportError:
  50     from dulwich.misc import unpack_from
  51 import sys
  52 import zlib
  53
  54 from dulwich.errors import (
  55     ApplyDeltaError,
  56     ChecksumMismatch,
  57     )
  58 from dulwich.file import GitFile
  59 from dulwich.lru_cache import (
  60     LRUSizeCache,
  61     )
  62 from dulwich.objects import (
  63     ShaFile,
  64     hex_to_sha,
  65     sha_to_hex,
  66     )
  67 from dulwich.misc import (
  68     make_sha,
  69     )
  70
  71 supports_mmap_offset = (sys.version_info[0] >= 3 or
  72         (sys.version_info[0] == 2 and sys.version_info[1] >= 6))
  73
  74
  75 def take_msb_bytes(read):
  76     """Read bytes marked with most significant bit.
  77
  78     :param read: Read function
  79     """
  80     ret = []
  81     while len(ret) == 0 or ret[-1] & 0x80:
  82         ret.append(ord(read(1)))
  83     return ret
  84
  85
  86 def read_zlib_chunks(read, buffer_size=4096):
  87     """Read chunks of zlib data from a buffer.
  88
  89     :param read: Read function
  90     :return: Tuple with list of chunks, length of
  91         compressed data length and unused read data
  92     """
  93     obj = zlib.decompressobj()
  94     ret = []
  95     fed = 0
  96     while obj.unused_data == "":
  97         add = read(buffer_size)
  98         if len(add) < buffer_size:
  99             add += "Z"
 100         fed += len(add)
 101         ret.append(obj.decompress(add))
 102     comp_len = fed-len(obj.unused_data)
 103     return ret, comp_len, obj.unused_data
 104
 105
 106 def iter_sha1(iter):
 107     """Return the hexdigest of the SHA1 over a set of names.
 108
 109     :param iter: Iterator over string objects
 110     :return: 40-byte hex sha1 digest
 111     """
 112     sha1 = make_sha()
 113     for name in iter:
 114         sha1.update(name)
 115     return sha1.hexdigest()
 116
 117
 118 def load_pack_index(path):
 119     """Load an index file by path.
 120
 121     :param filename: Path to the index file
 122     """
 123     f = GitFile(path, 'rb')
 124     return load_pack_index_file(path, f)
 125
 126
 127 def load_pack_index_file(path, f):
 128     """Load an index file from a file-like object.
 129
 130     :param path: Path for the index file
 131     :param f: File-like object
 132     """
 133     if f.read(4) == '\377tOc':
 134         version = struct.unpack(">L", f.read(4))[0]
 135         if version == 2:
 136             f.seek(0)
 137             return PackIndex2(path, file=f)
 138         else:
 139             raise KeyError("Unknown pack index format %d" % version)
 140     else:
 141         f.seek(0)
 142         return PackIndex1(path, file=f)
 143
 144
 145 def bisect_find_sha(start, end, sha, unpack_name):
 146     """Find a SHA in a data blob with sorted SHAs.
 147
 148     :param start: Start index of range to search
 149     :param end: End index of range to search
 150     :param sha: Sha to find
 151     :param unpack_name: Callback to retrieve SHA by index
 152     :return: Index of the SHA, or None if it wasn't found
 153     """
 154     assert start <= end
 155     while start <= end:
 156         i = (start + end)/2
 157         file_sha = unpack_name(i)
 158         x = cmp(file_sha, sha)
 159         if x < 0:
 160             start = i + 1
 161         elif x > 0:
 162             end = i - 1
 163         else:
 164             return i
 165     return None
 166
 167
 168 class PackIndex(object):
 169     """An index in to a packfile.
 170
 171     Given a sha id of an object a pack index can tell you the location in the
 172     packfile of that object if it has it.
 173
 174     To do the loop it opens the file, and indexes first 256 4 byte groups
 175     with the first byte of the sha id. The value in the four byte group indexed
 176     is the end of the group that shares the same starting byte. Subtract one
 177     from the starting byte and index again to find the start of the group.
 178     The values are sorted by sha id within the group, so do the math to find
 179     the start and end offset and then bisect in to find if the value is present.
 180     """
 181
 182     def __init__(self, filename, file=None, size=None):
 183         """Create a pack index object.
 184
 185         Provide it with the name of the index file to consider, and it will map
 186         it whenever required.
 187         """
 188         self._filename = filename
 189         # Take the size now, so it can be checked each time we map the file to
 190         # ensure that it hasn't changed.
 191         if file is None:
 192             self._file = GitFile(filename, 'rb')
 193         else:
 194             self._file = file
 195         fileno = getattr(self._file, 'fileno', None)
 196         if fileno is not None:
 197             fd = self._file.fileno()
 198             if size is None:
 199                 self._size = os.fstat(fd).st_size
 200             else:
 201                 self._size = size
 202             self._contents = mmap.mmap(fd, self._size,
 203                 access=mmap.ACCESS_READ)
 204         else:
 205             self._file.seek(0)
 206             self._contents = self._file.read()
 207             self._size = len(self._contents)
 208
 209     def __eq__(self, other):
 210         if not isinstance(other, PackIndex):
 211             return False
 212
 213         if self._fan_out_table != other._fan_out_table:
 214             return False
 215
 216         for (name1, _, _), (name2, _, _) in izip(self.iterentries(), other.iterentries()):
 217             if name1 != name2:
 218                 return False
 219         return True
 220
 221     def __ne__(self, other):
 222         return not self.__eq__(other)
 223
 224     def close(self):
 225         self._file.close()
 226
 227     def __len__(self):
 228         """Return the number of entries in this pack index."""
 229         return self._fan_out_table[-1]
 230
 231     def _unpack_entry(self, i):
 232         """Unpack the i-th entry in the index file.
 233
 234         :return: Tuple with object name (SHA), offset in pack file and
 235               CRC32 checksum (if known)."""
 236         raise NotImplementedError(self._unpack_entry)
 237
 238     def _unpack_name(self, i):
 239         """Unpack the i-th name from the index file."""
 240         raise NotImplementedError(self._unpack_name)
 241
 242     def _unpack_offset(self, i):
 243         """Unpack the i-th object offset from the index file."""
 244         raise NotImplementedError(self._unpack_offset)
 245
 246     def _unpack_crc32_checksum(self, i):
 247         """Unpack the crc32 checksum for the i-th object from the index file."""
 248         raise NotImplementedError(self._unpack_crc32_checksum)
 249
 250     def __iter__(self):
 251         """Iterate over the SHAs in this pack."""
 252         return imap(sha_to_hex, self._itersha())
 253
 254     def _itersha(self):
 255         for i in range(len(self)):
 256             yield self._unpack_name(i)
 257
 258     def objects_sha1(self):
 259         """Return the hex SHA1 over all the shas of all objects in this pack.
 260
 261         :note: This is used for the filename of the pack.
 262         """
 263         return iter_sha1(self._itersha())
 264
 265     def iterentries(self):
 266         """Iterate over the entries in this pack index.
 267
 268         Will yield tuples with object name, offset in packfile and crc32 checksum.
 269         """
 270         for i in range(len(self)):
 271             yield self._unpack_entry(i)
 272
 273     def _read_fan_out_table(self, start_offset):
 274         ret = []
 275         for i in range(0x100):
 276             ret.append(struct.unpack(">L", self._contents[start_offset+i*4:start_offset+(i+1)*4])[0])
 277         return ret
 278
 279     def check(self):
 280         """Check that the stored checksum matches the actual checksum."""
 281         # TODO: Check pack contents, too
 282         return self.calculate_checksum() == self.get_stored_checksum()
 283
 284     def calculate_checksum(self):
 285         """Calculate the SHA1 checksum over this pack index.
 286
 287         :return: This is a 20-byte binary digest
 288         """
 289         return make_sha(self._contents[:-20]).digest()
 290
 291     def get_pack_checksum(self):
 292         """Return the SHA1 checksum stored for the corresponding packfile.
 293
 294         :return: 20-byte binary digest
 295         """
 296         return str(self._contents[-40:-20])
 297
 298     def get_stored_checksum(self):
 299         """Return the SHA1 checksum stored for this index.
 300
 301         :return: 20-byte binary digest
 302         """
 303         return str(self._contents[-20:])
 304
 305     def object_index(self, sha):
 306         """Return the index in to the corresponding packfile for the object.
 307
 308         Given the name of an object it will return the offset that object lives
 309         at within the corresponding pack file. If the pack file doesn't have the
 310         object then None will be returned.
 311         """
 312         if len(sha) == 40:
 313             sha = hex_to_sha(sha)
 314         return self._object_index(sha)
 315
 316     def _object_index(self, sha):
 317         """See object_index.
 318
 319         :param sha: A *binary* SHA string. (20 characters long)_
 320         """
 321         assert len(sha) == 20
 322         idx = ord(sha[0])
 323         if idx == 0:
 324             start = 0
 325         else:
 326             start = self._fan_out_table[idx-1]
 327         end = self._fan_out_table[idx]
 328         i = bisect_find_sha(start, end, sha, self._unpack_name)
 329         if i is None:
 330             raise KeyError(sha)
 331         return self._unpack_offset(i)
 332
 333
 334
 335 class PackIndex1(PackIndex):
 336     """Version 1 Pack Index."""
 337
 338     def __init__(self, filename, file=None, size=None):
 339         PackIndex.__init__(self, filename, file, size)
 340         self.version = 1
 341         self._fan_out_table = self._read_fan_out_table(0)
 342
 343     def _unpack_entry(self, i):
 344         (offset, name) = unpack_from(">L20s", self._contents,
 345             (0x100 * 4) + (i * 24))
 346         return (name, offset, None)
 347
 348     def _unpack_name(self, i):
 349         offset = (0x100 * 4) + (i * 24) + 4
 350         return self._contents[offset:offset+20]
 351
 352     def _unpack_offset(self, i):
 353         offset = (0x100 * 4) + (i * 24)
 354         return unpack_from(">L", self._contents, offset)[0]
 355
 356     def _unpack_crc32_checksum(self, i):
 357         # Not stored in v1 index files
 358         return None
 359
 360
 361 class PackIndex2(PackIndex):
 362     """Version 2 Pack Index."""
 363
 364     def __init__(self, filename, file=None, size=None):
 365         PackIndex.__init__(self, filename, file, size)
 366         assert self._contents[:4] == '\377tOc', "Not a v2 pack index file"
 367         (self.version, ) = unpack_from(">L", self._contents, 4)
 368         assert self.version == 2, "Version was %d" % self.version
 369         self._fan_out_table = self._read_fan_out_table(8)
 370         self._name_table_offset = 8 + 0x100 * 4
 371         self._crc32_table_offset = self._name_table_offset + 20 * len(self)
 372         self._pack_offset_table_offset = self._crc32_table_offset + 4 * len(self)
 373
 374     def _unpack_entry(self, i):
 375         return (self._unpack_name(i), self._unpack_offset(i),
 376                 self._unpack_crc32_checksum(i))
 377
 378     def _unpack_name(self, i):
 379         offset = self._name_table_offset + i * 20
 380         return self._contents[offset:offset+20]
 381
 382     def _unpack_offset(self, i):
 383         offset = self._pack_offset_table_offset + i * 4
 384         return unpack_from(">L", self._contents, offset)[0]
 385
 386     def _unpack_crc32_checksum(self, i):
 387         return unpack_from(">L", self._contents,
 388                           self._crc32_table_offset + i * 4)[0]
 389
 390
 391
 392 def read_pack_header(f):
 393     """Read the header of a pack file.
 394
 395     :param f: File-like object to read from
 396     """
 397     header = f.read(12)
 398     assert header[:4] == "PACK"
 399     (version,) = unpack_from(">L", header, 4)
 400     assert version in (2, 3), "Version was %d" % version
 401     (num_objects,) = unpack_from(">L", header, 8)
 402     return (version, num_objects)
 403
 404
 405 def chunks_length(chunks):
 406     return sum(imap(len, chunks))
 407
 408
 409 def unpack_object(read):
 410     """Unpack a Git object.
 411
 412     :return: tuple with type, uncompressed data as chunks, compressed size and
 413         tail data
 414     """
 415     bytes = take_msb_bytes(read)
 416     type = (bytes[0] >> 4) & 0x07
 417     size = bytes[0] & 0x0f
 418     for i, byte in enumerate(bytes[1:]):
 419         size += (byte & 0x7f) << ((i * 7) + 4)
 420     raw_base = len(bytes)
 421     if type == 6: # offset delta
 422         bytes = take_msb_bytes(read)
 423         raw_base += len(bytes)
 424         assert not (bytes[-1] & 0x80)
 425         delta_base_offset = bytes[0] & 0x7f
 426         for byte in bytes[1:]:
 427             delta_base_offset += 1
 428             delta_base_offset <<= 7
 429             delta_base_offset += (byte & 0x7f)
 430         uncomp, comp_len, unused = read_zlib_chunks(read, size)
 431         assert size == chunks_length(uncomp)
 432         return type, (delta_base_offset, uncomp), comp_len+raw_base, unused
 433     elif type == 7: # ref delta
 434         basename = read(20)
 435         raw_base += 20
 436         uncomp, comp_len, unused = read_zlib_chunks(read, size)
 437         assert size == chunks_length(uncomp)
 438         return type, (basename, uncomp), comp_len+raw_base, unused
 439     else:
 440         uncomp, comp_len, unused = read_zlib_chunks(read, size)
 441         assert chunks_length(uncomp) == size
 442         return type, uncomp, comp_len+raw_base, unused
 443
 444
 445 def _compute_object_size((num, obj)):
 446     """Compute the size of a unresolved object for use with LRUSizeCache.
 447     """
 448     if num in (6, 7):
 449         return len(obj[1])
 450     assert isinstance(obj, str)
 451     return len(obj)
 452
 453
 454 class PackData(object):
 455     """The data contained in a packfile.
 456
 457     Pack files can be accessed both sequentially for exploding a pack, and
 458     directly with the help of an index to retrieve a specific object.
 459
 460     The objects within are either complete or a delta aginst another.
 461
 462     The header is variable length. If the MSB of each byte is set then it
 463     indicates that the subsequent byte is still part of the header.
 464     For the first byte the next MS bits are the type, which tells you the type
 465     of object, and whether it is a delta. The LS byte is the lowest bits of the
 466     size. For each subsequent byte the LS 7 bits are the next MS bits of the
 467     size, i.e. the last byte of the header contains the MS bits of the size.
 468
 469     For the complete objects the data is stored as zlib deflated data.
 470     The size in the header is the uncompressed object size, so to uncompress
 471     you need to just keep feeding data to zlib until you get an object back,
 472     or it errors on bad data. This is done here by just giving the complete
 473     buffer from the start of the deflated object on. This is bad, but until I
 474     get mmap sorted out it will have to do.
 475
 476     Currently there are no integrity checks done. Also no attempt is made to try
 477     and detect the delta case, or a request for an object at the wrong position.
 478     It will all just throw a zlib or KeyError.
 479     """
 480
 481     def __init__(self, filename, file=None, size=None):
 482         """Create a PackData object that represents the pack in the given filename.
 483
 484         The file must exist and stay readable until the object is disposed of. It
 485         must also stay the same size. It will be mapped whenever needed.
 486
 487         Currently there is a restriction on the size of the pack as the python
 488         mmap implementation is flawed.
 489         """
 490         self._filename = filename
 491         self._size = size
 492         self._header_size = 12
 493         if file is None:
 494             self._file = GitFile(self._filename, 'rb')
 495         else:
 496             self._file = file
 497         (version, self._num_objects) = read_pack_header(self._file)
 498         self._offset_cache = LRUSizeCache(1024*1024*20,
 499             compute_size=_compute_object_size)
 500
 501     @classmethod
 502     def from_file(cls, file, size):
 503         return cls(str(file), file=file, size=size)
 504
 505     @classmethod
 506     def from_path(cls, path):
 507         return cls(filename=path)
 508
 509     def close(self):
 510         self._file.close()
 511
 512     def _get_size(self):
 513         if self._size is not None:
 514             return self._size
 515         self._size = os.path.getsize(self._filename)
 516         assert self._size >= self._header_size, "%s is too small for a packfile (%d < %d)" % (self._filename, self._size, self._header_size)
 517         return self._size
 518
 519     def __len__(self):
 520         """Returns the number of objects in this pack."""
 521         return self._num_objects
 522
 523     def calculate_checksum(self):
 524         """Calculate the checksum for this pack.
 525
 526         :return: 20-byte binary SHA1 digest
 527         """
 528         s = make_sha()
 529         self._file.seek(0)
 530         todo = self._get_size() - 20
 531         while todo > 0:
 532             x = self._file.read(min(todo, 1<<16))
 533             s.update(x)
 534             todo -= len(x)
 535         return s.digest()
 536
 537     def resolve_object(self, offset, type, obj, get_ref, get_offset=None):
 538         """Resolve an object, possibly resolving deltas when necessary.
 539
 540         :return: Tuple with object type and contents.
 541         """
 542         if type not in (6, 7): # Not a delta
 543             return type, obj
 544
 545         if get_offset is None:
 546             get_offset = self.get_object_at
 547
 548         if type == 6: # offset delta
 549             (delta_offset, delta) = obj
 550             assert isinstance(delta_offset, int)
 551             assert isinstance(delta, str)
 552             base_offset = offset-delta_offset
 553             type, base_obj = get_offset(base_offset)
 554             assert isinstance(type, int)
 555         elif type == 7: # ref delta
 556             (basename, delta) = obj
 557             assert isinstance(basename, str) and len(basename) == 20
 558             assert isinstance(delta, str)
 559             type, base_obj = get_ref(basename)
 560             assert isinstance(type, int)
 561             # Can't be a ofs delta, as we wouldn't know the base offset
 562             assert type != 6
 563             base_offset = None
 564         type, base_chunks = self.resolve_object(base_offset, type, base_obj,
 565             get_ref)
 566         if base_offset is not None:
 567             self._offset_cache[base_offset] = type, base_chunks
 568         return (type, apply_delta(base_chunks, delta))
 569
 570     def iterobjects(self, progress=None):
 571
 572         class ObjectIterator(object):
 573
 574             def __init__(self, pack):
 575                 self.i = 0
 576                 self.offset = pack._header_size
 577                 self.num = len(pack)
 578                 self.map = pack._file
 579
 580             def __iter__(self):
 581                 return self
 582
 583             def __len__(self):
 584                 return self.num
 585
 586             def next(self):
 587                 if self.i == self.num:
 588                     raise StopIteration
 589                 self.map.seek(self.offset)
 590                 (type, obj, total_size, unused) = unpack_object(self.map.read)
 591                 self.map.seek(self.offset)
 592                 crc32 = zlib.crc32(self.map.read(total_size)) & 0xffffffff
 593                 ret = (self.offset, type, obj, crc32)
 594                 self.offset += total_size
 595                 if progress:
 596                     progress(self.i, self.num)
 597                 self.i+=1
 598                 return ret
 599         return ObjectIterator(self)
 600
 601     def iterentries(self, ext_resolve_ref=None, progress=None):
 602         """Yield entries summarizing the contents of this pack.
 603
 604         :param ext_resolve_ref: Optional function to resolve base
 605             objects (in case this is a thin pack)
 606         :param progress: Progress function, called with current and
 607             total object count.
 608
 609         This will yield tuples with (sha, offset, crc32)
 610         """
 611         found = {}
 612         postponed = defaultdict(list)
 613         class Postpone(Exception):
 614             """Raised to postpone delta resolving."""
 615
 616         def get_ref_text(sha):
 617             assert len(sha) == 20
 618             if sha in found:
 619                 return self.get_object_at(found[sha])
 620             if ext_resolve_ref:
 621                 try:
 622                     return ext_resolve_ref(sha)
 623                 except KeyError:
 624                     pass
 625             raise Postpone, (sha, )
 626         extra = []
 627         todo = chain(self.iterobjects(progress=progress), extra)
 628         for (offset, type, obj, crc32) in todo:
 629             assert isinstance(offset, int)
 630             assert isinstance(type, int)
 631             assert isinstance(obj, list) or isinstance(obj, str)
 632             try:
 633                 type, obj = self.resolve_object(offset, type, obj, get_ref_text)
 634             except Postpone, (sha, ):
 635                 postponed[sha].append((offset, type, obj))
 636             else:
 637                 shafile = ShaFile.from_raw_chunks(type, obj)
 638                 sha = shafile.sha().digest()
 639                 found[sha] = offset
 640                 yield sha, offset, crc32
 641                 extra.extend(postponed.get(sha, []))
 642         if postponed:
 643             raise KeyError([sha_to_hex(h) for h in postponed.keys()])
 644
 645     def sorted_entries(self, resolve_ext_ref=None, progress=None):
 646         """Return entries in this pack, sorted by SHA.
 647
 648         :param ext_resolve_ref: Optional function to resolve base
 649             objects (in case this is a thin pack)
 650         :param progress: Progress function, called with current and
 651             total object count.
 652         :return: List of tuples with (sha, offset, crc32)
 653         """
 654         ret = list(self.iterentries(resolve_ext_ref, progress=progress))
 655         ret.sort()
 656         return ret
 657
 658     def create_index_v1(self, filename, resolve_ext_ref=None, progress=None):
 659         """Create a version 1 file for this data file.
 660
 661         :param filename: Index filename.
 662         :param resolve_ext_ref: Function to use for resolving externally referenced
 663             SHA1s (for thin packs)
 664         :param progress: Progress report function
 665         """
 666         entries = self.sorted_entries(resolve_ext_ref, progress=progress)
 667         write_pack_index_v1(filename, entries, self.calculate_checksum())
 668
 669     def create_index_v2(self, filename, resolve_ext_ref=None, progress=None):
 670         """Create a version 2 index file for this data file.
 671
 672         :param filename: Index filename.
 673         :param resolve_ext_ref: Function to use for resolving externally referenced
 674             SHA1s (for thin packs)
 675         :param progress: Progress report function
 676         """
 677         entries = self.sorted_entries(resolve_ext_ref, progress=progress)
 678         write_pack_index_v2(filename, entries, self.calculate_checksum())
 679
 680     def create_index(self, filename, resolve_ext_ref=None, progress=None,
 681                      version=2):
 682         """Create an  index file for this data file.
 683
 684         :param filename: Index filename.
 685         :param resolve_ext_ref: Function to use for resolving externally referenced
 686             SHA1s (for thin packs)
 687         :param progress: Progress report function
 688         """
 689         if version == 1:
 690             self.create_index_v1(filename, resolve_ext_ref, progress)
 691         elif version == 2:
 692             self.create_index_v2(filename, resolve_ext_ref, progress)
 693         else:
 694             raise ValueError("unknown index format %d" % version)
 695
 696     def get_stored_checksum(self):
 697         """Return the expected checksum stored in this pack."""
 698         self._file.seek(self._get_size()-20)
 699         return self._file.read(20)
 700
 701     def check(self):
 702         """Check the consistency of this pack."""
 703         return (self.calculate_checksum() == self.get_stored_checksum())
 704
 705     def get_object_at(self, offset):
 706         """Given an offset in to the packfile return the object that is there.
 707
 708         Using the associated index the location of an object can be looked up, and
 709         then the packfile can be asked directly for that object using this
 710         function.
 711         """
 712         if offset in self._offset_cache:
 713             return self._offset_cache[offset]
 714         assert isinstance(offset, long) or isinstance(offset, int),\
 715                 "offset was %r" % offset
 716         assert offset >= self._header_size
 717         self._file.seek(offset)
 718         return unpack_object(self._file.read)[:2]
 719
 720
 721 class SHA1Reader(object):
 722     """Wrapper around a file-like object that remembers the SHA1 of
 723     the data read from it."""
 724
 725     def __init__(self, f):
 726         self.f = f
 727         self.sha1 = make_sha("")
 728
 729     def read(self, num=None):
 730         data = self.f.read(num)
 731         self.sha1.update(data)
 732         return data
 733
 734     def check_sha(self):
 735         stored = self.f.read(20)
 736         if stored != self.sha1.digest():
 737             raise ChecksumMismatch(self.sha1.hexdigest(), sha_to_hex(stored))
 738
 739     def close(self):
 740         return self.f.close()
 741
 742     def tell(self):
 743         return self.f.tell()
 744
 745
 746 class SHA1Writer(object):
 747     """Wrapper around a file-like object that remembers the SHA1 of
 748     the data written to it."""
 749
 750     def __init__(self, f):
 751         self.f = f
 752         self.sha1 = make_sha("")
 753
 754     def write(self, data):
 755         self.sha1.update(data)
 756         self.f.write(data)
 757
 758     def write_sha(self):
 759         sha = self.sha1.digest()
 760         assert len(sha) == 20
 761         self.f.write(sha)
 762         return sha
 763
 764     def close(self):
 765         sha = self.write_sha()
 766         self.f.close()
 767         return sha
 768
 769     def tell(self):
 770         return self.f.tell()
 771
 772
 773 def write_pack_object(f, type, object):
 774     """Write pack object to a file.
 775
 776     :param f: File to write to
 777     :param o: Object to write
 778     :return: Tuple with offset at which the object was written, and crc32
 779     """
 780     offset = f.tell()
 781     packed_data_hdr = ""
 782     if type == 6: # offset delta
 783         (delta_base_offset, object) = object
 784     elif type == 7: # ref delta
 785         (basename, object) = object
 786     size = len(object)
 787     c = (type << 4) | (size & 15)
 788     size >>= 4
 789     while size:
 790         packed_data_hdr += (chr(c | 0x80))
 791         c = size & 0x7f
 792         size >>= 7
 793     packed_data_hdr += chr(c)
 794     if type == 6: # offset delta
 795         ret = [delta_base_offset & 0x7f]
 796         delta_base_offset >>= 7
 797         while delta_base_offset:
 798             delta_base_offset -= 1
 799             ret.insert(0, 0x80 | (delta_base_offset & 0x7f))
 800             delta_base_offset >>= 7
 801         packed_data_hdr += "".join([chr(x) for x in ret])
 802     elif type == 7: # ref delta
 803         assert len(basename) == 20
 804         packed_data_hdr += basename
 805     packed_data = packed_data_hdr + zlib.compress(object)
 806     f.write(packed_data)
 807     return (offset, (zlib.crc32(packed_data) & 0xffffffff))
 808
 809
 810 def write_pack(filename, objects, num_objects):
 811     """Write a new pack data file.
 812
 813     :param filename: Path to the new pack file (without .pack extension)
 814     :param objects: Iterable over (object, path) tuples to write
 815     :param num_objects: Number of objects to write
 816     """
 817     f = GitFile(filename + ".pack", 'wb')
 818     try:
 819         entries, data_sum = write_pack_data(f, objects, num_objects)
 820     finally:
 821         f.close()
 822     entries.sort()
 823     write_pack_index_v2(filename + ".idx", entries, data_sum)
 824
 825
 826 def write_pack_data(f, objects, num_objects, window=10):
 827     """Write a new pack file.
 828
 829     :param filename: The filename of the new pack file.
 830     :param objects: List of objects to write (tuples with object and path)
 831     :return: List with (name, offset, crc32 checksum) entries, pack checksum
 832     """
 833     recency = list(objects)
 834     # FIXME: Somehow limit delta depth
 835     # FIXME: Make thin-pack optional (its not used when cloning a pack)
 836     # Build a list of objects ordered by the magic Linus heuristic
 837     # This helps us find good objects to diff against us
 838     magic = []
 839     for obj, path in recency:
 840         magic.append( (obj.type, path, 1, -obj.raw_length(), obj) )
 841     magic.sort()
 842     # Build a map of objects and their index in magic - so we can find preceeding objects
 843     # to diff against
 844     offs = {}
 845     for i in range(len(magic)):
 846         offs[magic[i][4]] = i
 847     # Write the pack
 848     entries = []
 849     f = SHA1Writer(f)
 850     f.write("PACK")               # Pack header
 851     f.write(struct.pack(">L", 2)) # Pack version
 852     f.write(struct.pack(">L", num_objects)) # Number of objects in pack
 853     for o, path in recency:
 854         sha1 = o.sha().digest()
 855         orig_t = o.type
 856         raw = o.as_raw_string()
 857         winner = raw
 858         t = orig_t
 859         #for i in range(offs[o]-window, window):
 860         #    if i < 0 or i >= len(offs): continue
 861         #    b = magic[i][4]
 862         #    if b.type != orig_t: continue
 863         #    base = b.as_raw_string()
 864         #    delta = create_delta(base, raw)
 865         #    if len(delta) < len(winner):
 866         #        winner = delta
 867         #        t = 6 if magic[i][2] == 1 else 7
 868         offset, crc32 = write_pack_object(f, t, winner)
 869         entries.append((sha1, offset, crc32))
 870     return entries, f.write_sha()
 871
 872
 873 def write_pack_index_v1(filename, entries, pack_checksum):
 874     """Write a new pack index file.
 875
 876     :param filename: The filename of the new pack index file.
 877     :param entries: List of tuples with object name (sha), offset_in_pack,  and
 878             crc32_checksum.
 879     :param pack_checksum: Checksum of the pack file.
 880     """
 881     f = GitFile(filename, 'wb')
 882     f = SHA1Writer(f)
 883     fan_out_table = defaultdict(lambda: 0)
 884     for (name, offset, entry_checksum) in entries:
 885         fan_out_table[ord(name[0])] += 1
 886     # Fan-out table
 887     for i in range(0x100):
 888         f.write(struct.pack(">L", fan_out_table[i]))
 889         fan_out_table[i+1] += fan_out_table[i]
 890     for (name, offset, entry_checksum) in entries:
 891         f.write(struct.pack(">L20s", offset, name))
 892     assert len(pack_checksum) == 20
 893     f.write(pack_checksum)
 894     f.close()
 895
 896
 897 def create_delta(base_buf, target_buf):
 898     """Use python difflib to work out how to transform base_buf to target_buf.
 899
 900     :param base_buf: Base buffer
 901     :param target_buf: Target buffer
 902     """
 903     assert isinstance(base_buf, str)
 904     assert isinstance(target_buf, str)
 905     out_buf = ""
 906     # write delta header
 907     def encode_size(size):
 908         ret = ""
 909         c = size & 0x7f
 910         size >>= 7
 911         while size:
 912             ret += chr(c | 0x80)
 913             c = size & 0x7f
 914             size >>= 7
 915         ret += chr(c)
 916         return ret
 917     out_buf += encode_size(len(base_buf))
 918     out_buf += encode_size(len(target_buf))
 919     # write out delta opcodes
 920     seq = difflib.SequenceMatcher(a=base_buf, b=target_buf)
 921     for opcode, i1, i2, j1, j2 in seq.get_opcodes():
 922         # Git patch opcodes don't care about deletes!
 923         #if opcode == "replace" or opcode == "delete":
 924         #    pass
 925         if opcode == "equal":
 926             # If they are equal, unpacker will use data from base_buf
 927             # Write out an opcode that says what range to use
 928             scratch = ""
 929             op = 0x80
 930             o = i1
 931             for i in range(4):
 932                 if o & 0xff << i*8:
 933                     scratch += chr((o >> i*8) & 0xff)
 934                     op |= 1 << i
 935             s = i2 - i1
 936             for i in range(2):
 937                 if s & 0xff << i*8:
 938                     scratch += chr((s >> i*8) & 0xff)
 939                     op |= 1 << (4+i)
 940             out_buf += chr(op)
 941             out_buf += scratch
 942         if opcode == "replace" or opcode == "insert":
 943             # If we are replacing a range or adding one, then we just
 944             # output it to the stream (prefixed by its size)
 945             s = j2 - j1
 946             o = j1
 947             while s > 127:
 948                 out_buf += chr(127)
 949                 out_buf += target_buf[o:o+127]
 950                 s -= 127
 951                 o += 127
 952             out_buf += chr(s)
 953             out_buf += target_buf[o:o+s]
 954     return out_buf
 955
 956
 957 def apply_delta(src_buf, delta):
 958     """Based on the similar function in git's patch-delta.c.
 959
 960     :param src_buf: Source buffer
 961     :param delta: Delta instructions
 962     """
 963     if type(src_buf) != str:
 964         src_buf = "".join(src_buf)
 965     assert isinstance(delta, str)
 966     out = []
 967     index = 0
 968     delta_length = len(delta)
 969     def get_delta_header_size(delta, index):
 970         size = 0
 971         i = 0
 972         while delta:
 973             cmd = ord(delta[index])
 974             index += 1
 975             size |= (cmd & ~0x80) << i
 976             i += 7
 977             if not cmd & 0x80:
 978                 break
 979         return size, index
 980     src_size, index = get_delta_header_size(delta, index)
 981     dest_size, index = get_delta_header_size(delta, index)
 982     assert src_size == len(src_buf), "%d vs %d" % (src_size, len(src_buf))
 983     while index < delta_length:
 984         cmd = ord(delta[index])
 985         index += 1
 986         if cmd & 0x80:
 987             cp_off = 0
 988             for i in range(4):
 989                 if cmd & (1 << i):
 990                     x = ord(delta[index])
 991                     index += 1
 992                     cp_off |= x << (i * 8)
 993             cp_size = 0
 994             for i in range(3):
 995                 if cmd & (1 << (4+i)):
 996                     x = ord(delta[index])
 997                     index += 1
 998                     cp_size |= x << (i * 8)
 999             if cp_size == 0:
1000                 cp_size = 0x10000
1001             if (cp_off + cp_size < cp_size or
1002                 cp_off + cp_size > src_size or
1003                 cp_size > dest_size):
1004                 break
1005             out.append(src_buf[cp_off:cp_off+cp_size])
1006         elif cmd != 0:
1007             out.append(delta[index:index+cmd])
1008             index += cmd
1009         else:
1010             raise ApplyDeltaError("Invalid opcode 0")
1011
1012     if index != delta_length:
1013         raise ApplyDeltaError("delta not empty: %r" % delta[index:])
1014
1015     if dest_size != chunks_length(out):
1016         raise ApplyDeltaError("dest size incorrect")
1017
1018     return out
1019
1020
1021 def write_pack_index_v2(filename, entries, pack_checksum):
1022     """Write a new pack index file.
1023
1024     :param filename: The filename of the new pack index file.
1025     :param entries: List of tuples with object name (sha), offset_in_pack,  and
1026             crc32_checksum.
1027     :param pack_checksum: Checksum of the pack file.
1028     """
1029     f = GitFile(filename, 'wb')
1030     f = SHA1Writer(f)
1031     f.write('\377tOc') # Magic!
1032     f.write(struct.pack(">L", 2))
1033     fan_out_table = defaultdict(lambda: 0)
1034     for (name, offset, entry_checksum) in entries:
1035         fan_out_table[ord(name[0])] += 1
1036     # Fan-out table
1037     for i in range(0x100):
1038         f.write(struct.pack(">L", fan_out_table[i]))
1039         fan_out_table[i+1] += fan_out_table[i]
1040     for (name, offset, entry_checksum) in entries:
1041         f.write(name)
1042     for (name, offset, entry_checksum) in entries:
1043         f.write(struct.pack(">L", entry_checksum))
1044     for (name, offset, entry_checksum) in entries:
1045         # FIXME: handle if MSBit is set in offset
1046         f.write(struct.pack(">L", offset))
1047     # FIXME: handle table for pack files > 8 Gb
1048     assert len(pack_checksum) == 20
1049     f.write(pack_checksum)
1050     f.close()
1051
1052
1053 class Pack(object):
1054     """A Git pack object."""
1055
1056     def __init__(self, basename):
1057         self._basename = basename
1058         self._data_path = self._basename + ".pack"
1059         self._idx_path = self._basename + ".idx"
1060         self._data = None
1061         self._idx = None
1062
1063     @classmethod
1064     def from_objects(self, data, idx):
1065         """Create a new pack object from pack data and index objects."""
1066         ret = Pack("")
1067         ret._data = data
1068         ret._idx = idx
1069         return ret
1070
1071     def name(self):
1072         """The SHA over the SHAs of the objects in this pack."""
1073         return self.index.objects_sha1()
1074
1075     @property
1076     def data(self):
1077         """The pack data object being used."""
1078         if self._data is None:
1079             self._data = PackData(self._data_path)
1080             assert len(self.index) == len(self._data)
1081             idx_stored_checksum = self.index.get_pack_checksum()
1082             data_stored_checksum = self._data.get_stored_checksum()
1083             if idx_stored_checksum != data_stored_checksum:
1084                 raise ChecksumMismatch(sha_to_hex(idx_stored_checksum),
1085                                        sha_to_hex(data_stored_checksum))
1086         return self._data
1087
1088     @property
1089     def index(self):
1090         """The index being used.
1091
1092         :note: This may be an in-memory index
1093         """
1094         if self._idx is None:
1095             self._idx = load_pack_index(self._idx_path)
1096         return self._idx
1097
1098     def close(self):
1099         if self._data is not None:
1100             self._data.close()
1101         self.index.close()
1102
1103     def __eq__(self, other):
1104         return type(self) == type(other) and self.index == other.index
1105
1106     def __len__(self):
1107         """Number of entries in this pack."""
1108         return len(self.index)
1109
1110     def __repr__(self):
1111         return "%s(%r)" % (self.__class__.__name__, self._basename)
1112
1113     def __iter__(self):
1114         """Iterate over all the sha1s of the objects in this pack."""
1115         return iter(self.index)
1116
1117     def check(self):
1118         """Check the integrity of this pack."""
1119         if not self.index.check():
1120             return False
1121         if not self.data.check():
1122             return False
1123         return True
1124
1125     def get_stored_checksum(self):
1126         return self.data.get_stored_checksum()
1127
1128     def __contains__(self, sha1):
1129         """Check whether this pack contains a particular SHA1."""
1130         try:
1131             self.index.object_index(sha1)
1132             return True
1133         except KeyError:
1134             return False
1135
1136     def get_raw(self, sha1, resolve_ref=None):
1137         offset = self.index.object_index(sha1)
1138         obj_type, obj = self.data.get_object_at(offset)
1139         if type(offset) is long:
1140           offset = int(offset)
1141         if resolve_ref is None:
1142             resolve_ref = self.get_raw
1143         kind, chunks = self.data.resolve_object(offset, obj_type, obj,
1144             resolve_ref)
1145         return kind, "".join(chunks)
1146
1147     def __getitem__(self, sha1):
1148         """Retrieve the specified SHA1."""
1149         type, uncomp = self.get_raw(sha1)
1150         return ShaFile.from_raw_string(type, uncomp)
1151
1152     def iterobjects(self, get_raw=None):
1153         """Iterate over the objects in this pack."""
1154         if get_raw is None:
1155             get_raw = self.get_raw
1156         for offset, type, obj, crc32 in self.data.iterobjects():
1157             assert isinstance(offset, int)
1158             type, obj = self.data.resolve_object(offset, type, obj, get_raw)
1159             yield ShaFile.from_raw_chunks(type, obj)
1160
1161
1162 try:
1163     from dulwich._pack import apply_delta, bisect_find_sha
1164 except ImportError:
1165     pass