dulwich/pack.py

   1 # pack.py -- For dealing wih packed git objects.
   2 # Copyright (C) 2007 James Westby <jw+debian@jameswestby.net>
   3 # Copryight (C) 2008-2009 Jelmer Vernooij <jelmer@samba.org>
   4 #
   5 # This program is free software; you can redistribute it and/or
   6 # modify it under the terms of the GNU General Public License
   7 # as published by the Free Software Foundation; version 2
   8 # of the License or (at your option) a later version.
   9 #
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 # GNU General Public License for more details.
  14 #
  15 # You should have received a copy of the GNU General Public License
  16 # along with this program; if not, write to the Free Software
  17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
  18 # MA  02110-1301, USA.
  19
  20 """Classes for dealing with packed git objects.
  21
  22 A pack is a compact representation of a bunch of objects, stored
  23 using deltas where possible.
  24
  25 They have two parts, the pack file, which stores the data, and an index
  26 that tells you where the data is.
  27
  28 To find an object you look in all of the index files 'til you find a
  29 match for the object name. You then use the pointer got from this as
  30 a pointer in to the corresponding packfile.
  31 """
  32
  33 try:
  34     from collections import defaultdict
  35 except ImportError:
  36     from misc import defaultdict
  37
  38 import difflib
  39 from itertools import (
  40     chain,
  41     imap,
  42     izip,
  43     )
  44 import mmap
  45 import os
  46 import struct
  47 try:
  48     from struct import unpack_from
  49 except ImportError:
  50     from dulwich.misc import unpack_from
  51 import sys
  52 import zlib
  53
  54 from dulwich.errors import (
  55     ApplyDeltaError,
  56     ChecksumMismatch,
  57     )
  58 from dulwich.file import GitFile
  59 from dulwich.lru_cache import (
  60     LRUSizeCache,
  61     )
  62 from dulwich.objects import (
  63     ShaFile,
  64     hex_to_sha,
  65     sha_to_hex,
  66     )
  67 from dulwich.misc import (
  68     make_sha,
  69     )
  70
  71 supports_mmap_offset = (sys.version_info[0] >= 3 or
  72         (sys.version_info[0] == 2 and sys.version_info[1] >= 6))
  73
  74
  75 def take_msb_bytes(read):
  76     """Read bytes marked with most significant bit.
  77
  78     :param read: Read function
  79     """
  80     ret = []
  81     while len(ret) == 0 or ret[-1] & 0x80:
  82         ret.append(ord(read(1)))
  83     return ret
  84
  85
  86 def read_zlib_chunks(read, buffer_size=4096):
  87     """Read chunks of zlib data from a buffer.
  88
  89     :param read: Read function
  90     :return: Tuple with list of chunks, length of
  91         compressed data length and unused read data
  92     """
  93     obj = zlib.decompressobj()
  94     ret = []
  95     fed = 0
  96     while obj.unused_data == "":
  97         add = read(buffer_size)
  98         if len(add) < buffer_size:
  99             add += "Z"
 100         fed += len(add)
 101         ret.append(obj.decompress(add))
 102     comp_len = fed-len(obj.unused_data)
 103     return ret, comp_len, obj.unused_data
 104
 105
 106 def iter_sha1(iter):
 107     """Return the hexdigest of the SHA1 over a set of names.
 108
 109     :param iter: Iterator over string objects
 110     :return: 40-byte hex sha1 digest
 111     """
 112     sha1 = make_sha()
 113     for name in iter:
 114         sha1.update(name)
 115     return sha1.hexdigest()
 116
 117
 118 def load_pack_index(path):
 119     """Load an index file by path.
 120
 121     :param filename: Path to the index file
 122     """
 123     f = GitFile(path, 'rb')
 124     return load_pack_index_file(path, f)
 125
 126
 127 def load_pack_index_file(path, f):
 128     """Load an index file from a file-like object.
 129
 130     :param path: Path for the index file
 131     :param f: File-like object
 132     """
 133     if f.read(4) == '\377tOc':
 134         version = struct.unpack(">L", f.read(4))[0]
 135         if version == 2:
 136             f.seek(0)
 137             return PackIndex2(path, file=f)
 138         else:
 139             raise KeyError("Unknown pack index format %d" % version)
 140     else:
 141         f.seek(0)
 142         return PackIndex1(path, file=f)
 143
 144
 145 def bisect_find_sha(start, end, sha, unpack_name):
 146     """Find a SHA in a data blob with sorted SHAs.
 147
 148     :param start: Start index of range to search
 149     :param end: End index of range to search
 150     :param sha: Sha to find
 151     :param unpack_name: Callback to retrieve SHA by index
 152     :return: Index of the SHA, or None if it wasn't found
 153     """
 154     assert start <= end
 155     while start <= end:
 156         i = (start + end)/2
 157         file_sha = unpack_name(i)
 158         x = cmp(file_sha, sha)
 159         if x < 0:
 160             start = i + 1
 161         elif x > 0:
 162             end = i - 1
 163         else:
 164             return i
 165     return None
 166
 167
 168 class PackIndex(object):
 169     """An index in to a packfile.
 170
 171     Given a sha id of an object a pack index can tell you the location in the
 172     packfile of that object if it has it.
 173
 174     To do the loop it opens the file, and indexes first 256 4 byte groups
 175     with the first byte of the sha id. The value in the four byte group indexed
 176     is the end of the group that shares the same starting byte. Subtract one
 177     from the starting byte and index again to find the start of the group.
 178     The values are sorted by sha id within the group, so do the math to find
 179     the start and end offset and then bisect in to find if the value is present.
 180     """
 181
 182     def __init__(self, filename, file=None, size=None):
 183         """Create a pack index object.
 184
 185         Provide it with the name of the index file to consider, and it will map
 186         it whenever required.
 187         """
 188         self._filename = filename
 189         # Take the size now, so it can be checked each time we map the file to
 190         # ensure that it hasn't changed.
 191         if file is None:
 192             self._file = GitFile(filename, 'rb')
 193         else:
 194             self._file = file
 195         fileno = getattr(self._file, 'fileno', None)
 196         if fileno is not None:
 197             fd = self._file.fileno()
 198             if size is None:
 199                 self._size = os.fstat(fd).st_size
 200             else:
 201                 self._size = size
 202             self._contents = mmap.mmap(fd, self._size,
 203                 access=mmap.ACCESS_READ)
 204         else:
 205             self._file.seek(0)
 206             self._contents = self._file.read()
 207             self._size = len(self._contents)
 208
 209     def __eq__(self, other):
 210         if not isinstance(other, PackIndex):
 211             return False
 212
 213         if self._fan_out_table != other._fan_out_table:
 214             return False
 215
 216         for (name1, _, _), (name2, _, _) in izip(self.iterentries(), other.iterentries()):
 217             if name1 != name2:
 218                 return False
 219         return True
 220
 221     def __ne__(self, other):
 222         return not self.__eq__(other)
 223
 224     def close(self):
 225         self._file.close()
 226
 227     def __len__(self):
 228         """Return the number of entries in this pack index."""
 229         return self._fan_out_table[-1]
 230
 231     def _unpack_entry(self, i):
 232         """Unpack the i-th entry in the index file.
 233
 234         :return: Tuple with object name (SHA), offset in pack file and
 235               CRC32 checksum (if known)."""
 236         raise NotImplementedError(self._unpack_entry)
 237
 238     def _unpack_name(self, i):
 239         """Unpack the i-th name from the index file."""
 240         raise NotImplementedError(self._unpack_name)
 241
 242     def _unpack_offset(self, i):
 243         """Unpack the i-th object offset from the index file."""
 244         raise NotImplementedError(self._unpack_offset)
 245
 246     def _unpack_crc32_checksum(self, i):
 247         """Unpack the crc32 checksum for the i-th object from the index file."""
 248         raise NotImplementedError(self._unpack_crc32_checksum)
 249
 250     def __iter__(self):
 251         """Iterate over the SHAs in this pack."""
 252         return imap(sha_to_hex, self._itersha())
 253
 254     def _itersha(self):
 255         for i in range(len(self)):
 256             yield self._unpack_name(i)
 257
 258     def objects_sha1(self):
 259         """Return the hex SHA1 over all the shas of all objects in this pack.
 260
 261         :note: This is used for the filename of the pack.
 262         """
 263         return iter_sha1(self._itersha())
 264
 265     def iterentries(self):
 266         """Iterate over the entries in this pack index.
 267
 268         Will yield tuples with object name, offset in packfile and crc32 checksum.
 269         """
 270         for i in range(len(self)):
 271             yield self._unpack_entry(i)
 272
 273     def _read_fan_out_table(self, start_offset):
 274         ret = []
 275         for i in range(0x100):
 276             ret.append(struct.unpack(">L", self._contents[start_offset+i*4:start_offset+(i+1)*4])[0])
 277         return ret
 278
 279     def check(self):
 280         """Check that the stored checksum matches the actual checksum."""
 281         # TODO: Check pack contents, too
 282         return self.calculate_checksum() == self.get_stored_checksum()
 283
 284     def calculate_checksum(self):
 285         """Calculate the SHA1 checksum over this pack index.
 286
 287         :return: This is a 20-byte binary digest
 288         """
 289         return make_sha(self._contents[:-20]).digest()
 290
 291     def get_pack_checksum(self):
 292         """Return the SHA1 checksum stored for the corresponding packfile.
 293
 294         :return: 20-byte binary digest
 295         """
 296         return str(self._contents[-40:-20])
 297
 298     def get_stored_checksum(self):
 299         """Return the SHA1 checksum stored for this index.
 300
 301         :return: 20-byte binary digest
 302         """
 303         return str(self._contents[-20:])
 304
 305     def object_index(self, sha):
 306         """Return the index in to the corresponding packfile for the object.
 307
 308         Given the name of an object it will return the offset that object lives
 309         at within the corresponding pack file. If the pack file doesn't have the
 310         object then None will be returned.
 311         """
 312         if len(sha) == 40:
 313             sha = hex_to_sha(sha)
 314         return self._object_index(sha)
 315
 316     def _object_index(self, sha):
 317         """See object_index.
 318
 319         :param sha: A *binary* SHA string. (20 characters long)_
 320         """
 321         assert len(sha) == 20
 322         idx = ord(sha[0])
 323         if idx == 0:
 324             start = 0
 325         else:
 326             start = self._fan_out_table[idx-1]
 327         end = self._fan_out_table[idx]
 328         i = bisect_find_sha(start, end, sha, self._unpack_name)
 329         if i is None:
 330             raise KeyError(sha)
 331         return self._unpack_offset(i)
 332
 333
 334
 335 class PackIndex1(PackIndex):
 336     """Version 1 Pack Index."""
 337
 338     def __init__(self, filename, file=None, size=None):
 339         PackIndex.__init__(self, filename, file, size)
 340         self.version = 1
 341         self._fan_out_table = self._read_fan_out_table(0)
 342
 343     def _unpack_entry(self, i):
 344         (offset, name) = unpack_from(">L20s", self._contents,
 345             (0x100 * 4) + (i * 24))
 346         return (name, offset, None)
 347
 348     def _unpack_name(self, i):
 349         offset = (0x100 * 4) + (i * 24) + 4
 350         return self._contents[offset:offset+20]
 351
 352     def _unpack_offset(self, i):
 353         offset = (0x100 * 4) + (i * 24)
 354         return unpack_from(">L", self._contents, offset)[0]
 355
 356     def _unpack_crc32_checksum(self, i):
 357         # Not stored in v1 index files
 358         return None
 359
 360
 361 class PackIndex2(PackIndex):
 362     """Version 2 Pack Index."""
 363
 364     def __init__(self, filename, file=None, size=None):
 365         PackIndex.__init__(self, filename, file, size)
 366         assert self._contents[:4] == '\377tOc', "Not a v2 pack index file"
 367         (self.version, ) = unpack_from(">L", self._contents, 4)
 368         assert self.version == 2, "Version was %d" % self.version
 369         self._fan_out_table = self._read_fan_out_table(8)
 370         self._name_table_offset = 8 + 0x100 * 4
 371         self._crc32_table_offset = self._name_table_offset + 20 * len(self)
 372         self._pack_offset_table_offset = self._crc32_table_offset + 4 * len(self)
 373
 374     def _unpack_entry(self, i):
 375         return (self._unpack_name(i), self._unpack_offset(i),
 376                 self._unpack_crc32_checksum(i))
 377
 378     def _unpack_name(self, i):
 379         offset = self._name_table_offset + i * 20
 380         return self._contents[offset:offset+20]
 381
 382     def _unpack_offset(self, i):
 383         offset = self._pack_offset_table_offset + i * 4
 384         return unpack_from(">L", self._contents, offset)[0]
 385
 386     def _unpack_crc32_checksum(self, i):
 387         return unpack_from(">L", self._contents,
 388                           self._crc32_table_offset + i * 4)[0]
 389
 390
 391
 392 def read_pack_header(f):
 393     """Read the header of a pack file.
 394
 395     :param f: File-like object to read from
 396     """
 397     header = f.read(12)
 398     assert header[:4] == "PACK"
 399     (version,) = unpack_from(">L", header, 4)
 400     assert version in (2, 3), "Version was %d" % version
 401     (num_objects,) = unpack_from(">L", header, 8)
 402     return (version, num_objects)
 403
 404
 405 def chunks_length(chunks):
 406     return sum(imap(len, chunks))
 407
 408
 409 def unpack_object(read):
 410     """Unpack a Git object.
 411
 412     :return: tuple with type, uncompressed data as chunks, compressed size and
 413         tail data
 414     """
 415     bytes = take_msb_bytes(read)
 416     type = (bytes[0] >> 4) & 0x07
 417     size = bytes[0] & 0x0f
 418     for i, byte in enumerate(bytes[1:]):
 419         size += (byte & 0x7f) << ((i * 7) + 4)
 420     raw_base = len(bytes)
 421     if type == 6: # offset delta
 422         bytes = take_msb_bytes(read)
 423         raw_base += len(bytes)
 424         assert not (bytes[-1] & 0x80)
 425         delta_base_offset = bytes[0] & 0x7f
 426         for byte in bytes[1:]:
 427             delta_base_offset += 1
 428             delta_base_offset <<= 7
 429             delta_base_offset += (byte & 0x7f)
 430         uncomp, comp_len, unused = read_zlib_chunks(read)
 431         assert size == chunks_length(uncomp)
 432         return type, (delta_base_offset, uncomp), comp_len+raw_base, unused
 433     elif type == 7: # ref delta
 434         basename = read(20)
 435         raw_base += 20
 436         uncomp, comp_len, unused = read_zlib_chunks(read)
 437         assert size == chunks_length(uncomp)
 438         return type, (basename, uncomp), comp_len+raw_base, unused
 439     else:
 440         uncomp, comp_len, unused = read_zlib_chunks(read)
 441         assert chunks_length(uncomp) == size
 442         return type, uncomp, comp_len+raw_base, unused
 443
 444
 445 def _compute_object_size((num, obj)):
 446     """Compute the size of a unresolved object for use with LRUSizeCache.
 447     """
 448     if num in (6, 7):
 449         return chunks_length(obj[1])
 450     return chunks_length(obj)
 451
 452
 453 class PackData(object):
 454     """The data contained in a packfile.
 455
 456     Pack files can be accessed both sequentially for exploding a pack, and
 457     directly with the help of an index to retrieve a specific object.
 458
 459     The objects within are either complete or a delta aginst another.
 460
 461     The header is variable length. If the MSB of each byte is set then it
 462     indicates that the subsequent byte is still part of the header.
 463     For the first byte the next MS bits are the type, which tells you the type
 464     of object, and whether it is a delta. The LS byte is the lowest bits of the
 465     size. For each subsequent byte the LS 7 bits are the next MS bits of the
 466     size, i.e. the last byte of the header contains the MS bits of the size.
 467
 468     For the complete objects the data is stored as zlib deflated data.
 469     The size in the header is the uncompressed object size, so to uncompress
 470     you need to just keep feeding data to zlib until you get an object back,
 471     or it errors on bad data. This is done here by just giving the complete
 472     buffer from the start of the deflated object on. This is bad, but until I
 473     get mmap sorted out it will have to do.
 474
 475     Currently there are no integrity checks done. Also no attempt is made to try
 476     and detect the delta case, or a request for an object at the wrong position.
 477     It will all just throw a zlib or KeyError.
 478     """
 479
 480     def __init__(self, filename, file=None, size=None):
 481         """Create a PackData object that represents the pack in the given filename.
 482
 483         The file must exist and stay readable until the object is disposed of. It
 484         must also stay the same size. It will be mapped whenever needed.
 485
 486         Currently there is a restriction on the size of the pack as the python
 487         mmap implementation is flawed.
 488         """
 489         self._filename = filename
 490         self._size = size
 491         self._header_size = 12
 492         if file is None:
 493             self._file = GitFile(self._filename, 'rb')
 494         else:
 495             self._file = file
 496         (version, self._num_objects) = read_pack_header(self._file)
 497         self._offset_cache = LRUSizeCache(1024*1024*20,
 498             compute_size=_compute_object_size)
 499
 500     @classmethod
 501     def from_file(cls, file, size):
 502         return cls(str(file), file=file, size=size)
 503
 504     @classmethod
 505     def from_path(cls, path):
 506         return cls(filename=path)
 507
 508     def close(self):
 509         self._file.close()
 510
 511     def _get_size(self):
 512         if self._size is not None:
 513             return self._size
 514         self._size = os.path.getsize(self._filename)
 515         assert self._size >= self._header_size, "%s is too small for a packfile (%d < %d)" % (self._filename, self._size, self._header_size)
 516         return self._size
 517
 518     def __len__(self):
 519         """Returns the number of objects in this pack."""
 520         return self._num_objects
 521
 522     def calculate_checksum(self):
 523         """Calculate the checksum for this pack.
 524
 525         :return: 20-byte binary SHA1 digest
 526         """
 527         s = make_sha()
 528         self._file.seek(0)
 529         todo = self._get_size() - 20
 530         while todo > 0:
 531             x = self._file.read(min(todo, 1<<16))
 532             s.update(x)
 533             todo -= len(x)
 534         return s.digest()
 535
 536     def resolve_object(self, offset, type, obj, get_ref, get_offset=None):
 537         """Resolve an object, possibly resolving deltas when necessary.
 538
 539         :return: Tuple with object type and contents.
 540         """
 541         if type not in (6, 7): # Not a delta
 542             return type, obj
 543
 544         if get_offset is None:
 545             get_offset = self.get_object_at
 546
 547         if type == 6: # offset delta
 548             (delta_offset, delta) = obj
 549             assert isinstance(delta_offset, int)
 550             base_offset = offset-delta_offset
 551             type, base_obj = get_offset(base_offset)
 552             assert isinstance(type, int)
 553         elif type == 7: # ref delta
 554             (basename, delta) = obj
 555             assert isinstance(basename, str) and len(basename) == 20
 556             type, base_obj = get_ref(basename)
 557             assert isinstance(type, int)
 558             # Can't be a ofs delta, as we wouldn't know the base offset
 559             assert type != 6
 560             base_offset = None
 561         type, base_chunks = self.resolve_object(base_offset, type, base_obj,
 562             get_ref)
 563         if base_offset is not None:
 564             self._offset_cache[base_offset] = type, base_chunks
 565         return (type, apply_delta(base_chunks, delta))
 566
 567     def iterobjects(self, progress=None):
 568
 569         class ObjectIterator(object):
 570
 571             def __init__(self, pack):
 572                 self.i = 0
 573                 self.offset = pack._header_size
 574                 self.num = len(pack)
 575                 self.map = pack._file
 576
 577             def __iter__(self):
 578                 return self
 579
 580             def __len__(self):
 581                 return self.num
 582
 583             def next(self):
 584                 if self.i == self.num:
 585                     raise StopIteration
 586                 self.map.seek(self.offset)
 587                 (type, obj, total_size, unused) = unpack_object(self.map.read)
 588                 self.map.seek(self.offset)
 589                 crc32 = zlib.crc32(self.map.read(total_size)) & 0xffffffff
 590                 ret = (self.offset, type, obj, crc32)
 591                 self.offset += total_size
 592                 if progress:
 593                     progress(self.i, self.num)
 594                 self.i+=1
 595                 return ret
 596         return ObjectIterator(self)
 597
 598     def iterentries(self, ext_resolve_ref=None, progress=None):
 599         """Yield entries summarizing the contents of this pack.
 600
 601         :param ext_resolve_ref: Optional function to resolve base
 602             objects (in case this is a thin pack)
 603         :param progress: Progress function, called with current and
 604             total object count.
 605
 606         This will yield tuples with (sha, offset, crc32)
 607         """
 608         found = {}
 609         postponed = defaultdict(list)
 610         class Postpone(Exception):
 611             """Raised to postpone delta resolving."""
 612
 613         def get_ref_text(sha):
 614             assert len(sha) == 20
 615             if sha in found:
 616                 return self.get_object_at(found[sha])
 617             if ext_resolve_ref:
 618                 try:
 619                     return ext_resolve_ref(sha)
 620                 except KeyError:
 621                     pass
 622             raise Postpone, (sha, )
 623         extra = []
 624         todo = chain(self.iterobjects(progress=progress), extra)
 625         for (offset, type, obj, crc32) in todo:
 626             assert isinstance(offset, int)
 627             assert isinstance(type, int)
 628             assert isinstance(obj, list) or isinstance(obj, str)
 629             try:
 630                 type, obj = self.resolve_object(offset, type, obj, get_ref_text)
 631             except Postpone, (sha, ):
 632                 postponed[sha].append((offset, type, obj))
 633             else:
 634                 shafile = ShaFile.from_raw_chunks(type, obj)
 635                 sha = shafile.sha().digest()
 636                 found[sha] = offset
 637                 yield sha, offset, crc32
 638                 extra.extend(postponed.get(sha, []))
 639         if postponed:
 640             raise KeyError([sha_to_hex(h) for h in postponed.keys()])
 641
 642     def sorted_entries(self, resolve_ext_ref=None, progress=None):
 643         """Return entries in this pack, sorted by SHA.
 644
 645         :param ext_resolve_ref: Optional function to resolve base
 646             objects (in case this is a thin pack)
 647         :param progress: Progress function, called with current and
 648             total object count.
 649         :return: List of tuples with (sha, offset, crc32)
 650         """
 651         ret = list(self.iterentries(resolve_ext_ref, progress=progress))
 652         ret.sort()
 653         return ret
 654
 655     def create_index_v1(self, filename, resolve_ext_ref=None, progress=None):
 656         """Create a version 1 file for this data file.
 657
 658         :param filename: Index filename.
 659         :param resolve_ext_ref: Function to use for resolving externally referenced
 660             SHA1s (for thin packs)
 661         :param progress: Progress report function
 662         """
 663         entries = self.sorted_entries(resolve_ext_ref, progress=progress)
 664         write_pack_index_v1(filename, entries, self.calculate_checksum())
 665
 666     def create_index_v2(self, filename, resolve_ext_ref=None, progress=None):
 667         """Create a version 2 index file for this data file.
 668
 669         :param filename: Index filename.
 670         :param resolve_ext_ref: Function to use for resolving externally referenced
 671             SHA1s (for thin packs)
 672         :param progress: Progress report function
 673         """
 674         entries = self.sorted_entries(resolve_ext_ref, progress=progress)
 675         write_pack_index_v2(filename, entries, self.calculate_checksum())
 676
 677     def create_index(self, filename, resolve_ext_ref=None, progress=None,
 678                      version=2):
 679         """Create an  index file for this data file.
 680
 681         :param filename: Index filename.
 682         :param resolve_ext_ref: Function to use for resolving externally referenced
 683             SHA1s (for thin packs)
 684         :param progress: Progress report function
 685         """
 686         if version == 1:
 687             self.create_index_v1(filename, resolve_ext_ref, progress)
 688         elif version == 2:
 689             self.create_index_v2(filename, resolve_ext_ref, progress)
 690         else:
 691             raise ValueError("unknown index format %d" % version)
 692
 693     def get_stored_checksum(self):
 694         """Return the expected checksum stored in this pack."""
 695         self._file.seek(self._get_size()-20)
 696         return self._file.read(20)
 697
 698     def check(self):
 699         """Check the consistency of this pack."""
 700         return (self.calculate_checksum() == self.get_stored_checksum())
 701
 702     def get_object_at(self, offset):
 703         """Given an offset in to the packfile return the object that is there.
 704
 705         Using the associated index the location of an object can be looked up, and
 706         then the packfile can be asked directly for that object using this
 707         function.
 708         """
 709         if offset in self._offset_cache:
 710             return self._offset_cache[offset]
 711         assert isinstance(offset, long) or isinstance(offset, int),\
 712                 "offset was %r" % offset
 713         assert offset >= self._header_size
 714         self._file.seek(offset)
 715         return unpack_object(self._file.read)[:2]
 716
 717
 718 class SHA1Reader(object):
 719     """Wrapper around a file-like object that remembers the SHA1 of
 720     the data read from it."""
 721
 722     def __init__(self, f):
 723         self.f = f
 724         self.sha1 = make_sha("")
 725
 726     def read(self, num=None):
 727         data = self.f.read(num)
 728         self.sha1.update(data)
 729         return data
 730
 731     def check_sha(self):
 732         stored = self.f.read(20)
 733         if stored != self.sha1.digest():
 734             raise ChecksumMismatch(self.sha1.hexdigest(), sha_to_hex(stored))
 735
 736     def close(self):
 737         return self.f.close()
 738
 739     def tell(self):
 740         return self.f.tell()
 741
 742
 743 class SHA1Writer(object):
 744     """Wrapper around a file-like object that remembers the SHA1 of
 745     the data written to it."""
 746
 747     def __init__(self, f):
 748         self.f = f
 749         self.sha1 = make_sha("")
 750
 751     def write(self, data):
 752         self.sha1.update(data)
 753         self.f.write(data)
 754
 755     def write_sha(self):
 756         sha = self.sha1.digest()
 757         assert len(sha) == 20
 758         self.f.write(sha)
 759         return sha
 760
 761     def close(self):
 762         sha = self.write_sha()
 763         self.f.close()
 764         return sha
 765
 766     def tell(self):
 767         return self.f.tell()
 768
 769
 770 def write_pack_object(f, type, object):
 771     """Write pack object to a file.
 772
 773     :param f: File to write to
 774     :param o: Object to write
 775     :return: Tuple with offset at which the object was written, and crc32
 776     """
 777     offset = f.tell()
 778     packed_data_hdr = ""
 779     if type == 6: # offset delta
 780         (delta_base_offset, object) = object
 781     elif type == 7: # ref delta
 782         (basename, object) = object
 783     size = len(object)
 784     c = (type << 4) | (size & 15)
 785     size >>= 4
 786     while size:
 787         packed_data_hdr += (chr(c | 0x80))
 788         c = size & 0x7f
 789         size >>= 7
 790     packed_data_hdr += chr(c)
 791     if type == 6: # offset delta
 792         ret = [delta_base_offset & 0x7f]
 793         delta_base_offset >>= 7
 794         while delta_base_offset:
 795             delta_base_offset -= 1
 796             ret.insert(0, 0x80 | (delta_base_offset & 0x7f))
 797             delta_base_offset >>= 7
 798         packed_data_hdr += "".join([chr(x) for x in ret])
 799     elif type == 7: # ref delta
 800         assert len(basename) == 20
 801         packed_data_hdr += basename
 802     packed_data = packed_data_hdr + zlib.compress(object)
 803     f.write(packed_data)
 804     return (offset, (zlib.crc32(packed_data) & 0xffffffff))
 805
 806
 807 def write_pack(filename, objects, num_objects):
 808     """Write a new pack data file.
 809
 810     :param filename: Path to the new pack file (without .pack extension)
 811     :param objects: Iterable over (object, path) tuples to write
 812     :param num_objects: Number of objects to write
 813     """
 814     f = GitFile(filename + ".pack", 'wb')
 815     try:
 816         entries, data_sum = write_pack_data(f, objects, num_objects)
 817     finally:
 818         f.close()
 819     entries.sort()
 820     write_pack_index_v2(filename + ".idx", entries, data_sum)
 821
 822
 823 def write_pack_data(f, objects, num_objects, window=10):
 824     """Write a new pack file.
 825
 826     :param filename: The filename of the new pack file.
 827     :param objects: List of objects to write (tuples with object and path)
 828     :return: List with (name, offset, crc32 checksum) entries, pack checksum
 829     """
 830     recency = list(objects)
 831     # FIXME: Somehow limit delta depth
 832     # FIXME: Make thin-pack optional (its not used when cloning a pack)
 833     # Build a list of objects ordered by the magic Linus heuristic
 834     # This helps us find good objects to diff against us
 835     magic = []
 836     for obj, path in recency:
 837         magic.append( (obj.type, path, 1, -obj.raw_length(), obj) )
 838     magic.sort()
 839     # Build a map of objects and their index in magic - so we can find preceeding objects
 840     # to diff against
 841     offs = {}
 842     for i in range(len(magic)):
 843         offs[magic[i][4]] = i
 844     # Write the pack
 845     entries = []
 846     f = SHA1Writer(f)
 847     f.write("PACK")               # Pack header
 848     f.write(struct.pack(">L", 2)) # Pack version
 849     f.write(struct.pack(">L", num_objects)) # Number of objects in pack
 850     for o, path in recency:
 851         sha1 = o.sha().digest()
 852         orig_t = o.type
 853         raw = o.as_raw_string()
 854         winner = raw
 855         t = orig_t
 856         #for i in range(offs[o]-window, window):
 857         #    if i < 0 or i >= len(offs): continue
 858         #    b = magic[i][4]
 859         #    if b.type != orig_t: continue
 860         #    base = b.as_raw_string()
 861         #    delta = create_delta(base, raw)
 862         #    if len(delta) < len(winner):
 863         #        winner = delta
 864         #        t = 6 if magic[i][2] == 1 else 7
 865         offset, crc32 = write_pack_object(f, t, winner)
 866         entries.append((sha1, offset, crc32))
 867     return entries, f.write_sha()
 868
 869
 870 def write_pack_index_v1(filename, entries, pack_checksum):
 871     """Write a new pack index file.
 872
 873     :param filename: The filename of the new pack index file.
 874     :param entries: List of tuples with object name (sha), offset_in_pack,  and
 875             crc32_checksum.
 876     :param pack_checksum: Checksum of the pack file.
 877     """
 878     f = GitFile(filename, 'wb')
 879     f = SHA1Writer(f)
 880     fan_out_table = defaultdict(lambda: 0)
 881     for (name, offset, entry_checksum) in entries:
 882         fan_out_table[ord(name[0])] += 1
 883     # Fan-out table
 884     for i in range(0x100):
 885         f.write(struct.pack(">L", fan_out_table[i]))
 886         fan_out_table[i+1] += fan_out_table[i]
 887     for (name, offset, entry_checksum) in entries:
 888         f.write(struct.pack(">L20s", offset, name))
 889     assert len(pack_checksum) == 20
 890     f.write(pack_checksum)
 891     f.close()
 892
 893
 894 def create_delta(base_buf, target_buf):
 895     """Use python difflib to work out how to transform base_buf to target_buf.
 896
 897     :param base_buf: Base buffer
 898     :param target_buf: Target buffer
 899     """
 900     assert isinstance(base_buf, str)
 901     assert isinstance(target_buf, str)
 902     out_buf = ""
 903     # write delta header
 904     def encode_size(size):
 905         ret = ""
 906         c = size & 0x7f
 907         size >>= 7
 908         while size:
 909             ret += chr(c | 0x80)
 910             c = size & 0x7f
 911             size >>= 7
 912         ret += chr(c)
 913         return ret
 914     out_buf += encode_size(len(base_buf))
 915     out_buf += encode_size(len(target_buf))
 916     # write out delta opcodes
 917     seq = difflib.SequenceMatcher(a=base_buf, b=target_buf)
 918     for opcode, i1, i2, j1, j2 in seq.get_opcodes():
 919         # Git patch opcodes don't care about deletes!
 920         #if opcode == "replace" or opcode == "delete":
 921         #    pass
 922         if opcode == "equal":
 923             # If they are equal, unpacker will use data from base_buf
 924             # Write out an opcode that says what range to use
 925             scratch = ""
 926             op = 0x80
 927             o = i1
 928             for i in range(4):
 929                 if o & 0xff << i*8:
 930                     scratch += chr((o >> i*8) & 0xff)
 931                     op |= 1 << i
 932             s = i2 - i1
 933             for i in range(2):
 934                 if s & 0xff << i*8:
 935                     scratch += chr((s >> i*8) & 0xff)
 936                     op |= 1 << (4+i)
 937             out_buf += chr(op)
 938             out_buf += scratch
 939         if opcode == "replace" or opcode == "insert":
 940             # If we are replacing a range or adding one, then we just
 941             # output it to the stream (prefixed by its size)
 942             s = j2 - j1
 943             o = j1
 944             while s > 127:
 945                 out_buf += chr(127)
 946                 out_buf += target_buf[o:o+127]
 947                 s -= 127
 948                 o += 127
 949             out_buf += chr(s)
 950             out_buf += target_buf[o:o+s]
 951     return out_buf
 952
 953
 954 def apply_delta(src_buf, delta):
 955     """Based on the similar function in git's patch-delta.c.
 956
 957     :param src_buf: Source buffer
 958     :param delta: Delta instructions
 959     """
 960     if type(src_buf) != str:
 961         src_buf = "".join(src_buf)
 962     if type(delta) != str:
 963         delta = "".join(delta)
 964     out = []
 965     index = 0
 966     delta_length = len(delta)
 967     def get_delta_header_size(delta, index):
 968         size = 0
 969         i = 0
 970         while delta:
 971             cmd = ord(delta[index])
 972             index += 1
 973             size |= (cmd & ~0x80) << i
 974             i += 7
 975             if not cmd & 0x80:
 976                 break
 977         return size, index
 978     src_size, index = get_delta_header_size(delta, index)
 979     dest_size, index = get_delta_header_size(delta, index)
 980     assert src_size == len(src_buf), "%d vs %d" % (src_size, len(src_buf))
 981     while index < delta_length:
 982         cmd = ord(delta[index])
 983         index += 1
 984         if cmd & 0x80:
 985             cp_off = 0
 986             for i in range(4):
 987                 if cmd & (1 << i):
 988                     x = ord(delta[index])
 989                     index += 1
 990                     cp_off |= x << (i * 8)
 991             cp_size = 0
 992             for i in range(3):
 993                 if cmd & (1 << (4+i)):
 994                     x = ord(delta[index])
 995                     index += 1
 996                     cp_size |= x << (i * 8)
 997             if cp_size == 0:
 998                 cp_size = 0x10000
 999             if (cp_off + cp_size < cp_size or
1000                 cp_off + cp_size > src_size or
1001                 cp_size > dest_size):
1002                 break
1003             out.append(src_buf[cp_off:cp_off+cp_size])
1004         elif cmd != 0:
1005             out.append(delta[index:index+cmd])
1006             index += cmd
1007         else:
1008             raise ApplyDeltaError("Invalid opcode 0")
1009
1010     if index != delta_length:
1011         raise ApplyDeltaError("delta not empty: %r" % delta[index:])
1012
1013     if dest_size != chunks_length(out):
1014         raise ApplyDeltaError("dest size incorrect")
1015
1016     return out
1017
1018
1019 def write_pack_index_v2(filename, entries, pack_checksum):
1020     """Write a new pack index file.
1021
1022     :param filename: The filename of the new pack index file.
1023     :param entries: List of tuples with object name (sha), offset_in_pack,  and
1024             crc32_checksum.
1025     :param pack_checksum: Checksum of the pack file.
1026     """
1027     f = GitFile(filename, 'wb')
1028     f = SHA1Writer(f)
1029     f.write('\377tOc') # Magic!
1030     f.write(struct.pack(">L", 2))
1031     fan_out_table = defaultdict(lambda: 0)
1032     for (name, offset, entry_checksum) in entries:
1033         fan_out_table[ord(name[0])] += 1
1034     # Fan-out table
1035     for i in range(0x100):
1036         f.write(struct.pack(">L", fan_out_table[i]))
1037         fan_out_table[i+1] += fan_out_table[i]
1038     for (name, offset, entry_checksum) in entries:
1039         f.write(name)
1040     for (name, offset, entry_checksum) in entries:
1041         f.write(struct.pack(">L", entry_checksum))
1042     for (name, offset, entry_checksum) in entries:
1043         # FIXME: handle if MSBit is set in offset
1044         f.write(struct.pack(">L", offset))
1045     # FIXME: handle table for pack files > 8 Gb
1046     assert len(pack_checksum) == 20
1047     f.write(pack_checksum)
1048     f.close()
1049
1050
1051 class Pack(object):
1052     """A Git pack object."""
1053
1054     def __init__(self, basename):
1055         self._basename = basename
1056         self._data_path = self._basename + ".pack"
1057         self._idx_path = self._basename + ".idx"
1058         self._data = None
1059         self._idx = None
1060
1061     @classmethod
1062     def from_objects(self, data, idx):
1063         """Create a new pack object from pack data and index objects."""
1064         ret = Pack("")
1065         ret._data = data
1066         ret._idx = idx
1067         return ret
1068
1069     def name(self):
1070         """The SHA over the SHAs of the objects in this pack."""
1071         return self.index.objects_sha1()
1072
1073     @property
1074     def data(self):
1075         """The pack data object being used."""
1076         if self._data is None:
1077             self._data = PackData(self._data_path)
1078             assert len(self.index) == len(self._data)
1079             idx_stored_checksum = self.index.get_pack_checksum()
1080             data_stored_checksum = self._data.get_stored_checksum()
1081             if idx_stored_checksum != data_stored_checksum:
1082                 raise ChecksumMismatch(sha_to_hex(idx_stored_checksum),
1083                                        sha_to_hex(data_stored_checksum))
1084         return self._data
1085
1086     @property
1087     def index(self):
1088         """The index being used.
1089
1090         :note: This may be an in-memory index
1091         """
1092         if self._idx is None:
1093             self._idx = load_pack_index(self._idx_path)
1094         return self._idx
1095
1096     def close(self):
1097         if self._data is not None:
1098             self._data.close()
1099         self.index.close()
1100
1101     def __eq__(self, other):
1102         return type(self) == type(other) and self.index == other.index
1103
1104     def __len__(self):
1105         """Number of entries in this pack."""
1106         return len(self.index)
1107
1108     def __repr__(self):
1109         return "%s(%r)" % (self.__class__.__name__, self._basename)
1110
1111     def __iter__(self):
1112         """Iterate over all the sha1s of the objects in this pack."""
1113         return iter(self.index)
1114
1115     def check(self):
1116         """Check the integrity of this pack."""
1117         if not self.index.check():
1118             return False
1119         if not self.data.check():
1120             return False
1121         return True
1122
1123     def get_stored_checksum(self):
1124         return self.data.get_stored_checksum()
1125
1126     def __contains__(self, sha1):
1127         """Check whether this pack contains a particular SHA1."""
1128         try:
1129             self.index.object_index(sha1)
1130             return True
1131         except KeyError:
1132             return False
1133
1134     def get_raw(self, sha1, resolve_ref=None):
1135         offset = self.index.object_index(sha1)
1136         obj_type, obj = self.data.get_object_at(offset)
1137         if type(offset) is long:
1138           offset = int(offset)
1139         if resolve_ref is None:
1140             resolve_ref = self.get_raw
1141         kind, chunks = self.data.resolve_object(offset, obj_type, obj,
1142             resolve_ref)
1143         return kind, "".join(chunks)
1144
1145     def __getitem__(self, sha1):
1146         """Retrieve the specified SHA1."""
1147         type, uncomp = self.get_raw(sha1)
1148         return ShaFile.from_raw_string(type, uncomp)
1149
1150     def iterobjects(self, get_raw=None):
1151         """Iterate over the objects in this pack."""
1152         if get_raw is None:
1153             get_raw = self.get_raw
1154         for offset, type, obj, crc32 in self.data.iterobjects():
1155             assert isinstance(offset, int)
1156             type, obj = self.data.resolve_object(offset, type, obj, get_raw)
1157             yield ShaFile.from_raw_chunks(type, obj)
1158
1159
1160 try:
1161     from dulwich._pack import apply_delta, bisect_find_sha
1162 except ImportError:
1163     pass