dulwich/pack.py

   1 # pack.py -- For dealing wih packed git objects.
   2 # Copyright (C) 2007 James Westby <jw+debian@jameswestby.net>
   3 # Copryight (C) 2008 Jelmer Vernooij <jelmer@samba.org>
   4 #
   5 # This program is free software; you can redistribute it and/or
   6 # modify it under the terms of the GNU General Public License
   7 # as published by the Free Software Foundation; version 2
   8 # of the License or (at your option) a later version.
   9 #
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 # GNU General Public License for more details.
  14 #
  15 # You should have received a copy of the GNU General Public License
  16 # along with this program; if not, write to the Free Software
  17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
  18 # MA  02110-1301, USA.
  19
  20 """Classes for dealing with packed git objects.
  21
  22 A pack is a compact representation of a bunch of objects, stored
  23 using deltas where possible.
  24
  25 They have two parts, the pack file, which stores the data, and an index
  26 that tells you where the data is.
  27
  28 To find an object you look in all of the index files 'til you find a
  29 match for the object name. You then use the pointer got from this as
  30 a pointer in to the corresponding packfile.
  31 """
  32
  33 try:
  34     from collections import defaultdict
  35 except ImportError:
  36     from misc import defaultdict
  37
  38 from itertools import imap, izip
  39 import mmap
  40 import os
  41 import struct
  42 try:
  43     from struct import unpack_from
  44 except ImportError:
  45     from dulwich.misc import unpack_from
  46 import sys
  47 import zlib
  48 import difflib
  49
  50 from dulwich.errors import (
  51     ApplyDeltaError,
  52     ChecksumMismatch,
  53     )
  54 from dulwich.lru_cache import (
  55     LRUSizeCache,
  56     )
  57 from dulwich.objects import (
  58     ShaFile,
  59     hex_to_sha,
  60     sha_to_hex,
  61     )
  62 from dulwich.misc import make_sha
  63
  64 supports_mmap_offset = (sys.version_info[0] >= 3 or
  65         (sys.version_info[0] == 2 and sys.version_info[1] >= 6))
  66
  67
  68 def take_msb_bytes(map, offset):
  69     ret = []
  70     while len(ret) == 0 or ret[-1] & 0x80:
  71         ret.append(ord(map[offset]))
  72         offset += 1
  73     return ret
  74
  75
  76 def read_zlib(data, offset, dec_size):
  77     obj = zlib.decompressobj()
  78     x = ""
  79     fed = 0
  80     while obj.unused_data == "":
  81         base = offset+fed
  82         add = data[base:base+1024]
  83         if len(add) < 1024:
  84             add += "Z"
  85         fed += len(add)
  86         x += obj.decompress(add)
  87     assert len(x) == dec_size
  88     comp_len = fed-len(obj.unused_data)
  89     return x, comp_len
  90
  91
  92 def iter_sha1(iter):
  93     """Return the hexdigest of the SHA1 over a set of names."""
  94     sha1 = make_sha()
  95     for name in iter:
  96         sha1.update(name)
  97     return sha1.hexdigest()
  98
  99
 100 MAX_MMAP_SIZE = 1024 * 1024 * 1024
 101
 102 def simple_mmap(f, offset, size, access=mmap.ACCESS_READ):
 103     """Simple wrapper for mmap() which always supports the offset parameter.
 104
 105     :param f: File object.
 106     :param offset: Offset in the file, from the beginning of the file.
 107     :param size: Size of the mmap'ed area
 108     :param access: Access mechanism.
 109     :return: MMAP'd area.
 110     """
 111     if offset+size > MAX_MMAP_SIZE and not supports_mmap_offset:
 112         raise AssertionError("%s is larger than 256 meg, and this version "
 113             "of Python does not support the offset argument to mmap().")
 114     if supports_mmap_offset:
 115         return mmap.mmap(f.fileno(), size, access=access, offset=offset), 0
 116     else:
 117         mem = mmap.mmap(f.fileno(), size+offset, access=access)
 118         return mem, offset
 119
 120
 121 def load_pack_index(filename):
 122     f = open(filename, 'r')
 123     if f.read(4) == '\377tOc':
 124         version = struct.unpack(">L", f.read(4))[0]
 125         if version == 2:
 126             f.seek(0)
 127             return PackIndex2(filename, file=f)
 128         else:
 129             raise KeyError("Unknown pack index format %d" % version)
 130     else:
 131         f.seek(0)
 132         return PackIndex1(filename, file=f)
 133
 134
 135 class PackIndex(object):
 136     """An index in to a packfile.
 137
 138     Given a sha id of an object a pack index can tell you the location in the
 139     packfile of that object if it has it.
 140
 141     To do the loop it opens the file, and indexes first 256 4 byte groups
 142     with the first byte of the sha id. The value in the four byte group indexed
 143     is the end of the group that shares the same starting byte. Subtract one
 144     from the starting byte and index again to find the start of the group.
 145     The values are sorted by sha id within the group, so do the math to find
 146     the start and end offset and then bisect in to find if the value is present.
 147     """
 148
 149     def __init__(self, filename, file=None):
 150         """Create a pack index object.
 151
 152         Provide it with the name of the index file to consider, and it will map
 153         it whenever required.
 154         """
 155         self._filename = filename
 156         # Take the size now, so it can be checked each time we map the file to
 157         # ensure that it hasn't changed.
 158         self._size = os.path.getsize(filename)
 159         if file is None:
 160             self._file = open(filename, 'r')
 161         else:
 162             self._file = file
 163         self._contents, map_offset = simple_mmap(self._file, 0, self._size)
 164         assert map_offset == 0
 165
 166     def __eq__(self, other):
 167         if not isinstance(other, PackIndex):
 168             return False
 169
 170         if self._fan_out_table != other._fan_out_table:
 171             return False
 172
 173         for (name1, _, _), (name2, _, _) in izip(self.iterentries(), other.iterentries()):
 174             if name1 != name2:
 175                 return False
 176         return True
 177
 178     def close(self):
 179         self._file.close()
 180
 181     def __len__(self):
 182         """Return the number of entries in this pack index."""
 183         return self._fan_out_table[-1]
 184
 185     def _unpack_entry(self, i):
 186         """Unpack the i-th entry in the index file.
 187
 188         :return: Tuple with object name (SHA), offset in pack file and
 189               CRC32 checksum (if known)."""
 190         raise NotImplementedError(self._unpack_entry)
 191
 192     def _unpack_name(self, i):
 193         """Unpack the i-th name from the index file."""
 194         raise NotImplementedError(self._unpack_name)
 195
 196     def _unpack_offset(self, i):
 197         """Unpack the i-th object offset from the index file."""
 198         raise NotImplementedError(self._unpack_offset)
 199
 200     def _unpack_crc32_checksum(self, i):
 201         """Unpack the crc32 checksum for the i-th object from the index file."""
 202         raise NotImplementedError(self._unpack_crc32_checksum)
 203
 204     def __iter__(self):
 205         return imap(sha_to_hex, self._itersha())
 206
 207     def _itersha(self):
 208         for i in range(len(self)):
 209             yield self._unpack_name(i)
 210
 211     def objects_sha1(self):
 212         """Return the hex SHA1 over all the shas of all objects in this pack.
 213
 214         :note: This is used for the filename of the pack.
 215         """
 216         return iter_sha1(self._itersha())
 217
 218     def iterentries(self):
 219         """Iterate over the entries in this pack index.
 220
 221         Will yield tuples with object name, offset in packfile and crc32 checksum.
 222         """
 223         for i in range(len(self)):
 224             yield self._unpack_entry(i)
 225
 226     def _read_fan_out_table(self, start_offset):
 227         ret = []
 228         for i in range(0x100):
 229             ret.append(struct.unpack(">L", self._contents[start_offset+i*4:start_offset+(i+1)*4])[0])
 230         return ret
 231
 232     def check(self):
 233         """Check that the stored checksum matches the actual checksum."""
 234         return self.calculate_checksum() == self.get_stored_checksum()
 235
 236     def calculate_checksum(self):
 237         f = open(self._filename, 'r')
 238         try:
 239             return make_sha(self._contents[:-20]).digest()
 240         finally:
 241             f.close()
 242
 243     def get_pack_checksum(self):
 244         """Return the SHA1 checksum stored for the corresponding packfile."""
 245         return str(self._contents[-40:-20])
 246
 247     def get_stored_checksum(self):
 248         """Return the SHA1 checksum stored for this index."""
 249         return str(self._contents[-20:])
 250
 251     def object_index(self, sha):
 252         """Return the index in to the corresponding packfile for the object.
 253
 254         Given the name of an object it will return the offset that object lives
 255         at within the corresponding pack file. If the pack file doesn't have the
 256         object then None will be returned.
 257         """
 258         if len(sha) == 40:
 259             sha = hex_to_sha(sha)
 260         return self._object_index(sha)
 261
 262     def _object_index(self, sha):
 263         """See object_index.
 264
 265         :param sha: A *binary* SHA string. (20 characters long)_
 266         """
 267         assert len(sha) == 20
 268         idx = ord(sha[0])
 269         if idx == 0:
 270             start = 0
 271         else:
 272             start = self._fan_out_table[idx-1]
 273         end = self._fan_out_table[idx]
 274         assert start <= end
 275         while start <= end:
 276             i = (start + end)/2
 277             file_sha = self._unpack_name(i)
 278             if file_sha < sha:
 279                 start = i + 1
 280             elif file_sha > sha:
 281                 end = i - 1
 282             else:
 283                 return self._unpack_offset(i)
 284         return None
 285
 286
 287 class PackIndex1(PackIndex):
 288     """Version 1 Pack Index."""
 289
 290     def __init__(self, filename, file=None):
 291         PackIndex.__init__(self, filename, file)
 292         self.version = 1
 293         self._fan_out_table = self._read_fan_out_table(0)
 294
 295     def _unpack_entry(self, i):
 296         (offset, name) = unpack_from(">L20s", self._contents,
 297             (0x100 * 4) + (i * 24))
 298         return (name, offset, None)
 299
 300     def _unpack_name(self, i):
 301         offset = (0x100 * 4) + (i * 24) + 4
 302         return self._contents[offset:offset+20]
 303
 304     def _unpack_offset(self, i):
 305         offset = (0x100 * 4) + (i * 24)
 306         return unpack_from(">L", self._contents, offset)[0]
 307
 308     def _unpack_crc32_checksum(self, i):
 309         # Not stored in v1 index files
 310         return None
 311
 312
 313 class PackIndex2(PackIndex):
 314     """Version 2 Pack Index."""
 315
 316     def __init__(self, filename, file=None):
 317         PackIndex.__init__(self, filename, file)
 318         assert self._contents[:4] == '\377tOc', "Not a v2 pack index file"
 319         (self.version, ) = unpack_from(">L", self._contents, 4)
 320         assert self.version == 2, "Version was %d" % self.version
 321         self._fan_out_table = self._read_fan_out_table(8)
 322         self._name_table_offset = 8 + 0x100 * 4
 323         self._crc32_table_offset = self._name_table_offset + 20 * len(self)
 324         self._pack_offset_table_offset = self._crc32_table_offset + 4 * len(self)
 325
 326     def _unpack_entry(self, i):
 327         return (self._unpack_name(i), self._unpack_offset(i),
 328                 self._unpack_crc32_checksum(i))
 329
 330     def _unpack_name(self, i):
 331         offset = self._name_table_offset + i * 20
 332         return self._contents[offset:offset+20]
 333
 334     def _unpack_offset(self, i):
 335         offset = self._pack_offset_table_offset + i * 4
 336         return unpack_from(">L", self._contents, offset)[0]
 337
 338     def _unpack_crc32_checksum(self, i):
 339         return unpack_from(">L", self._contents,
 340                           self._crc32_table_offset + i * 4)[0]
 341
 342
 343
 344 def read_pack_header(f):
 345     header = f.read(12)
 346     assert header[:4] == "PACK"
 347     (version,) = unpack_from(">L", header, 4)
 348     assert version in (2, 3), "Version was %d" % version
 349     (num_objects,) = unpack_from(">L", header, 8)
 350     return (version, num_objects)
 351
 352
 353 def read_pack_tail(f):
 354     return (f.read(20),)
 355
 356
 357 def unpack_object(map, offset=0):
 358     bytes = take_msb_bytes(map, offset)
 359     type = (bytes[0] >> 4) & 0x07
 360     size = bytes[0] & 0x0f
 361     for i, byte in enumerate(bytes[1:]):
 362         size += (byte & 0x7f) << ((i * 7) + 4)
 363     raw_base = len(bytes)
 364     if type == 6: # offset delta
 365         bytes = take_msb_bytes(map, raw_base + offset)
 366         assert not (bytes[-1] & 0x80)
 367         delta_base_offset = bytes[0] & 0x7f
 368         for byte in bytes[1:]:
 369             delta_base_offset += 1
 370             delta_base_offset <<= 7
 371             delta_base_offset += (byte & 0x7f)
 372         raw_base+=len(bytes)
 373         uncomp, comp_len = read_zlib(map, offset + raw_base, size)
 374         assert size == len(uncomp)
 375         return type, (delta_base_offset, uncomp), comp_len+raw_base
 376     elif type == 7: # ref delta
 377         basename = map[offset+raw_base:offset+raw_base+20]
 378         uncomp, comp_len = read_zlib(map, offset+raw_base+20, size)
 379         assert size == len(uncomp)
 380         return type, (basename, uncomp), comp_len+raw_base+20
 381     else:
 382         uncomp, comp_len = read_zlib(map, offset+raw_base, size)
 383         assert len(uncomp) == size
 384         return type, uncomp, comp_len+raw_base
 385
 386
 387 def compute_object_size((num, obj)):
 388     if num in (6, 7):
 389         return len(obj[1])
 390     assert isinstance(obj, str)
 391     return len(obj)
 392
 393
 394 class PackData(object):
 395     """The data contained in a packfile.
 396
 397     Pack files can be accessed both sequentially for exploding a pack, and
 398     directly with the help of an index to retrieve a specific object.
 399
 400     The objects within are either complete or a delta aginst another.
 401
 402     The header is variable length. If the MSB of each byte is set then it
 403     indicates that the subsequent byte is still part of the header.
 404     For the first byte the next MS bits are the type, which tells you the type
 405     of object, and whether it is a delta. The LS byte is the lowest bits of the
 406     size. For each subsequent byte the LS 7 bits are the next MS bits of the
 407     size, i.e. the last byte of the header contains the MS bits of the size.
 408
 409     For the complete objects the data is stored as zlib deflated data.
 410     The size in the header is the uncompressed object size, so to uncompress
 411     you need to just keep feeding data to zlib until you get an object back,
 412     or it errors on bad data. This is done here by just giving the complete
 413     buffer from the start of the deflated object on. This is bad, but until I
 414     get mmap sorted out it will have to do.
 415
 416     Currently there are no integrity checks done. Also no attempt is made to try
 417     and detect the delta case, or a request for an object at the wrong position.
 418     It will all just throw a zlib or KeyError.
 419     """
 420
 421     def __init__(self, filename):
 422         """Create a PackData object that represents the pack in the given filename.
 423
 424         The file must exist and stay readable until the object is disposed of. It
 425         must also stay the same size. It will be mapped whenever needed.
 426
 427         Currently there is a restriction on the size of the pack as the python
 428         mmap implementation is flawed.
 429         """
 430         self._filename = filename
 431         assert os.path.exists(filename), "%s is not a packfile" % filename
 432         self._size = os.path.getsize(filename)
 433         self._header_size = 12
 434         assert self._size >= self._header_size, "%s is too small for a packfile (%d < %d)" % (filename, self._size, self._header_size)
 435         self._read_header()
 436         self._offset_cache = LRUSizeCache(1024*1024*100,
 437             compute_size=compute_object_size)
 438
 439     def _read_header(self):
 440         f = open(self._filename, 'rb')
 441         try:
 442             (version, self._num_objects) = \
 443                     read_pack_header(f)
 444             f.seek(self._size-20)
 445             (self._stored_checksum,) = read_pack_tail(f)
 446         finally:
 447             f.close()
 448
 449     def __len__(self):
 450         """Returns the number of objects in this pack."""
 451         return self._num_objects
 452
 453     def calculate_checksum(self):
 454         """Calculate the checksum for this pack."""
 455         f = open(self._filename, 'rb')
 456         try:
 457             map, map_offset = simple_mmap(f, 0, self._size - 20)
 458             return make_sha(map[map_offset:self._size-20]).digest()
 459         finally:
 460             f.close()
 461
 462     def resolve_object(self, offset, type, obj, get_ref, get_offset=None):
 463         """Resolve an object, possibly resolving deltas when necessary.
 464
 465         :return: Tuple with object type and contents.
 466         """
 467         if type not in (6, 7): # Not a delta
 468             return type, obj
 469
 470         if get_offset is None:
 471             get_offset = self.get_object_at
 472
 473         if type == 6: # offset delta
 474             (delta_offset, delta) = obj
 475             assert isinstance(delta_offset, int)
 476             assert isinstance(delta, str)
 477             base_offset = offset-delta_offset
 478             type, base_obj = get_offset(base_offset)
 479             assert isinstance(type, int)
 480         elif type == 7: # ref delta
 481             (basename, delta) = obj
 482             assert isinstance(basename, str) and len(basename) == 20
 483             assert isinstance(delta, str)
 484             type, base_obj = get_ref(basename)
 485             assert isinstance(type, int)
 486             # Can't be a ofs delta, as we wouldn't know the base offset
 487             assert type != 6
 488             base_offset = None
 489         type, base_text = self.resolve_object(base_offset, type, base_obj, get_ref)
 490         if base_offset is not None:
 491             self._offset_cache[base_offset] = type, base_text
 492         ret = (type, apply_delta(base_text, delta))
 493         return ret
 494
 495     def iterobjects(self):
 496         offset = self._header_size
 497         f = open(self._filename, 'rb')
 498         num = len(self)
 499         map, _ = simple_mmap(f, 0, self._size)
 500         for i in range(num):
 501             (type, obj, total_size) = unpack_object(map, offset)
 502             crc32 = zlib.crc32(map[offset:offset+total_size]) & 0xffffffff
 503             yield offset, type, obj, crc32
 504             offset += total_size
 505         f.close()
 506
 507     def iterentries(self, ext_resolve_ref=None):
 508         found = {}
 509         postponed = defaultdict(list)
 510         class Postpone(Exception):
 511             """Raised to postpone delta resolving."""
 512
 513         def get_ref_text(sha):
 514             if sha in found:
 515                 return found[sha]
 516             if ext_resolve_ref:
 517                 try:
 518                     return ext_resolve_ref(sha)
 519                 except KeyError:
 520                     pass
 521             raise Postpone, (sha, )
 522         todo = list(self.iterobjects())
 523         while todo:
 524             (offset, type, obj, crc32) = todo.pop(0)
 525             assert isinstance(offset, int)
 526             assert isinstance(type, int)
 527             assert isinstance(obj, tuple) or isinstance(obj, str)
 528             try:
 529                 type, obj = self.resolve_object(offset, type, obj, get_ref_text)
 530             except Postpone, (sha, ):
 531                 postponed[sha].append((offset, type, obj))
 532             else:
 533                 shafile = ShaFile.from_raw_string(type, obj)
 534                 sha = shafile.sha().digest()
 535                 found[sha] = (type, obj)
 536                 yield sha, offset, crc32
 537                 todo += postponed.get(sha, [])
 538         if postponed:
 539             raise KeyError([sha_to_hex(h) for h in postponed.keys()])
 540
 541     def sorted_entries(self, resolve_ext_ref=None):
 542         ret = list(self.iterentries(resolve_ext_ref))
 543         ret.sort()
 544         return ret
 545
 546     def create_index_v1(self, filename, resolve_ext_ref=None):
 547         entries = self.sorted_entries(resolve_ext_ref)
 548         write_pack_index_v1(filename, entries, self.calculate_checksum())
 549
 550     def create_index_v2(self, filename, resolve_ext_ref=None):
 551         entries = self.sorted_entries(resolve_ext_ref)
 552         write_pack_index_v2(filename, entries, self.calculate_checksum())
 553
 554     def get_stored_checksum(self):
 555         return self._stored_checksum
 556
 557     def check(self):
 558         return (self.calculate_checksum() == self.get_stored_checksum())
 559
 560     def get_object_at(self, offset):
 561         """Given an offset in to the packfile return the object that is there.
 562
 563         Using the associated index the location of an object can be looked up, and
 564         then the packfile can be asked directly for that object using this
 565         function.
 566         """
 567         if offset in self._offset_cache:
 568             return self._offset_cache[offset]
 569         assert isinstance(offset, long) or isinstance(offset, int),\
 570                 "offset was %r" % offset
 571         assert offset >= self._header_size
 572         f = open(self._filename, 'rb')
 573         try:
 574             map, map_offset = simple_mmap(f, offset, self._size-offset)
 575             ret = unpack_object(map, map_offset)[:2]
 576             return ret
 577         finally:
 578             f.close()
 579
 580
 581 class SHA1Writer(object):
 582
 583     def __init__(self, f):
 584         self.f = f
 585         self.sha1 = make_sha("")
 586
 587     def write(self, data):
 588         self.sha1.update(data)
 589         self.f.write(data)
 590
 591     def write_sha(self):
 592         sha = self.sha1.digest()
 593         assert len(sha) == 20
 594         self.f.write(sha)
 595         return sha
 596
 597     def close(self):
 598         sha = self.write_sha()
 599         self.f.close()
 600         return sha
 601
 602     def tell(self):
 603         return self.f.tell()
 604
 605
 606 def write_pack_object(f, type, object):
 607     """Write pack object to a file.
 608
 609     :param f: File to write to
 610     :param o: Object to write
 611     :return: Tuple with offset at which the object was written, and crc32
 612     """
 613     ret = f.tell()
 614     packed_data_hdr = ""
 615     if type == 6: # ref delta
 616         (delta_base_offset, object) = object
 617     elif type == 7: # offset delta
 618         (basename, object) = object
 619     size = len(object)
 620     c = (type << 4) | (size & 15)
 621     size >>= 4
 622     while size:
 623         packed_data_hdr += (chr(c | 0x80))
 624         c = size & 0x7f
 625         size >>= 7
 626     packed_data_hdr += chr(c)
 627     if type == 6: # offset delta
 628         ret = [delta_base_offset & 0x7f]
 629         delta_base_offset >>= 7
 630         while delta_base_offset:
 631             delta_base_offset -= 1
 632             ret.insert(0, 0x80 | (delta_base_offset & 0x7f))
 633             delta_base_offset >>= 7
 634         packed_data_hdr += "".join([chr(x) for x in ret])
 635     elif type == 7: # ref delta
 636         assert len(basename) == 20
 637         packed_data_hdr += basename
 638     packed_data = packed_data_hdr + zlib.compress(object)
 639     f.write(packed_data)
 640     return (f.tell(), (zlib.crc32(packed_data) & 0xffffffff))
 641
 642
 643 def write_pack(filename, objects, num_objects):
 644     f = open(filename + ".pack", 'w')
 645     try:
 646         entries, data_sum = write_pack_data(f, objects, num_objects)
 647     finally:
 648         f.close()
 649     entries.sort()
 650     write_pack_index_v2(filename + ".idx", entries, data_sum)
 651
 652
 653 def write_pack_data(f, objects, num_objects, window=10):
 654     """Write a new pack file.
 655
 656     :param filename: The filename of the new pack file.
 657     :param objects: List of objects to write (tuples with object and path)
 658     :return: List with (name, offset, crc32 checksum) entries, pack checksum
 659     """
 660     recency = list(objects)
 661     # FIXME: Somehow limit delta depth
 662     # FIXME: Make thin-pack optional (its not used when cloning a pack)
 663     # Build a list of objects ordered by the magic Linus heuristic
 664     # This helps us find good objects to diff against us
 665     magic = []
 666     for obj, path in recency:
 667         magic.append( (obj.type, path, 1, -len(obj.as_raw_string()[1]), obj) )
 668     magic.sort()
 669     # Build a map of objects and their index in magic - so we can find preceeding objects
 670     # to diff against
 671     offs = {}
 672     for i in range(len(magic)):
 673         offs[magic[i][4]] = i
 674     # Write the pack
 675     entries = []
 676     f = SHA1Writer(f)
 677     f.write("PACK")               # Pack header
 678     f.write(struct.pack(">L", 2)) # Pack version
 679     f.write(struct.pack(">L", num_objects)) # Number of objects in pack
 680     for o, path in recency:
 681         sha1 = o.sha().digest()
 682         orig_t, raw = o.as_raw_string()
 683         winner = raw
 684         t = orig_t
 685         #for i in range(offs[o]-window, window):
 686         #    if i < 0 or i >= len(offs): continue
 687         #    b = magic[i][4]
 688         #    if b.type != orig_t: continue
 689         #    _, base = b.as_raw_string()
 690         #    delta = create_delta(base, raw)
 691         #    if len(delta) < len(winner):
 692         #        winner = delta
 693         #        t = 6 if magic[i][2] == 1 else 7
 694         offset, crc32 = write_pack_object(f, t, winner)
 695         entries.append((sha1, offset, crc32))
 696     return entries, f.write_sha()
 697
 698
 699 def write_pack_index_v1(filename, entries, pack_checksum):
 700     """Write a new pack index file.
 701
 702     :param filename: The filename of the new pack index file.
 703     :param entries: List of tuples with object name (sha), offset_in_pack,  and
 704             crc32_checksum.
 705     :param pack_checksum: Checksum of the pack file.
 706     """
 707     f = open(filename, 'w')
 708     f = SHA1Writer(f)
 709     fan_out_table = defaultdict(lambda: 0)
 710     for (name, offset, entry_checksum) in entries:
 711         fan_out_table[ord(name[0])] += 1
 712     # Fan-out table
 713     for i in range(0x100):
 714         f.write(struct.pack(">L", fan_out_table[i]))
 715         fan_out_table[i+1] += fan_out_table[i]
 716     for (name, offset, entry_checksum) in entries:
 717         f.write(struct.pack(">L20s", offset, name))
 718     assert len(pack_checksum) == 20
 719     f.write(pack_checksum)
 720     f.close()
 721
 722
 723 def create_delta(base_buf, target_buf):
 724     """Use python difflib to work out how to transform base_buf to target_buf"""
 725     assert isinstance(base_buf, str)
 726     assert isinstance(target_buf, str)
 727     out_buf = ""
 728     # write delta header
 729     def encode_size(size):
 730         ret = ""
 731         c = size & 0x7f
 732         size >>= 7
 733         while size:
 734             ret += chr(c | 0x80)
 735             c = size & 0x7f
 736             size >>= 7
 737         ret += chr(c)
 738         return ret
 739     out_buf += encode_size(len(base_buf))
 740     out_buf += encode_size(len(target_buf))
 741     # write out delta opcodes
 742     seq = difflib.SequenceMatcher(a=base_buf, b=target_buf)
 743     for opcode, i1, i2, j1, j2 in seq.get_opcodes():
 744         # Git patch opcodes don't care about deletes!
 745         #if opcode == "replace" or opcode == "delete":
 746         #    pass
 747         if opcode == "equal":
 748             # If they are equal, unpacker will use data from base_buf
 749             # Write out an opcode that says what range to use
 750             scratch = ""
 751             op = 0x80
 752             o = i1
 753             for i in range(4):
 754                 if o & 0xff << i*8:
 755                     scratch += chr(o >> i)
 756                     op |= 1 << i
 757             s = i2 - i1
 758             for i in range(2):
 759                 if s & 0xff << i*8:
 760                     scratch += chr(s >> i)
 761                     op |= 1 << (4+i)
 762             out_buf += chr(op)
 763             out_buf += scratch
 764         if opcode == "replace" or opcode == "insert":
 765             # If we are replacing a range or adding one, then we just
 766             # output it to the stream (prefixed by its size)
 767             s = j2 - j1
 768             o = j1
 769             while s > 127:
 770                 out_buf += chr(127)
 771                 out_buf += target_buf[o:o+127]
 772                 s -= 127
 773                 o += 127
 774             out_buf += chr(s)
 775             out_buf += target_buf[o:o+s]
 776     return out_buf
 777
 778
 779 def apply_delta(src_buf, delta):
 780     """Based on the similar function in git's patch-delta.c.
 781
 782     :param src_buf: Source buffer
 783     :param delta: Delta instructions
 784     """
 785     assert isinstance(src_buf, str), "was %r" % (src_buf,)
 786     assert isinstance(delta, str)
 787     out = []
 788     index = 0
 789     delta_length = len(delta)
 790     def get_delta_header_size(delta, index):
 791         size = 0
 792         i = 0
 793         while delta:
 794             cmd = ord(delta[index])
 795             index += 1
 796             size |= (cmd & ~0x80) << i
 797             i += 7
 798             if not cmd & 0x80:
 799                 break
 800         return size, index
 801     src_size, index = get_delta_header_size(delta, index)
 802     dest_size, index = get_delta_header_size(delta, index)
 803     assert src_size == len(src_buf), "%d vs %d" % (src_size, len(src_buf))
 804     while index < delta_length:
 805         cmd = ord(delta[index])
 806         index += 1
 807         if cmd & 0x80:
 808             cp_off = 0
 809             for i in range(4):
 810                 if cmd & (1 << i):
 811                     x = ord(delta[index])
 812                     index += 1
 813                     cp_off |= x << (i * 8)
 814             cp_size = 0
 815             for i in range(3):
 816                 if cmd & (1 << (4+i)):
 817                     x = ord(delta[index])
 818                     index += 1
 819                     cp_size |= x << (i * 8)
 820             if cp_size == 0:
 821                 cp_size = 0x10000
 822             if (cp_off + cp_size < cp_size or
 823                 cp_off + cp_size > src_size or
 824                 cp_size > dest_size):
 825                 break
 826             out.append(src_buf[cp_off:cp_off+cp_size])
 827         elif cmd != 0:
 828             out.append(delta[index:index+cmd])
 829             index += cmd
 830         else:
 831             raise ApplyDeltaError("Invalid opcode 0")
 832
 833     if index != delta_length:
 834         raise ApplyDeltaError("delta not empty: %r" % delta[index:])
 835
 836     out = ''.join(out)
 837     if dest_size != len(out):
 838         raise ApplyDeltaError("dest size incorrect")
 839
 840     return out
 841
 842
 843 def write_pack_index_v2(filename, entries, pack_checksum):
 844     """Write a new pack index file.
 845
 846     :param filename: The filename of the new pack index file.
 847     :param entries: List of tuples with object name (sha), offset_in_pack,  and
 848             crc32_checksum.
 849     :param pack_checksum: Checksum of the pack file.
 850     """
 851     f = open(filename, 'w')
 852     f = SHA1Writer(f)
 853     f.write('\377tOc') # Magic!
 854     f.write(struct.pack(">L", 2))
 855     fan_out_table = defaultdict(lambda: 0)
 856     for (name, offset, entry_checksum) in entries:
 857         fan_out_table[ord(name[0])] += 1
 858     # Fan-out table
 859     for i in range(0x100):
 860         f.write(struct.pack(">L", fan_out_table[i]))
 861         fan_out_table[i+1] += fan_out_table[i]
 862     for (name, offset, entry_checksum) in entries:
 863         f.write(name)
 864     for (name, offset, entry_checksum) in entries:
 865         f.write(struct.pack(">L", entry_checksum))
 866     for (name, offset, entry_checksum) in entries:
 867         # FIXME: handle if MSBit is set in offset
 868         f.write(struct.pack(">L", offset))
 869     # FIXME: handle table for pack files > 8 Gb
 870     assert len(pack_checksum) == 20
 871     f.write(pack_checksum)
 872     f.close()
 873
 874
 875 class Pack(object):
 876
 877     def __init__(self, basename):
 878         self._basename = basename
 879         self._data_path = self._basename + ".pack"
 880         self._idx_path = self._basename + ".idx"
 881         self._data = None
 882         self._idx = None
 883
 884     def name(self):
 885         """The SHA over the SHAs of the objects in this pack."""
 886         return self.idx.objects_sha1()
 887
 888     @property
 889     def data(self):
 890         if self._data is None:
 891             self._data = PackData(self._data_path)
 892             assert len(self.idx) == len(self._data)
 893             idx_stored_checksum = self.idx.get_pack_checksum()
 894             data_stored_checksum = self._data.get_stored_checksum()
 895             if idx_stored_checksum != data_stored_checksum:
 896                 raise ChecksumMismatch(sha_to_hex(idx_stored_checksum),
 897                                        sha_to_hex(data_stored_checksum))
 898         return self._data
 899
 900     @property
 901     def idx(self):
 902         if self._idx is None:
 903             self._idx = load_pack_index(self._idx_path)
 904         return self._idx
 905
 906     def close(self):
 907         if self._data is not None:
 908             self._data.close()
 909         self.idx.close()
 910
 911     def __eq__(self, other):
 912         return type(self) == type(other) and self.idx == other.idx
 913
 914     def __len__(self):
 915         """Number of entries in this pack."""
 916         return len(self.idx)
 917
 918     def __repr__(self):
 919         return "Pack(%r)" % self._basename
 920
 921     def __iter__(self):
 922         """Iterate over all the sha1s of the objects in this pack."""
 923         return iter(self.idx)
 924
 925     def check(self):
 926         if not self.idx.check():
 927             return False
 928         if not self.data.check():
 929             return False
 930         return True
 931
 932     def get_stored_checksum(self):
 933         return self.data.get_stored_checksum()
 934
 935     def __contains__(self, sha1):
 936         """Check whether this pack contains a particular SHA1."""
 937         return (self.idx.object_index(sha1) is not None)
 938
 939     def get_raw(self, sha1, resolve_ref=None):
 940         offset = self.idx.object_index(sha1)
 941         if offset is None:
 942             raise KeyError(sha1)
 943         type, obj = self.data.get_object_at(offset)
 944         if isinstance(offset, long):
 945           offset = int(offset)
 946         if resolve_ref is None:
 947             resolve_ref = self.get_raw
 948         assert isinstance(offset, int)
 949         return self.data.resolve_object(offset, type, obj, resolve_ref)
 950
 951     def __getitem__(self, sha1):
 952         """Retrieve the specified SHA1."""
 953         type, uncomp = self.get_raw(sha1)
 954         return ShaFile.from_raw_string(type, uncomp)
 955
 956     def iterobjects(self, get_raw=None):
 957         if get_raw is None:
 958             get_raw = self.get_raw
 959         for offset, type, obj, crc32 in self.data.iterobjects():
 960             assert isinstance(offset, int)
 961             yield ShaFile.from_raw_string(
 962                     *self.data.resolve_object(offset, type, obj, get_raw))
 963
 964
 965 def load_packs(path):
 966     if not os.path.exists(path):
 967         return
 968     for name in os.listdir(path):
 969         if name.startswith("pack-") and name.endswith(".pack"):
 970             yield Pack(os.path.join(path, name[:-len(".pack")]))
 971
 972
 973 try:
 974     from dulwich._pack import apply_delta
 975 except ImportError:
 976     pass