dulwich/pack.py

   1 # pack.py -- For dealing wih packed git objects.
   2 # Copyright (C) 2007 James Westby <jw+debian@jameswestby.net>
   3 # Copryight (C) 2008 Jelmer Vernooij <jelmer@samba.org>
   4 #
   5 # This program is free software; you can redistribute it and/or
   6 # modify it under the terms of the GNU General Public License
   7 # as published by the Free Software Foundation; version 2
   8 # of the License or (at your option) a later version.
   9 #
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 # GNU General Public License for more details.
  14 #
  15 # You should have received a copy of the GNU General Public License
  16 # along with this program; if not, write to the Free Software
  17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
  18 # MA  02110-1301, USA.
  19
  20 """Classes for dealing with packed git objects.
  21
  22 A pack is a compact representation of a bunch of objects, stored
  23 using deltas where possible.
  24
  25 They have two parts, the pack file, which stores the data, and an index
  26 that tells you where the data is.
  27
  28 To find an object you look in all of the index files 'til you find a
  29 match for the object name. You then use the pointer got from this as
  30 a pointer in to the corresponding packfile.
  31 """
  32
  33 try:
  34     from collections import defaultdict
  35 except ImportError:
  36     from misc import defaultdict
  37
  38 from itertools import chain, imap, izip
  39 import mmap
  40 import os
  41 import struct
  42 try:
  43     from struct import unpack_from
  44 except ImportError:
  45     from dulwich.misc import unpack_from
  46 import sys
  47 import zlib
  48 import difflib
  49
  50 from dulwich.errors import (
  51     ApplyDeltaError,
  52     ChecksumMismatch,
  53     )
  54 from dulwich.lru_cache import (
  55     LRUSizeCache,
  56     )
  57 from dulwich.objects import (
  58     ShaFile,
  59     hex_to_sha,
  60     sha_to_hex,
  61     )
  62 from dulwich.misc import make_sha
  63
  64 supports_mmap_offset = (sys.version_info[0] >= 3 or
  65         (sys.version_info[0] == 2 and sys.version_info[1] >= 6))
  66
  67
  68 def take_msb_bytes(map, offset):
  69     ret = []
  70     while len(ret) == 0 or ret[-1] & 0x80:
  71         ret.append(ord(map[offset]))
  72         offset += 1
  73     return ret
  74
  75
  76 def read_zlib(data, offset, dec_size):
  77     obj = zlib.decompressobj()
  78     ret = []
  79     fed = 0
  80     while obj.unused_data == "":
  81         base = offset+fed
  82         add = data[base:base+1024]
  83         if len(add) < 1024:
  84             add += "Z"
  85         fed += len(add)
  86         ret.append(obj.decompress(add))
  87     x = "".join(ret)
  88     assert len(x) == dec_size
  89     comp_len = fed-len(obj.unused_data)
  90     return x, comp_len
  91
  92
  93 def iter_sha1(iter):
  94     """Return the hexdigest of the SHA1 over a set of names."""
  95     sha1 = make_sha()
  96     for name in iter:
  97         sha1.update(name)
  98     return sha1.hexdigest()
  99
 100
 101 def simple_mmap(f, offset, size, access=mmap.ACCESS_READ):
 102     """Simple wrapper for mmap() which always supports the offset parameter.
 103
 104     :param f: File object.
 105     :param offset: Offset in the file, from the beginning of the file.
 106     :param size: Size of the mmap'ed area
 107     :param access: Access mechanism.
 108     :return: MMAP'd area.
 109     """
 110     if supports_mmap_offset:
 111         return mmap.mmap(f.fileno(), size, access=access, offset=offset), 0
 112     else:
 113         mem = mmap.mmap(f.fileno(), size+offset, access=access)
 114         return mem, offset
 115
 116
 117 def load_pack_index(filename):
 118     f = open(filename, 'r')
 119     if f.read(4) == '\377tOc':
 120         version = struct.unpack(">L", f.read(4))[0]
 121         if version == 2:
 122             f.seek(0)
 123             return PackIndex2(filename, file=f)
 124         else:
 125             raise KeyError("Unknown pack index format %d" % version)
 126     else:
 127         f.seek(0)
 128         return PackIndex1(filename, file=f)
 129
 130
 131 def bisect_find_sha(start, end, sha, unpack_name):
 132     assert start <= end
 133     while start <= end:
 134         i = (start + end)/2
 135         file_sha = unpack_name(i)
 136         x = cmp(file_sha, sha)
 137         if x < 0:
 138             start = i + 1
 139         elif x > 0:
 140             end = i - 1
 141         else:
 142             return i
 143     return None
 144
 145
 146 class PackIndex(object):
 147     """An index in to a packfile.
 148
 149     Given a sha id of an object a pack index can tell you the location in the
 150     packfile of that object if it has it.
 151
 152     To do the loop it opens the file, and indexes first 256 4 byte groups
 153     with the first byte of the sha id. The value in the four byte group indexed
 154     is the end of the group that shares the same starting byte. Subtract one
 155     from the starting byte and index again to find the start of the group.
 156     The values are sorted by sha id within the group, so do the math to find
 157     the start and end offset and then bisect in to find if the value is present.
 158     """
 159
 160     def __init__(self, filename, file=None):
 161         """Create a pack index object.
 162
 163         Provide it with the name of the index file to consider, and it will map
 164         it whenever required.
 165         """
 166         self._filename = filename
 167         # Take the size now, so it can be checked each time we map the file to
 168         # ensure that it hasn't changed.
 169         self._size = os.path.getsize(filename)
 170         if file is None:
 171             self._file = open(filename, 'r')
 172         else:
 173             self._file = file
 174         self._contents, map_offset = simple_mmap(self._file, 0, self._size)
 175         assert map_offset == 0
 176
 177     def __eq__(self, other):
 178         if not isinstance(other, PackIndex):
 179             return False
 180
 181         if self._fan_out_table != other._fan_out_table:
 182             return False
 183
 184         for (name1, _, _), (name2, _, _) in izip(self.iterentries(), other.iterentries()):
 185             if name1 != name2:
 186                 return False
 187         return True
 188
 189     def close(self):
 190         self._file.close()
 191
 192     def __len__(self):
 193         """Return the number of entries in this pack index."""
 194         return self._fan_out_table[-1]
 195
 196     def _unpack_entry(self, i):
 197         """Unpack the i-th entry in the index file.
 198
 199         :return: Tuple with object name (SHA), offset in pack file and
 200               CRC32 checksum (if known)."""
 201         raise NotImplementedError(self._unpack_entry)
 202
 203     def _unpack_name(self, i):
 204         """Unpack the i-th name from the index file."""
 205         raise NotImplementedError(self._unpack_name)
 206
 207     def _unpack_offset(self, i):
 208         """Unpack the i-th object offset from the index file."""
 209         raise NotImplementedError(self._unpack_offset)
 210
 211     def _unpack_crc32_checksum(self, i):
 212         """Unpack the crc32 checksum for the i-th object from the index file."""
 213         raise NotImplementedError(self._unpack_crc32_checksum)
 214
 215     def __iter__(self):
 216         return imap(sha_to_hex, self._itersha())
 217
 218     def _itersha(self):
 219         for i in range(len(self)):
 220             yield self._unpack_name(i)
 221
 222     def objects_sha1(self):
 223         """Return the hex SHA1 over all the shas of all objects in this pack.
 224
 225         :note: This is used for the filename of the pack.
 226         """
 227         return iter_sha1(self._itersha())
 228
 229     def iterentries(self):
 230         """Iterate over the entries in this pack index.
 231
 232         Will yield tuples with object name, offset in packfile and crc32 checksum.
 233         """
 234         for i in range(len(self)):
 235             yield self._unpack_entry(i)
 236
 237     def _read_fan_out_table(self, start_offset):
 238         ret = []
 239         for i in range(0x100):
 240             ret.append(struct.unpack(">L", self._contents[start_offset+i*4:start_offset+(i+1)*4])[0])
 241         return ret
 242
 243     def check(self):
 244         """Check that the stored checksum matches the actual checksum."""
 245         return self.calculate_checksum() == self.get_stored_checksum()
 246
 247     def calculate_checksum(self):
 248         return make_sha(self._contents[:-20]).digest()
 249
 250     def get_pack_checksum(self):
 251         """Return the SHA1 checksum stored for the corresponding packfile."""
 252         return str(self._contents[-40:-20])
 253
 254     def get_stored_checksum(self):
 255         """Return the SHA1 checksum stored for this index."""
 256         return str(self._contents[-20:])
 257
 258     def object_index(self, sha):
 259         """Return the index in to the corresponding packfile for the object.
 260
 261         Given the name of an object it will return the offset that object lives
 262         at within the corresponding pack file. If the pack file doesn't have the
 263         object then None will be returned.
 264         """
 265         if len(sha) == 40:
 266             sha = hex_to_sha(sha)
 267         return self._object_index(sha)
 268
 269     def _object_index(self, sha):
 270         """See object_index.
 271
 272         :param sha: A *binary* SHA string. (20 characters long)_
 273         """
 274         assert len(sha) == 20
 275         idx = ord(sha[0])
 276         if idx == 0:
 277             start = 0
 278         else:
 279             start = self._fan_out_table[idx-1]
 280         end = self._fan_out_table[idx]
 281         i = bisect_find_sha(start, end, sha, self._unpack_name)
 282         if i is None:
 283             raise KeyError(sha)
 284         return self._unpack_offset(i)
 285
 286
 287
 288 class PackIndex1(PackIndex):
 289     """Version 1 Pack Index."""
 290
 291     def __init__(self, filename, file=None):
 292         PackIndex.__init__(self, filename, file)
 293         self.version = 1
 294         self._fan_out_table = self._read_fan_out_table(0)
 295
 296     def _unpack_entry(self, i):
 297         (offset, name) = unpack_from(">L20s", self._contents,
 298             (0x100 * 4) + (i * 24))
 299         return (name, offset, None)
 300
 301     def _unpack_name(self, i):
 302         offset = (0x100 * 4) + (i * 24) + 4
 303         return self._contents[offset:offset+20]
 304
 305     def _unpack_offset(self, i):
 306         offset = (0x100 * 4) + (i * 24)
 307         return unpack_from(">L", self._contents, offset)[0]
 308
 309     def _unpack_crc32_checksum(self, i):
 310         # Not stored in v1 index files
 311         return None
 312
 313
 314 class PackIndex2(PackIndex):
 315     """Version 2 Pack Index."""
 316
 317     def __init__(self, filename, file=None):
 318         PackIndex.__init__(self, filename, file)
 319         assert self._contents[:4] == '\377tOc', "Not a v2 pack index file"
 320         (self.version, ) = unpack_from(">L", self._contents, 4)
 321         assert self.version == 2, "Version was %d" % self.version
 322         self._fan_out_table = self._read_fan_out_table(8)
 323         self._name_table_offset = 8 + 0x100 * 4
 324         self._crc32_table_offset = self._name_table_offset + 20 * len(self)
 325         self._pack_offset_table_offset = self._crc32_table_offset + 4 * len(self)
 326
 327     def _unpack_entry(self, i):
 328         return (self._unpack_name(i), self._unpack_offset(i),
 329                 self._unpack_crc32_checksum(i))
 330
 331     def _unpack_name(self, i):
 332         offset = self._name_table_offset + i * 20
 333         return self._contents[offset:offset+20]
 334
 335     def _unpack_offset(self, i):
 336         offset = self._pack_offset_table_offset + i * 4
 337         return unpack_from(">L", self._contents, offset)[0]
 338
 339     def _unpack_crc32_checksum(self, i):
 340         return unpack_from(">L", self._contents,
 341                           self._crc32_table_offset + i * 4)[0]
 342
 343
 344
 345 def read_pack_header(f):
 346     header = f.read(12)
 347     assert header[:4] == "PACK"
 348     (version,) = unpack_from(">L", header, 4)
 349     assert version in (2, 3), "Version was %d" % version
 350     (num_objects,) = unpack_from(">L", header, 8)
 351     return (version, num_objects)
 352
 353
 354 def read_pack_tail(f):
 355     return (f.read(20),)
 356
 357
 358 def unpack_object(map, offset=0):
 359     bytes = take_msb_bytes(map, offset)
 360     type = (bytes[0] >> 4) & 0x07
 361     size = bytes[0] & 0x0f
 362     for i, byte in enumerate(bytes[1:]):
 363         size += (byte & 0x7f) << ((i * 7) + 4)
 364     raw_base = len(bytes)
 365     if type == 6: # offset delta
 366         bytes = take_msb_bytes(map, raw_base + offset)
 367         assert not (bytes[-1] & 0x80)
 368         delta_base_offset = bytes[0] & 0x7f
 369         for byte in bytes[1:]:
 370             delta_base_offset += 1
 371             delta_base_offset <<= 7
 372             delta_base_offset += (byte & 0x7f)
 373         raw_base+=len(bytes)
 374         uncomp, comp_len = read_zlib(map, offset + raw_base, size)
 375         assert size == len(uncomp)
 376         return type, (delta_base_offset, uncomp), comp_len+raw_base
 377     elif type == 7: # ref delta
 378         basename = map[offset+raw_base:offset+raw_base+20]
 379         uncomp, comp_len = read_zlib(map, offset+raw_base+20, size)
 380         assert size == len(uncomp)
 381         return type, (basename, uncomp), comp_len+raw_base+20
 382     else:
 383         uncomp, comp_len = read_zlib(map, offset+raw_base, size)
 384         assert len(uncomp) == size
 385         return type, uncomp, comp_len+raw_base
 386
 387
 388 def compute_object_size((num, obj)):
 389     if num in (6, 7):
 390         return len(obj[1])
 391     assert isinstance(obj, str)
 392     return len(obj)
 393
 394
 395 class PackData(object):
 396     """The data contained in a packfile.
 397
 398     Pack files can be accessed both sequentially for exploding a pack, and
 399     directly with the help of an index to retrieve a specific object.
 400
 401     The objects within are either complete or a delta aginst another.
 402
 403     The header is variable length. If the MSB of each byte is set then it
 404     indicates that the subsequent byte is still part of the header.
 405     For the first byte the next MS bits are the type, which tells you the type
 406     of object, and whether it is a delta. The LS byte is the lowest bits of the
 407     size. For each subsequent byte the LS 7 bits are the next MS bits of the
 408     size, i.e. the last byte of the header contains the MS bits of the size.
 409
 410     For the complete objects the data is stored as zlib deflated data.
 411     The size in the header is the uncompressed object size, so to uncompress
 412     you need to just keep feeding data to zlib until you get an object back,
 413     or it errors on bad data. This is done here by just giving the complete
 414     buffer from the start of the deflated object on. This is bad, but until I
 415     get mmap sorted out it will have to do.
 416
 417     Currently there are no integrity checks done. Also no attempt is made to try
 418     and detect the delta case, or a request for an object at the wrong position.
 419     It will all just throw a zlib or KeyError.
 420     """
 421
 422     def __init__(self, filename):
 423         """Create a PackData object that represents the pack in the given filename.
 424
 425         The file must exist and stay readable until the object is disposed of. It
 426         must also stay the same size. It will be mapped whenever needed.
 427
 428         Currently there is a restriction on the size of the pack as the python
 429         mmap implementation is flawed.
 430         """
 431         self._filename = filename
 432         assert os.path.exists(filename), "%s is not a packfile" % filename
 433         self._size = os.path.getsize(filename)
 434         self._header_size = 12
 435         assert self._size >= self._header_size, "%s is too small for a packfile (%d < %d)" % (filename, self._size, self._header_size)
 436         self._file = open(self._filename, 'rb')
 437         self._read_header()
 438         self._offset_cache = LRUSizeCache(1024*1024*20,
 439             compute_size=compute_object_size)
 440
 441     def close(self):
 442         self._file.close()
 443
 444     def _read_header(self):
 445         (version, self._num_objects) = read_pack_header(self._file)
 446         self._file.seek(self._size-20)
 447         (self._stored_checksum,) = read_pack_tail(self._file)
 448
 449     def __len__(self):
 450         """Returns the number of objects in this pack."""
 451         return self._num_objects
 452
 453     def calculate_checksum(self):
 454         """Calculate the checksum for this pack."""
 455         map, map_offset = simple_mmap(self._file, 0, self._size - 20)
 456         try:
 457             return make_sha(map[map_offset:self._size-20]).digest()
 458         finally:
 459             map.close()
 460
 461     def resolve_object(self, offset, type, obj, get_ref, get_offset=None):
 462         """Resolve an object, possibly resolving deltas when necessary.
 463
 464         :return: Tuple with object type and contents.
 465         """
 466         if type not in (6, 7): # Not a delta
 467             return type, obj
 468
 469         if get_offset is None:
 470             get_offset = self.get_object_at
 471
 472         if type == 6: # offset delta
 473             (delta_offset, delta) = obj
 474             assert isinstance(delta_offset, int)
 475             assert isinstance(delta, str)
 476             base_offset = offset-delta_offset
 477             type, base_obj = get_offset(base_offset)
 478             assert isinstance(type, int)
 479         elif type == 7: # ref delta
 480             (basename, delta) = obj
 481             assert isinstance(basename, str) and len(basename) == 20
 482             assert isinstance(delta, str)
 483             type, base_obj = get_ref(basename)
 484             assert isinstance(type, int)
 485             # Can't be a ofs delta, as we wouldn't know the base offset
 486             assert type != 6
 487             base_offset = None
 488         type, base_text = self.resolve_object(base_offset, type, base_obj, get_ref)
 489         if base_offset is not None:
 490             self._offset_cache[base_offset] = type, base_text
 491         ret = (type, apply_delta(base_text, delta))
 492         return ret
 493
 494     def iterobjects(self, progress=None):
 495         offset = self._header_size
 496         num = len(self)
 497         map, _ = simple_mmap(self._file, 0, self._size)
 498         try:
 499             for i in range(num):
 500                 (type, obj, total_size) = unpack_object(map, offset)
 501                 crc32 = zlib.crc32(map[offset:offset+total_size]) & 0xffffffff
 502                 yield offset, type, obj, crc32
 503                 offset += total_size
 504                 if progress:
 505                     progress(i, num)
 506         finally:
 507             map.close()
 508
 509     def iterentries(self, ext_resolve_ref=None, progress=None):
 510         found = {}
 511         postponed = defaultdict(list)
 512         class Postpone(Exception):
 513             """Raised to postpone delta resolving."""
 514
 515         def get_ref_text(sha):
 516             assert len(sha) == 20
 517             if sha in found:
 518                 return found[sha]
 519             if ext_resolve_ref:
 520                 try:
 521                     return ext_resolve_ref(sha)
 522                 except KeyError:
 523                     pass
 524             raise Postpone, (sha, )
 525         extra = []
 526         todo = chain(self.iterobjects(progress=progress), extra)
 527         for (offset, type, obj, crc32) in todo:
 528             assert isinstance(offset, int)
 529             assert isinstance(type, int)
 530             assert isinstance(obj, tuple) or isinstance(obj, str)
 531             try:
 532                 type, obj = self.resolve_object(offset, type, obj, get_ref_text)
 533             except Postpone, (sha, ):
 534                 postponed[sha].append((offset, type, obj))
 535             else:
 536                 shafile = ShaFile.from_raw_string(type, obj)
 537                 sha = shafile.sha().digest()
 538                 found[sha] = (type, obj)
 539                 yield sha, offset, crc32
 540                 extra.extend(postponed.get(sha, []))
 541         if postponed:
 542             raise KeyError([sha_to_hex(h) for h in postponed.keys()])
 543
 544     def sorted_entries(self, resolve_ext_ref=None, progress=None):
 545         ret = list(self.iterentries(resolve_ext_ref, progress=progress))
 546         ret.sort()
 547         return ret
 548
 549     def create_index_v1(self, filename, resolve_ext_ref=None, progress=None):
 550         entries = self.sorted_entries(resolve_ext_ref, progress=progress)
 551         write_pack_index_v1(filename, entries, self.calculate_checksum())
 552
 553     def create_index_v2(self, filename, resolve_ext_ref=None, progress=None):
 554         entries = self.sorted_entries(resolve_ext_ref, progress=progress)
 555         write_pack_index_v2(filename, entries, self.calculate_checksum())
 556
 557     def get_stored_checksum(self):
 558         return self._stored_checksum
 559
 560     def check(self):
 561         return (self.calculate_checksum() == self.get_stored_checksum())
 562
 563     def get_object_at(self, offset):
 564         """Given an offset in to the packfile return the object that is there.
 565
 566         Using the associated index the location of an object can be looked up, and
 567         then the packfile can be asked directly for that object using this
 568         function.
 569         """
 570         if offset in self._offset_cache:
 571             return self._offset_cache[offset]
 572         assert isinstance(offset, long) or isinstance(offset, int),\
 573                 "offset was %r" % offset
 574         assert offset >= self._header_size
 575         map, map_offset = simple_mmap(self._file, offset, self._size-offset)
 576         try:
 577             ret = unpack_object(map, map_offset)[:2]
 578             return ret
 579         finally:
 580             map.close()
 581
 582
 583 class SHA1Writer(object):
 584
 585     def __init__(self, f):
 586         self.f = f
 587         self.sha1 = make_sha("")
 588
 589     def write(self, data):
 590         self.sha1.update(data)
 591         self.f.write(data)
 592
 593     def write_sha(self):
 594         sha = self.sha1.digest()
 595         assert len(sha) == 20
 596         self.f.write(sha)
 597         return sha
 598
 599     def close(self):
 600         sha = self.write_sha()
 601         self.f.close()
 602         return sha
 603
 604     def tell(self):
 605         return self.f.tell()
 606
 607
 608 def write_pack_object(f, type, object):
 609     """Write pack object to a file.
 610
 611     :param f: File to write to
 612     :param o: Object to write
 613     :return: Tuple with offset at which the object was written, and crc32
 614     """
 615     ret = f.tell()
 616     packed_data_hdr = ""
 617     if type == 6: # ref delta
 618         (delta_base_offset, object) = object
 619     elif type == 7: # offset delta
 620         (basename, object) = object
 621     size = len(object)
 622     c = (type << 4) | (size & 15)
 623     size >>= 4
 624     while size:
 625         packed_data_hdr += (chr(c | 0x80))
 626         c = size & 0x7f
 627         size >>= 7
 628     packed_data_hdr += chr(c)
 629     if type == 6: # offset delta
 630         ret = [delta_base_offset & 0x7f]
 631         delta_base_offset >>= 7
 632         while delta_base_offset:
 633             delta_base_offset -= 1
 634             ret.insert(0, 0x80 | (delta_base_offset & 0x7f))
 635             delta_base_offset >>= 7
 636         packed_data_hdr += "".join([chr(x) for x in ret])
 637     elif type == 7: # ref delta
 638         assert len(basename) == 20
 639         packed_data_hdr += basename
 640     packed_data = packed_data_hdr + zlib.compress(object)
 641     f.write(packed_data)
 642     return (f.tell(), (zlib.crc32(packed_data) & 0xffffffff))
 643
 644
 645 def write_pack(filename, objects, num_objects):
 646     f = open(filename + ".pack", 'w')
 647     try:
 648         entries, data_sum = write_pack_data(f, objects, num_objects)
 649     finally:
 650         f.close()
 651     entries.sort()
 652     write_pack_index_v2(filename + ".idx", entries, data_sum)
 653
 654
 655 def write_pack_data(f, objects, num_objects, window=10):
 656     """Write a new pack file.
 657
 658     :param filename: The filename of the new pack file.
 659     :param objects: List of objects to write (tuples with object and path)
 660     :return: List with (name, offset, crc32 checksum) entries, pack checksum
 661     """
 662     recency = list(objects)
 663     # FIXME: Somehow limit delta depth
 664     # FIXME: Make thin-pack optional (its not used when cloning a pack)
 665     # Build a list of objects ordered by the magic Linus heuristic
 666     # This helps us find good objects to diff against us
 667     magic = []
 668     for obj, path in recency:
 669         magic.append( (obj.type, path, 1, -len(obj.as_raw_string()[1]), obj) )
 670     magic.sort()
 671     # Build a map of objects and their index in magic - so we can find preceeding objects
 672     # to diff against
 673     offs = {}
 674     for i in range(len(magic)):
 675         offs[magic[i][4]] = i
 676     # Write the pack
 677     entries = []
 678     f = SHA1Writer(f)
 679     f.write("PACK")               # Pack header
 680     f.write(struct.pack(">L", 2)) # Pack version
 681     f.write(struct.pack(">L", num_objects)) # Number of objects in pack
 682     for o, path in recency:
 683         sha1 = o.sha().digest()
 684         orig_t, raw = o.as_raw_string()
 685         winner = raw
 686         t = orig_t
 687         #for i in range(offs[o]-window, window):
 688         #    if i < 0 or i >= len(offs): continue
 689         #    b = magic[i][4]
 690         #    if b.type != orig_t: continue
 691         #    _, base = b.as_raw_string()
 692         #    delta = create_delta(base, raw)
 693         #    if len(delta) < len(winner):
 694         #        winner = delta
 695         #        t = 6 if magic[i][2] == 1 else 7
 696         offset, crc32 = write_pack_object(f, t, winner)
 697         entries.append((sha1, offset, crc32))
 698     return entries, f.write_sha()
 699
 700
 701 def write_pack_index_v1(filename, entries, pack_checksum):
 702     """Write a new pack index file.
 703
 704     :param filename: The filename of the new pack index file.
 705     :param entries: List of tuples with object name (sha), offset_in_pack,  and
 706             crc32_checksum.
 707     :param pack_checksum: Checksum of the pack file.
 708     """
 709     f = open(filename, 'w')
 710     f = SHA1Writer(f)
 711     fan_out_table = defaultdict(lambda: 0)
 712     for (name, offset, entry_checksum) in entries:
 713         fan_out_table[ord(name[0])] += 1
 714     # Fan-out table
 715     for i in range(0x100):
 716         f.write(struct.pack(">L", fan_out_table[i]))
 717         fan_out_table[i+1] += fan_out_table[i]
 718     for (name, offset, entry_checksum) in entries:
 719         f.write(struct.pack(">L20s", offset, name))
 720     assert len(pack_checksum) == 20
 721     f.write(pack_checksum)
 722     f.close()
 723
 724
 725 def create_delta(base_buf, target_buf):
 726     """Use python difflib to work out how to transform base_buf to target_buf"""
 727     assert isinstance(base_buf, str)
 728     assert isinstance(target_buf, str)
 729     out_buf = ""
 730     # write delta header
 731     def encode_size(size):
 732         ret = ""
 733         c = size & 0x7f
 734         size >>= 7
 735         while size:
 736             ret += chr(c | 0x80)
 737             c = size & 0x7f
 738             size >>= 7
 739         ret += chr(c)
 740         return ret
 741     out_buf += encode_size(len(base_buf))
 742     out_buf += encode_size(len(target_buf))
 743     # write out delta opcodes
 744     seq = difflib.SequenceMatcher(a=base_buf, b=target_buf)
 745     for opcode, i1, i2, j1, j2 in seq.get_opcodes():
 746         # Git patch opcodes don't care about deletes!
 747         #if opcode == "replace" or opcode == "delete":
 748         #    pass
 749         if opcode == "equal":
 750             # If they are equal, unpacker will use data from base_buf
 751             # Write out an opcode that says what range to use
 752             scratch = ""
 753             op = 0x80
 754             o = i1
 755             for i in range(4):
 756                 if o & 0xff << i*8:
 757                     scratch += chr(o >> i)
 758                     op |= 1 << i
 759             s = i2 - i1
 760             for i in range(2):
 761                 if s & 0xff << i*8:
 762                     scratch += chr(s >> i)
 763                     op |= 1 << (4+i)
 764             out_buf += chr(op)
 765             out_buf += scratch
 766         if opcode == "replace" or opcode == "insert":
 767             # If we are replacing a range or adding one, then we just
 768             # output it to the stream (prefixed by its size)
 769             s = j2 - j1
 770             o = j1
 771             while s > 127:
 772                 out_buf += chr(127)
 773                 out_buf += target_buf[o:o+127]
 774                 s -= 127
 775                 o += 127
 776             out_buf += chr(s)
 777             out_buf += target_buf[o:o+s]
 778     return out_buf
 779
 780
 781 def apply_delta(src_buf, delta):
 782     """Based on the similar function in git's patch-delta.c.
 783
 784     :param src_buf: Source buffer
 785     :param delta: Delta instructions
 786     """
 787     assert isinstance(src_buf, str), "was %r" % (src_buf,)
 788     assert isinstance(delta, str)
 789     out = []
 790     index = 0
 791     delta_length = len(delta)
 792     def get_delta_header_size(delta, index):
 793         size = 0
 794         i = 0
 795         while delta:
 796             cmd = ord(delta[index])
 797             index += 1
 798             size |= (cmd & ~0x80) << i
 799             i += 7
 800             if not cmd & 0x80:
 801                 break
 802         return size, index
 803     src_size, index = get_delta_header_size(delta, index)
 804     dest_size, index = get_delta_header_size(delta, index)
 805     assert src_size == len(src_buf), "%d vs %d" % (src_size, len(src_buf))
 806     while index < delta_length:
 807         cmd = ord(delta[index])
 808         index += 1
 809         if cmd & 0x80:
 810             cp_off = 0
 811             for i in range(4):
 812                 if cmd & (1 << i):
 813                     x = ord(delta[index])
 814                     index += 1
 815                     cp_off |= x << (i * 8)
 816             cp_size = 0
 817             for i in range(3):
 818                 if cmd & (1 << (4+i)):
 819                     x = ord(delta[index])
 820                     index += 1
 821                     cp_size |= x << (i * 8)
 822             if cp_size == 0:
 823                 cp_size = 0x10000
 824             if (cp_off + cp_size < cp_size or
 825                 cp_off + cp_size > src_size or
 826                 cp_size > dest_size):
 827                 break
 828             out.append(src_buf[cp_off:cp_off+cp_size])
 829         elif cmd != 0:
 830             out.append(delta[index:index+cmd])
 831             index += cmd
 832         else:
 833             raise ApplyDeltaError("Invalid opcode 0")
 834
 835     if index != delta_length:
 836         raise ApplyDeltaError("delta not empty: %r" % delta[index:])
 837
 838     out = ''.join(out)
 839     if dest_size != len(out):
 840         raise ApplyDeltaError("dest size incorrect")
 841
 842     return out
 843
 844
 845 def write_pack_index_v2(filename, entries, pack_checksum):
 846     """Write a new pack index file.
 847
 848     :param filename: The filename of the new pack index file.
 849     :param entries: List of tuples with object name (sha), offset_in_pack,  and
 850             crc32_checksum.
 851     :param pack_checksum: Checksum of the pack file.
 852     """
 853     f = open(filename, 'w')
 854     f = SHA1Writer(f)
 855     f.write('\377tOc') # Magic!
 856     f.write(struct.pack(">L", 2))
 857     fan_out_table = defaultdict(lambda: 0)
 858     for (name, offset, entry_checksum) in entries:
 859         fan_out_table[ord(name[0])] += 1
 860     # Fan-out table
 861     for i in range(0x100):
 862         f.write(struct.pack(">L", fan_out_table[i]))
 863         fan_out_table[i+1] += fan_out_table[i]
 864     for (name, offset, entry_checksum) in entries:
 865         f.write(name)
 866     for (name, offset, entry_checksum) in entries:
 867         f.write(struct.pack(">L", entry_checksum))
 868     for (name, offset, entry_checksum) in entries:
 869         # FIXME: handle if MSBit is set in offset
 870         f.write(struct.pack(">L", offset))
 871     # FIXME: handle table for pack files > 8 Gb
 872     assert len(pack_checksum) == 20
 873     f.write(pack_checksum)
 874     f.close()
 875
 876
 877 class Pack(object):
 878
 879     def __init__(self, basename):
 880         self._basename = basename
 881         self._data_path = self._basename + ".pack"
 882         self._idx_path = self._basename + ".idx"
 883         self._data = None
 884         self._idx = None
 885
 886     @classmethod
 887     def from_objects(self, data, idx):
 888         ret = Pack("")
 889         ret._data = data
 890         ret._idx = idx
 891         return ret
 892
 893     def name(self):
 894         """The SHA over the SHAs of the objects in this pack."""
 895         return self.idx.objects_sha1()
 896
 897     @property
 898     def data(self):
 899         if self._data is None:
 900             self._data = PackData(self._data_path)
 901             assert len(self.idx) == len(self._data)
 902             idx_stored_checksum = self.idx.get_pack_checksum()
 903             data_stored_checksum = self._data.get_stored_checksum()
 904             if idx_stored_checksum != data_stored_checksum:
 905                 raise ChecksumMismatch(sha_to_hex(idx_stored_checksum),
 906                                        sha_to_hex(data_stored_checksum))
 907         return self._data
 908
 909     @property
 910     def idx(self):
 911         if self._idx is None:
 912             self._idx = load_pack_index(self._idx_path)
 913         return self._idx
 914
 915     def close(self):
 916         if self._data is not None:
 917             self._data.close()
 918         self.idx.close()
 919
 920     def __eq__(self, other):
 921         return type(self) == type(other) and self.idx == other.idx
 922
 923     def __len__(self):
 924         """Number of entries in this pack."""
 925         return len(self.idx)
 926
 927     def __repr__(self):
 928         return "Pack(%r)" % self._basename
 929
 930     def __iter__(self):
 931         """Iterate over all the sha1s of the objects in this pack."""
 932         return iter(self.idx)
 933
 934     def check(self):
 935         if not self.idx.check():
 936             return False
 937         if not self.data.check():
 938             return False
 939         return True
 940
 941     def get_stored_checksum(self):
 942         return self.data.get_stored_checksum()
 943
 944     def __contains__(self, sha1):
 945         """Check whether this pack contains a particular SHA1."""
 946         try:
 947             self.idx.object_index(sha1)
 948             return True
 949         except KeyError:
 950             return False
 951
 952     def get_raw(self, sha1, resolve_ref=None):
 953         offset = self.idx.object_index(sha1)
 954         obj_type, obj = self.data.get_object_at(offset)
 955         if type(offset) is long:
 956           offset = int(offset)
 957         if resolve_ref is None:
 958             resolve_ref = self.get_raw
 959         return self.data.resolve_object(offset, obj_type, obj, resolve_ref)
 960
 961     def __getitem__(self, sha1):
 962         """Retrieve the specified SHA1."""
 963         type, uncomp = self.get_raw(sha1)
 964         return ShaFile.from_raw_string(type, uncomp)
 965
 966     def iterobjects(self, get_raw=None):
 967         if get_raw is None:
 968             get_raw = self.get_raw
 969         for offset, type, obj, crc32 in self.data.iterobjects():
 970             assert isinstance(offset, int)
 971             yield ShaFile.from_raw_string(
 972                     *self.data.resolve_object(offset, type, obj, get_raw))
 973
 974
 975 def load_packs(path):
 976     if not os.path.exists(path):
 977         return
 978     for name in os.listdir(path):
 979         if name.startswith("pack-") and name.endswith(".pack"):
 980             yield Pack(os.path.join(path, name[:-len(".pack")]))
 981
 982
 983 try:
 984     from dulwich._pack import apply_delta, bisect_find_sha
 985 except ImportError:
 986     pass