dulwich/pack.py

   1 # pack.py -- For dealing wih packed git objects.
   2 # Copyright (C) 2007 James Westby <jw+debian@jameswestby.net>
   3 # Copryight (C) 2008 Jelmer Vernooij <jelmer@samba.org>
   4 #
   5 # This program is free software; you can redistribute it and/or
   6 # modify it under the terms of the GNU General Public License
   7 # as published by the Free Software Foundation; version 2
   8 # of the License or (at your option) a later version.
   9 #
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 # GNU General Public License for more details.
  14 #
  15 # You should have received a copy of the GNU General Public License
  16 # along with this program; if not, write to the Free Software
  17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
  18 # MA  02110-1301, USA.
  19
  20 """Classes for dealing with packed git objects.
  21
  22 A pack is a compact representation of a bunch of objects, stored
  23 using deltas where possible.
  24
  25 They have two parts, the pack file, which stores the data, and an index
  26 that tells you where the data is.
  27
  28 To find an object you look in all of the index files 'til you find a
  29 match for the object name. You then use the pointer got from this as
  30 a pointer in to the corresponding packfile.
  31 """
  32
  33 try:
  34     from collections import defaultdict
  35 except ImportError:
  36     from misc import defaultdict
  37
  38 from itertools import imap, izip
  39 import mmap
  40 import os
  41 import struct
  42 try:
  43     from struct import unpack_from
  44 except ImportError:
  45     from dulwich.misc import unpack_from
  46 import sys
  47 import zlib
  48 import difflib
  49
  50 from dulwich.errors import (
  51     ApplyDeltaError,
  52     ChecksumMismatch,
  53     )
  54 from dulwich.lru_cache import (
  55     LRUSizeCache,
  56     )
  57 from dulwich.objects import (
  58     ShaFile,
  59     hex_to_sha,
  60     sha_to_hex,
  61     )
  62 from dulwich.misc import make_sha
  63
  64 supports_mmap_offset = (sys.version_info[0] >= 3 or
  65         (sys.version_info[0] == 2 and sys.version_info[1] >= 6))
  66
  67
  68 def take_msb_bytes(map, offset):
  69     ret = []
  70     while len(ret) == 0 or ret[-1] & 0x80:
  71         ret.append(ord(map[offset]))
  72         offset += 1
  73     return ret
  74
  75
  76 def read_zlib(data, offset, dec_size):
  77     obj = zlib.decompressobj()
  78     ret = []
  79     fed = 0
  80     while obj.unused_data == "":
  81         base = offset+fed
  82         add = data[base:base+1024]
  83         if len(add) < 1024:
  84             add += "Z"
  85         fed += len(add)
  86         ret.append(obj.decompress(add))
  87     x = "".join(ret)
  88     assert len(x) == dec_size
  89     comp_len = fed-len(obj.unused_data)
  90     return x, comp_len
  91
  92
  93 def iter_sha1(iter):
  94     """Return the hexdigest of the SHA1 over a set of names."""
  95     sha1 = make_sha()
  96     for name in iter:
  97         sha1.update(name)
  98     return sha1.hexdigest()
  99
 100
 101 def simple_mmap(f, offset, size, access=mmap.ACCESS_READ):
 102     """Simple wrapper for mmap() which always supports the offset parameter.
 103
 104     :param f: File object.
 105     :param offset: Offset in the file, from the beginning of the file.
 106     :param size: Size of the mmap'ed area
 107     :param access: Access mechanism.
 108     :return: MMAP'd area.
 109     """
 110     if supports_mmap_offset:
 111         return mmap.mmap(f.fileno(), size, access=access, offset=offset), 0
 112     else:
 113         mem = mmap.mmap(f.fileno(), size+offset, access=access)
 114         return mem, offset
 115
 116
 117 def load_pack_index(filename):
 118     f = open(filename, 'r')
 119     if f.read(4) == '\377tOc':
 120         version = struct.unpack(">L", f.read(4))[0]
 121         if version == 2:
 122             f.seek(0)
 123             return PackIndex2(filename, file=f)
 124         else:
 125             raise KeyError("Unknown pack index format %d" % version)
 126     else:
 127         f.seek(0)
 128         return PackIndex1(filename, file=f)
 129
 130
 131 def bisect_find_sha(start, end, sha, unpack_name):
 132     assert start <= end
 133     while start <= end:
 134         i = (start + end)/2
 135         file_sha = unpack_name(i)
 136         x = cmp(file_sha, sha)
 137         if x < 0:
 138             start = i + 1
 139         elif x > 0:
 140             end = i - 1
 141         else:
 142             return i
 143     return None
 144
 145
 146 class PackIndex(object):
 147     """An index in to a packfile.
 148
 149     Given a sha id of an object a pack index can tell you the location in the
 150     packfile of that object if it has it.
 151
 152     To do the loop it opens the file, and indexes first 256 4 byte groups
 153     with the first byte of the sha id. The value in the four byte group indexed
 154     is the end of the group that shares the same starting byte. Subtract one
 155     from the starting byte and index again to find the start of the group.
 156     The values are sorted by sha id within the group, so do the math to find
 157     the start and end offset and then bisect in to find if the value is present.
 158     """
 159
 160     def __init__(self, filename, file=None):
 161         """Create a pack index object.
 162
 163         Provide it with the name of the index file to consider, and it will map
 164         it whenever required.
 165         """
 166         self._filename = filename
 167         # Take the size now, so it can be checked each time we map the file to
 168         # ensure that it hasn't changed.
 169         self._size = os.path.getsize(filename)
 170         if file is None:
 171             self._file = open(filename, 'r')
 172         else:
 173             self._file = file
 174         self._contents, map_offset = simple_mmap(self._file, 0, self._size)
 175         assert map_offset == 0
 176
 177     def __eq__(self, other):
 178         if not isinstance(other, PackIndex):
 179             return False
 180
 181         if self._fan_out_table != other._fan_out_table:
 182             return False
 183
 184         for (name1, _, _), (name2, _, _) in izip(self.iterentries(), other.iterentries()):
 185             if name1 != name2:
 186                 return False
 187         return True
 188
 189     def close(self):
 190         self._file.close()
 191
 192     def __len__(self):
 193         """Return the number of entries in this pack index."""
 194         return self._fan_out_table[-1]
 195
 196     def _unpack_entry(self, i):
 197         """Unpack the i-th entry in the index file.
 198
 199         :return: Tuple with object name (SHA), offset in pack file and
 200               CRC32 checksum (if known)."""
 201         raise NotImplementedError(self._unpack_entry)
 202
 203     def _unpack_name(self, i):
 204         """Unpack the i-th name from the index file."""
 205         raise NotImplementedError(self._unpack_name)
 206
 207     def _unpack_offset(self, i):
 208         """Unpack the i-th object offset from the index file."""
 209         raise NotImplementedError(self._unpack_offset)
 210
 211     def _unpack_crc32_checksum(self, i):
 212         """Unpack the crc32 checksum for the i-th object from the index file."""
 213         raise NotImplementedError(self._unpack_crc32_checksum)
 214
 215     def __iter__(self):
 216         return imap(sha_to_hex, self._itersha())
 217
 218     def _itersha(self):
 219         for i in range(len(self)):
 220             yield self._unpack_name(i)
 221
 222     def objects_sha1(self):
 223         """Return the hex SHA1 over all the shas of all objects in this pack.
 224
 225         :note: This is used for the filename of the pack.
 226         """
 227         return iter_sha1(self._itersha())
 228
 229     def iterentries(self):
 230         """Iterate over the entries in this pack index.
 231
 232         Will yield tuples with object name, offset in packfile and crc32 checksum.
 233         """
 234         for i in range(len(self)):
 235             yield self._unpack_entry(i)
 236
 237     def _read_fan_out_table(self, start_offset):
 238         ret = []
 239         for i in range(0x100):
 240             ret.append(struct.unpack(">L", self._contents[start_offset+i*4:start_offset+(i+1)*4])[0])
 241         return ret
 242
 243     def check(self):
 244         """Check that the stored checksum matches the actual checksum."""
 245         return self.calculate_checksum() == self.get_stored_checksum()
 246
 247     def calculate_checksum(self):
 248         return make_sha(self._contents[:-20]).digest()
 249
 250     def get_pack_checksum(self):
 251         """Return the SHA1 checksum stored for the corresponding packfile."""
 252         return str(self._contents[-40:-20])
 253
 254     def get_stored_checksum(self):
 255         """Return the SHA1 checksum stored for this index."""
 256         return str(self._contents[-20:])
 257
 258     def object_index(self, sha):
 259         """Return the index in to the corresponding packfile for the object.
 260
 261         Given the name of an object it will return the offset that object lives
 262         at within the corresponding pack file. If the pack file doesn't have the
 263         object then None will be returned.
 264         """
 265         if len(sha) == 40:
 266             sha = hex_to_sha(sha)
 267         return self._object_index(sha)
 268
 269     def _object_index(self, sha):
 270         """See object_index.
 271
 272         :param sha: A *binary* SHA string. (20 characters long)_
 273         """
 274         assert len(sha) == 20
 275         idx = ord(sha[0])
 276         if idx == 0:
 277             start = 0
 278         else:
 279             start = self._fan_out_table[idx-1]
 280         end = self._fan_out_table[idx]
 281         i = bisect_find_sha(start, end, sha, self._unpack_name)
 282         if i is None:
 283             return None
 284         return self._unpack_offset(i)
 285
 286
 287
 288 class PackIndex1(PackIndex):
 289     """Version 1 Pack Index."""
 290
 291     def __init__(self, filename, file=None):
 292         PackIndex.__init__(self, filename, file)
 293         self.version = 1
 294         self._fan_out_table = self._read_fan_out_table(0)
 295
 296     def _unpack_entry(self, i):
 297         (offset, name) = unpack_from(">L20s", self._contents,
 298             (0x100 * 4) + (i * 24))
 299         return (name, offset, None)
 300
 301     def _unpack_name(self, i):
 302         offset = (0x100 * 4) + (i * 24) + 4
 303         return self._contents[offset:offset+20]
 304
 305     def _unpack_offset(self, i):
 306         offset = (0x100 * 4) + (i * 24)
 307         return unpack_from(">L", self._contents, offset)[0]
 308
 309     def _unpack_crc32_checksum(self, i):
 310         # Not stored in v1 index files
 311         return None
 312
 313
 314 class PackIndex2(PackIndex):
 315     """Version 2 Pack Index."""
 316
 317     def __init__(self, filename, file=None):
 318         PackIndex.__init__(self, filename, file)
 319         assert self._contents[:4] == '\377tOc', "Not a v2 pack index file"
 320         (self.version, ) = unpack_from(">L", self._contents, 4)
 321         assert self.version == 2, "Version was %d" % self.version
 322         self._fan_out_table = self._read_fan_out_table(8)
 323         self._name_table_offset = 8 + 0x100 * 4
 324         self._crc32_table_offset = self._name_table_offset + 20 * len(self)
 325         self._pack_offset_table_offset = self._crc32_table_offset + 4 * len(self)
 326
 327     def _unpack_entry(self, i):
 328         return (self._unpack_name(i), self._unpack_offset(i),
 329                 self._unpack_crc32_checksum(i))
 330
 331     def _unpack_name(self, i):
 332         offset = self._name_table_offset + i * 20
 333         return self._contents[offset:offset+20]
 334
 335     def _unpack_offset(self, i):
 336         offset = self._pack_offset_table_offset + i * 4
 337         return unpack_from(">L", self._contents, offset)[0]
 338
 339     def _unpack_crc32_checksum(self, i):
 340         return unpack_from(">L", self._contents,
 341                           self._crc32_table_offset + i * 4)[0]
 342
 343
 344
 345 def read_pack_header(f):
 346     header = f.read(12)
 347     assert header[:4] == "PACK"
 348     (version,) = unpack_from(">L", header, 4)
 349     assert version in (2, 3), "Version was %d" % version
 350     (num_objects,) = unpack_from(">L", header, 8)
 351     return (version, num_objects)
 352
 353
 354 def read_pack_tail(f):
 355     return (f.read(20),)
 356
 357
 358 def unpack_object(map, offset=0):
 359     bytes = take_msb_bytes(map, offset)
 360     type = (bytes[0] >> 4) & 0x07
 361     size = bytes[0] & 0x0f
 362     for i, byte in enumerate(bytes[1:]):
 363         size += (byte & 0x7f) << ((i * 7) + 4)
 364     raw_base = len(bytes)
 365     if type == 6: # offset delta
 366         bytes = take_msb_bytes(map, raw_base + offset)
 367         assert not (bytes[-1] & 0x80)
 368         delta_base_offset = bytes[0] & 0x7f
 369         for byte in bytes[1:]:
 370             delta_base_offset += 1
 371             delta_base_offset <<= 7
 372             delta_base_offset += (byte & 0x7f)
 373         raw_base+=len(bytes)
 374         uncomp, comp_len = read_zlib(map, offset + raw_base, size)
 375         assert size == len(uncomp)
 376         return type, (delta_base_offset, uncomp), comp_len+raw_base
 377     elif type == 7: # ref delta
 378         basename = map[offset+raw_base:offset+raw_base+20]
 379         uncomp, comp_len = read_zlib(map, offset+raw_base+20, size)
 380         assert size == len(uncomp)
 381         return type, (basename, uncomp), comp_len+raw_base+20
 382     else:
 383         uncomp, comp_len = read_zlib(map, offset+raw_base, size)
 384         assert len(uncomp) == size
 385         return type, uncomp, comp_len+raw_base
 386
 387
 388 def compute_object_size((num, obj)):
 389     if num in (6, 7):
 390         return len(obj[1])
 391     assert isinstance(obj, str)
 392     return len(obj)
 393
 394
 395 class PackData(object):
 396     """The data contained in a packfile.
 397
 398     Pack files can be accessed both sequentially for exploding a pack, and
 399     directly with the help of an index to retrieve a specific object.
 400
 401     The objects within are either complete or a delta aginst another.
 402
 403     The header is variable length. If the MSB of each byte is set then it
 404     indicates that the subsequent byte is still part of the header.
 405     For the first byte the next MS bits are the type, which tells you the type
 406     of object, and whether it is a delta. The LS byte is the lowest bits of the
 407     size. For each subsequent byte the LS 7 bits are the next MS bits of the
 408     size, i.e. the last byte of the header contains the MS bits of the size.
 409
 410     For the complete objects the data is stored as zlib deflated data.
 411     The size in the header is the uncompressed object size, so to uncompress
 412     you need to just keep feeding data to zlib until you get an object back,
 413     or it errors on bad data. This is done here by just giving the complete
 414     buffer from the start of the deflated object on. This is bad, but until I
 415     get mmap sorted out it will have to do.
 416
 417     Currently there are no integrity checks done. Also no attempt is made to try
 418     and detect the delta case, or a request for an object at the wrong position.
 419     It will all just throw a zlib or KeyError.
 420     """
 421
 422     def __init__(self, filename):
 423         """Create a PackData object that represents the pack in the given filename.
 424
 425         The file must exist and stay readable until the object is disposed of. It
 426         must also stay the same size. It will be mapped whenever needed.
 427
 428         Currently there is a restriction on the size of the pack as the python
 429         mmap implementation is flawed.
 430         """
 431         self._filename = filename
 432         assert os.path.exists(filename), "%s is not a packfile" % filename
 433         self._size = os.path.getsize(filename)
 434         self._header_size = 12
 435         assert self._size >= self._header_size, "%s is too small for a packfile (%d < %d)" % (filename, self._size, self._header_size)
 436         self._read_header()
 437         self._offset_cache = LRUSizeCache(1024*1024*100,
 438             compute_size=compute_object_size)
 439
 440     def _read_header(self):
 441         f = open(self._filename, 'rb')
 442         try:
 443             (version, self._num_objects) = \
 444                     read_pack_header(f)
 445             f.seek(self._size-20)
 446             (self._stored_checksum,) = read_pack_tail(f)
 447         finally:
 448             f.close()
 449
 450     def __len__(self):
 451         """Returns the number of objects in this pack."""
 452         return self._num_objects
 453
 454     def calculate_checksum(self):
 455         """Calculate the checksum for this pack."""
 456         f = open(self._filename, 'rb')
 457         try:
 458             map, map_offset = simple_mmap(f, 0, self._size - 20)
 459             return make_sha(map[map_offset:self._size-20]).digest()
 460         finally:
 461             f.close()
 462
 463     def resolve_object(self, offset, type, obj, get_ref, get_offset=None):
 464         """Resolve an object, possibly resolving deltas when necessary.
 465
 466         :return: Tuple with object type and contents.
 467         """
 468         if type not in (6, 7): # Not a delta
 469             return type, obj
 470
 471         if get_offset is None:
 472             get_offset = self.get_object_at
 473
 474         if type == 6: # offset delta
 475             (delta_offset, delta) = obj
 476             assert isinstance(delta_offset, int)
 477             assert isinstance(delta, str)
 478             base_offset = offset-delta_offset
 479             type, base_obj = get_offset(base_offset)
 480             assert isinstance(type, int)
 481         elif type == 7: # ref delta
 482             (basename, delta) = obj
 483             assert isinstance(basename, str) and len(basename) == 20
 484             assert isinstance(delta, str)
 485             type, base_obj = get_ref(basename)
 486             assert isinstance(type, int)
 487             # Can't be a ofs delta, as we wouldn't know the base offset
 488             assert type != 6
 489             base_offset = None
 490         type, base_text = self.resolve_object(base_offset, type, base_obj, get_ref)
 491         if base_offset is not None:
 492             self._offset_cache[base_offset] = type, base_text
 493         ret = (type, apply_delta(base_text, delta))
 494         return ret
 495
 496     def iterobjects(self):
 497         offset = self._header_size
 498         f = open(self._filename, 'rb')
 499         num = len(self)
 500         map, _ = simple_mmap(f, 0, self._size)
 501         for i in range(num):
 502             (type, obj, total_size) = unpack_object(map, offset)
 503             crc32 = zlib.crc32(map[offset:offset+total_size]) & 0xffffffff
 504             yield offset, type, obj, crc32
 505             offset += total_size
 506         f.close()
 507
 508     def iterentries(self, ext_resolve_ref=None):
 509         found = {}
 510         postponed = defaultdict(list)
 511         class Postpone(Exception):
 512             """Raised to postpone delta resolving."""
 513
 514         def get_ref_text(sha):
 515             if sha in found:
 516                 return found[sha]
 517             if ext_resolve_ref:
 518                 try:
 519                     return ext_resolve_ref(sha)
 520                 except KeyError:
 521                     pass
 522             raise Postpone, (sha, )
 523         todo = list(self.iterobjects())
 524         while todo:
 525             (offset, type, obj, crc32) = todo.pop(0)
 526             assert isinstance(offset, int)
 527             assert isinstance(type, int)
 528             assert isinstance(obj, tuple) or isinstance(obj, str)
 529             try:
 530                 type, obj = self.resolve_object(offset, type, obj, get_ref_text)
 531             except Postpone, (sha, ):
 532                 postponed[sha].append((offset, type, obj))
 533             else:
 534                 shafile = ShaFile.from_raw_string(type, obj)
 535                 sha = shafile.sha().digest()
 536                 found[sha] = (type, obj)
 537                 yield sha, offset, crc32
 538                 todo += postponed.get(sha, [])
 539         if postponed:
 540             raise KeyError([sha_to_hex(h) for h in postponed.keys()])
 541
 542     def sorted_entries(self, resolve_ext_ref=None):
 543         ret = list(self.iterentries(resolve_ext_ref))
 544         ret.sort()
 545         return ret
 546
 547     def create_index_v1(self, filename, resolve_ext_ref=None):
 548         entries = self.sorted_entries(resolve_ext_ref)
 549         write_pack_index_v1(filename, entries, self.calculate_checksum())
 550
 551     def create_index_v2(self, filename, resolve_ext_ref=None):
 552         entries = self.sorted_entries(resolve_ext_ref)
 553         write_pack_index_v2(filename, entries, self.calculate_checksum())
 554
 555     def get_stored_checksum(self):
 556         return self._stored_checksum
 557
 558     def check(self):
 559         return (self.calculate_checksum() == self.get_stored_checksum())
 560
 561     def get_object_at(self, offset):
 562         """Given an offset in to the packfile return the object that is there.
 563
 564         Using the associated index the location of an object can be looked up, and
 565         then the packfile can be asked directly for that object using this
 566         function.
 567         """
 568         if offset in self._offset_cache:
 569             return self._offset_cache[offset]
 570         assert isinstance(offset, long) or isinstance(offset, int),\
 571                 "offset was %r" % offset
 572         assert offset >= self._header_size
 573         f = open(self._filename, 'rb')
 574         try:
 575             map, map_offset = simple_mmap(f, offset, self._size-offset)
 576             ret = unpack_object(map, map_offset)[:2]
 577             return ret
 578         finally:
 579             f.close()
 580
 581
 582 class SHA1Writer(object):
 583
 584     def __init__(self, f):
 585         self.f = f
 586         self.sha1 = make_sha("")
 587
 588     def write(self, data):
 589         self.sha1.update(data)
 590         self.f.write(data)
 591
 592     def write_sha(self):
 593         sha = self.sha1.digest()
 594         assert len(sha) == 20
 595         self.f.write(sha)
 596         return sha
 597
 598     def close(self):
 599         sha = self.write_sha()
 600         self.f.close()
 601         return sha
 602
 603     def tell(self):
 604         return self.f.tell()
 605
 606
 607 def write_pack_object(f, type, object):
 608     """Write pack object to a file.
 609
 610     :param f: File to write to
 611     :param o: Object to write
 612     :return: Tuple with offset at which the object was written, and crc32
 613     """
 614     ret = f.tell()
 615     packed_data_hdr = ""
 616     if type == 6: # ref delta
 617         (delta_base_offset, object) = object
 618     elif type == 7: # offset delta
 619         (basename, object) = object
 620     size = len(object)
 621     c = (type << 4) | (size & 15)
 622     size >>= 4
 623     while size:
 624         packed_data_hdr += (chr(c | 0x80))
 625         c = size & 0x7f
 626         size >>= 7
 627     packed_data_hdr += chr(c)
 628     if type == 6: # offset delta
 629         ret = [delta_base_offset & 0x7f]
 630         delta_base_offset >>= 7
 631         while delta_base_offset:
 632             delta_base_offset -= 1
 633             ret.insert(0, 0x80 | (delta_base_offset & 0x7f))
 634             delta_base_offset >>= 7
 635         packed_data_hdr += "".join([chr(x) for x in ret])
 636     elif type == 7: # ref delta
 637         assert len(basename) == 20
 638         packed_data_hdr += basename
 639     packed_data = packed_data_hdr + zlib.compress(object)
 640     f.write(packed_data)
 641     return (f.tell(), (zlib.crc32(packed_data) & 0xffffffff))
 642
 643
 644 def write_pack(filename, objects, num_objects):
 645     f = open(filename + ".pack", 'w')
 646     try:
 647         entries, data_sum = write_pack_data(f, objects, num_objects)
 648     finally:
 649         f.close()
 650     entries.sort()
 651     write_pack_index_v2(filename + ".idx", entries, data_sum)
 652
 653
 654 def write_pack_data(f, objects, num_objects, window=10):
 655     """Write a new pack file.
 656
 657     :param filename: The filename of the new pack file.
 658     :param objects: List of objects to write (tuples with object and path)
 659     :return: List with (name, offset, crc32 checksum) entries, pack checksum
 660     """
 661     recency = list(objects)
 662     # FIXME: Somehow limit delta depth
 663     # FIXME: Make thin-pack optional (its not used when cloning a pack)
 664     # Build a list of objects ordered by the magic Linus heuristic
 665     # This helps us find good objects to diff against us
 666     magic = []
 667     for obj, path in recency:
 668         magic.append( (obj.type, path, 1, -len(obj.as_raw_string()[1]), obj) )
 669     magic.sort()
 670     # Build a map of objects and their index in magic - so we can find preceeding objects
 671     # to diff against
 672     offs = {}
 673     for i in range(len(magic)):
 674         offs[magic[i][4]] = i
 675     # Write the pack
 676     entries = []
 677     f = SHA1Writer(f)
 678     f.write("PACK")               # Pack header
 679     f.write(struct.pack(">L", 2)) # Pack version
 680     f.write(struct.pack(">L", num_objects)) # Number of objects in pack
 681     for o, path in recency:
 682         sha1 = o.sha().digest()
 683         orig_t, raw = o.as_raw_string()
 684         winner = raw
 685         t = orig_t
 686         #for i in range(offs[o]-window, window):
 687         #    if i < 0 or i >= len(offs): continue
 688         #    b = magic[i][4]
 689         #    if b.type != orig_t: continue
 690         #    _, base = b.as_raw_string()
 691         #    delta = create_delta(base, raw)
 692         #    if len(delta) < len(winner):
 693         #        winner = delta
 694         #        t = 6 if magic[i][2] == 1 else 7
 695         offset, crc32 = write_pack_object(f, t, winner)
 696         entries.append((sha1, offset, crc32))
 697     return entries, f.write_sha()
 698
 699
 700 def write_pack_index_v1(filename, entries, pack_checksum):
 701     """Write a new pack index file.
 702
 703     :param filename: The filename of the new pack index file.
 704     :param entries: List of tuples with object name (sha), offset_in_pack,  and
 705             crc32_checksum.
 706     :param pack_checksum: Checksum of the pack file.
 707     """
 708     f = open(filename, 'w')
 709     f = SHA1Writer(f)
 710     fan_out_table = defaultdict(lambda: 0)
 711     for (name, offset, entry_checksum) in entries:
 712         fan_out_table[ord(name[0])] += 1
 713     # Fan-out table
 714     for i in range(0x100):
 715         f.write(struct.pack(">L", fan_out_table[i]))
 716         fan_out_table[i+1] += fan_out_table[i]
 717     for (name, offset, entry_checksum) in entries:
 718         f.write(struct.pack(">L20s", offset, name))
 719     assert len(pack_checksum) == 20
 720     f.write(pack_checksum)
 721     f.close()
 722
 723
 724 def create_delta(base_buf, target_buf):
 725     """Use python difflib to work out how to transform base_buf to target_buf"""
 726     assert isinstance(base_buf, str)
 727     assert isinstance(target_buf, str)
 728     out_buf = ""
 729     # write delta header
 730     def encode_size(size):
 731         ret = ""
 732         c = size & 0x7f
 733         size >>= 7
 734         while size:
 735             ret += chr(c | 0x80)
 736             c = size & 0x7f
 737             size >>= 7
 738         ret += chr(c)
 739         return ret
 740     out_buf += encode_size(len(base_buf))
 741     out_buf += encode_size(len(target_buf))
 742     # write out delta opcodes
 743     seq = difflib.SequenceMatcher(a=base_buf, b=target_buf)
 744     for opcode, i1, i2, j1, j2 in seq.get_opcodes():
 745         # Git patch opcodes don't care about deletes!
 746         #if opcode == "replace" or opcode == "delete":
 747         #    pass
 748         if opcode == "equal":
 749             # If they are equal, unpacker will use data from base_buf
 750             # Write out an opcode that says what range to use
 751             scratch = ""
 752             op = 0x80
 753             o = i1
 754             for i in range(4):
 755                 if o & 0xff << i*8:
 756                     scratch += chr(o >> i)
 757                     op |= 1 << i
 758             s = i2 - i1
 759             for i in range(2):
 760                 if s & 0xff << i*8:
 761                     scratch += chr(s >> i)
 762                     op |= 1 << (4+i)
 763             out_buf += chr(op)
 764             out_buf += scratch
 765         if opcode == "replace" or opcode == "insert":
 766             # If we are replacing a range or adding one, then we just
 767             # output it to the stream (prefixed by its size)
 768             s = j2 - j1
 769             o = j1
 770             while s > 127:
 771                 out_buf += chr(127)
 772                 out_buf += target_buf[o:o+127]
 773                 s -= 127
 774                 o += 127
 775             out_buf += chr(s)
 776             out_buf += target_buf[o:o+s]
 777     return out_buf
 778
 779
 780 def apply_delta(src_buf, delta):
 781     """Based on the similar function in git's patch-delta.c.
 782
 783     :param src_buf: Source buffer
 784     :param delta: Delta instructions
 785     """
 786     assert isinstance(src_buf, str), "was %r" % (src_buf,)
 787     assert isinstance(delta, str)
 788     out = []
 789     index = 0
 790     delta_length = len(delta)
 791     def get_delta_header_size(delta, index):
 792         size = 0
 793         i = 0
 794         while delta:
 795             cmd = ord(delta[index])
 796             index += 1
 797             size |= (cmd & ~0x80) << i
 798             i += 7
 799             if not cmd & 0x80:
 800                 break
 801         return size, index
 802     src_size, index = get_delta_header_size(delta, index)
 803     dest_size, index = get_delta_header_size(delta, index)
 804     assert src_size == len(src_buf), "%d vs %d" % (src_size, len(src_buf))
 805     while index < delta_length:
 806         cmd = ord(delta[index])
 807         index += 1
 808         if cmd & 0x80:
 809             cp_off = 0
 810             for i in range(4):
 811                 if cmd & (1 << i):
 812                     x = ord(delta[index])
 813                     index += 1
 814                     cp_off |= x << (i * 8)
 815             cp_size = 0
 816             for i in range(3):
 817                 if cmd & (1 << (4+i)):
 818                     x = ord(delta[index])
 819                     index += 1
 820                     cp_size |= x << (i * 8)
 821             if cp_size == 0:
 822                 cp_size = 0x10000
 823             if (cp_off + cp_size < cp_size or
 824                 cp_off + cp_size > src_size or
 825                 cp_size > dest_size):
 826                 break
 827             out.append(src_buf[cp_off:cp_off+cp_size])
 828         elif cmd != 0:
 829             out.append(delta[index:index+cmd])
 830             index += cmd
 831         else:
 832             raise ApplyDeltaError("Invalid opcode 0")
 833
 834     if index != delta_length:
 835         raise ApplyDeltaError("delta not empty: %r" % delta[index:])
 836
 837     out = ''.join(out)
 838     if dest_size != len(out):
 839         raise ApplyDeltaError("dest size incorrect")
 840
 841     return out
 842
 843
 844 def write_pack_index_v2(filename, entries, pack_checksum):
 845     """Write a new pack index file.
 846
 847     :param filename: The filename of the new pack index file.
 848     :param entries: List of tuples with object name (sha), offset_in_pack,  and
 849             crc32_checksum.
 850     :param pack_checksum: Checksum of the pack file.
 851     """
 852     f = open(filename, 'w')
 853     f = SHA1Writer(f)
 854     f.write('\377tOc') # Magic!
 855     f.write(struct.pack(">L", 2))
 856     fan_out_table = defaultdict(lambda: 0)
 857     for (name, offset, entry_checksum) in entries:
 858         fan_out_table[ord(name[0])] += 1
 859     # Fan-out table
 860     for i in range(0x100):
 861         f.write(struct.pack(">L", fan_out_table[i]))
 862         fan_out_table[i+1] += fan_out_table[i]
 863     for (name, offset, entry_checksum) in entries:
 864         f.write(name)
 865     for (name, offset, entry_checksum) in entries:
 866         f.write(struct.pack(">L", entry_checksum))
 867     for (name, offset, entry_checksum) in entries:
 868         # FIXME: handle if MSBit is set in offset
 869         f.write(struct.pack(">L", offset))
 870     # FIXME: handle table for pack files > 8 Gb
 871     assert len(pack_checksum) == 20
 872     f.write(pack_checksum)
 873     f.close()
 874
 875
 876 class Pack(object):
 877
 878     def __init__(self, basename):
 879         self._basename = basename
 880         self._data_path = self._basename + ".pack"
 881         self._idx_path = self._basename + ".idx"
 882         self._data = None
 883         self._idx = None
 884
 885     def name(self):
 886         """The SHA over the SHAs of the objects in this pack."""
 887         return self.idx.objects_sha1()
 888
 889     @property
 890     def data(self):
 891         if self._data is None:
 892             self._data = PackData(self._data_path)
 893             assert len(self.idx) == len(self._data)
 894             idx_stored_checksum = self.idx.get_pack_checksum()
 895             data_stored_checksum = self._data.get_stored_checksum()
 896             if idx_stored_checksum != data_stored_checksum:
 897                 raise ChecksumMismatch(sha_to_hex(idx_stored_checksum),
 898                                        sha_to_hex(data_stored_checksum))
 899         return self._data
 900
 901     @property
 902     def idx(self):
 903         if self._idx is None:
 904             self._idx = load_pack_index(self._idx_path)
 905         return self._idx
 906
 907     def close(self):
 908         if self._data is not None:
 909             self._data.close()
 910         self.idx.close()
 911
 912     def __eq__(self, other):
 913         return type(self) == type(other) and self.idx == other.idx
 914
 915     def __len__(self):
 916         """Number of entries in this pack."""
 917         return len(self.idx)
 918
 919     def __repr__(self):
 920         return "Pack(%r)" % self._basename
 921
 922     def __iter__(self):
 923         """Iterate over all the sha1s of the objects in this pack."""
 924         return iter(self.idx)
 925
 926     def check(self):
 927         if not self.idx.check():
 928             return False
 929         if not self.data.check():
 930             return False
 931         return True
 932
 933     def get_stored_checksum(self):
 934         return self.data.get_stored_checksum()
 935
 936     def __contains__(self, sha1):
 937         """Check whether this pack contains a particular SHA1."""
 938         return (self.idx.object_index(sha1) is not None)
 939
 940     def get_raw(self, sha1, resolve_ref=None):
 941         offset = self.idx.object_index(sha1)
 942         if offset is None:
 943             raise KeyError(sha1)
 944         type, obj = self.data.get_object_at(offset)
 945         if isinstance(offset, long):
 946           offset = int(offset)
 947         if resolve_ref is None:
 948             resolve_ref = self.get_raw
 949         assert isinstance(offset, int)
 950         return self.data.resolve_object(offset, type, obj, resolve_ref)
 951
 952     def __getitem__(self, sha1):
 953         """Retrieve the specified SHA1."""
 954         type, uncomp = self.get_raw(sha1)
 955         return ShaFile.from_raw_string(type, uncomp)
 956
 957     def iterobjects(self, get_raw=None):
 958         if get_raw is None:
 959             get_raw = self.get_raw
 960         for offset, type, obj, crc32 in self.data.iterobjects():
 961             assert isinstance(offset, int)
 962             yield ShaFile.from_raw_string(
 963                     *self.data.resolve_object(offset, type, obj, get_raw))
 964
 965
 966 def load_packs(path):
 967     if not os.path.exists(path):
 968         return
 969     for name in os.listdir(path):
 970         if name.startswith("pack-") and name.endswith(".pack"):
 971             yield Pack(os.path.join(path, name[:-len(".pack")]))
 972
 973
 974 try:
 975     from dulwich._pack import apply_delta, bisect_find_sha
 976 except ImportError:
 977     pass