dulwich/pack.py

   1 # pack.py -- For dealing wih packed git objects.
   2 # Copyright (C) 2007 James Westby <jw+debian@jameswestby.net>
   3 # Copryight (C) 2008-2009 Jelmer Vernooij <jelmer@samba.org>
   4 #
   5 # This program is free software; you can redistribute it and/or
   6 # modify it under the terms of the GNU General Public License
   7 # as published by the Free Software Foundation; version 2
   8 # of the License or (at your option) a later version.
   9 #
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 # GNU General Public License for more details.
  14 #
  15 # You should have received a copy of the GNU General Public License
  16 # along with this program; if not, write to the Free Software
  17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
  18 # MA  02110-1301, USA.
  19
  20 """Classes for dealing with packed git objects.
  21
  22 A pack is a compact representation of a bunch of objects, stored
  23 using deltas where possible.
  24
  25 They have two parts, the pack file, which stores the data, and an index
  26 that tells you where the data is.
  27
  28 To find an object you look in all of the index files 'til you find a
  29 match for the object name. You then use the pointer got from this as
  30 a pointer in to the corresponding packfile.
  31 """
  32
  33 try:
  34     from collections import defaultdict
  35 except ImportError:
  36     from misc import defaultdict
  37
  38 from itertools import chain, imap, izip
  39 import mmap
  40 import os
  41 import struct
  42 try:
  43     from struct import unpack_from
  44 except ImportError:
  45     from dulwich.misc import unpack_from
  46 import sys
  47 import zlib
  48 import difflib
  49
  50 from dulwich.errors import (
  51     ApplyDeltaError,
  52     ChecksumMismatch,
  53     )
  54 from dulwich.lru_cache import (
  55     LRUSizeCache,
  56     )
  57 from dulwich.objects import (
  58     ShaFile,
  59     hex_to_sha,
  60     sha_to_hex,
  61     )
  62 from dulwich.misc import (
  63     make_sha,
  64     )
  65
  66 supports_mmap_offset = (sys.version_info[0] >= 3 or
  67         (sys.version_info[0] == 2 and sys.version_info[1] >= 6))
  68
  69
  70 def take_msb_bytes(map, offset):
  71     ret = []
  72     while len(ret) == 0 or ret[-1] & 0x80:
  73         ret.append(ord(map[offset]))
  74         offset += 1
  75     return ret
  76
  77
  78 def read_zlib_chunks(data, offset, dec_size):
  79     obj = zlib.decompressobj()
  80     ret = []
  81     fed = 0
  82     while obj.unused_data == "":
  83         base = offset+fed
  84         add = data[base:base+1024]
  85         if len(add) < 1024:
  86             add += "Z"
  87         fed += len(add)
  88         ret.append(obj.decompress(add))
  89     comp_len = fed-len(obj.unused_data)
  90     return ret, comp_len
  91
  92
  93 def read_zlib(data, offset, dec_size):
  94     ret, comp_len = read_zlib_chunks(data, offset, dec_size)
  95     x = "".join(ret)
  96     assert len(x) == dec_size
  97     return x, comp_len
  98
  99
 100 def iter_sha1(iter):
 101     """Return the hexdigest of the SHA1 over a set of names.
 102
 103     :param iter: Iterator over string objects
 104     :return: 40-byte hex sha1 digest
 105     """
 106     sha1 = make_sha()
 107     for name in iter:
 108         sha1.update(name)
 109     return sha1.hexdigest()
 110
 111
 112 def simple_mmap(f, offset, size, access=mmap.ACCESS_READ):
 113     """Simple wrapper for mmap() which always supports the offset parameter.
 114
 115     :param f: File object.
 116     :param offset: Offset in the file, from the beginning of the file.
 117     :param size: Size of the mmap'ed area
 118     :param access: Access mechanism.
 119     :return: MMAP'd area.
 120     """
 121     mem = mmap.mmap(f.fileno(), size+offset, access=access)
 122     return mem, offset
 123
 124
 125 def load_pack_index(filename):
 126     f = open(filename, 'rb')
 127     if f.read(4) == '\377tOc':
 128         version = struct.unpack(">L", f.read(4))[0]
 129         if version == 2:
 130             f.seek(0)
 131             return PackIndex2(filename, file=f)
 132         else:
 133             raise KeyError("Unknown pack index format %d" % version)
 134     else:
 135         f.seek(0)
 136         return PackIndex1(filename, file=f)
 137
 138
 139 def bisect_find_sha(start, end, sha, unpack_name):
 140     assert start <= end
 141     while start <= end:
 142         i = (start + end)/2
 143         file_sha = unpack_name(i)
 144         x = cmp(file_sha, sha)
 145         if x < 0:
 146             start = i + 1
 147         elif x > 0:
 148             end = i - 1
 149         else:
 150             return i
 151     return None
 152
 153
 154 class PackIndex(object):
 155     """An index in to a packfile.
 156
 157     Given a sha id of an object a pack index can tell you the location in the
 158     packfile of that object if it has it.
 159
 160     To do the loop it opens the file, and indexes first 256 4 byte groups
 161     with the first byte of the sha id. The value in the four byte group indexed
 162     is the end of the group that shares the same starting byte. Subtract one
 163     from the starting byte and index again to find the start of the group.
 164     The values are sorted by sha id within the group, so do the math to find
 165     the start and end offset and then bisect in to find if the value is present.
 166     """
 167
 168     def __init__(self, filename, file=None):
 169         """Create a pack index object.
 170
 171         Provide it with the name of the index file to consider, and it will map
 172         it whenever required.
 173         """
 174         self._filename = filename
 175         # Take the size now, so it can be checked each time we map the file to
 176         # ensure that it hasn't changed.
 177         self._size = os.path.getsize(filename)
 178         if file is None:
 179             self._file = open(filename, 'rb')
 180         else:
 181             self._file = file
 182         self._contents, map_offset = simple_mmap(self._file, 0, self._size)
 183         assert map_offset == 0
 184
 185     def __eq__(self, other):
 186         if not isinstance(other, PackIndex):
 187             return False
 188
 189         if self._fan_out_table != other._fan_out_table:
 190             return False
 191
 192         for (name1, _, _), (name2, _, _) in izip(self.iterentries(), other.iterentries()):
 193             if name1 != name2:
 194                 return False
 195         return True
 196
 197     def close(self):
 198         self._file.close()
 199
 200     def __len__(self):
 201         """Return the number of entries in this pack index."""
 202         return self._fan_out_table[-1]
 203
 204     def _unpack_entry(self, i):
 205         """Unpack the i-th entry in the index file.
 206
 207         :return: Tuple with object name (SHA), offset in pack file and
 208               CRC32 checksum (if known)."""
 209         raise NotImplementedError(self._unpack_entry)
 210
 211     def _unpack_name(self, i):
 212         """Unpack the i-th name from the index file."""
 213         raise NotImplementedError(self._unpack_name)
 214
 215     def _unpack_offset(self, i):
 216         """Unpack the i-th object offset from the index file."""
 217         raise NotImplementedError(self._unpack_offset)
 218
 219     def _unpack_crc32_checksum(self, i):
 220         """Unpack the crc32 checksum for the i-th object from the index file."""
 221         raise NotImplementedError(self._unpack_crc32_checksum)
 222
 223     def __iter__(self):
 224         return imap(sha_to_hex, self._itersha())
 225
 226     def _itersha(self):
 227         for i in range(len(self)):
 228             yield self._unpack_name(i)
 229
 230     def objects_sha1(self):
 231         """Return the hex SHA1 over all the shas of all objects in this pack.
 232
 233         :note: This is used for the filename of the pack.
 234         """
 235         return iter_sha1(self._itersha())
 236
 237     def iterentries(self):
 238         """Iterate over the entries in this pack index.
 239
 240         Will yield tuples with object name, offset in packfile and crc32 checksum.
 241         """
 242         for i in range(len(self)):
 243             yield self._unpack_entry(i)
 244
 245     def _read_fan_out_table(self, start_offset):
 246         ret = []
 247         for i in range(0x100):
 248             ret.append(struct.unpack(">L", self._contents[start_offset+i*4:start_offset+(i+1)*4])[0])
 249         return ret
 250
 251     def check(self):
 252         """Check that the stored checksum matches the actual checksum."""
 253         return self.calculate_checksum() == self.get_stored_checksum()
 254
 255     def calculate_checksum(self):
 256         """Calculate the SHA1 checksum over this pack index.
 257
 258         :return: This is a 20-byte binary digest
 259         """
 260         return make_sha(self._contents[:-20]).digest()
 261
 262     def get_pack_checksum(self):
 263         """Return the SHA1 checksum stored for the corresponding packfile.
 264
 265         :return: 20-byte binary digest
 266         """
 267         return str(self._contents[-40:-20])
 268
 269     def get_stored_checksum(self):
 270         """Return the SHA1 checksum stored for this index.
 271
 272         :return: 20-byte binary digest
 273         """
 274         return str(self._contents[-20:])
 275
 276     def object_index(self, sha):
 277         """Return the index in to the corresponding packfile for the object.
 278
 279         Given the name of an object it will return the offset that object lives
 280         at within the corresponding pack file. If the pack file doesn't have the
 281         object then None will be returned.
 282         """
 283         if len(sha) == 40:
 284             sha = hex_to_sha(sha)
 285         return self._object_index(sha)
 286
 287     def _object_index(self, sha):
 288         """See object_index.
 289
 290         :param sha: A *binary* SHA string. (20 characters long)_
 291         """
 292         assert len(sha) == 20
 293         idx = ord(sha[0])
 294         if idx == 0:
 295             start = 0
 296         else:
 297             start = self._fan_out_table[idx-1]
 298         end = self._fan_out_table[idx]
 299         i = bisect_find_sha(start, end, sha, self._unpack_name)
 300         if i is None:
 301             raise KeyError(sha)
 302         return self._unpack_offset(i)
 303
 304
 305
 306 class PackIndex1(PackIndex):
 307     """Version 1 Pack Index."""
 308
 309     def __init__(self, filename, file=None):
 310         PackIndex.__init__(self, filename, file)
 311         self.version = 1
 312         self._fan_out_table = self._read_fan_out_table(0)
 313
 314     def _unpack_entry(self, i):
 315         (offset, name) = unpack_from(">L20s", self._contents,
 316             (0x100 * 4) + (i * 24))
 317         return (name, offset, None)
 318
 319     def _unpack_name(self, i):
 320         offset = (0x100 * 4) + (i * 24) + 4
 321         return self._contents[offset:offset+20]
 322
 323     def _unpack_offset(self, i):
 324         offset = (0x100 * 4) + (i * 24)
 325         return unpack_from(">L", self._contents, offset)[0]
 326
 327     def _unpack_crc32_checksum(self, i):
 328         # Not stored in v1 index files
 329         return None
 330
 331
 332 class PackIndex2(PackIndex):
 333     """Version 2 Pack Index."""
 334
 335     def __init__(self, filename, file=None):
 336         PackIndex.__init__(self, filename, file)
 337         assert self._contents[:4] == '\377tOc', "Not a v2 pack index file"
 338         (self.version, ) = unpack_from(">L", self._contents, 4)
 339         assert self.version == 2, "Version was %d" % self.version
 340         self._fan_out_table = self._read_fan_out_table(8)
 341         self._name_table_offset = 8 + 0x100 * 4
 342         self._crc32_table_offset = self._name_table_offset + 20 * len(self)
 343         self._pack_offset_table_offset = self._crc32_table_offset + 4 * len(self)
 344
 345     def _unpack_entry(self, i):
 346         return (self._unpack_name(i), self._unpack_offset(i),
 347                 self._unpack_crc32_checksum(i))
 348
 349     def _unpack_name(self, i):
 350         offset = self._name_table_offset + i * 20
 351         return self._contents[offset:offset+20]
 352
 353     def _unpack_offset(self, i):
 354         offset = self._pack_offset_table_offset + i * 4
 355         return unpack_from(">L", self._contents, offset)[0]
 356
 357     def _unpack_crc32_checksum(self, i):
 358         return unpack_from(">L", self._contents,
 359                           self._crc32_table_offset + i * 4)[0]
 360
 361
 362
 363 def read_pack_header(f):
 364     """Read the header of a pack file.
 365
 366     :param f: File-like object to read from
 367     """
 368     header = f.read(12)
 369     assert header[:4] == "PACK"
 370     (version,) = unpack_from(">L", header, 4)
 371     assert version in (2, 3), "Version was %d" % version
 372     (num_objects,) = unpack_from(">L", header, 8)
 373     return (version, num_objects)
 374
 375
 376 def read_pack_tail(f):
 377     return (f.read(20),)
 378
 379
 380 def unpack_object(map, offset=0):
 381     """Unpack a Git object.
 382
 383     :return: tuple with type, uncompressed data and compressed size
 384     """
 385     bytes = take_msb_bytes(map, offset)
 386     type = (bytes[0] >> 4) & 0x07
 387     size = bytes[0] & 0x0f
 388     for i, byte in enumerate(bytes[1:]):
 389         size += (byte & 0x7f) << ((i * 7) + 4)
 390     raw_base = len(bytes)
 391     if type == 6: # offset delta
 392         bytes = take_msb_bytes(map, raw_base + offset)
 393         assert not (bytes[-1] & 0x80)
 394         delta_base_offset = bytes[0] & 0x7f
 395         for byte in bytes[1:]:
 396             delta_base_offset += 1
 397             delta_base_offset <<= 7
 398             delta_base_offset += (byte & 0x7f)
 399         raw_base+=len(bytes)
 400         uncomp, comp_len = read_zlib(map, offset + raw_base, size)
 401         assert size == len(uncomp)
 402         return type, (delta_base_offset, uncomp), comp_len+raw_base
 403     elif type == 7: # ref delta
 404         basename = map[offset+raw_base:offset+raw_base+20]
 405         uncomp, comp_len = read_zlib(map, offset+raw_base+20, size)
 406         assert size == len(uncomp)
 407         return type, (basename, uncomp), comp_len+raw_base+20
 408     else:
 409         uncomp, comp_len = read_zlib(map, offset+raw_base, size)
 410         assert len(uncomp) == size
 411         return type, uncomp, comp_len+raw_base
 412
 413
 414 def compute_object_size((num, obj)):
 415     """Compute the size of a unresolved object for use with LRUSizeCache.
 416     """
 417     if num in (6, 7):
 418         return len(obj[1])
 419     assert isinstance(obj, str)
 420     return len(obj)
 421
 422
 423 class PackData(object):
 424     """The data contained in a packfile.
 425
 426     Pack files can be accessed both sequentially for exploding a pack, and
 427     directly with the help of an index to retrieve a specific object.
 428
 429     The objects within are either complete or a delta aginst another.
 430
 431     The header is variable length. If the MSB of each byte is set then it
 432     indicates that the subsequent byte is still part of the header.
 433     For the first byte the next MS bits are the type, which tells you the type
 434     of object, and whether it is a delta. The LS byte is the lowest bits of the
 435     size. For each subsequent byte the LS 7 bits are the next MS bits of the
 436     size, i.e. the last byte of the header contains the MS bits of the size.
 437
 438     For the complete objects the data is stored as zlib deflated data.
 439     The size in the header is the uncompressed object size, so to uncompress
 440     you need to just keep feeding data to zlib until you get an object back,
 441     or it errors on bad data. This is done here by just giving the complete
 442     buffer from the start of the deflated object on. This is bad, but until I
 443     get mmap sorted out it will have to do.
 444
 445     Currently there are no integrity checks done. Also no attempt is made to try
 446     and detect the delta case, or a request for an object at the wrong position.
 447     It will all just throw a zlib or KeyError.
 448     """
 449
 450     def __init__(self, filename):
 451         """Create a PackData object that represents the pack in the given filename.
 452
 453         The file must exist and stay readable until the object is disposed of. It
 454         must also stay the same size. It will be mapped whenever needed.
 455
 456         Currently there is a restriction on the size of the pack as the python
 457         mmap implementation is flawed.
 458         """
 459         self._filename = filename
 460         assert os.path.exists(filename), "%s is not a packfile" % filename
 461         self._size = os.path.getsize(filename)
 462         self._header_size = 12
 463         assert self._size >= self._header_size, "%s is too small for a packfile (%d < %d)" % (filename, self._size, self._header_size)
 464         self._file = open(self._filename, 'rb')
 465         self._read_header()
 466         self._offset_cache = LRUSizeCache(1024*1024*20,
 467             compute_size=compute_object_size)
 468
 469     def close(self):
 470         self._file.close()
 471
 472     def _read_header(self):
 473         (version, self._num_objects) = read_pack_header(self._file)
 474         self._file.seek(self._size-20)
 475         (self._stored_checksum,) = read_pack_tail(self._file)
 476
 477     def __len__(self):
 478         """Returns the number of objects in this pack."""
 479         return self._num_objects
 480
 481     def calculate_checksum(self):
 482         """Calculate the checksum for this pack.
 483
 484         :return: 20-byte binary SHA1 digest
 485         """
 486         map, map_offset = simple_mmap(self._file, 0, self._size - 20)
 487         try:
 488             return make_sha(map[map_offset:self._size-20]).digest()
 489         finally:
 490             map.close()
 491
 492     def resolve_object(self, offset, type, obj, get_ref, get_offset=None):
 493         """Resolve an object, possibly resolving deltas when necessary.
 494
 495         :return: Tuple with object type and contents.
 496         """
 497         if type not in (6, 7): # Not a delta
 498             return type, obj
 499
 500         if get_offset is None:
 501             get_offset = self.get_object_at
 502
 503         if type == 6: # offset delta
 504             (delta_offset, delta) = obj
 505             assert isinstance(delta_offset, int)
 506             assert isinstance(delta, str)
 507             base_offset = offset-delta_offset
 508             type, base_obj = get_offset(base_offset)
 509             assert isinstance(type, int)
 510         elif type == 7: # ref delta
 511             (basename, delta) = obj
 512             assert isinstance(basename, str) and len(basename) == 20
 513             assert isinstance(delta, str)
 514             type, base_obj = get_ref(basename)
 515             assert isinstance(type, int)
 516             # Can't be a ofs delta, as we wouldn't know the base offset
 517             assert type != 6
 518             base_offset = None
 519         type, base_text = self.resolve_object(base_offset, type, base_obj, get_ref)
 520         if base_offset is not None:
 521             self._offset_cache[base_offset] = type, base_text
 522         ret = (type, apply_delta(base_text, delta))
 523         return ret
 524
 525     def iterobjects(self, progress=None):
 526
 527         class ObjectIterator(object):
 528
 529             def __init__(self, pack):
 530                 self.i = 0
 531                 self.offset = pack._header_size
 532                 self.num = len(pack)
 533                 self.map, _ = simple_mmap(pack._file, 0, pack._size)
 534
 535             def __del__(self):
 536                 self.map.close()
 537
 538             def __iter__(self):
 539                 return self
 540
 541             def __len__(self):
 542                 return self.num
 543
 544             def next(self):
 545                 if self.i == self.num:
 546                     raise StopIteration
 547                 (type, obj, total_size) = unpack_object(self.map, self.offset)
 548                 crc32 = zlib.crc32(self.map[self.offset:self.offset+total_size]) & 0xffffffff
 549                 ret = (self.offset, type, obj, crc32)
 550                 self.offset += total_size
 551                 if progress:
 552                     progress(self.i, self.num)
 553                 self.i+=1
 554                 return ret
 555         return ObjectIterator(self)
 556
 557     def iterentries(self, ext_resolve_ref=None, progress=None):
 558         found = {}
 559         postponed = defaultdict(list)
 560         class Postpone(Exception):
 561             """Raised to postpone delta resolving."""
 562
 563         def get_ref_text(sha):
 564             assert len(sha) == 20
 565             if sha in found:
 566                 return self.get_object_at(found[sha])
 567             if ext_resolve_ref:
 568                 try:
 569                     return ext_resolve_ref(sha)
 570                 except KeyError:
 571                     pass
 572             raise Postpone, (sha, )
 573         extra = []
 574         todo = chain(self.iterobjects(progress=progress), extra)
 575         for (offset, type, obj, crc32) in todo:
 576             assert isinstance(offset, int)
 577             assert isinstance(type, int)
 578             assert isinstance(obj, tuple) or isinstance(obj, str)
 579             try:
 580                 type, obj = self.resolve_object(offset, type, obj, get_ref_text)
 581             except Postpone, (sha, ):
 582                 postponed[sha].append((offset, type, obj))
 583             else:
 584                 shafile = ShaFile.from_raw_string(type, obj)
 585                 sha = shafile.sha().digest()
 586                 found[sha] = offset
 587                 yield sha, offset, crc32
 588                 extra.extend(postponed.get(sha, []))
 589         if postponed:
 590             raise KeyError([sha_to_hex(h) for h in postponed.keys()])
 591
 592     def sorted_entries(self, resolve_ext_ref=None, progress=None):
 593         ret = list(self.iterentries(resolve_ext_ref, progress=progress))
 594         ret.sort()
 595         return ret
 596
 597     def create_index_v1(self, filename, resolve_ext_ref=None, progress=None):
 598         """Create a version 1 file for this data file.
 599
 600         :param filename: Index filename.
 601         :param resolve_ext_ref: Function to use for resolving externally referenced
 602             SHA1s (for thin packs)
 603         :param progress: Progress report function
 604         """
 605         entries = self.sorted_entries(resolve_ext_ref, progress=progress)
 606         write_pack_index_v1(filename, entries, self.calculate_checksum())
 607
 608     def create_index_v2(self, filename, resolve_ext_ref=None, progress=None):
 609         """Create a version 2 index file for this data file.
 610
 611         :param filename: Index filename.
 612         :param resolve_ext_ref: Function to use for resolving externally referenced
 613             SHA1s (for thin packs)
 614         :param progress: Progress report function
 615         """
 616         entries = self.sorted_entries(resolve_ext_ref, progress=progress)
 617         write_pack_index_v2(filename, entries, self.calculate_checksum())
 618
 619     def create_index(self, filename, resolve_ext_ref=None, progress=None,
 620                      version=2):
 621         """Create an  index file for this data file.
 622
 623         :param filename: Index filename.
 624         :param resolve_ext_ref: Function to use for resolving externally referenced
 625             SHA1s (for thin packs)
 626         :param progress: Progress report function
 627         """
 628         if version == 1:
 629             self.create_index_v1(filename, resolve_ext_ref, progress)
 630         elif version == 2:
 631             self.create_index_v2(filename, resolve_ext_ref, progress)
 632         else:
 633             raise ValueError("unknown index format %d" % version)
 634
 635     def get_stored_checksum(self):
 636         return self._stored_checksum
 637
 638     def check(self):
 639         return (self.calculate_checksum() == self.get_stored_checksum())
 640
 641     def get_object_at(self, offset):
 642         """Given an offset in to the packfile return the object that is there.
 643
 644         Using the associated index the location of an object can be looked up, and
 645         then the packfile can be asked directly for that object using this
 646         function.
 647         """
 648         if offset in self._offset_cache:
 649             return self._offset_cache[offset]
 650         assert isinstance(offset, long) or isinstance(offset, int),\
 651                 "offset was %r" % offset
 652         assert offset >= self._header_size
 653         map, map_offset = simple_mmap(self._file, offset, self._size-offset)
 654         try:
 655             ret = unpack_object(map, map_offset)[:2]
 656             return ret
 657         finally:
 658             map.close()
 659
 660
 661 class SHA1Reader(object):
 662     """Wrapper around a file-like object that remembers the SHA1 of
 663     the data read from it."""
 664
 665     def __init__(self, f):
 666         self.f = f
 667         self.sha1 = make_sha("")
 668
 669     def read(self, num=None):
 670         data = self.f.read(num)
 671         self.sha1.update(data)
 672         return data
 673
 674     def check_sha(self):
 675         stored = self.f.read(20)
 676         if stored != self.sha1.digest():
 677             raise ChecksumMismatch(self.sha1.hexdigest(), sha_to_hex(stored))
 678
 679     def close(self):
 680         return self.f.close()
 681
 682     def tell(self):
 683         return self.f.tell()
 684
 685
 686 class SHA1Writer(object):
 687     """Wrapper around a file-like object that remembers the SHA1 of
 688     the data written to it."""
 689
 690     def __init__(self, f):
 691         self.f = f
 692         self.sha1 = make_sha("")
 693
 694     def write(self, data):
 695         self.sha1.update(data)
 696         self.f.write(data)
 697
 698     def write_sha(self):
 699         sha = self.sha1.digest()
 700         assert len(sha) == 20
 701         self.f.write(sha)
 702         return sha
 703
 704     def close(self):
 705         sha = self.write_sha()
 706         self.f.close()
 707         return sha
 708
 709     def tell(self):
 710         return self.f.tell()
 711
 712
 713 def write_pack_object(f, type, object):
 714     """Write pack object to a file.
 715
 716     :param f: File to write to
 717     :param o: Object to write
 718     :return: Tuple with offset at which the object was written, and crc32
 719     """
 720     ret = f.tell()
 721     packed_data_hdr = ""
 722     if type == 6: # ref delta
 723         (delta_base_offset, object) = object
 724     elif type == 7: # offset delta
 725         (basename, object) = object
 726     size = len(object)
 727     c = (type << 4) | (size & 15)
 728     size >>= 4
 729     while size:
 730         packed_data_hdr += (chr(c | 0x80))
 731         c = size & 0x7f
 732         size >>= 7
 733     packed_data_hdr += chr(c)
 734     if type == 6: # offset delta
 735         ret = [delta_base_offset & 0x7f]
 736         delta_base_offset >>= 7
 737         while delta_base_offset:
 738             delta_base_offset -= 1
 739             ret.insert(0, 0x80 | (delta_base_offset & 0x7f))
 740             delta_base_offset >>= 7
 741         packed_data_hdr += "".join([chr(x) for x in ret])
 742     elif type == 7: # ref delta
 743         assert len(basename) == 20
 744         packed_data_hdr += basename
 745     packed_data = packed_data_hdr + zlib.compress(object)
 746     f.write(packed_data)
 747     return (f.tell(), (zlib.crc32(packed_data) & 0xffffffff))
 748
 749
 750 def write_pack(filename, objects, num_objects):
 751     """Write a new pack data file.
 752
 753     :param filename: Path to the new pack file (without .pack extension)
 754     :param objects: Iterable over (object, path) tuples to write
 755     :param num_objects: Number of objects to write
 756     """
 757     f = open(filename + ".pack", 'wb')
 758     try:
 759         entries, data_sum = write_pack_data(f, objects, num_objects)
 760     finally:
 761         f.close()
 762     entries.sort()
 763     write_pack_index_v2(filename + ".idx", entries, data_sum)
 764
 765
 766 def write_pack_data(f, objects, num_objects, window=10):
 767     """Write a new pack file.
 768
 769     :param filename: The filename of the new pack file.
 770     :param objects: List of objects to write (tuples with object and path)
 771     :return: List with (name, offset, crc32 checksum) entries, pack checksum
 772     """
 773     recency = list(objects)
 774     # FIXME: Somehow limit delta depth
 775     # FIXME: Make thin-pack optional (its not used when cloning a pack)
 776     # Build a list of objects ordered by the magic Linus heuristic
 777     # This helps us find good objects to diff against us
 778     magic = []
 779     for obj, path in recency:
 780         magic.append( (obj.type, path, 1, -len(obj.as_raw_string()), obj) )
 781     magic.sort()
 782     # Build a map of objects and their index in magic - so we can find preceeding objects
 783     # to diff against
 784     offs = {}
 785     for i in range(len(magic)):
 786         offs[magic[i][4]] = i
 787     # Write the pack
 788     entries = []
 789     f = SHA1Writer(f)
 790     f.write("PACK")               # Pack header
 791     f.write(struct.pack(">L", 2)) # Pack version
 792     f.write(struct.pack(">L", num_objects)) # Number of objects in pack
 793     for o, path in recency:
 794         sha1 = o.sha().digest()
 795         orig_t = o.type
 796         raw = o.as_raw_string()
 797         winner = raw
 798         t = orig_t
 799         #for i in range(offs[o]-window, window):
 800         #    if i < 0 or i >= len(offs): continue
 801         #    b = magic[i][4]
 802         #    if b.type != orig_t: continue
 803         #    base = b.as_raw_string()
 804         #    delta = create_delta(base, raw)
 805         #    if len(delta) < len(winner):
 806         #        winner = delta
 807         #        t = 6 if magic[i][2] == 1 else 7
 808         offset, crc32 = write_pack_object(f, t, winner)
 809         entries.append((sha1, offset, crc32))
 810     return entries, f.write_sha()
 811
 812
 813 def write_pack_index_v1(filename, entries, pack_checksum):
 814     """Write a new pack index file.
 815
 816     :param filename: The filename of the new pack index file.
 817     :param entries: List of tuples with object name (sha), offset_in_pack,  and
 818             crc32_checksum.
 819     :param pack_checksum: Checksum of the pack file.
 820     """
 821     f = open(filename, 'wb')
 822     f = SHA1Writer(f)
 823     fan_out_table = defaultdict(lambda: 0)
 824     for (name, offset, entry_checksum) in entries:
 825         fan_out_table[ord(name[0])] += 1
 826     # Fan-out table
 827     for i in range(0x100):
 828         f.write(struct.pack(">L", fan_out_table[i]))
 829         fan_out_table[i+1] += fan_out_table[i]
 830     for (name, offset, entry_checksum) in entries:
 831         f.write(struct.pack(">L20s", offset, name))
 832     assert len(pack_checksum) == 20
 833     f.write(pack_checksum)
 834     f.close()
 835
 836
 837 def create_delta(base_buf, target_buf):
 838     """Use python difflib to work out how to transform base_buf to target_buf.
 839
 840     :param base_buf: Base buffer
 841     :param target_buf: Target buffer
 842     """
 843     assert isinstance(base_buf, str)
 844     assert isinstance(target_buf, str)
 845     out_buf = ""
 846     # write delta header
 847     def encode_size(size):
 848         ret = ""
 849         c = size & 0x7f
 850         size >>= 7
 851         while size:
 852             ret += chr(c | 0x80)
 853             c = size & 0x7f
 854             size >>= 7
 855         ret += chr(c)
 856         return ret
 857     out_buf += encode_size(len(base_buf))
 858     out_buf += encode_size(len(target_buf))
 859     # write out delta opcodes
 860     seq = difflib.SequenceMatcher(a=base_buf, b=target_buf)
 861     for opcode, i1, i2, j1, j2 in seq.get_opcodes():
 862         # Git patch opcodes don't care about deletes!
 863         #if opcode == "replace" or opcode == "delete":
 864         #    pass
 865         if opcode == "equal":
 866             # If they are equal, unpacker will use data from base_buf
 867             # Write out an opcode that says what range to use
 868             scratch = ""
 869             op = 0x80
 870             o = i1
 871             for i in range(4):
 872                 if o & 0xff << i*8:
 873                     scratch += chr(o >> i)
 874                     op |= 1 << i
 875             s = i2 - i1
 876             for i in range(2):
 877                 if s & 0xff << i*8:
 878                     scratch += chr(s >> i)
 879                     op |= 1 << (4+i)
 880             out_buf += chr(op)
 881             out_buf += scratch
 882         if opcode == "replace" or opcode == "insert":
 883             # If we are replacing a range or adding one, then we just
 884             # output it to the stream (prefixed by its size)
 885             s = j2 - j1
 886             o = j1
 887             while s > 127:
 888                 out_buf += chr(127)
 889                 out_buf += target_buf[o:o+127]
 890                 s -= 127
 891                 o += 127
 892             out_buf += chr(s)
 893             out_buf += target_buf[o:o+s]
 894     return out_buf
 895
 896
 897 def apply_delta(src_buf, delta):
 898     """Based on the similar function in git's patch-delta.c.
 899
 900     :param src_buf: Source buffer
 901     :param delta: Delta instructions
 902     """
 903     assert isinstance(src_buf, str), "was %r" % (src_buf,)
 904     assert isinstance(delta, str)
 905     out = []
 906     index = 0
 907     delta_length = len(delta)
 908     def get_delta_header_size(delta, index):
 909         size = 0
 910         i = 0
 911         while delta:
 912             cmd = ord(delta[index])
 913             index += 1
 914             size |= (cmd & ~0x80) << i
 915             i += 7
 916             if not cmd & 0x80:
 917                 break
 918         return size, index
 919     src_size, index = get_delta_header_size(delta, index)
 920     dest_size, index = get_delta_header_size(delta, index)
 921     assert src_size == len(src_buf), "%d vs %d" % (src_size, len(src_buf))
 922     while index < delta_length:
 923         cmd = ord(delta[index])
 924         index += 1
 925         if cmd & 0x80:
 926             cp_off = 0
 927             for i in range(4):
 928                 if cmd & (1 << i):
 929                     x = ord(delta[index])
 930                     index += 1
 931                     cp_off |= x << (i * 8)
 932             cp_size = 0
 933             for i in range(3):
 934                 if cmd & (1 << (4+i)):
 935                     x = ord(delta[index])
 936                     index += 1
 937                     cp_size |= x << (i * 8)
 938             if cp_size == 0:
 939                 cp_size = 0x10000
 940             if (cp_off + cp_size < cp_size or
 941                 cp_off + cp_size > src_size or
 942                 cp_size > dest_size):
 943                 break
 944             out.append(src_buf[cp_off:cp_off+cp_size])
 945         elif cmd != 0:
 946             out.append(delta[index:index+cmd])
 947             index += cmd
 948         else:
 949             raise ApplyDeltaError("Invalid opcode 0")
 950
 951     if index != delta_length:
 952         raise ApplyDeltaError("delta not empty: %r" % delta[index:])
 953
 954     out = ''.join(out)
 955     if dest_size != len(out):
 956         raise ApplyDeltaError("dest size incorrect")
 957
 958     return out
 959
 960
 961 def write_pack_index_v2(filename, entries, pack_checksum):
 962     """Write a new pack index file.
 963
 964     :param filename: The filename of the new pack index file.
 965     :param entries: List of tuples with object name (sha), offset_in_pack,  and
 966             crc32_checksum.
 967     :param pack_checksum: Checksum of the pack file.
 968     """
 969     f = open(filename, 'wb')
 970     f = SHA1Writer(f)
 971     f.write('\377tOc') # Magic!
 972     f.write(struct.pack(">L", 2))
 973     fan_out_table = defaultdict(lambda: 0)
 974     for (name, offset, entry_checksum) in entries:
 975         fan_out_table[ord(name[0])] += 1
 976     # Fan-out table
 977     for i in range(0x100):
 978         f.write(struct.pack(">L", fan_out_table[i]))
 979         fan_out_table[i+1] += fan_out_table[i]
 980     for (name, offset, entry_checksum) in entries:
 981         f.write(name)
 982     for (name, offset, entry_checksum) in entries:
 983         f.write(struct.pack(">L", entry_checksum))
 984     for (name, offset, entry_checksum) in entries:
 985         # FIXME: handle if MSBit is set in offset
 986         f.write(struct.pack(">L", offset))
 987     # FIXME: handle table for pack files > 8 Gb
 988     assert len(pack_checksum) == 20
 989     f.write(pack_checksum)
 990     f.close()
 991
 992
 993 class Pack(object):
 994     """A Git pack object."""
 995
 996     def __init__(self, basename):
 997         self._basename = basename
 998         self._data_path = self._basename + ".pack"
 999         self._idx_path = self._basename + ".idx"
1000         self._data = None
1001         self._idx = None
1002
1003     @classmethod
1004     def from_objects(self, data, idx):
1005         """Create a new pack object from pack data and index objects."""
1006         ret = Pack("")
1007         ret._data = data
1008         ret._idx = idx
1009         return ret
1010
1011     def name(self):
1012         """The SHA over the SHAs of the objects in this pack."""
1013         return self.index.objects_sha1()
1014
1015     @property
1016     def data(self):
1017         """The pack data object being used."""
1018         if self._data is None:
1019             self._data = PackData(self._data_path)
1020             assert len(self.index) == len(self._data)
1021             idx_stored_checksum = self.index.get_pack_checksum()
1022             data_stored_checksum = self._data.get_stored_checksum()
1023             if idx_stored_checksum != data_stored_checksum:
1024                 raise ChecksumMismatch(sha_to_hex(idx_stored_checksum),
1025                                        sha_to_hex(data_stored_checksum))
1026         return self._data
1027
1028     @property
1029     def index(self):
1030         """The index being used.
1031
1032         :note: This may be an in-memory index
1033         """
1034         if self._idx is None:
1035             self._idx = load_pack_index(self._idx_path)
1036         return self._idx
1037
1038     def close(self):
1039         if self._data is not None:
1040             self._data.close()
1041         self.index.close()
1042
1043     def __eq__(self, other):
1044         return type(self) == type(other) and self.index == other.index
1045
1046     def __len__(self):
1047         """Number of entries in this pack."""
1048         return len(self.index)
1049
1050     def __repr__(self):
1051         return "%s(%r)" % (self.__class__.__name__, self._basename)
1052
1053     def __iter__(self):
1054         """Iterate over all the sha1s of the objects in this pack."""
1055         return iter(self.index)
1056
1057     def check(self):
1058         """Check the integrity of this pack."""
1059         if not self.index.check():
1060             return False
1061         if not self.data.check():
1062             return False
1063         return True
1064
1065     def get_stored_checksum(self):
1066         return self.data.get_stored_checksum()
1067
1068     def __contains__(self, sha1):
1069         """Check whether this pack contains a particular SHA1."""
1070         try:
1071             self.index.object_index(sha1)
1072             return True
1073         except KeyError:
1074             return False
1075
1076     def get_raw(self, sha1, resolve_ref=None):
1077         offset = self.index.object_index(sha1)
1078         obj_type, obj = self.data.get_object_at(offset)
1079         if type(offset) is long:
1080           offset = int(offset)
1081         if resolve_ref is None:
1082             resolve_ref = self.get_raw
1083         return self.data.resolve_object(offset, obj_type, obj, resolve_ref)
1084
1085     def __getitem__(self, sha1):
1086         """Retrieve the specified SHA1."""
1087         type, uncomp = self.get_raw(sha1)
1088         return ShaFile.from_raw_string(type, uncomp)
1089
1090     def iterobjects(self, get_raw=None):
1091         """Iterate over the objects in this pack."""
1092         if get_raw is None:
1093             get_raw = self.get_raw
1094         for offset, type, obj, crc32 in self.data.iterobjects():
1095             assert isinstance(offset, int)
1096             yield ShaFile.from_raw_string(
1097                     *self.data.resolve_object(offset, type, obj, get_raw))
1098
1099
1100 def load_packs(path):
1101     if not os.path.exists(path):
1102         return
1103     for name in os.listdir(path):
1104         if name.startswith("pack-") and name.endswith(".pack"):
1105             yield Pack(os.path.join(path, name[:-len(".pack")]))
1106
1107
1108 try:
1109     from dulwich._pack import apply_delta, bisect_find_sha
1110 except ImportError:
1111     pass