dulwich/objects.py

   1 # objects.py -- Access to base git objects
   2 # Copyright (C) 2007 James Westby <jw+debian@jameswestby.net>
   3 # Copyright (C) 2008-2009 Jelmer Vernooij <jelmer@samba.org>
   4 #
   5 # This program is free software; you can redistribute it and/or
   6 # modify it under the terms of the GNU General Public License
   7 # as published by the Free Software Foundation; version 2
   8 # of the License or (at your option) a later version of the License.
   9 #
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 # GNU General Public License for more details.
  14 #
  15 # You should have received a copy of the GNU General Public License
  16 # along with this program; if not, write to the Free Software
  17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
  18 # MA  02110-1301, USA.
  19
  20
  21 """Access to base git objects."""
  22
  23
  24 import binascii
  25 from cStringIO import (
  26     StringIO,
  27     )
  28 import mmap
  29 import os
  30 import stat
  31 import zlib
  32
  33 from dulwich.errors import (
  34     ChecksumMismatch,
  35     NotBlobError,
  36     NotCommitError,
  37     NotTagError,
  38     NotTreeError,
  39     ObjectFormatException,
  40     )
  41 from dulwich.file import GitFile
  42 from dulwich.misc import (
  43     make_sha,
  44     )
  45
  46
  47 # Header fields for commits
  48 _TREE_HEADER = "tree"
  49 _PARENT_HEADER = "parent"
  50 _AUTHOR_HEADER = "author"
  51 _COMMITTER_HEADER = "committer"
  52 _ENCODING_HEADER = "encoding"
  53
  54
  55 # Header fields for objects
  56 _OBJECT_HEADER = "object"
  57 _TYPE_HEADER = "type"
  58 _TAG_HEADER = "tag"
  59 _TAGGER_HEADER = "tagger"
  60
  61
  62 S_IFGITLINK = 0160000
  63
  64 def S_ISGITLINK(m):
  65     return (stat.S_IFMT(m) == S_IFGITLINK)
  66
  67
  68 def _decompress(string):
  69     dcomp = zlib.decompressobj()
  70     dcomped = dcomp.decompress(string)
  71     dcomped += dcomp.flush()
  72     return dcomped
  73
  74
  75 def sha_to_hex(sha):
  76     """Takes a string and returns the hex of the sha within"""
  77     hexsha = binascii.hexlify(sha)
  78     assert len(hexsha) == 40, "Incorrect length of sha1 string: %d" % hexsha
  79     return hexsha
  80
  81
  82 def hex_to_sha(hex):
  83     """Takes a hex sha and returns a binary sha"""
  84     assert len(hex) == 40, "Incorrent length of hexsha: %s" % hex
  85     return binascii.unhexlify(hex)
  86
  87
  88 def hex_to_filename(path, hex):
  89     """Takes a hex sha and returns its filename relative to the given path."""
  90     dir = hex[:2]
  91     file = hex[2:]
  92     # Check from object dir
  93     return os.path.join(path, dir, file)
  94
  95
  96 def filename_to_hex(filename):
  97     """Takes an object filename and returns its corresponding hex sha."""
  98     # grab the last (up to) two path components
  99     names = filename.rsplit(os.path.sep, 2)[-2:]
 100     errmsg = "Invalid object filename: %s" % filename
 101     assert len(names) == 2, errmsg
 102     base, rest = names
 103     assert len(base) == 2 and len(rest) == 38, errmsg
 104     hex = base + rest
 105     hex_to_sha(hex)
 106     return hex
 107
 108
 109 def serializable_property(name, docstring=None):
 110     def set(obj, value):
 111         obj._ensure_parsed()
 112         setattr(obj, "_"+name, value)
 113         obj._needs_serialization = True
 114     def get(obj):
 115         obj._ensure_parsed()
 116         return getattr(obj, "_"+name)
 117     return property(get, set, doc=docstring)
 118
 119
 120 def object_class(type):
 121     """Get the object class corresponding to the given type.
 122
 123     :param type: Either a type name string or a numeric type.
 124     :return: The ShaFile subclass corresponding to the given type, or None if
 125         type is not a valid type name/number.
 126     """
 127     return _TYPE_MAP.get(type, None)
 128
 129
 130 def check_hexsha(hex, error_msg):
 131     try:
 132         hex_to_sha(hex)
 133     except (TypeError, AssertionError):
 134         raise ObjectFormatException("%s %s" % (error_msg, hex))
 135
 136
 137 def check_identity(identity, error_msg):
 138     email_start = identity.find("<")
 139     email_end = identity.find(">")
 140     if (email_start < 0 or email_end < 0 or email_end <= email_start
 141         or identity.find("<", email_start + 1) >= 0
 142         or identity.find(">", email_end + 1) >= 0
 143         or not identity.endswith(">")):
 144         raise ObjectFormatException(error_msg)
 145
 146
 147 class FixedSha(object):
 148     """SHA object that behaves like hashlib's but is given a fixed value."""
 149
 150     def __init__(self, hexsha):
 151         self._hexsha = hexsha
 152         self._sha = hex_to_sha(hexsha)
 153
 154     def digest(self):
 155         return self._sha
 156
 157     def hexdigest(self):
 158         return self._hexsha
 159
 160
 161 class ShaFile(object):
 162     """A git SHA file."""
 163
 164     @staticmethod
 165     def _parse_legacy_object_header(magic, f):
 166         """Parse a legacy object, creating it but not reading the file."""
 167         bufsize = 1024
 168         decomp = zlib.decompressobj()
 169         header = decomp.decompress(magic)
 170         start = 0
 171         end = -1
 172         while end < 0:
 173             header += decomp.decompress(f.read(bufsize))
 174             end = header.find("\0", start)
 175             start = len(header)
 176         header = header[:end]
 177         type_name, size = header.split(" ", 1)
 178         size = int(size)  # sanity check
 179         obj_class = object_class(type_name)
 180         if not obj_class:
 181             raise ObjectFormatException("Not a known type: %s" % type_name)
 182         obj = obj_class()
 183         obj._filename = f.name
 184         return obj
 185
 186     def _parse_legacy_object(self, f):
 187         """Parse a legacy object, setting the raw string."""
 188         size = os.path.getsize(f.name)
 189         map = mmap.mmap(f.fileno(), size, access=mmap.ACCESS_READ)
 190         try:
 191             text = _decompress(map)
 192         finally:
 193             map.close()
 194         header_end = text.find('\0')
 195         if header_end < 0:
 196             raise ObjectFormatException("Invalid object header")
 197         self.set_raw_string(text[header_end+1:])
 198
 199     def as_legacy_object_chunks(self):
 200         compobj = zlib.compressobj()
 201         yield compobj.compress(self._header())
 202         for chunk in self.as_raw_chunks():
 203             yield compobj.compress(chunk)
 204         yield compobj.flush()
 205
 206     def as_legacy_object(self):
 207         return "".join(self.as_legacy_object_chunks())
 208
 209     def as_raw_chunks(self):
 210         if self._needs_parsing:
 211             self._ensure_parsed()
 212         elif self._needs_serialization:
 213             self._chunked_text = self._serialize()
 214         return self._chunked_text
 215
 216     def as_raw_string(self):
 217         return "".join(self.as_raw_chunks())
 218
 219     def __str__(self):
 220         return self.as_raw_string()
 221
 222     def __hash__(self):
 223         return hash(self.id)
 224
 225     def as_pretty_string(self):
 226         return self.as_raw_string()
 227
 228     def _ensure_parsed(self):
 229         if self._needs_parsing:
 230             if not self._chunked_text:
 231                 assert self._filename, "ShaFile needs either text or filename"
 232                 self._parse_file()
 233             self._deserialize(self._chunked_text)
 234             self._needs_parsing = False
 235
 236     def set_raw_string(self, text):
 237         if type(text) != str:
 238             raise TypeError(text)
 239         self.set_raw_chunks([text])
 240
 241     def set_raw_chunks(self, chunks):
 242         self._chunked_text = chunks
 243         self._deserialize(chunks)
 244         self._sha = None
 245         self._needs_parsing = False
 246         self._needs_serialization = False
 247
 248     @staticmethod
 249     def _parse_object_header(magic, f):
 250         """Parse a new style object, creating it but not reading the file."""
 251         num_type = (ord(magic[0]) >> 4) & 7
 252         obj_class = object_class(num_type)
 253         if not obj_class:
 254             raise ObjectFormatError("Not a known type: %d" % num_type)
 255         obj = obj_class()
 256         obj._filename = f.name
 257         return obj
 258
 259     def _parse_object(self, f):
 260         """Parse a new style object, setting self._text."""
 261         size = os.path.getsize(f.name)
 262         map = mmap.mmap(f.fileno(), size, access=mmap.ACCESS_READ)
 263         try:
 264             # skip type and size; type must have already been determined, and we
 265             # trust zlib to fail if it's otherwise corrupted
 266             byte = ord(map[0])
 267             used = 1
 268             while (byte & 0x80) != 0:
 269                 byte = ord(map[used])
 270                 used += 1
 271             raw = map[used:]
 272             self.set_raw_string(_decompress(raw))
 273         finally:
 274             map.close()
 275
 276     @classmethod
 277     def _is_legacy_object(cls, magic):
 278         b0, b1 = map(ord, magic)
 279         word = (b0 << 8) + b1
 280         return b0 == 0x78 and (word % 31) == 0
 281
 282     @classmethod
 283     def _parse_file_header(cls, f):
 284         magic = f.read(2)
 285         if cls._is_legacy_object(magic):
 286             return cls._parse_legacy_object_header(magic, f)
 287         else:
 288             return cls._parse_object_header(magic, f)
 289
 290     def __init__(self):
 291         """Don't call this directly"""
 292         self._sha = None
 293         self._filename = None
 294         self._chunked_text = []
 295         self._needs_parsing = False
 296         self._needs_serialization = True
 297
 298     def _deserialize(self, chunks):
 299         raise NotImplementedError(self._deserialize)
 300
 301     def _serialize(self):
 302         raise NotImplementedError(self._serialize)
 303
 304     def _parse_file(self):
 305         f = GitFile(self._filename, 'rb')
 306         try:
 307             magic = f.read(2)
 308             if self._is_legacy_object(magic):
 309                 self._parse_legacy_object(f)
 310             else:
 311                 self._parse_object(f)
 312         finally:
 313             f.close()
 314
 315     @classmethod
 316     def from_file(cls, filename):
 317         """Get the contents of a SHA file on disk."""
 318         f = GitFile(filename, 'rb')
 319         try:
 320             try:
 321                 obj = cls._parse_file_header(f)
 322                 obj._sha = FixedSha(filename_to_hex(filename))
 323                 obj._needs_parsing = True
 324                 obj._needs_serialization = True
 325                 return obj
 326             except (IndexError, ValueError), e:
 327                 raise ObjectFormatException("invalid object header")
 328         finally:
 329             f.close()
 330
 331     @staticmethod
 332     def from_raw_string(type_num, string):
 333         """Creates an object of the indicated type from the raw string given.
 334
 335         :param type_num: The numeric type of the object.
 336         :param string: The raw uncompressed contents.
 337         """
 338         obj = object_class(type_num)()
 339         obj.set_raw_string(string)
 340         return obj
 341
 342     @staticmethod
 343     def from_raw_chunks(type_num, chunks):
 344         """Creates an object of the indicated type from the raw chunks given.
 345
 346         :param type_num: The numeric type of the object.
 347         :param chunks: An iterable of the raw uncompressed contents.
 348         """
 349         obj = object_class(type_num)()
 350         obj.set_raw_chunks(chunks)
 351         return obj
 352
 353     @classmethod
 354     def from_string(cls, string):
 355         """Create a ShaFile from a string."""
 356         obj = cls()
 357         obj.set_raw_string(string)
 358         return obj
 359
 360     def _check_has_member(self, member, error_msg):
 361         """Check that the object has a given member variable.
 362
 363         :param member: the member variable to check for
 364         :param error_msg: the message for an error if the member is missing
 365         :raise ObjectFormatException: with the given error_msg if member is
 366             missing or is None
 367         """
 368         if getattr(self, member, None) is None:
 369             raise ObjectFormatException(error_msg)
 370
 371     def check(self):
 372         """Check this object for internal consistency.
 373
 374         :raise ObjectFormatException: if the object is malformed in some way
 375         :raise ChecksumMismatch: if the object was created with a SHA that does
 376             not match its contents
 377         """
 378         # TODO: if we find that error-checking during object parsing is a
 379         # performance bottleneck, those checks should be moved to the class's
 380         # check() method during optimization so we can still check the object
 381         # when necessary.
 382         old_sha = self.id
 383         try:
 384             self._deserialize(self.as_raw_chunks())
 385             self._sha = None
 386             new_sha = self.id
 387         except Exception, e:
 388             raise ObjectFormatException(e)
 389         if old_sha != new_sha:
 390             raise ChecksumMismatch(new_sha, old_sha)
 391
 392     def _header(self):
 393         return "%s %lu\0" % (self.type_name, self.raw_length())
 394
 395     def raw_length(self):
 396         """Returns the length of the raw string of this object."""
 397         ret = 0
 398         for chunk in self.as_raw_chunks():
 399             ret += len(chunk)
 400         return ret
 401
 402     def _make_sha(self):
 403         ret = make_sha()
 404         ret.update(self._header())
 405         for chunk in self.as_raw_chunks():
 406             ret.update(chunk)
 407         return ret
 408
 409     def sha(self):
 410         """The SHA1 object that is the name of this object."""
 411         if self._sha is None:
 412             # this is a local because as_raw_chunks() overwrites self._sha
 413             new_sha = make_sha()
 414             new_sha.update(self._header())
 415             for chunk in self.as_raw_chunks():
 416                 new_sha.update(chunk)
 417             self._sha = new_sha
 418         return self._sha
 419
 420     @property
 421     def id(self):
 422         return self.sha().hexdigest()
 423
 424     def get_type(self):
 425         return self.type_num
 426
 427     def set_type(self, type):
 428         self.type_num = type
 429
 430     # DEPRECATED: use type_num or type_name as needed.
 431     type = property(get_type, set_type)
 432
 433     def __repr__(self):
 434         return "<%s %s>" % (self.__class__.__name__, self.id)
 435
 436     def __ne__(self, other):
 437         return self.id != other.id
 438
 439     def __eq__(self, other):
 440         """Return true if the sha of the two objects match.
 441
 442         The __le__ etc methods aren't overriden as they make no sense,
 443         certainly at this level.
 444         """
 445         return self.id == other.id
 446
 447
 448 class Blob(ShaFile):
 449     """A Git Blob object."""
 450
 451     type_name = 'blob'
 452     type_num = 3
 453
 454     def __init__(self):
 455         super(Blob, self).__init__()
 456         self._chunked_text = []
 457         self._needs_parsing = False
 458         self._needs_serialization = False
 459
 460     def _get_data(self):
 461         return self.as_raw_string()
 462
 463     def _set_data(self, data):
 464         self.set_raw_string(data)
 465
 466     data = property(_get_data, _set_data,
 467                     "The text contained within the blob object.")
 468
 469     def _get_chunked(self):
 470         self._ensure_parsed()
 471         return self._chunked_text
 472
 473     def _set_chunked(self, chunks):
 474         self._chunked_text = chunks
 475
 476     def _serialize(self):
 477         if not self._chunked_text:
 478             self._ensure_parsed()
 479         self._needs_serialization = False
 480         return self._chunked_text
 481
 482     def _deserialize(self, chunks):
 483         self._chunked_text = chunks
 484
 485     chunked = property(_get_chunked, _set_chunked,
 486         "The text within the blob object, as chunks (not necessarily lines).")
 487
 488     @classmethod
 489     def from_file(cls, filename):
 490         blob = ShaFile.from_file(filename)
 491         if not isinstance(blob, cls):
 492             raise NotBlobError(filename)
 493         return blob
 494
 495     def check(self):
 496         """Check this object for internal consistency.
 497
 498         :raise ObjectFormatException: if the object is malformed in some way
 499         """
 500         super(Blob, self).check()
 501
 502
 503 def _parse_tag_or_commit(text):
 504     """Parse tag or commit text.
 505
 506     :param text: the raw text of the tag or commit object.
 507     :yield: tuples of (field, value), one per header line, in the order read
 508         from the text, possibly including duplicates. Includes a field named
 509         None for the freeform tag/commit text.
 510     """
 511     f = StringIO(text)
 512     for l in f:
 513         l = l.rstrip("\n")
 514         if l == "":
 515             # Empty line indicates end of headers
 516             break
 517         yield l.split(" ", 1)
 518     yield (None, f.read())
 519     f.close()
 520
 521
 522 def parse_tag(text):
 523     return _parse_tag_or_commit(text)
 524
 525
 526 class Tag(ShaFile):
 527     """A Git Tag object."""
 528
 529     type_name = 'tag'
 530     type_num = 4
 531
 532     def __init__(self):
 533         super(Tag, self).__init__()
 534         self._tag_timezone_neg_utc = False
 535
 536     @classmethod
 537     def from_file(cls, filename):
 538         tag = ShaFile.from_file(filename)
 539         if not isinstance(tag, cls):
 540             raise NotTagError(filename)
 541         return tag
 542
 543     def check(self):
 544         """Check this object for internal consistency.
 545
 546         :raise ObjectFormatException: if the object is malformed in some way
 547         """
 548         super(Tag, self).check()
 549         self._check_has_member("_object_sha", "missing object sha")
 550         self._check_has_member("_object_class", "missing object type")
 551         self._check_has_member("_name", "missing tag name")
 552
 553         if not self._name:
 554             raise ObjectFormatException("empty tag name")
 555
 556         check_hexsha(self._object_sha, "invalid object sha")
 557
 558         if getattr(self, "_tagger", None):
 559             check_identity(self._tagger, "invalid tagger")
 560
 561         last = None
 562         for field, _ in parse_tag("".join(self._chunked_text)):
 563             if field == _OBJECT_HEADER and last is not None:
 564                 raise ObjectFormatException("unexpected object")
 565             elif field == _TYPE_HEADER and last != _OBJECT_HEADER:
 566                 raise ObjectFormatException("unexpected type")
 567             elif field == _TAG_HEADER and last != _TYPE_HEADER:
 568                 raise ObjectFormatException("unexpected tag name")
 569             elif field == _TAGGER_HEADER and last != _TAG_HEADER:
 570                 raise ObjectFormatException("unexpected tagger")
 571             last = field
 572
 573     def _serialize(self):
 574         chunks = []
 575         chunks.append("%s %s\n" % (_OBJECT_HEADER, self._object_sha))
 576         chunks.append("%s %s\n" % (_TYPE_HEADER, self._object_class.type_name))
 577         chunks.append("%s %s\n" % (_TAG_HEADER, self._name))
 578         if self._tagger:
 579             if self._tag_time is None:
 580                 chunks.append("%s %s\n" % (_TAGGER_HEADER, self._tagger))
 581             else:
 582                 chunks.append("%s %s %d %s\n" % (
 583                   _TAGGER_HEADER, self._tagger, self._tag_time,
 584                   format_timezone(self._tag_timezone,
 585                     self._tag_timezone_neg_utc)))
 586         chunks.append("\n") # To close headers
 587         chunks.append(self._message)
 588         return chunks
 589
 590     def _deserialize(self, chunks):
 591         """Grab the metadata attached to the tag"""
 592         self._tagger = None
 593         for field, value in parse_tag("".join(chunks)):
 594             if field == _OBJECT_HEADER:
 595                 self._object_sha = value
 596             elif field == _TYPE_HEADER:
 597                 obj_class = object_class(value)
 598                 if not obj_class:
 599                     raise ObjectFormatException("Not a known type: %s" % value)
 600                 self._object_class = obj_class
 601             elif field == _TAG_HEADER:
 602                 self._name = value
 603             elif field == _TAGGER_HEADER:
 604                 try:
 605                     sep = value.index("> ")
 606                 except ValueError:
 607                     self._tagger = value
 608                     self._tag_time = None
 609                     self._tag_timezone = None
 610                     self._tag_timezone_neg_utc = False
 611                 else:
 612                     self._tagger = value[0:sep+1]
 613                     try:
 614                         (timetext, timezonetext) = value[sep+2:].rsplit(" ", 1)
 615                         self._tag_time = int(timetext)
 616                         self._tag_timezone, self._tag_timezone_neg_utc = \
 617                                 parse_timezone(timezonetext)
 618                     except ValueError, e:
 619                         raise ObjectFormatException(e)
 620             elif field is None:
 621                 self._message = value
 622             else:
 623                 raise ObjectFormatError("Unknown field %s" % field)
 624
 625     def _get_object(self):
 626         """Get the object pointed to by this tag.
 627
 628         :return: tuple of (object class, sha).
 629         """
 630         self._ensure_parsed()
 631         return (self._object_class, self._object_sha)
 632
 633     def _set_object(self, value):
 634         self._ensure_parsed()
 635         (self._object_class, self._object_sha) = value
 636         self._needs_serialization = True
 637
 638     object = property(_get_object, _set_object)
 639
 640     name = serializable_property("name", "The name of this tag")
 641     tagger = serializable_property("tagger",
 642         "Returns the name of the person who created this tag")
 643     tag_time = serializable_property("tag_time",
 644         "The creation timestamp of the tag.  As the number of seconds since the epoch")
 645     tag_timezone = serializable_property("tag_timezone",
 646         "The timezone that tag_time is in.")
 647     message = serializable_property("message", "The message attached to this tag")
 648
 649
 650 def parse_tree(text):
 651     """Parse a tree text.
 652
 653     :param text: Serialized text to parse
 654     :yields: tuples of (name, mode, sha)
 655     """
 656     count = 0
 657     l = len(text)
 658     while count < l:
 659         mode_end = text.index(' ', count)
 660         mode = int(text[count:mode_end], 8)
 661         name_end = text.index('\0', mode_end)
 662         name = text[mode_end+1:name_end]
 663         count = name_end+21
 664         sha = text[name_end+1:count]
 665         yield (name, mode, sha_to_hex(sha))
 666
 667
 668 def serialize_tree(items):
 669     """Serialize the items in a tree to a text.
 670
 671     :param items: Sorted iterable over (name, mode, sha) tuples
 672     :return: Serialized tree text as chunks
 673     """
 674     for name, mode, hexsha in items:
 675         yield "%04o %s\0%s" % (mode, name, hex_to_sha(hexsha))
 676
 677
 678 def sorted_tree_items(entries):
 679     """Iterate over a tree entries dictionary in the order in which
 680     the items would be serialized.
 681
 682     :param entries: Dictionary mapping names to (mode, sha) tuples
 683     :return: Iterator over (name, mode, sha)
 684     """
 685     for name, entry in sorted(entries.iteritems(), cmp=cmp_entry):
 686         yield name, entry[0], entry[1]
 687
 688
 689 def cmp_entry((name1, value1), (name2, value2)):
 690     """Compare two tree entries."""
 691     if stat.S_ISDIR(value1[0]):
 692         name1 += "/"
 693     if stat.S_ISDIR(value2[0]):
 694         name2 += "/"
 695     return cmp(name1, name2)
 696
 697
 698 class Tree(ShaFile):
 699     """A Git tree object"""
 700
 701     type_name = 'tree'
 702     type_num = 2
 703
 704     def __init__(self):
 705         super(Tree, self).__init__()
 706         self._entries = {}
 707
 708     @classmethod
 709     def from_file(cls, filename):
 710         tree = ShaFile.from_file(filename)
 711         if not isinstance(tree, cls):
 712             raise NotTreeError(filename)
 713         return tree
 714
 715     def __contains__(self, name):
 716         self._ensure_parsed()
 717         return name in self._entries
 718
 719     def __getitem__(self, name):
 720         self._ensure_parsed()
 721         return self._entries[name]
 722
 723     def __setitem__(self, name, value):
 724         assert isinstance(value, tuple)
 725         assert len(value) == 2
 726         self._ensure_parsed()
 727         self._entries[name] = value
 728         self._needs_serialization = True
 729
 730     def __delitem__(self, name):
 731         self._ensure_parsed()
 732         del self._entries[name]
 733         self._needs_serialization = True
 734
 735     def __len__(self):
 736         self._ensure_parsed()
 737         return len(self._entries)
 738
 739     def __iter__(self):
 740         self._ensure_parsed()
 741         return iter(self._entries)
 742
 743     def add(self, mode, name, hexsha):
 744         assert type(mode) == int
 745         assert type(name) == str
 746         assert type(hexsha) == str
 747         self._ensure_parsed()
 748         self._entries[name] = mode, hexsha
 749         self._needs_serialization = True
 750
 751     def entries(self):
 752         """Return a list of tuples describing the tree entries"""
 753         self._ensure_parsed()
 754         # The order of this is different from iteritems() for historical
 755         # reasons
 756         return [
 757             (mode, name, hexsha) for (name, mode, hexsha) in self.iteritems()]
 758
 759     def iteritems(self):
 760         """Iterate over entries in the order in which they would be serialized.
 761
 762         :return: Iterator over (name, mode, sha) tuples
 763         """
 764         self._ensure_parsed()
 765         return sorted_tree_items(self._entries)
 766
 767     def _deserialize(self, chunks):
 768         """Grab the entries in the tree"""
 769         try:
 770             parsed_entries = parse_tree("".join(chunks))
 771         except ValueError, e:
 772             raise ObjectFormatException(e)
 773         # TODO: list comprehension is for efficiency in the common (small) case;
 774         # if memory efficiency in the large case is a concern, use a genexp.
 775         self._entries = dict([(n, (m, s)) for n, m, s in parsed_entries])
 776
 777     def check(self):
 778         """Check this object for internal consistency.
 779
 780         :raise ObjectFormatException: if the object is malformed in some way
 781         """
 782         super(Tree, self).check()
 783         last = None
 784         allowed_modes = (stat.S_IFREG | 0755, stat.S_IFREG | 0644,
 785                          stat.S_IFLNK, stat.S_IFDIR, S_IFGITLINK,
 786                          # TODO: optionally exclude as in git fsck --strict
 787                          stat.S_IFREG | 0664)
 788         for name, mode, sha in parse_tree("".join(self._chunked_text)):
 789             check_hexsha(sha, 'invalid sha %s' % sha)
 790             if '/' in name or name in ('', '.', '..'):
 791                 raise ObjectFormatException('invalid name %s' % name)
 792
 793             if mode not in allowed_modes:
 794                 raise ObjectFormatException('invalid mode %06o' % mode)
 795
 796             entry = (name, (mode, sha))
 797             if last:
 798                 if cmp_entry(last, entry) > 0:
 799                     raise ObjectFormatException('entries not sorted')
 800                 if name == last[0]:
 801                     raise ObjectFormatException('duplicate entry %s' % name)
 802             last = entry
 803
 804     def _serialize(self):
 805         return list(serialize_tree(self.iteritems()))
 806
 807     def as_pretty_string(self):
 808         text = []
 809         for name, mode, hexsha in self.iteritems():
 810             if mode & stat.S_IFDIR:
 811                 kind = "tree"
 812             else:
 813                 kind = "blob"
 814             text.append("%04o %s %s\t%s\n" % (mode, kind, hexsha, name))
 815         return "".join(text)
 816
 817
 818 def parse_timezone(text):
 819     offset = int(text)
 820     negative_utc = (offset == 0 and text[0] == '-')
 821     signum = (offset < 0) and -1 or 1
 822     offset = abs(offset)
 823     hours = int(offset / 100)
 824     minutes = (offset % 100)
 825     return signum * (hours * 3600 + minutes * 60), negative_utc
 826
 827
 828 def format_timezone(offset, negative_utc=False):
 829     if offset % 60 != 0:
 830         raise ValueError("Unable to handle non-minute offset.")
 831     if offset < 0 or (offset == 0 and negative_utc):
 832         sign = '-'
 833     else:
 834         sign = '+'
 835     offset = abs(offset)
 836     return '%c%02d%02d' % (sign, offset / 3600, (offset / 60) % 60)
 837
 838
 839 def parse_commit(text):
 840     return _parse_tag_or_commit(text)
 841
 842
 843 class Commit(ShaFile):
 844     """A git commit object"""
 845
 846     type_name = 'commit'
 847     type_num = 1
 848
 849     def __init__(self):
 850         super(Commit, self).__init__()
 851         self._parents = []
 852         self._encoding = None
 853         self._extra = {}
 854         self._author_timezone_neg_utc = False
 855         self._commit_timezone_neg_utc = False
 856
 857     @classmethod
 858     def from_file(cls, filename):
 859         commit = ShaFile.from_file(filename)
 860         if not isinstance(commit, cls):
 861             raise NotCommitError(filename)
 862         return commit
 863
 864     def _deserialize(self, chunks):
 865         self._parents = []
 866         self._extra = []
 867         self._author = None
 868         for field, value in parse_commit("".join(self._chunked_text)):
 869             if field == _TREE_HEADER:
 870                 self._tree = value
 871             elif field == _PARENT_HEADER:
 872                 self._parents.append(value)
 873             elif field == _AUTHOR_HEADER:
 874                 self._author, timetext, timezonetext = value.rsplit(" ", 2)
 875                 self._author_time = int(timetext)
 876                 self._author_timezone, self._author_timezone_neg_utc =\
 877                     parse_timezone(timezonetext)
 878             elif field == _COMMITTER_HEADER:
 879                 self._committer, timetext, timezonetext = value.rsplit(" ", 2)
 880                 self._commit_time = int(timetext)
 881                 self._commit_timezone, self._commit_timezone_neg_utc =\
 882                     parse_timezone(timezonetext)
 883             elif field == _ENCODING_HEADER:
 884                 self._encoding = value
 885             elif field is None:
 886                 self._message = value
 887             else:
 888                 self._extra.append((field, value))
 889
 890     def check(self):
 891         """Check this object for internal consistency.
 892
 893         :raise ObjectFormatException: if the object is malformed in some way
 894         """
 895         super(Commit, self).check()
 896         self._check_has_member("_tree", "missing tree")
 897         self._check_has_member("_author", "missing author")
 898         self._check_has_member("_committer", "missing committer")
 899         # times are currently checked when set
 900
 901         for parent in self._parents:
 902             check_hexsha(parent, "invalid parent sha")
 903         check_hexsha(self._tree, "invalid tree sha")
 904
 905         check_identity(self._author, "invalid author")
 906         check_identity(self._committer, "invalid committer")
 907
 908         last = None
 909         for field, _ in parse_commit("".join(self._chunked_text)):
 910             if field == _TREE_HEADER and last is not None:
 911                 raise ObjectFormatException("unexpected tree")
 912             elif field == _PARENT_HEADER and last not in (_PARENT_HEADER,
 913                                                           _TREE_HEADER):
 914                 raise ObjectFormatException("unexpected parent")
 915             elif field == _AUTHOR_HEADER and last not in (_TREE_HEADER,
 916                                                           _PARENT_HEADER):
 917                 raise ObjectFormatException("unexpected author")
 918             elif field == _COMMITTER_HEADER and last != _AUTHOR_HEADER:
 919                 raise ObjectFormatException("unexpected committer")
 920             elif field == _ENCODING_HEADER and last != _COMMITTER_HEADER:
 921                 raise ObjectFormatException("unexpected encoding")
 922             last = field
 923
 924         # TODO: optionally check for duplicate parents
 925
 926     def _serialize(self):
 927         chunks = []
 928         chunks.append("%s %s\n" % (_TREE_HEADER, self._tree))
 929         for p in self._parents:
 930             chunks.append("%s %s\n" % (_PARENT_HEADER, p))
 931         chunks.append("%s %s %s %s\n" % (
 932           _AUTHOR_HEADER, self._author, str(self._author_time),
 933           format_timezone(self._author_timezone,
 934                           self._author_timezone_neg_utc)))
 935         chunks.append("%s %s %s %s\n" % (
 936           _COMMITTER_HEADER, self._committer, str(self._commit_time),
 937           format_timezone(self._commit_timezone,
 938                           self._commit_timezone_neg_utc)))
 939         if self.encoding:
 940             chunks.append("%s %s\n" % (_ENCODING_HEADER, self.encoding))
 941         for k, v in self.extra:
 942             if "\n" in k or "\n" in v:
 943                 raise AssertionError("newline in extra data: %r -> %r" % (k, v))
 944             chunks.append("%s %s\n" % (k, v))
 945         chunks.append("\n") # There must be a new line after the headers
 946         chunks.append(self._message)
 947         return chunks
 948
 949     tree = serializable_property("tree", "Tree that is the state of this commit")
 950
 951     def _get_parents(self):
 952         """Return a list of parents of this commit."""
 953         self._ensure_parsed()
 954         return self._parents
 955
 956     def _set_parents(self, value):
 957         """Set a list of parents of this commit."""
 958         self._ensure_parsed()
 959         self._needs_serialization = True
 960         self._parents = value
 961
 962     parents = property(_get_parents, _set_parents)
 963
 964     def _get_extra(self):
 965         """Return extra settings of this commit."""
 966         self._ensure_parsed()
 967         return self._extra
 968
 969     extra = property(_get_extra)
 970
 971     author = serializable_property("author",
 972         "The name of the author of the commit")
 973
 974     committer = serializable_property("committer",
 975         "The name of the committer of the commit")
 976
 977     message = serializable_property("message",
 978         "The commit message")
 979
 980     commit_time = serializable_property("commit_time",
 981         "The timestamp of the commit. As the number of seconds since the epoch.")
 982
 983     commit_timezone = serializable_property("commit_timezone",
 984         "The zone the commit time is in")
 985
 986     author_time = serializable_property("author_time",
 987         "The timestamp the commit was written. as the number of seconds since the epoch.")
 988
 989     author_timezone = serializable_property("author_timezone",
 990         "Returns the zone the author time is in.")
 991
 992     encoding = serializable_property("encoding",
 993         "Encoding of the commit message.")
 994
 995
 996 OBJECT_CLASSES = (
 997     Commit,
 998     Tree,
 999     Blob,
1000     Tag,
1001     )
1002
1003 _TYPE_MAP = {}
1004
1005 for cls in OBJECT_CLASSES:
1006     _TYPE_MAP[cls.type_name] = cls
1007     _TYPE_MAP[cls.type_num] = cls
1008
1009
1010
1011 # Hold on to the pure-python implementations for testing
1012 _parse_tree_py = parse_tree
1013 _sorted_tree_items_py = sorted_tree_items
1014 try:
1015     # Try to import C versions
1016     from dulwich._objects import parse_tree, sorted_tree_items
1017 except ImportError:
1018     pass