dulwich/objects.py

   1 # objects.py -- Access to base git objects
   2 # Copyright (C) 2007 James Westby <jw+debian@jameswestby.net>
   3 # Copyright (C) 2008-2009 Jelmer Vernooij <jelmer@samba.org>
   4 #
   5 # This program is free software; you can redistribute it and/or
   6 # modify it under the terms of the GNU General Public License
   7 # as published by the Free Software Foundation; version 2
   8 # of the License or (at your option) a later version of the License.
   9 #
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 # GNU General Public License for more details.
  14 #
  15 # You should have received a copy of the GNU General Public License
  16 # along with this program; if not, write to the Free Software
  17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
  18 # MA  02110-1301, USA.
  19
  20
  21 """Access to base git objects."""
  22
  23
  24 import binascii
  25 from cStringIO import (
  26     StringIO,
  27     )
  28 import mmap
  29 import os
  30 import stat
  31 import zlib
  32
  33 from dulwich.errors import (
  34     ChecksumMismatch,
  35     NotBlobError,
  36     NotCommitError,
  37     NotTagError,
  38     NotTreeError,
  39     ObjectFormatException,
  40     )
  41 from dulwich.file import GitFile
  42 from dulwich.misc import (
  43     make_sha,
  44     )
  45
  46
  47 # Header fields for commits
  48 _TREE_HEADER = "tree"
  49 _PARENT_HEADER = "parent"
  50 _AUTHOR_HEADER = "author"
  51 _COMMITTER_HEADER = "committer"
  52 _ENCODING_HEADER = "encoding"
  53
  54
  55 # Header fields for objects
  56 _OBJECT_HEADER = "object"
  57 _TYPE_HEADER = "type"
  58 _TAG_HEADER = "tag"
  59 _TAGGER_HEADER = "tagger"
  60
  61
  62 S_IFGITLINK = 0160000
  63
  64 def S_ISGITLINK(m):
  65     return (stat.S_IFMT(m) == S_IFGITLINK)
  66
  67
  68 def _decompress(string):
  69     dcomp = zlib.decompressobj()
  70     dcomped = dcomp.decompress(string)
  71     dcomped += dcomp.flush()
  72     return dcomped
  73
  74
  75 def sha_to_hex(sha):
  76     """Takes a string and returns the hex of the sha within"""
  77     hexsha = binascii.hexlify(sha)
  78     assert len(hexsha) == 40, "Incorrect length of sha1 string: %d" % hexsha
  79     return hexsha
  80
  81
  82 def hex_to_sha(hex):
  83     """Takes a hex sha and returns a binary sha"""
  84     assert len(hex) == 40, "Incorrent length of hexsha: %s" % hex
  85     return binascii.unhexlify(hex)
  86
  87
  88 def hex_to_filename(path, hex):
  89     """Takes a hex sha and returns its filename relative to the given path."""
  90     dir = hex[:2]
  91     file = hex[2:]
  92     # Check from object dir
  93     return os.path.join(path, dir, file)
  94
  95
  96 def filename_to_hex(filename):
  97     """Takes an object filename and returns its corresponding hex sha."""
  98     # grab the last (up to) two path components
  99     names = filename.rsplit(os.path.sep, 2)[-2:]
 100     errmsg = "Invalid object filename: %s" % filename
 101     assert len(names) == 2, errmsg
 102     base, rest = names
 103     assert len(base) == 2 and len(rest) == 38, errmsg
 104     hex = base + rest
 105     hex_to_sha(hex)
 106     return hex
 107
 108
 109 def object_header(num_type, length):
 110     """Return an object header for the given numeric type and text length."""
 111     return "%s %d\0" % (object_class(num_type).type_name, length)
 112
 113
 114 def serializable_property(name, docstring=None):
 115     def set(obj, value):
 116         obj._ensure_parsed()
 117         setattr(obj, "_"+name, value)
 118         obj._needs_serialization = True
 119     def get(obj):
 120         obj._ensure_parsed()
 121         return getattr(obj, "_"+name)
 122     return property(get, set, doc=docstring)
 123
 124
 125 def object_class(type):
 126     """Get the object class corresponding to the given type.
 127
 128     :param type: Either a type name string or a numeric type.
 129     :return: The ShaFile subclass corresponding to the given type, or None if
 130         type is not a valid type name/number.
 131     """
 132     return _TYPE_MAP.get(type, None)
 133
 134
 135 def check_hexsha(hex, error_msg):
 136     try:
 137         hex_to_sha(hex)
 138     except (TypeError, AssertionError):
 139         raise ObjectFormatException("%s %s" % (error_msg, hex))
 140
 141
 142 def check_identity(identity, error_msg):
 143     email_start = identity.find("<")
 144     email_end = identity.find(">")
 145     if (email_start < 0 or email_end < 0 or email_end <= email_start
 146         or identity.find("<", email_start + 1) >= 0
 147         or identity.find(">", email_end + 1) >= 0
 148         or not identity.endswith(">")):
 149         raise ObjectFormatException(error_msg)
 150
 151
 152 class FixedSha(object):
 153     """SHA object that behaves like hashlib's but is given a fixed value."""
 154
 155     def __init__(self, hexsha):
 156         self._hexsha = hexsha
 157         self._sha = hex_to_sha(hexsha)
 158
 159     def digest(self):
 160         return self._sha
 161
 162     def hexdigest(self):
 163         return self._hexsha
 164
 165
 166 class ShaFile(object):
 167     """A git SHA file."""
 168
 169     @staticmethod
 170     def _parse_legacy_object_header(magic, f):
 171         """Parse a legacy object, creating it but not reading the file."""
 172         bufsize = 1024
 173         decomp = zlib.decompressobj()
 174         header = decomp.decompress(magic)
 175         start = 0
 176         end = -1
 177         while end < 0:
 178             header += decomp.decompress(f.read(bufsize))
 179             end = header.find("\0", start)
 180             start = len(header)
 181         header = header[:end]
 182         type_name, size = header.split(" ", 1)
 183         size = int(size)  # sanity check
 184         obj_class = object_class(type_name)
 185         if not obj_class:
 186             raise ObjectFormatException("Not a known type: %s" % type_name)
 187         obj = obj_class()
 188         obj._filename = f.name
 189         return obj
 190
 191     def _parse_legacy_object(self, f):
 192         """Parse a legacy object, setting the raw string."""
 193         size = os.path.getsize(f.name)
 194         map = mmap.mmap(f.fileno(), size, access=mmap.ACCESS_READ)
 195         try:
 196             text = _decompress(map)
 197         finally:
 198             map.close()
 199         header_end = text.find('\0')
 200         if header_end < 0:
 201             raise ObjectFormatException("Invalid object header")
 202         self.set_raw_string(text[header_end+1:])
 203
 204     def as_legacy_object_chunks(self):
 205         compobj = zlib.compressobj()
 206         yield compobj.compress(self._header())
 207         for chunk in self.as_raw_chunks():
 208             yield compobj.compress(chunk)
 209         yield compobj.flush()
 210
 211     def as_legacy_object(self):
 212         return "".join(self.as_legacy_object_chunks())
 213
 214     def as_raw_chunks(self):
 215         if self._needs_parsing:
 216             self._ensure_parsed()
 217         elif self._needs_serialization:
 218             self._chunked_text = self._serialize()
 219         return self._chunked_text
 220
 221     def as_raw_string(self):
 222         return "".join(self.as_raw_chunks())
 223
 224     def __str__(self):
 225         return self.as_raw_string()
 226
 227     def __hash__(self):
 228         return hash(self.id)
 229
 230     def as_pretty_string(self):
 231         return self.as_raw_string()
 232
 233     def _ensure_parsed(self):
 234         if self._needs_parsing:
 235             if not self._chunked_text:
 236                 assert self._filename, "ShaFile needs either text or filename"
 237                 self._parse_file()
 238             self._deserialize(self._chunked_text)
 239             self._needs_parsing = False
 240
 241     def set_raw_string(self, text):
 242         if type(text) != str:
 243             raise TypeError(text)
 244         self.set_raw_chunks([text])
 245
 246     def set_raw_chunks(self, chunks):
 247         self._chunked_text = chunks
 248         self._deserialize(chunks)
 249         self._sha = None
 250         self._needs_parsing = False
 251         self._needs_serialization = False
 252
 253     @staticmethod
 254     def _parse_object_header(magic, f):
 255         """Parse a new style object, creating it but not reading the file."""
 256         num_type = (ord(magic[0]) >> 4) & 7
 257         obj_class = object_class(num_type)
 258         if not obj_class:
 259             raise ObjectFormatException("Not a known type: %d" % num_type)
 260         obj = obj_class()
 261         obj._filename = f.name
 262         return obj
 263
 264     def _parse_object(self, f):
 265         """Parse a new style object, setting self._text."""
 266         size = os.path.getsize(f.name)
 267         map = mmap.mmap(f.fileno(), size, access=mmap.ACCESS_READ)
 268         try:
 269             # skip type and size; type must have already been determined, and we
 270             # trust zlib to fail if it's otherwise corrupted
 271             byte = ord(map[0])
 272             used = 1
 273             while (byte & 0x80) != 0:
 274                 byte = ord(map[used])
 275                 used += 1
 276             raw = map[used:]
 277             self.set_raw_string(_decompress(raw))
 278         finally:
 279             map.close()
 280
 281     @classmethod
 282     def _is_legacy_object(cls, magic):
 283         b0, b1 = map(ord, magic)
 284         word = (b0 << 8) + b1
 285         return b0 == 0x78 and (word % 31) == 0
 286
 287     @classmethod
 288     def _parse_file_header(cls, f):
 289         magic = f.read(2)
 290         if cls._is_legacy_object(magic):
 291             return cls._parse_legacy_object_header(magic, f)
 292         else:
 293             return cls._parse_object_header(magic, f)
 294
 295     def __init__(self):
 296         """Don't call this directly"""
 297         self._sha = None
 298         self._filename = None
 299         self._chunked_text = []
 300         self._needs_parsing = False
 301         self._needs_serialization = True
 302
 303     def _deserialize(self, chunks):
 304         raise NotImplementedError(self._deserialize)
 305
 306     def _serialize(self):
 307         raise NotImplementedError(self._serialize)
 308
 309     def _parse_file(self):
 310         f = GitFile(self._filename, 'rb')
 311         try:
 312             magic = f.read(2)
 313             if self._is_legacy_object(magic):
 314                 self._parse_legacy_object(f)
 315             else:
 316                 self._parse_object(f)
 317         finally:
 318             f.close()
 319
 320     @classmethod
 321     def from_path(cls, path):
 322         f = GitFile(path, 'rb')
 323         try:
 324             obj = cls.from_file(f)
 325             obj._sha = FixedSha(filename_to_hex(path))
 326             return obj
 327         finally:
 328             f.close()
 329
 330     @classmethod
 331     def from_file(cls, f):
 332         """Get the contents of a SHA file on disk."""
 333         try:
 334             obj = cls._parse_file_header(f)
 335             obj._sha = None
 336             obj._needs_parsing = True
 337             obj._needs_serialization = True
 338             return obj
 339         except (IndexError, ValueError), e:
 340             raise ObjectFormatException("invalid object header")
 341
 342     @staticmethod
 343     def from_raw_string(type_num, string):
 344         """Creates an object of the indicated type from the raw string given.
 345
 346         :param type_num: The numeric type of the object.
 347         :param string: The raw uncompressed contents.
 348         """
 349         obj = object_class(type_num)()
 350         obj.set_raw_string(string)
 351         return obj
 352
 353     @staticmethod
 354     def from_raw_chunks(type_num, chunks):
 355         """Creates an object of the indicated type from the raw chunks given.
 356
 357         :param type_num: The numeric type of the object.
 358         :param chunks: An iterable of the raw uncompressed contents.
 359         """
 360         obj = object_class(type_num)()
 361         obj.set_raw_chunks(chunks)
 362         return obj
 363
 364     @classmethod
 365     def from_string(cls, string):
 366         """Create a ShaFile from a string."""
 367         obj = cls()
 368         obj.set_raw_string(string)
 369         return obj
 370
 371     def _check_has_member(self, member, error_msg):
 372         """Check that the object has a given member variable.
 373
 374         :param member: the member variable to check for
 375         :param error_msg: the message for an error if the member is missing
 376         :raise ObjectFormatException: with the given error_msg if member is
 377             missing or is None
 378         """
 379         if getattr(self, member, None) is None:
 380             raise ObjectFormatException(error_msg)
 381
 382     def check(self):
 383         """Check this object for internal consistency.
 384
 385         :raise ObjectFormatException: if the object is malformed in some way
 386         :raise ChecksumMismatch: if the object was created with a SHA that does
 387             not match its contents
 388         """
 389         # TODO: if we find that error-checking during object parsing is a
 390         # performance bottleneck, those checks should be moved to the class's
 391         # check() method during optimization so we can still check the object
 392         # when necessary.
 393         old_sha = self.id
 394         try:
 395             self._deserialize(self.as_raw_chunks())
 396             self._sha = None
 397             new_sha = self.id
 398         except Exception, e:
 399             raise ObjectFormatException(e)
 400         if old_sha != new_sha:
 401             raise ChecksumMismatch(new_sha, old_sha)
 402
 403     def _header(self):
 404         return object_header(self.type, self.raw_length())
 405
 406     def raw_length(self):
 407         """Returns the length of the raw string of this object."""
 408         ret = 0
 409         for chunk in self.as_raw_chunks():
 410             ret += len(chunk)
 411         return ret
 412
 413     def _make_sha(self):
 414         ret = make_sha()
 415         ret.update(self._header())
 416         for chunk in self.as_raw_chunks():
 417             ret.update(chunk)
 418         return ret
 419
 420     def sha(self):
 421         """The SHA1 object that is the name of this object."""
 422         if self._sha is None:
 423             # this is a local because as_raw_chunks() overwrites self._sha
 424             new_sha = make_sha()
 425             new_sha.update(self._header())
 426             for chunk in self.as_raw_chunks():
 427                 new_sha.update(chunk)
 428             self._sha = new_sha
 429         return self._sha
 430
 431     @property
 432     def id(self):
 433         return self.sha().hexdigest()
 434
 435     def get_type(self):
 436         return self.type_num
 437
 438     def set_type(self, type):
 439         self.type_num = type
 440
 441     # DEPRECATED: use type_num or type_name as needed.
 442     type = property(get_type, set_type)
 443
 444     def __repr__(self):
 445         return "<%s %s>" % (self.__class__.__name__, self.id)
 446
 447     def __ne__(self, other):
 448         return self.id != other.id
 449
 450     def __eq__(self, other):
 451         """Return true if the sha of the two objects match.
 452
 453         The __le__ etc methods aren't overriden as they make no sense,
 454         certainly at this level.
 455         """
 456         return self.id == other.id
 457
 458
 459 class Blob(ShaFile):
 460     """A Git Blob object."""
 461
 462     type_name = 'blob'
 463     type_num = 3
 464
 465     def __init__(self):
 466         super(Blob, self).__init__()
 467         self._chunked_text = []
 468         self._needs_parsing = False
 469         self._needs_serialization = False
 470
 471     def _get_data(self):
 472         return self.as_raw_string()
 473
 474     def _set_data(self, data):
 475         self.set_raw_string(data)
 476
 477     data = property(_get_data, _set_data,
 478                     "The text contained within the blob object.")
 479
 480     def _get_chunked(self):
 481         self._ensure_parsed()
 482         return self._chunked_text
 483
 484     def _set_chunked(self, chunks):
 485         self._chunked_text = chunks
 486
 487     def _serialize(self):
 488         if not self._chunked_text:
 489             self._ensure_parsed()
 490         self._needs_serialization = False
 491         return self._chunked_text
 492
 493     def _deserialize(self, chunks):
 494         self._chunked_text = chunks
 495
 496     chunked = property(_get_chunked, _set_chunked,
 497         "The text within the blob object, as chunks (not necessarily lines).")
 498
 499     @classmethod
 500     def from_path(cls, path):
 501         blob = ShaFile.from_path(path)
 502         if not isinstance(blob, cls):
 503             raise NotBlobError(path)
 504         return blob
 505
 506     def check(self):
 507         """Check this object for internal consistency.
 508
 509         :raise ObjectFormatException: if the object is malformed in some way
 510         """
 511         super(Blob, self).check()
 512
 513
 514 def _parse_tag_or_commit(text):
 515     """Parse tag or commit text.
 516
 517     :param text: the raw text of the tag or commit object.
 518     :yield: tuples of (field, value), one per header line, in the order read
 519         from the text, possibly including duplicates. Includes a field named
 520         None for the freeform tag/commit text.
 521     """
 522     f = StringIO(text)
 523     for l in f:
 524         l = l.rstrip("\n")
 525         if l == "":
 526             # Empty line indicates end of headers
 527             break
 528         yield l.split(" ", 1)
 529     yield (None, f.read())
 530     f.close()
 531
 532
 533 def parse_tag(text):
 534     return _parse_tag_or_commit(text)
 535
 536
 537 class Tag(ShaFile):
 538     """A Git Tag object."""
 539
 540     type_name = 'tag'
 541     type_num = 4
 542
 543     def __init__(self):
 544         super(Tag, self).__init__()
 545         self._tag_timezone_neg_utc = False
 546
 547     @classmethod
 548     def from_path(cls, filename):
 549         tag = ShaFile.from_path(filename)
 550         if not isinstance(tag, cls):
 551             raise NotTagError(filename)
 552         return tag
 553
 554     def check(self):
 555         """Check this object for internal consistency.
 556
 557         :raise ObjectFormatException: if the object is malformed in some way
 558         """
 559         super(Tag, self).check()
 560         self._check_has_member("_object_sha", "missing object sha")
 561         self._check_has_member("_object_class", "missing object type")
 562         self._check_has_member("_name", "missing tag name")
 563
 564         if not self._name:
 565             raise ObjectFormatException("empty tag name")
 566
 567         check_hexsha(self._object_sha, "invalid object sha")
 568
 569         if getattr(self, "_tagger", None):
 570             check_identity(self._tagger, "invalid tagger")
 571
 572         last = None
 573         for field, _ in parse_tag("".join(self._chunked_text)):
 574             if field == _OBJECT_HEADER and last is not None:
 575                 raise ObjectFormatException("unexpected object")
 576             elif field == _TYPE_HEADER and last != _OBJECT_HEADER:
 577                 raise ObjectFormatException("unexpected type")
 578             elif field == _TAG_HEADER and last != _TYPE_HEADER:
 579                 raise ObjectFormatException("unexpected tag name")
 580             elif field == _TAGGER_HEADER and last != _TAG_HEADER:
 581                 raise ObjectFormatException("unexpected tagger")
 582             last = field
 583
 584     def _serialize(self):
 585         chunks = []
 586         chunks.append("%s %s\n" % (_OBJECT_HEADER, self._object_sha))
 587         chunks.append("%s %s\n" % (_TYPE_HEADER, self._object_class.type_name))
 588         chunks.append("%s %s\n" % (_TAG_HEADER, self._name))
 589         if self._tagger:
 590             if self._tag_time is None:
 591                 chunks.append("%s %s\n" % (_TAGGER_HEADER, self._tagger))
 592             else:
 593                 chunks.append("%s %s %d %s\n" % (
 594                   _TAGGER_HEADER, self._tagger, self._tag_time,
 595                   format_timezone(self._tag_timezone,
 596                     self._tag_timezone_neg_utc)))
 597         chunks.append("\n") # To close headers
 598         chunks.append(self._message)
 599         return chunks
 600
 601     def _deserialize(self, chunks):
 602         """Grab the metadata attached to the tag"""
 603         self._tagger = None
 604         for field, value in parse_tag("".join(chunks)):
 605             if field == _OBJECT_HEADER:
 606                 self._object_sha = value
 607             elif field == _TYPE_HEADER:
 608                 obj_class = object_class(value)
 609                 if not obj_class:
 610                     raise ObjectFormatException("Not a known type: %s" % value)
 611                 self._object_class = obj_class
 612             elif field == _TAG_HEADER:
 613                 self._name = value
 614             elif field == _TAGGER_HEADER:
 615                 try:
 616                     sep = value.index("> ")
 617                 except ValueError:
 618                     self._tagger = value
 619                     self._tag_time = None
 620                     self._tag_timezone = None
 621                     self._tag_timezone_neg_utc = False
 622                 else:
 623                     self._tagger = value[0:sep+1]
 624                     try:
 625                         (timetext, timezonetext) = value[sep+2:].rsplit(" ", 1)
 626                         self._tag_time = int(timetext)
 627                         self._tag_timezone, self._tag_timezone_neg_utc = \
 628                                 parse_timezone(timezonetext)
 629                     except ValueError, e:
 630                         raise ObjectFormatException(e)
 631             elif field is None:
 632                 self._message = value
 633             else:
 634                 raise ObjectFormatException("Unknown field %s" % field)
 635
 636     def _get_object(self):
 637         """Get the object pointed to by this tag.
 638
 639         :return: tuple of (object class, sha).
 640         """
 641         self._ensure_parsed()
 642         return (self._object_class, self._object_sha)
 643
 644     def _set_object(self, value):
 645         self._ensure_parsed()
 646         (self._object_class, self._object_sha) = value
 647         self._needs_serialization = True
 648
 649     object = property(_get_object, _set_object)
 650
 651     name = serializable_property("name", "The name of this tag")
 652     tagger = serializable_property("tagger",
 653         "Returns the name of the person who created this tag")
 654     tag_time = serializable_property("tag_time",
 655         "The creation timestamp of the tag.  As the number of seconds since the epoch")
 656     tag_timezone = serializable_property("tag_timezone",
 657         "The timezone that tag_time is in.")
 658     message = serializable_property("message", "The message attached to this tag")
 659
 660
 661 def parse_tree(text):
 662     """Parse a tree text.
 663
 664     :param text: Serialized text to parse
 665     :yields: tuples of (name, mode, sha)
 666     """
 667     count = 0
 668     l = len(text)
 669     while count < l:
 670         mode_end = text.index(' ', count)
 671         mode = int(text[count:mode_end], 8)
 672         name_end = text.index('\0', mode_end)
 673         name = text[mode_end+1:name_end]
 674         count = name_end+21
 675         sha = text[name_end+1:count]
 676         yield (name, mode, sha_to_hex(sha))
 677
 678
 679 def serialize_tree(items):
 680     """Serialize the items in a tree to a text.
 681
 682     :param items: Sorted iterable over (name, mode, sha) tuples
 683     :return: Serialized tree text as chunks
 684     """
 685     for name, mode, hexsha in items:
 686         yield "%04o %s\0%s" % (mode, name, hex_to_sha(hexsha))
 687
 688
 689 def sorted_tree_items(entries):
 690     """Iterate over a tree entries dictionary in the order in which
 691     the items would be serialized.
 692
 693     :param entries: Dictionary mapping names to (mode, sha) tuples
 694     :return: Iterator over (name, mode, sha)
 695     """
 696     for name, entry in sorted(entries.iteritems(), cmp=cmp_entry):
 697         yield name, entry[0], entry[1]
 698
 699
 700 def cmp_entry((name1, value1), (name2, value2)):
 701     """Compare two tree entries."""
 702     if stat.S_ISDIR(value1[0]):
 703         name1 += "/"
 704     if stat.S_ISDIR(value2[0]):
 705         name2 += "/"
 706     return cmp(name1, name2)
 707
 708
 709 class Tree(ShaFile):
 710     """A Git tree object"""
 711
 712     type_name = 'tree'
 713     type_num = 2
 714
 715     def __init__(self):
 716         super(Tree, self).__init__()
 717         self._entries = {}
 718
 719     @classmethod
 720     def from_path(cls, filename):
 721         tree = ShaFile.from_path(filename)
 722         if not isinstance(tree, cls):
 723             raise NotTreeError(filename)
 724         return tree
 725
 726     def __contains__(self, name):
 727         self._ensure_parsed()
 728         return name in self._entries
 729
 730     def __getitem__(self, name):
 731         self._ensure_parsed()
 732         return self._entries[name]
 733
 734     def __setitem__(self, name, value):
 735         assert isinstance(value, tuple)
 736         assert len(value) == 2
 737         self._ensure_parsed()
 738         self._entries[name] = value
 739         self._needs_serialization = True
 740
 741     def __delitem__(self, name):
 742         self._ensure_parsed()
 743         del self._entries[name]
 744         self._needs_serialization = True
 745
 746     def __len__(self):
 747         self._ensure_parsed()
 748         return len(self._entries)
 749
 750     def __iter__(self):
 751         self._ensure_parsed()
 752         return iter(self._entries)
 753
 754     def add(self, mode, name, hexsha):
 755         assert type(mode) == int
 756         assert type(name) == str
 757         assert type(hexsha) == str
 758         self._ensure_parsed()
 759         self._entries[name] = mode, hexsha
 760         self._needs_serialization = True
 761
 762     def entries(self):
 763         """Return a list of tuples describing the tree entries"""
 764         self._ensure_parsed()
 765         # The order of this is different from iteritems() for historical
 766         # reasons
 767         return [
 768             (mode, name, hexsha) for (name, mode, hexsha) in self.iteritems()]
 769
 770     def iteritems(self):
 771         """Iterate over entries in the order in which they would be serialized.
 772
 773         :return: Iterator over (name, mode, sha) tuples
 774         """
 775         self._ensure_parsed()
 776         return sorted_tree_items(self._entries)
 777
 778     def _deserialize(self, chunks):
 779         """Grab the entries in the tree"""
 780         try:
 781             parsed_entries = parse_tree("".join(chunks))
 782         except ValueError, e:
 783             raise ObjectFormatException(e)
 784         # TODO: list comprehension is for efficiency in the common (small) case;
 785         # if memory efficiency in the large case is a concern, use a genexp.
 786         self._entries = dict([(n, (m, s)) for n, m, s in parsed_entries])
 787
 788     def check(self):
 789         """Check this object for internal consistency.
 790
 791         :raise ObjectFormatException: if the object is malformed in some way
 792         """
 793         super(Tree, self).check()
 794         last = None
 795         allowed_modes = (stat.S_IFREG | 0755, stat.S_IFREG | 0644,
 796                          stat.S_IFLNK, stat.S_IFDIR, S_IFGITLINK,
 797                          # TODO: optionally exclude as in git fsck --strict
 798                          stat.S_IFREG | 0664)
 799         for name, mode, sha in parse_tree("".join(self._chunked_text)):
 800             check_hexsha(sha, 'invalid sha %s' % sha)
 801             if '/' in name or name in ('', '.', '..'):
 802                 raise ObjectFormatException('invalid name %s' % name)
 803
 804             if mode not in allowed_modes:
 805                 raise ObjectFormatException('invalid mode %06o' % mode)
 806
 807             entry = (name, (mode, sha))
 808             if last:
 809                 if cmp_entry(last, entry) > 0:
 810                     raise ObjectFormatException('entries not sorted')
 811                 if name == last[0]:
 812                     raise ObjectFormatException('duplicate entry %s' % name)
 813             last = entry
 814
 815     def _serialize(self):
 816         return list(serialize_tree(self.iteritems()))
 817
 818     def as_pretty_string(self):
 819         text = []
 820         for name, mode, hexsha in self.iteritems():
 821             if mode & stat.S_IFDIR:
 822                 kind = "tree"
 823             else:
 824                 kind = "blob"
 825             text.append("%04o %s %s\t%s\n" % (mode, kind, hexsha, name))
 826         return "".join(text)
 827
 828
 829 def parse_timezone(text):
 830     offset = int(text)
 831     negative_utc = (offset == 0 and text[0] == '-')
 832     signum = (offset < 0) and -1 or 1
 833     offset = abs(offset)
 834     hours = int(offset / 100)
 835     minutes = (offset % 100)
 836     return signum * (hours * 3600 + minutes * 60), negative_utc
 837
 838
 839 def format_timezone(offset, negative_utc=False):
 840     if offset % 60 != 0:
 841         raise ValueError("Unable to handle non-minute offset.")
 842     if offset < 0 or (offset == 0 and negative_utc):
 843         sign = '-'
 844     else:
 845         sign = '+'
 846     offset = abs(offset)
 847     return '%c%02d%02d' % (sign, offset / 3600, (offset / 60) % 60)
 848
 849
 850 def parse_commit(text):
 851     return _parse_tag_or_commit(text)
 852
 853
 854 class Commit(ShaFile):
 855     """A git commit object"""
 856
 857     type_name = 'commit'
 858     type_num = 1
 859
 860     def __init__(self):
 861         super(Commit, self).__init__()
 862         self._parents = []
 863         self._encoding = None
 864         self._extra = {}
 865         self._author_timezone_neg_utc = False
 866         self._commit_timezone_neg_utc = False
 867
 868     @classmethod
 869     def from_path(cls, path):
 870         commit = ShaFile.from_path(path)
 871         if not isinstance(commit, cls):
 872             raise NotCommitError(path)
 873         return commit
 874
 875     def _deserialize(self, chunks):
 876         self._parents = []
 877         self._extra = []
 878         self._author = None
 879         for field, value in parse_commit("".join(self._chunked_text)):
 880             if field == _TREE_HEADER:
 881                 self._tree = value
 882             elif field == _PARENT_HEADER:
 883                 self._parents.append(value)
 884             elif field == _AUTHOR_HEADER:
 885                 self._author, timetext, timezonetext = value.rsplit(" ", 2)
 886                 self._author_time = int(timetext)
 887                 self._author_timezone, self._author_timezone_neg_utc =\
 888                     parse_timezone(timezonetext)
 889             elif field == _COMMITTER_HEADER:
 890                 self._committer, timetext, timezonetext = value.rsplit(" ", 2)
 891                 self._commit_time = int(timetext)
 892                 self._commit_timezone, self._commit_timezone_neg_utc =\
 893                     parse_timezone(timezonetext)
 894             elif field == _ENCODING_HEADER:
 895                 self._encoding = value
 896             elif field is None:
 897                 self._message = value
 898             else:
 899                 self._extra.append((field, value))
 900
 901     def check(self):
 902         """Check this object for internal consistency.
 903
 904         :raise ObjectFormatException: if the object is malformed in some way
 905         """
 906         super(Commit, self).check()
 907         self._check_has_member("_tree", "missing tree")
 908         self._check_has_member("_author", "missing author")
 909         self._check_has_member("_committer", "missing committer")
 910         # times are currently checked when set
 911
 912         for parent in self._parents:
 913             check_hexsha(parent, "invalid parent sha")
 914         check_hexsha(self._tree, "invalid tree sha")
 915
 916         check_identity(self._author, "invalid author")
 917         check_identity(self._committer, "invalid committer")
 918
 919         last = None
 920         for field, _ in parse_commit("".join(self._chunked_text)):
 921             if field == _TREE_HEADER and last is not None:
 922                 raise ObjectFormatException("unexpected tree")
 923             elif field == _PARENT_HEADER and last not in (_PARENT_HEADER,
 924                                                           _TREE_HEADER):
 925                 raise ObjectFormatException("unexpected parent")
 926             elif field == _AUTHOR_HEADER and last not in (_TREE_HEADER,
 927                                                           _PARENT_HEADER):
 928                 raise ObjectFormatException("unexpected author")
 929             elif field == _COMMITTER_HEADER and last != _AUTHOR_HEADER:
 930                 raise ObjectFormatException("unexpected committer")
 931             elif field == _ENCODING_HEADER and last != _COMMITTER_HEADER:
 932                 raise ObjectFormatException("unexpected encoding")
 933             last = field
 934
 935         # TODO: optionally check for duplicate parents
 936
 937     def _serialize(self):
 938         chunks = []
 939         chunks.append("%s %s\n" % (_TREE_HEADER, self._tree))
 940         for p in self._parents:
 941             chunks.append("%s %s\n" % (_PARENT_HEADER, p))
 942         chunks.append("%s %s %s %s\n" % (
 943           _AUTHOR_HEADER, self._author, str(self._author_time),
 944           format_timezone(self._author_timezone,
 945                           self._author_timezone_neg_utc)))
 946         chunks.append("%s %s %s %s\n" % (
 947           _COMMITTER_HEADER, self._committer, str(self._commit_time),
 948           format_timezone(self._commit_timezone,
 949                           self._commit_timezone_neg_utc)))
 950         if self.encoding:
 951             chunks.append("%s %s\n" % (_ENCODING_HEADER, self.encoding))
 952         for k, v in self.extra:
 953             if "\n" in k or "\n" in v:
 954                 raise AssertionError("newline in extra data: %r -> %r" % (k, v))
 955             chunks.append("%s %s\n" % (k, v))
 956         chunks.append("\n") # There must be a new line after the headers
 957         chunks.append(self._message)
 958         return chunks
 959
 960     tree = serializable_property("tree", "Tree that is the state of this commit")
 961
 962     def _get_parents(self):
 963         """Return a list of parents of this commit."""
 964         self._ensure_parsed()
 965         return self._parents
 966
 967     def _set_parents(self, value):
 968         """Set a list of parents of this commit."""
 969         self._ensure_parsed()
 970         self._needs_serialization = True
 971         self._parents = value
 972
 973     parents = property(_get_parents, _set_parents)
 974
 975     def _get_extra(self):
 976         """Return extra settings of this commit."""
 977         self._ensure_parsed()
 978         return self._extra
 979
 980     extra = property(_get_extra)
 981
 982     author = serializable_property("author",
 983         "The name of the author of the commit")
 984
 985     committer = serializable_property("committer",
 986         "The name of the committer of the commit")
 987
 988     message = serializable_property("message",
 989         "The commit message")
 990
 991     commit_time = serializable_property("commit_time",
 992         "The timestamp of the commit. As the number of seconds since the epoch.")
 993
 994     commit_timezone = serializable_property("commit_timezone",
 995         "The zone the commit time is in")
 996
 997     author_time = serializable_property("author_time",
 998         "The timestamp the commit was written. as the number of seconds since the epoch.")
 999
1000     author_timezone = serializable_property("author_timezone",
1001         "Returns the zone the author time is in.")
1002
1003     encoding = serializable_property("encoding",
1004         "Encoding of the commit message.")
1005
1006
1007 OBJECT_CLASSES = (
1008     Commit,
1009     Tree,
1010     Blob,
1011     Tag,
1012     )
1013
1014 _TYPE_MAP = {}
1015
1016 for cls in OBJECT_CLASSES:
1017     _TYPE_MAP[cls.type_name] = cls
1018     _TYPE_MAP[cls.type_num] = cls
1019
1020
1021
1022 # Hold on to the pure-python implementations for testing
1023 _parse_tree_py = parse_tree
1024 _sorted_tree_items_py = sorted_tree_items
1025 try:
1026     # Try to import C versions
1027     from dulwich._objects import parse_tree, sorted_tree_items
1028 except ImportError:
1029     pass