dulwich/objects.py

   1 # objects.py -- Access to base git objects
   2 # Copyright (C) 2007 James Westby <jw+debian@jameswestby.net>
   3 # Copyright (C) 2008-2009 Jelmer Vernooij <jelmer@samba.org>
   4 #
   5 # This program is free software; you can redistribute it and/or
   6 # modify it under the terms of the GNU General Public License
   7 # as published by the Free Software Foundation; version 2
   8 # of the License or (at your option) a later version of the License.
   9 #
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 # GNU General Public License for more details.
  14 #
  15 # You should have received a copy of the GNU General Public License
  16 # along with this program; if not, write to the Free Software
  17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
  18 # MA  02110-1301, USA.
  19
  20
  21 """Access to base git objects."""
  22
  23
  24 import binascii
  25 from cStringIO import (
  26     StringIO,
  27     )
  28 import mmap
  29 import os
  30 import stat
  31 import zlib
  32
  33 from dulwich.errors import (
  34     ChecksumMismatch,
  35     NotBlobError,
  36     NotCommitError,
  37     NotTagError,
  38     NotTreeError,
  39     ObjectFormatException,
  40     )
  41 from dulwich.file import GitFile
  42 from dulwich.misc import (
  43     make_sha,
  44     )
  45
  46
  47 # Header fields for commits
  48 _TREE_HEADER = "tree"
  49 _PARENT_HEADER = "parent"
  50 _AUTHOR_HEADER = "author"
  51 _COMMITTER_HEADER = "committer"
  52 _ENCODING_HEADER = "encoding"
  53
  54
  55 # Header fields for objects
  56 _OBJECT_HEADER = "object"
  57 _TYPE_HEADER = "type"
  58 _TAG_HEADER = "tag"
  59 _TAGGER_HEADER = "tagger"
  60
  61
  62 S_IFGITLINK = 0160000
  63
  64 def S_ISGITLINK(m):
  65     return (stat.S_IFMT(m) == S_IFGITLINK)
  66
  67
  68 def _decompress(string):
  69     dcomp = zlib.decompressobj()
  70     dcomped = dcomp.decompress(string)
  71     dcomped += dcomp.flush()
  72     return dcomped
  73
  74
  75 def sha_to_hex(sha):
  76     """Takes a string and returns the hex of the sha within"""
  77     hexsha = binascii.hexlify(sha)
  78     assert len(hexsha) == 40, "Incorrect length of sha1 string: %d" % hexsha
  79     return hexsha
  80
  81
  82 def hex_to_sha(hex):
  83     """Takes a hex sha and returns a binary sha"""
  84     assert len(hex) == 40, "Incorrent length of hexsha: %s" % hex
  85     return binascii.unhexlify(hex)
  86
  87
  88 def hex_to_filename(path, hex):
  89     """Takes a hex sha and returns its filename relative to the given path."""
  90     dir = hex[:2]
  91     file = hex[2:]
  92     # Check from object dir
  93     return os.path.join(path, dir, file)
  94
  95
  96 def filename_to_hex(filename):
  97     """Takes an object filename and returns its corresponding hex sha."""
  98     # grab the last (up to) two path components
  99     names = filename.rsplit(os.path.sep, 2)[-2:]
 100     errmsg = "Invalid object filename: %s" % filename
 101     assert len(names) == 2, errmsg
 102     base, rest = names
 103     assert len(base) == 2 and len(rest) == 38, errmsg
 104     hex = base + rest
 105     hex_to_sha(hex)
 106     return hex
 107
 108
 109 def object_header(num_type, length):
 110     """Return an object header for the given numeric type and text length."""
 111     return "%s %d\0" % (object_class(num_type).type_name, length)
 112
 113
 114 def serializable_property(name, docstring=None):
 115     def set(obj, value):
 116         obj._ensure_parsed()
 117         setattr(obj, "_"+name, value)
 118         obj._needs_serialization = True
 119     def get(obj):
 120         obj._ensure_parsed()
 121         return getattr(obj, "_"+name)
 122     return property(get, set, doc=docstring)
 123
 124
 125 def object_class(type):
 126     """Get the object class corresponding to the given type.
 127
 128     :param type: Either a type name string or a numeric type.
 129     :return: The ShaFile subclass corresponding to the given type, or None if
 130         type is not a valid type name/number.
 131     """
 132     return _TYPE_MAP.get(type, None)
 133
 134
 135 def check_hexsha(hex, error_msg):
 136     try:
 137         hex_to_sha(hex)
 138     except (TypeError, AssertionError):
 139         raise ObjectFormatException("%s %s" % (error_msg, hex))
 140
 141
 142 def check_identity(identity, error_msg):
 143     """Check if the specified identity is valid.
 144
 145     This will raise an exception if the identity is not valid.
 146
 147     :param identity: Identity string
 148     :param error_msg: Error message to use in exception
 149     """
 150     email_start = identity.find("<")
 151     email_end = identity.find(">")
 152     if (email_start < 0 or email_end < 0 or email_end <= email_start
 153         or identity.find("<", email_start + 1) >= 0
 154         or identity.find(">", email_end + 1) >= 0
 155         or not identity.endswith(">")):
 156         raise ObjectFormatException(error_msg)
 157
 158
 159 class FixedSha(object):
 160     """SHA object that behaves like hashlib's but is given a fixed value."""
 161
 162     def __init__(self, hexsha):
 163         self._hexsha = hexsha
 164         self._sha = hex_to_sha(hexsha)
 165
 166     def digest(self):
 167         return self._sha
 168
 169     def hexdigest(self):
 170         return self._hexsha
 171
 172
 173 class ShaFile(object):
 174     """A git SHA file."""
 175
 176     @staticmethod
 177     def _parse_legacy_object_header(magic, f):
 178         """Parse a legacy object, creating it but not reading the file."""
 179         bufsize = 1024
 180         decomp = zlib.decompressobj()
 181         header = decomp.decompress(magic)
 182         start = 0
 183         end = -1
 184         while end < 0:
 185             header += decomp.decompress(f.read(bufsize))
 186             end = header.find("\0", start)
 187             start = len(header)
 188         header = header[:end]
 189         type_name, size = header.split(" ", 1)
 190         size = int(size)  # sanity check
 191         obj_class = object_class(type_name)
 192         if not obj_class:
 193             raise ObjectFormatException("Not a known type: %s" % type_name)
 194         return obj_class()
 195
 196     def _parse_legacy_object(self, f):
 197         """Parse a legacy object, setting the raw string."""
 198         size = os.path.getsize(f.name)
 199         map = mmap.mmap(f.fileno(), size, access=mmap.ACCESS_READ)
 200         try:
 201             text = _decompress(map)
 202         finally:
 203             map.close()
 204         header_end = text.find('\0')
 205         if header_end < 0:
 206             raise ObjectFormatException("Invalid object header")
 207         self.set_raw_string(text[header_end+1:])
 208
 209     def as_legacy_object_chunks(self):
 210         compobj = zlib.compressobj()
 211         yield compobj.compress(self._header())
 212         for chunk in self.as_raw_chunks():
 213             yield compobj.compress(chunk)
 214         yield compobj.flush()
 215
 216     def as_legacy_object(self):
 217         return "".join(self.as_legacy_object_chunks())
 218
 219     def as_raw_chunks(self):
 220         if self._needs_parsing:
 221             self._ensure_parsed()
 222         elif self._needs_serialization:
 223             self._chunked_text = self._serialize()
 224         return self._chunked_text
 225
 226     def as_raw_string(self):
 227         return "".join(self.as_raw_chunks())
 228
 229     def __str__(self):
 230         return self.as_raw_string()
 231
 232     def __hash__(self):
 233         return hash(self.id)
 234
 235     def as_pretty_string(self):
 236         return self.as_raw_string()
 237
 238     def _ensure_parsed(self):
 239         if self._needs_parsing:
 240             if not self._chunked_text:
 241                 if self._file is not None:
 242                     self._parse_file(self._file)
 243                 elif self._path is not None:
 244                     self._parse_path()
 245                 else:
 246                     raise AssertionError(
 247                         "ShaFile needs either text or filename")
 248             self._deserialize(self._chunked_text)
 249             self._needs_parsing = False
 250
 251     def set_raw_string(self, text):
 252         if type(text) != str:
 253             raise TypeError(text)
 254         self.set_raw_chunks([text])
 255
 256     def set_raw_chunks(self, chunks):
 257         self._chunked_text = chunks
 258         self._deserialize(chunks)
 259         self._sha = None
 260         self._needs_parsing = False
 261         self._needs_serialization = False
 262
 263     @staticmethod
 264     def _parse_object_header(magic, f):
 265         """Parse a new style object, creating it but not reading the file."""
 266         num_type = (ord(magic[0]) >> 4) & 7
 267         obj_class = object_class(num_type)
 268         if not obj_class:
 269             raise ObjectFormatException("Not a known type: %d" % num_type)
 270         return obj_class()
 271
 272     def _parse_object(self, f):
 273         """Parse a new style object, setting self._text."""
 274         size = os.path.getsize(f.name)
 275         map = mmap.mmap(f.fileno(), size, access=mmap.ACCESS_READ)
 276         try:
 277             # skip type and size; type must have already been determined, and
 278             # we trust zlib to fail if it's otherwise corrupted
 279             byte = ord(map[0])
 280             used = 1
 281             while (byte & 0x80) != 0:
 282                 byte = ord(map[used])
 283                 used += 1
 284             raw = map[used:]
 285             self.set_raw_string(_decompress(raw))
 286         finally:
 287             map.close()
 288
 289     @classmethod
 290     def _is_legacy_object(cls, magic):
 291         b0, b1 = map(ord, magic)
 292         word = (b0 << 8) + b1
 293         return b0 == 0x78 and (word % 31) == 0
 294
 295     @classmethod
 296     def _parse_file_header(cls, f):
 297         magic = f.read(2)
 298         if cls._is_legacy_object(magic):
 299             return cls._parse_legacy_object_header(magic, f)
 300         else:
 301             return cls._parse_object_header(magic, f)
 302
 303     def __init__(self):
 304         """Don't call this directly"""
 305         self._sha = None
 306         self._path = None
 307         self._file = None
 308         self._chunked_text = []
 309         self._needs_parsing = False
 310         self._needs_serialization = True
 311
 312     def _deserialize(self, chunks):
 313         raise NotImplementedError(self._deserialize)
 314
 315     def _serialize(self):
 316         raise NotImplementedError(self._serialize)
 317
 318     def _parse_path(self):
 319         f = GitFile(self._path, 'rb')
 320         try:
 321             self._parse_file(f)
 322         finally:
 323             f.close()
 324
 325     def _parse_file(self, f):
 326         magic = f.read(2)
 327         if self._is_legacy_object(magic):
 328             self._parse_legacy_object(f)
 329         else:
 330             self._parse_object(f)
 331
 332     @classmethod
 333     def from_path(cls, path):
 334         f = GitFile(path, 'rb')
 335         try:
 336             obj = cls.from_file(f)
 337             obj._path = path
 338             obj._sha = FixedSha(filename_to_hex(path))
 339             obj._file = None
 340             return obj
 341         finally:
 342             f.close()
 343
 344     @classmethod
 345     def from_file(cls, f):
 346         """Get the contents of a SHA file on disk."""
 347         try:
 348             obj = cls._parse_file_header(f)
 349             obj._sha = None
 350             obj._needs_parsing = True
 351             obj._needs_serialization = True
 352             obj._file = f
 353             return obj
 354         except (IndexError, ValueError), e:
 355             raise ObjectFormatException("invalid object header")
 356
 357     @staticmethod
 358     def from_raw_string(type_num, string):
 359         """Creates an object of the indicated type from the raw string given.
 360
 361         :param type_num: The numeric type of the object.
 362         :param string: The raw uncompressed contents.
 363         """
 364         obj = object_class(type_num)()
 365         obj.set_raw_string(string)
 366         return obj
 367
 368     @staticmethod
 369     def from_raw_chunks(type_num, chunks):
 370         """Creates an object of the indicated type from the raw chunks given.
 371
 372         :param type_num: The numeric type of the object.
 373         :param chunks: An iterable of the raw uncompressed contents.
 374         """
 375         obj = object_class(type_num)()
 376         obj.set_raw_chunks(chunks)
 377         return obj
 378
 379     @classmethod
 380     def from_string(cls, string):
 381         """Create a ShaFile from a string."""
 382         obj = cls()
 383         obj.set_raw_string(string)
 384         return obj
 385
 386     def _check_has_member(self, member, error_msg):
 387         """Check that the object has a given member variable.
 388
 389         :param member: the member variable to check for
 390         :param error_msg: the message for an error if the member is missing
 391         :raise ObjectFormatException: with the given error_msg if member is
 392             missing or is None
 393         """
 394         if getattr(self, member, None) is None:
 395             raise ObjectFormatException(error_msg)
 396
 397     def check(self):
 398         """Check this object for internal consistency.
 399
 400         :raise ObjectFormatException: if the object is malformed in some way
 401         :raise ChecksumMismatch: if the object was created with a SHA that does
 402             not match its contents
 403         """
 404         # TODO: if we find that error-checking during object parsing is a
 405         # performance bottleneck, those checks should be moved to the class's
 406         # check() method during optimization so we can still check the object
 407         # when necessary.
 408         old_sha = self.id
 409         try:
 410             self._deserialize(self.as_raw_chunks())
 411             self._sha = None
 412             new_sha = self.id
 413         except Exception, e:
 414             raise ObjectFormatException(e)
 415         if old_sha != new_sha:
 416             raise ChecksumMismatch(new_sha, old_sha)
 417
 418     def _header(self):
 419         return object_header(self.type, self.raw_length())
 420
 421     def raw_length(self):
 422         """Returns the length of the raw string of this object."""
 423         ret = 0
 424         for chunk in self.as_raw_chunks():
 425             ret += len(chunk)
 426         return ret
 427
 428     def _make_sha(self):
 429         ret = make_sha()
 430         ret.update(self._header())
 431         for chunk in self.as_raw_chunks():
 432             ret.update(chunk)
 433         return ret
 434
 435     def sha(self):
 436         """The SHA1 object that is the name of this object."""
 437         if self._sha is None or self._needs_serialization:
 438             # this is a local because as_raw_chunks() overwrites self._sha
 439             new_sha = make_sha()
 440             new_sha.update(self._header())
 441             for chunk in self.as_raw_chunks():
 442                 new_sha.update(chunk)
 443             self._sha = new_sha
 444         return self._sha
 445
 446     @property
 447     def id(self):
 448         return self.sha().hexdigest()
 449
 450     def get_type(self):
 451         return self.type_num
 452
 453     def set_type(self, type):
 454         self.type_num = type
 455
 456     # DEPRECATED: use type_num or type_name as needed.
 457     type = property(get_type, set_type)
 458
 459     def __repr__(self):
 460         return "<%s %s>" % (self.__class__.__name__, self.id)
 461
 462     def __ne__(self, other):
 463         return self.id != other.id
 464
 465     def __eq__(self, other):
 466         """Return true if the sha of the two objects match.
 467
 468         The __le__ etc methods aren't overriden as they make no sense,
 469         certainly at this level.
 470         """
 471         return self.id == other.id
 472
 473
 474 class Blob(ShaFile):
 475     """A Git Blob object."""
 476
 477     type_name = 'blob'
 478     type_num = 3
 479
 480     def __init__(self):
 481         super(Blob, self).__init__()
 482         self._chunked_text = []
 483         self._needs_parsing = False
 484         self._needs_serialization = False
 485
 486     def _get_data(self):
 487         return self.as_raw_string()
 488
 489     def _set_data(self, data):
 490         self.set_raw_string(data)
 491
 492     data = property(_get_data, _set_data,
 493                     "The text contained within the blob object.")
 494
 495     def _get_chunked(self):
 496         self._ensure_parsed()
 497         return self._chunked_text
 498
 499     def _set_chunked(self, chunks):
 500         self._chunked_text = chunks
 501
 502     def _serialize(self):
 503         if not self._chunked_text:
 504             self._ensure_parsed()
 505         self._needs_serialization = False
 506         return self._chunked_text
 507
 508     def _deserialize(self, chunks):
 509         self._chunked_text = chunks
 510
 511     chunked = property(_get_chunked, _set_chunked,
 512         "The text within the blob object, as chunks (not necessarily lines).")
 513
 514     @classmethod
 515     def from_path(cls, path):
 516         blob = ShaFile.from_path(path)
 517         if not isinstance(blob, cls):
 518             raise NotBlobError(path)
 519         return blob
 520
 521     def check(self):
 522         """Check this object for internal consistency.
 523
 524         :raise ObjectFormatException: if the object is malformed in some way
 525         """
 526         super(Blob, self).check()
 527
 528
 529 def _parse_tag_or_commit(text):
 530     """Parse tag or commit text.
 531
 532     :param text: the raw text of the tag or commit object.
 533     :yield: tuples of (field, value), one per header line, in the order read
 534         from the text, possibly including duplicates. Includes a field named
 535         None for the freeform tag/commit text.
 536     """
 537     f = StringIO(text)
 538     for l in f:
 539         l = l.rstrip("\n")
 540         if l == "":
 541             # Empty line indicates end of headers
 542             break
 543         yield l.split(" ", 1)
 544     yield (None, f.read())
 545     f.close()
 546
 547
 548 def parse_tag(text):
 549     return _parse_tag_or_commit(text)
 550
 551
 552 class Tag(ShaFile):
 553     """A Git Tag object."""
 554
 555     type_name = 'tag'
 556     type_num = 4
 557
 558     def __init__(self):
 559         super(Tag, self).__init__()
 560         self._tag_timezone_neg_utc = False
 561
 562     @classmethod
 563     def from_path(cls, filename):
 564         tag = ShaFile.from_path(filename)
 565         if not isinstance(tag, cls):
 566             raise NotTagError(filename)
 567         return tag
 568
 569     def check(self):
 570         """Check this object for internal consistency.
 571
 572         :raise ObjectFormatException: if the object is malformed in some way
 573         """
 574         super(Tag, self).check()
 575         self._check_has_member("_object_sha", "missing object sha")
 576         self._check_has_member("_object_class", "missing object type")
 577         self._check_has_member("_name", "missing tag name")
 578
 579         if not self._name:
 580             raise ObjectFormatException("empty tag name")
 581
 582         check_hexsha(self._object_sha, "invalid object sha")
 583
 584         if getattr(self, "_tagger", None):
 585             check_identity(self._tagger, "invalid tagger")
 586
 587         last = None
 588         for field, _ in parse_tag("".join(self._chunked_text)):
 589             if field == _OBJECT_HEADER and last is not None:
 590                 raise ObjectFormatException("unexpected object")
 591             elif field == _TYPE_HEADER and last != _OBJECT_HEADER:
 592                 raise ObjectFormatException("unexpected type")
 593             elif field == _TAG_HEADER and last != _TYPE_HEADER:
 594                 raise ObjectFormatException("unexpected tag name")
 595             elif field == _TAGGER_HEADER and last != _TAG_HEADER:
 596                 raise ObjectFormatException("unexpected tagger")
 597             last = field
 598
 599     def _serialize(self):
 600         chunks = []
 601         chunks.append("%s %s\n" % (_OBJECT_HEADER, self._object_sha))
 602         chunks.append("%s %s\n" % (_TYPE_HEADER, self._object_class.type_name))
 603         chunks.append("%s %s\n" % (_TAG_HEADER, self._name))
 604         if self._tagger:
 605             if self._tag_time is None:
 606                 chunks.append("%s %s\n" % (_TAGGER_HEADER, self._tagger))
 607             else:
 608                 chunks.append("%s %s %d %s\n" % (
 609                   _TAGGER_HEADER, self._tagger, self._tag_time,
 610                   format_timezone(self._tag_timezone,
 611                     self._tag_timezone_neg_utc)))
 612         chunks.append("\n") # To close headers
 613         chunks.append(self._message)
 614         return chunks
 615
 616     def _deserialize(self, chunks):
 617         """Grab the metadata attached to the tag"""
 618         self._tagger = None
 619         for field, value in parse_tag("".join(chunks)):
 620             if field == _OBJECT_HEADER:
 621                 self._object_sha = value
 622             elif field == _TYPE_HEADER:
 623                 obj_class = object_class(value)
 624                 if not obj_class:
 625                     raise ObjectFormatException("Not a known type: %s" % value)
 626                 self._object_class = obj_class
 627             elif field == _TAG_HEADER:
 628                 self._name = value
 629             elif field == _TAGGER_HEADER:
 630                 try:
 631                     sep = value.index("> ")
 632                 except ValueError:
 633                     self._tagger = value
 634                     self._tag_time = None
 635                     self._tag_timezone = None
 636                     self._tag_timezone_neg_utc = False
 637                 else:
 638                     self._tagger = value[0:sep+1]
 639                     try:
 640                         (timetext, timezonetext) = value[sep+2:].rsplit(" ", 1)
 641                         self._tag_time = int(timetext)
 642                         self._tag_timezone, self._tag_timezone_neg_utc = \
 643                                 parse_timezone(timezonetext)
 644                     except ValueError, e:
 645                         raise ObjectFormatException(e)
 646             elif field is None:
 647                 self._message = value
 648             else:
 649                 raise ObjectFormatException("Unknown field %s" % field)
 650
 651     def _get_object(self):
 652         """Get the object pointed to by this tag.
 653
 654         :return: tuple of (object class, sha).
 655         """
 656         self._ensure_parsed()
 657         return (self._object_class, self._object_sha)
 658
 659     def _set_object(self, value):
 660         self._ensure_parsed()
 661         (self._object_class, self._object_sha) = value
 662         self._needs_serialization = True
 663
 664     object = property(_get_object, _set_object)
 665
 666     name = serializable_property("name", "The name of this tag")
 667     tagger = serializable_property("tagger",
 668         "Returns the name of the person who created this tag")
 669     tag_time = serializable_property("tag_time",
 670         "The creation timestamp of the tag.  As the number of seconds since the epoch")
 671     tag_timezone = serializable_property("tag_timezone",
 672         "The timezone that tag_time is in.")
 673     message = serializable_property("message", "The message attached to this tag")
 674
 675
 676 def parse_tree(text):
 677     """Parse a tree text.
 678
 679     :param text: Serialized text to parse
 680     :yields: tuples of (name, mode, sha)
 681     """
 682     count = 0
 683     l = len(text)
 684     while count < l:
 685         mode_end = text.index(' ', count)
 686         mode = int(text[count:mode_end], 8)
 687         name_end = text.index('\0', mode_end)
 688         name = text[mode_end+1:name_end]
 689         count = name_end+21
 690         sha = text[name_end+1:count]
 691         yield (name, mode, sha_to_hex(sha))
 692
 693
 694 def serialize_tree(items):
 695     """Serialize the items in a tree to a text.
 696
 697     :param items: Sorted iterable over (name, mode, sha) tuples
 698     :return: Serialized tree text as chunks
 699     """
 700     for name, mode, hexsha in items:
 701         yield "%04o %s\0%s" % (mode, name, hex_to_sha(hexsha))
 702
 703
 704 def sorted_tree_items(entries):
 705     """Iterate over a tree entries dictionary in the order in which
 706     the items would be serialized.
 707
 708     :param entries: Dictionary mapping names to (mode, sha) tuples
 709     :return: Iterator over (name, mode, hexsha)
 710     """
 711     for name, entry in sorted(entries.iteritems(), cmp=cmp_entry):
 712         mode, hexsha = entry
 713         # Stricter type checks than normal to mirror checks in the C version.
 714         mode = int(mode)
 715         if not isinstance(hexsha, str):
 716             raise TypeError('Expected a string for SHA, got %r' % hexsha)
 717         yield name, mode, hexsha
 718
 719
 720 def cmp_entry((name1, value1), (name2, value2)):
 721     """Compare two tree entries."""
 722     if stat.S_ISDIR(value1[0]):
 723         name1 += "/"
 724     if stat.S_ISDIR(value2[0]):
 725         name2 += "/"
 726     return cmp(name1, name2)
 727
 728
 729 class Tree(ShaFile):
 730     """A Git tree object"""
 731
 732     type_name = 'tree'
 733     type_num = 2
 734
 735     def __init__(self):
 736         super(Tree, self).__init__()
 737         self._entries = {}
 738
 739     @classmethod
 740     def from_path(cls, filename):
 741         tree = ShaFile.from_path(filename)
 742         if not isinstance(tree, cls):
 743             raise NotTreeError(filename)
 744         return tree
 745
 746     def __contains__(self, name):
 747         self._ensure_parsed()
 748         return name in self._entries
 749
 750     def __getitem__(self, name):
 751         self._ensure_parsed()
 752         return self._entries[name]
 753
 754     def __setitem__(self, name, value):
 755         """Set a tree entry by name.
 756
 757         :param name: The name of the entry, as a string.
 758         :param value: A tuple of (mode, hexsha), where mode is the mode of the
 759             entry as an integral type and hexsha is the hex SHA of the entry as
 760             a string.
 761         """
 762         mode, hexsha = value
 763         self._ensure_parsed()
 764         self._entries[name] = (mode, hexsha)
 765         self._needs_serialization = True
 766
 767     def __delitem__(self, name):
 768         self._ensure_parsed()
 769         del self._entries[name]
 770         self._needs_serialization = True
 771
 772     def __len__(self):
 773         self._ensure_parsed()
 774         return len(self._entries)
 775
 776     def __iter__(self):
 777         self._ensure_parsed()
 778         return iter(self._entries)
 779
 780     def add(self, mode, name, hexsha):
 781         """Add an entry to the tree.
 782
 783         :param mode: The mode of the entry as an integral type. Not all possible
 784             modes are supported by git; see check() for details.
 785         :param name: The name of the entry, as a string.
 786         :param hexsha: The hex SHA of the entry as a string.
 787         """
 788         self._ensure_parsed()
 789         self._entries[name] = mode, hexsha
 790         self._needs_serialization = True
 791
 792     def entries(self):
 793         """Return a list of tuples describing the tree entries"""
 794         self._ensure_parsed()
 795         # The order of this is different from iteritems() for historical
 796         # reasons
 797         return [
 798             (mode, name, hexsha) for (name, mode, hexsha) in self.iteritems()]
 799
 800     def iteritems(self):
 801         """Iterate over entries in the order in which they would be serialized.
 802
 803         :return: Iterator over (name, mode, sha) tuples
 804         """
 805         self._ensure_parsed()
 806         return sorted_tree_items(self._entries)
 807
 808     def _deserialize(self, chunks):
 809         """Grab the entries in the tree"""
 810         try:
 811             parsed_entries = parse_tree("".join(chunks))
 812         except ValueError, e:
 813             raise ObjectFormatException(e)
 814         # TODO: list comprehension is for efficiency in the common (small) case;
 815         # if memory efficiency in the large case is a concern, use a genexp.
 816         self._entries = dict([(n, (m, s)) for n, m, s in parsed_entries])
 817
 818     def check(self):
 819         """Check this object for internal consistency.
 820
 821         :raise ObjectFormatException: if the object is malformed in some way
 822         """
 823         super(Tree, self).check()
 824         last = None
 825         allowed_modes = (stat.S_IFREG | 0755, stat.S_IFREG | 0644,
 826                          stat.S_IFLNK, stat.S_IFDIR, S_IFGITLINK,
 827                          # TODO: optionally exclude as in git fsck --strict
 828                          stat.S_IFREG | 0664)
 829         for name, mode, sha in parse_tree("".join(self._chunked_text)):
 830             check_hexsha(sha, 'invalid sha %s' % sha)
 831             if '/' in name or name in ('', '.', '..'):
 832                 raise ObjectFormatException('invalid name %s' % name)
 833
 834             if mode not in allowed_modes:
 835                 raise ObjectFormatException('invalid mode %06o' % mode)
 836
 837             entry = (name, (mode, sha))
 838             if last:
 839                 if cmp_entry(last, entry) > 0:
 840                     raise ObjectFormatException('entries not sorted')
 841                 if name == last[0]:
 842                     raise ObjectFormatException('duplicate entry %s' % name)
 843             last = entry
 844
 845     def _serialize(self):
 846         return list(serialize_tree(self.iteritems()))
 847
 848     def as_pretty_string(self):
 849         text = []
 850         for name, mode, hexsha in self.iteritems():
 851             if mode & stat.S_IFDIR:
 852                 kind = "tree"
 853             else:
 854                 kind = "blob"
 855             text.append("%04o %s %s\t%s\n" % (mode, kind, hexsha, name))
 856         return "".join(text)
 857
 858
 859 def parse_timezone(text):
 860     offset = int(text)
 861     negative_utc = (offset == 0 and text[0] == '-')
 862     signum = (offset < 0) and -1 or 1
 863     offset = abs(offset)
 864     hours = int(offset / 100)
 865     minutes = (offset % 100)
 866     return signum * (hours * 3600 + minutes * 60), negative_utc
 867
 868
 869 def format_timezone(offset, negative_utc=False):
 870     if offset % 60 != 0:
 871         raise ValueError("Unable to handle non-minute offset.")
 872     if offset < 0 or (offset == 0 and negative_utc):
 873         sign = '-'
 874     else:
 875         sign = '+'
 876     offset = abs(offset)
 877     return '%c%02d%02d' % (sign, offset / 3600, (offset / 60) % 60)
 878
 879
 880 def parse_commit(text):
 881     return _parse_tag_or_commit(text)
 882
 883
 884 class Commit(ShaFile):
 885     """A git commit object"""
 886
 887     type_name = 'commit'
 888     type_num = 1
 889
 890     def __init__(self):
 891         super(Commit, self).__init__()
 892         self._parents = []
 893         self._encoding = None
 894         self._extra = {}
 895         self._author_timezone_neg_utc = False
 896         self._commit_timezone_neg_utc = False
 897
 898     @classmethod
 899     def from_path(cls, path):
 900         commit = ShaFile.from_path(path)
 901         if not isinstance(commit, cls):
 902             raise NotCommitError(path)
 903         return commit
 904
 905     def _deserialize(self, chunks):
 906         self._parents = []
 907         self._extra = []
 908         self._author = None
 909         for field, value in parse_commit("".join(self._chunked_text)):
 910             if field == _TREE_HEADER:
 911                 self._tree = value
 912             elif field == _PARENT_HEADER:
 913                 self._parents.append(value)
 914             elif field == _AUTHOR_HEADER:
 915                 self._author, timetext, timezonetext = value.rsplit(" ", 2)
 916                 self._author_time = int(timetext)
 917                 self._author_timezone, self._author_timezone_neg_utc =\
 918                     parse_timezone(timezonetext)
 919             elif field == _COMMITTER_HEADER:
 920                 self._committer, timetext, timezonetext = value.rsplit(" ", 2)
 921                 self._commit_time = int(timetext)
 922                 self._commit_timezone, self._commit_timezone_neg_utc =\
 923                     parse_timezone(timezonetext)
 924             elif field == _ENCODING_HEADER:
 925                 self._encoding = value
 926             elif field is None:
 927                 self._message = value
 928             else:
 929                 self._extra.append((field, value))
 930
 931     def check(self):
 932         """Check this object for internal consistency.
 933
 934         :raise ObjectFormatException: if the object is malformed in some way
 935         """
 936         super(Commit, self).check()
 937         self._check_has_member("_tree", "missing tree")
 938         self._check_has_member("_author", "missing author")
 939         self._check_has_member("_committer", "missing committer")
 940         # times are currently checked when set
 941
 942         for parent in self._parents:
 943             check_hexsha(parent, "invalid parent sha")
 944         check_hexsha(self._tree, "invalid tree sha")
 945
 946         check_identity(self._author, "invalid author")
 947         check_identity(self._committer, "invalid committer")
 948
 949         last = None
 950         for field, _ in parse_commit("".join(self._chunked_text)):
 951             if field == _TREE_HEADER and last is not None:
 952                 raise ObjectFormatException("unexpected tree")
 953             elif field == _PARENT_HEADER and last not in (_PARENT_HEADER,
 954                                                           _TREE_HEADER):
 955                 raise ObjectFormatException("unexpected parent")
 956             elif field == _AUTHOR_HEADER and last not in (_TREE_HEADER,
 957                                                           _PARENT_HEADER):
 958                 raise ObjectFormatException("unexpected author")
 959             elif field == _COMMITTER_HEADER and last != _AUTHOR_HEADER:
 960                 raise ObjectFormatException("unexpected committer")
 961             elif field == _ENCODING_HEADER and last != _COMMITTER_HEADER:
 962                 raise ObjectFormatException("unexpected encoding")
 963             last = field
 964
 965         # TODO: optionally check for duplicate parents
 966
 967     def _serialize(self):
 968         chunks = []
 969         chunks.append("%s %s\n" % (_TREE_HEADER, self._tree))
 970         for p in self._parents:
 971             chunks.append("%s %s\n" % (_PARENT_HEADER, p))
 972         chunks.append("%s %s %s %s\n" % (
 973           _AUTHOR_HEADER, self._author, str(self._author_time),
 974           format_timezone(self._author_timezone,
 975                           self._author_timezone_neg_utc)))
 976         chunks.append("%s %s %s %s\n" % (
 977           _COMMITTER_HEADER, self._committer, str(self._commit_time),
 978           format_timezone(self._commit_timezone,
 979                           self._commit_timezone_neg_utc)))
 980         if self.encoding:
 981             chunks.append("%s %s\n" % (_ENCODING_HEADER, self.encoding))
 982         for k, v in self.extra:
 983             if "\n" in k or "\n" in v:
 984                 raise AssertionError("newline in extra data: %r -> %r" % (k, v))
 985             chunks.append("%s %s\n" % (k, v))
 986         chunks.append("\n") # There must be a new line after the headers
 987         chunks.append(self._message)
 988         return chunks
 989
 990     tree = serializable_property("tree", "Tree that is the state of this commit")
 991
 992     def _get_parents(self):
 993         """Return a list of parents of this commit."""
 994         self._ensure_parsed()
 995         return self._parents
 996
 997     def _set_parents(self, value):
 998         """Set a list of parents of this commit."""
 999         self._ensure_parsed()
1000         self._needs_serialization = True
1001         self._parents = value
1002
1003     parents = property(_get_parents, _set_parents)
1004
1005     def _get_extra(self):
1006         """Return extra settings of this commit."""
1007         self._ensure_parsed()
1008         return self._extra
1009
1010     extra = property(_get_extra)
1011
1012     author = serializable_property("author",
1013         "The name of the author of the commit")
1014
1015     committer = serializable_property("committer",
1016         "The name of the committer of the commit")
1017
1018     message = serializable_property("message",
1019         "The commit message")
1020
1021     commit_time = serializable_property("commit_time",
1022         "The timestamp of the commit. As the number of seconds since the epoch.")
1023
1024     commit_timezone = serializable_property("commit_timezone",
1025         "The zone the commit time is in")
1026
1027     author_time = serializable_property("author_time",
1028         "The timestamp the commit was written. as the number of seconds since the epoch.")
1029
1030     author_timezone = serializable_property("author_timezone",
1031         "Returns the zone the author time is in.")
1032
1033     encoding = serializable_property("encoding",
1034         "Encoding of the commit message.")
1035
1036
1037 OBJECT_CLASSES = (
1038     Commit,
1039     Tree,
1040     Blob,
1041     Tag,
1042     )
1043
1044 _TYPE_MAP = {}
1045
1046 for cls in OBJECT_CLASSES:
1047     _TYPE_MAP[cls.type_name] = cls
1048     _TYPE_MAP[cls.type_num] = cls
1049
1050
1051
1052 # Hold on to the pure-python implementations for testing
1053 _parse_tree_py = parse_tree
1054 _sorted_tree_items_py = sorted_tree_items
1055 try:
1056     # Try to import C versions
1057     from dulwich._objects import parse_tree, sorted_tree_items
1058 except ImportError:
1059     pass