dulwich/objects.py

   1 # objects.py -- Access to base git objects
   2 # Copyright (C) 2007 James Westby <jw+debian@jameswestby.net>
   3 # Copyright (C) 2008-2009 Jelmer Vernooij <jelmer@samba.org>
   4 #
   5 # This program is free software; you can redistribute it and/or
   6 # modify it under the terms of the GNU General Public License
   7 # as published by the Free Software Foundation; version 2
   8 # of the License or (at your option) a later version of the License.
   9 #
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 # GNU General Public License for more details.
  14 #
  15 # You should have received a copy of the GNU General Public License
  16 # along with this program; if not, write to the Free Software
  17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
  18 # MA  02110-1301, USA.
  19
  20 """Access to base git objects."""
  21
  22
  23 import binascii
  24 from cStringIO import (
  25     StringIO,
  26     )
  27 import os
  28 import stat
  29 import zlib
  30
  31 from dulwich.errors import (
  32     ChecksumMismatch,
  33     NotBlobError,
  34     NotCommitError,
  35     NotTagError,
  36     NotTreeError,
  37     ObjectFormatException,
  38     )
  39 from dulwich.file import GitFile
  40 from dulwich.misc import (
  41     make_sha,
  42     TreeEntry,
  43     )
  44
  45
  46 # Header fields for commits
  47 _TREE_HEADER = "tree"
  48 _PARENT_HEADER = "parent"
  49 _AUTHOR_HEADER = "author"
  50 _COMMITTER_HEADER = "committer"
  51 _ENCODING_HEADER = "encoding"
  52
  53
  54 # Header fields for objects
  55 _OBJECT_HEADER = "object"
  56 _TYPE_HEADER = "type"
  57 _TAG_HEADER = "tag"
  58 _TAGGER_HEADER = "tagger"
  59
  60
  61 S_IFGITLINK = 0160000
  62
  63 def S_ISGITLINK(m):
  64     return (stat.S_IFMT(m) == S_IFGITLINK)
  65
  66
  67 def _decompress(string):
  68     dcomp = zlib.decompressobj()
  69     dcomped = dcomp.decompress(string)
  70     dcomped += dcomp.flush()
  71     return dcomped
  72
  73
  74 def sha_to_hex(sha):
  75     """Takes a string and returns the hex of the sha within"""
  76     hexsha = binascii.hexlify(sha)
  77     assert len(hexsha) == 40, "Incorrect length of sha1 string: %d" % hexsha
  78     return hexsha
  79
  80
  81 def hex_to_sha(hex):
  82     """Takes a hex sha and returns a binary sha"""
  83     assert len(hex) == 40, "Incorrent length of hexsha: %s" % hex
  84     return binascii.unhexlify(hex)
  85
  86
  87 def hex_to_filename(path, hex):
  88     """Takes a hex sha and returns its filename relative to the given path."""
  89     dir = hex[:2]
  90     file = hex[2:]
  91     # Check from object dir
  92     return os.path.join(path, dir, file)
  93
  94
  95 def filename_to_hex(filename):
  96     """Takes an object filename and returns its corresponding hex sha."""
  97     # grab the last (up to) two path components
  98     names = filename.rsplit(os.path.sep, 2)[-2:]
  99     errmsg = "Invalid object filename: %s" % filename
 100     assert len(names) == 2, errmsg
 101     base, rest = names
 102     assert len(base) == 2 and len(rest) == 38, errmsg
 103     hex = base + rest
 104     hex_to_sha(hex)
 105     return hex
 106
 107
 108 def object_header(num_type, length):
 109     """Return an object header for the given numeric type and text length."""
 110     return "%s %d\0" % (object_class(num_type).type_name, length)
 111
 112
 113 def serializable_property(name, docstring=None):
 114     def set(obj, value):
 115         obj._ensure_parsed()
 116         setattr(obj, "_"+name, value)
 117         obj._needs_serialization = True
 118     def get(obj):
 119         obj._ensure_parsed()
 120         return getattr(obj, "_"+name)
 121     return property(get, set, doc=docstring)
 122
 123
 124 def object_class(type):
 125     """Get the object class corresponding to the given type.
 126
 127     :param type: Either a type name string or a numeric type.
 128     :return: The ShaFile subclass corresponding to the given type, or None if
 129         type is not a valid type name/number.
 130     """
 131     return _TYPE_MAP.get(type, None)
 132
 133
 134 def check_hexsha(hex, error_msg):
 135     try:
 136         hex_to_sha(hex)
 137     except (TypeError, AssertionError):
 138         raise ObjectFormatException("%s %s" % (error_msg, hex))
 139
 140
 141 def check_identity(identity, error_msg):
 142     """Check if the specified identity is valid.
 143
 144     This will raise an exception if the identity is not valid.
 145
 146     :param identity: Identity string
 147     :param error_msg: Error message to use in exception
 148     """
 149     email_start = identity.find("<")
 150     email_end = identity.find(">")
 151     if (email_start < 0 or email_end < 0 or email_end <= email_start
 152         or identity.find("<", email_start + 1) >= 0
 153         or identity.find(">", email_end + 1) >= 0
 154         or not identity.endswith(">")):
 155         raise ObjectFormatException(error_msg)
 156
 157
 158 class FixedSha(object):
 159     """SHA object that behaves like hashlib's but is given a fixed value."""
 160
 161     __slots__ = ('_hexsha', '_sha')
 162
 163     def __init__(self, hexsha):
 164         self._hexsha = hexsha
 165         self._sha = hex_to_sha(hexsha)
 166
 167     def digest(self):
 168         return self._sha
 169
 170     def hexdigest(self):
 171         return self._hexsha
 172
 173
 174 class ShaFile(object):
 175     """A git SHA file."""
 176
 177     __slots__ = ('_needs_parsing', '_chunked_text', '_file', '_path',
 178                  '_sha', '_needs_serialization', '_magic')
 179
 180     @staticmethod
 181     def _parse_legacy_object_header(magic, f):
 182         """Parse a legacy object, creating it but not reading the file."""
 183         bufsize = 1024
 184         decomp = zlib.decompressobj()
 185         header = decomp.decompress(magic)
 186         start = 0
 187         end = -1
 188         while end < 0:
 189             extra = f.read(bufsize)
 190             header += decomp.decompress(extra)
 191             magic += extra
 192             end = header.find("\0", start)
 193             start = len(header)
 194         header = header[:end]
 195         type_name, size = header.split(" ", 1)
 196         size = int(size)  # sanity check
 197         obj_class = object_class(type_name)
 198         if not obj_class:
 199             raise ObjectFormatException("Not a known type: %s" % type_name)
 200         ret = obj_class()
 201         ret._magic = magic
 202         return ret
 203
 204     def _parse_legacy_object(self, map):
 205         """Parse a legacy object, setting the raw string."""
 206         text = _decompress(map)
 207         header_end = text.find('\0')
 208         if header_end < 0:
 209             raise ObjectFormatException("Invalid object header, no \\0")
 210         self.set_raw_string(text[header_end+1:])
 211
 212     def as_legacy_object_chunks(self):
 213         compobj = zlib.compressobj()
 214         yield compobj.compress(self._header())
 215         for chunk in self.as_raw_chunks():
 216             yield compobj.compress(chunk)
 217         yield compobj.flush()
 218
 219     def as_legacy_object(self):
 220         return "".join(self.as_legacy_object_chunks())
 221
 222     def as_raw_chunks(self):
 223         if self._needs_parsing:
 224             self._ensure_parsed()
 225         elif self._needs_serialization:
 226             self._chunked_text = self._serialize()
 227         return self._chunked_text
 228
 229     def as_raw_string(self):
 230         return "".join(self.as_raw_chunks())
 231
 232     def __str__(self):
 233         return self.as_raw_string()
 234
 235     def __hash__(self):
 236         return hash(self.id)
 237
 238     def as_pretty_string(self):
 239         return self.as_raw_string()
 240
 241     def _ensure_parsed(self):
 242         if self._needs_parsing:
 243             if not self._chunked_text:
 244                 if self._file is not None:
 245                     self._parse_file(self._file)
 246                     self._file = None
 247                 elif self._path is not None:
 248                     self._parse_path()
 249                 else:
 250                     raise AssertionError(
 251                         "ShaFile needs either text or filename")
 252             self._deserialize(self._chunked_text)
 253             self._needs_parsing = False
 254
 255     def set_raw_string(self, text):
 256         if type(text) != str:
 257             raise TypeError(text)
 258         self.set_raw_chunks([text])
 259
 260     def set_raw_chunks(self, chunks):
 261         self._chunked_text = chunks
 262         self._deserialize(chunks)
 263         self._sha = None
 264         self._needs_parsing = False
 265         self._needs_serialization = False
 266
 267     @staticmethod
 268     def _parse_object_header(magic, f):
 269         """Parse a new style object, creating it but not reading the file."""
 270         num_type = (ord(magic[0]) >> 4) & 7
 271         obj_class = object_class(num_type)
 272         if not obj_class:
 273             raise ObjectFormatException("Not a known type %d" % num_type)
 274         ret = obj_class()
 275         ret._magic = magic
 276         return ret
 277
 278     def _parse_object(self, map):
 279         """Parse a new style object, setting self._text."""
 280         # skip type and size; type must have already been determined, and
 281         # we trust zlib to fail if it's otherwise corrupted
 282         byte = ord(map[0])
 283         used = 1
 284         while (byte & 0x80) != 0:
 285             byte = ord(map[used])
 286             used += 1
 287         raw = map[used:]
 288         self.set_raw_string(_decompress(raw))
 289
 290     @classmethod
 291     def _is_legacy_object(cls, magic):
 292         b0, b1 = map(ord, magic)
 293         word = (b0 << 8) + b1
 294         return b0 == 0x78 and (word % 31) == 0
 295
 296     @classmethod
 297     def _parse_file_header(cls, f):
 298         magic = f.read(2)
 299         if cls._is_legacy_object(magic):
 300             return cls._parse_legacy_object_header(magic, f)
 301         else:
 302             return cls._parse_object_header(magic, f)
 303
 304     def __init__(self):
 305         """Don't call this directly"""
 306         self._sha = None
 307         self._path = None
 308         self._file = None
 309         self._magic = None
 310         self._chunked_text = []
 311         self._needs_parsing = False
 312         self._needs_serialization = True
 313
 314     def _deserialize(self, chunks):
 315         raise NotImplementedError(self._deserialize)
 316
 317     def _serialize(self):
 318         raise NotImplementedError(self._serialize)
 319
 320     def _parse_path(self):
 321         f = GitFile(self._path, 'rb')
 322         try:
 323             self._parse_file(f)
 324         finally:
 325             f.close()
 326
 327     def _parse_file(self, f):
 328         magic = self._magic
 329         if magic is None:
 330             magic = f.read(2)
 331         map = magic + f.read()
 332         if self._is_legacy_object(magic[:2]):
 333             self._parse_legacy_object(map)
 334         else:
 335             self._parse_object(map)
 336
 337     @classmethod
 338     def from_path(cls, path):
 339         f = GitFile(path, 'rb')
 340         try:
 341             obj = cls.from_file(f)
 342             obj._path = path
 343             obj._sha = FixedSha(filename_to_hex(path))
 344             obj._file = None
 345             obj._magic = None
 346             return obj
 347         finally:
 348             f.close()
 349
 350     @classmethod
 351     def from_file(cls, f):
 352         """Get the contents of a SHA file on disk."""
 353         try:
 354             obj = cls._parse_file_header(f)
 355             obj._sha = None
 356             obj._needs_parsing = True
 357             obj._needs_serialization = True
 358             obj._file = f
 359             return obj
 360         except (IndexError, ValueError), e:
 361             raise ObjectFormatException("invalid object header")
 362
 363     @staticmethod
 364     def from_raw_string(type_num, string):
 365         """Creates an object of the indicated type from the raw string given.
 366
 367         :param type_num: The numeric type of the object.
 368         :param string: The raw uncompressed contents.
 369         """
 370         obj = object_class(type_num)()
 371         obj.set_raw_string(string)
 372         return obj
 373
 374     @staticmethod
 375     def from_raw_chunks(type_num, chunks):
 376         """Creates an object of the indicated type from the raw chunks given.
 377
 378         :param type_num: The numeric type of the object.
 379         :param chunks: An iterable of the raw uncompressed contents.
 380         """
 381         obj = object_class(type_num)()
 382         obj.set_raw_chunks(chunks)
 383         return obj
 384
 385     @classmethod
 386     def from_string(cls, string):
 387         """Create a ShaFile from a string."""
 388         obj = cls()
 389         obj.set_raw_string(string)
 390         return obj
 391
 392     def _check_has_member(self, member, error_msg):
 393         """Check that the object has a given member variable.
 394
 395         :param member: the member variable to check for
 396         :param error_msg: the message for an error if the member is missing
 397         :raise ObjectFormatException: with the given error_msg if member is
 398             missing or is None
 399         """
 400         if getattr(self, member, None) is None:
 401             raise ObjectFormatException(error_msg)
 402
 403     def check(self):
 404         """Check this object for internal consistency.
 405
 406         :raise ObjectFormatException: if the object is malformed in some way
 407         :raise ChecksumMismatch: if the object was created with a SHA that does
 408             not match its contents
 409         """
 410         # TODO: if we find that error-checking during object parsing is a
 411         # performance bottleneck, those checks should be moved to the class's
 412         # check() method during optimization so we can still check the object
 413         # when necessary.
 414         old_sha = self.id
 415         try:
 416             self._deserialize(self.as_raw_chunks())
 417             self._sha = None
 418             new_sha = self.id
 419         except Exception, e:
 420             raise ObjectFormatException(e)
 421         if old_sha != new_sha:
 422             raise ChecksumMismatch(new_sha, old_sha)
 423
 424     def _header(self):
 425         return object_header(self.type, self.raw_length())
 426
 427     def raw_length(self):
 428         """Returns the length of the raw string of this object."""
 429         ret = 0
 430         for chunk in self.as_raw_chunks():
 431             ret += len(chunk)
 432         return ret
 433
 434     def _make_sha(self):
 435         ret = make_sha()
 436         ret.update(self._header())
 437         for chunk in self.as_raw_chunks():
 438             ret.update(chunk)
 439         return ret
 440
 441     def sha(self):
 442         """The SHA1 object that is the name of this object."""
 443         if self._sha is None or self._needs_serialization:
 444             # this is a local because as_raw_chunks() overwrites self._sha
 445             new_sha = make_sha()
 446             new_sha.update(self._header())
 447             for chunk in self.as_raw_chunks():
 448                 new_sha.update(chunk)
 449             self._sha = new_sha
 450         return self._sha
 451
 452     @property
 453     def id(self):
 454         return self.sha().hexdigest()
 455
 456     def get_type(self):
 457         return self.type_num
 458
 459     def set_type(self, type):
 460         self.type_num = type
 461
 462     # DEPRECATED: use type_num or type_name as needed.
 463     type = property(get_type, set_type)
 464
 465     def __repr__(self):
 466         return "<%s %s>" % (self.__class__.__name__, self.id)
 467
 468     def __ne__(self, other):
 469         return self.id != other.id
 470
 471     def __eq__(self, other):
 472         """Return true if the sha of the two objects match.
 473
 474         The __le__ etc methods aren't overriden as they make no sense,
 475         certainly at this level.
 476         """
 477         return self.id == other.id
 478
 479
 480 class Blob(ShaFile):
 481     """A Git Blob object."""
 482
 483     __slots__ = ()
 484
 485     type_name = 'blob'
 486     type_num = 3
 487
 488     def __init__(self):
 489         super(Blob, self).__init__()
 490         self._chunked_text = []
 491         self._needs_parsing = False
 492         self._needs_serialization = False
 493
 494     def _get_data(self):
 495         return self.as_raw_string()
 496
 497     def _set_data(self, data):
 498         self.set_raw_string(data)
 499
 500     data = property(_get_data, _set_data,
 501                     "The text contained within the blob object.")
 502
 503     def _get_chunked(self):
 504         self._ensure_parsed()
 505         return self._chunked_text
 506
 507     def _set_chunked(self, chunks):
 508         self._chunked_text = chunks
 509
 510     def _serialize(self):
 511         if not self._chunked_text:
 512             self._ensure_parsed()
 513         self._needs_serialization = False
 514         return self._chunked_text
 515
 516     def _deserialize(self, chunks):
 517         self._chunked_text = chunks
 518
 519     chunked = property(_get_chunked, _set_chunked,
 520         "The text within the blob object, as chunks (not necessarily lines).")
 521
 522     @classmethod
 523     def from_path(cls, path):
 524         blob = ShaFile.from_path(path)
 525         if not isinstance(blob, cls):
 526             raise NotBlobError(path)
 527         return blob
 528
 529     def check(self):
 530         """Check this object for internal consistency.
 531
 532         :raise ObjectFormatException: if the object is malformed in some way
 533         """
 534         super(Blob, self).check()
 535
 536
 537 def _parse_tag_or_commit(text):
 538     """Parse tag or commit text.
 539
 540     :param text: the raw text of the tag or commit object.
 541     :return: iterator of tuples of (field, value), one per header line, in the
 542         order read from the text, possibly including duplicates. Includes a
 543         field named None for the freeform tag/commit text.
 544     """
 545     f = StringIO(text)
 546     for l in f:
 547         l = l.rstrip("\n")
 548         if l == "":
 549             # Empty line indicates end of headers
 550             break
 551         yield l.split(" ", 1)
 552     yield (None, f.read())
 553     f.close()
 554
 555
 556 def parse_tag(text):
 557     return _parse_tag_or_commit(text)
 558
 559
 560 class Tag(ShaFile):
 561     """A Git Tag object."""
 562
 563     type_name = 'tag'
 564     type_num = 4
 565
 566     __slots__ = ('_tag_timezone_neg_utc', '_name', '_object_sha',
 567                  '_object_class', '_tag_time', '_tag_timezone',
 568                  '_tagger', '_message')
 569
 570     def __init__(self):
 571         super(Tag, self).__init__()
 572         self._tag_timezone_neg_utc = False
 573
 574     @classmethod
 575     def from_path(cls, filename):
 576         tag = ShaFile.from_path(filename)
 577         if not isinstance(tag, cls):
 578             raise NotTagError(filename)
 579         return tag
 580
 581     def check(self):
 582         """Check this object for internal consistency.
 583
 584         :raise ObjectFormatException: if the object is malformed in some way
 585         """
 586         super(Tag, self).check()
 587         self._check_has_member("_object_sha", "missing object sha")
 588         self._check_has_member("_object_class", "missing object type")
 589         self._check_has_member("_name", "missing tag name")
 590
 591         if not self._name:
 592             raise ObjectFormatException("empty tag name")
 593
 594         check_hexsha(self._object_sha, "invalid object sha")
 595
 596         if getattr(self, "_tagger", None):
 597             check_identity(self._tagger, "invalid tagger")
 598
 599         last = None
 600         for field, _ in parse_tag("".join(self._chunked_text)):
 601             if field == _OBJECT_HEADER and last is not None:
 602                 raise ObjectFormatException("unexpected object")
 603             elif field == _TYPE_HEADER and last != _OBJECT_HEADER:
 604                 raise ObjectFormatException("unexpected type")
 605             elif field == _TAG_HEADER and last != _TYPE_HEADER:
 606                 raise ObjectFormatException("unexpected tag name")
 607             elif field == _TAGGER_HEADER and last != _TAG_HEADER:
 608                 raise ObjectFormatException("unexpected tagger")
 609             last = field
 610
 611     def _serialize(self):
 612         chunks = []
 613         chunks.append("%s %s\n" % (_OBJECT_HEADER, self._object_sha))
 614         chunks.append("%s %s\n" % (_TYPE_HEADER, self._object_class.type_name))
 615         chunks.append("%s %s\n" % (_TAG_HEADER, self._name))
 616         if self._tagger:
 617             if self._tag_time is None:
 618                 chunks.append("%s %s\n" % (_TAGGER_HEADER, self._tagger))
 619             else:
 620                 chunks.append("%s %s %d %s\n" % (
 621                   _TAGGER_HEADER, self._tagger, self._tag_time,
 622                   format_timezone(self._tag_timezone,
 623                     self._tag_timezone_neg_utc)))
 624         chunks.append("\n") # To close headers
 625         chunks.append(self._message)
 626         return chunks
 627
 628     def _deserialize(self, chunks):
 629         """Grab the metadata attached to the tag"""
 630         self._tagger = None
 631         for field, value in parse_tag("".join(chunks)):
 632             if field == _OBJECT_HEADER:
 633                 self._object_sha = value
 634             elif field == _TYPE_HEADER:
 635                 obj_class = object_class(value)
 636                 if not obj_class:
 637                     raise ObjectFormatException("Not a known type: %s" % value)
 638                 self._object_class = obj_class
 639             elif field == _TAG_HEADER:
 640                 self._name = value
 641             elif field == _TAGGER_HEADER:
 642                 try:
 643                     sep = value.index("> ")
 644                 except ValueError:
 645                     self._tagger = value
 646                     self._tag_time = None
 647                     self._tag_timezone = None
 648                     self._tag_timezone_neg_utc = False
 649                 else:
 650                     self._tagger = value[0:sep+1]
 651                     try:
 652                         (timetext, timezonetext) = value[sep+2:].rsplit(" ", 1)
 653                         self._tag_time = int(timetext)
 654                         self._tag_timezone, self._tag_timezone_neg_utc = \
 655                                 parse_timezone(timezonetext)
 656                     except ValueError, e:
 657                         raise ObjectFormatException(e)
 658             elif field is None:
 659                 self._message = value
 660             else:
 661                 raise ObjectFormatException("Unknown field %s" % field)
 662
 663     def _get_object(self):
 664         """Get the object pointed to by this tag.
 665
 666         :return: tuple of (object class, sha).
 667         """
 668         self._ensure_parsed()
 669         return (self._object_class, self._object_sha)
 670
 671     def _set_object(self, value):
 672         self._ensure_parsed()
 673         (self._object_class, self._object_sha) = value
 674         self._needs_serialization = True
 675
 676     object = property(_get_object, _set_object)
 677
 678     name = serializable_property("name", "The name of this tag")
 679     tagger = serializable_property("tagger",
 680         "Returns the name of the person who created this tag")
 681     tag_time = serializable_property("tag_time",
 682         "The creation timestamp of the tag.  As the number of seconds since the epoch")
 683     tag_timezone = serializable_property("tag_timezone",
 684         "The timezone that tag_time is in.")
 685     message = serializable_property("message", "The message attached to this tag")
 686
 687
 688 def parse_tree(text):
 689     """Parse a tree text.
 690
 691     :param text: Serialized text to parse
 692     :return: iterator of tuples of (name, mode, sha)
 693     """
 694     count = 0
 695     l = len(text)
 696     while count < l:
 697         mode_end = text.index(' ', count)
 698         mode_text = text[count:mode_end]
 699         assert mode_text[0] != '0'
 700         try:
 701             mode = int(mode_text, 8)
 702         except ValueError:
 703             raise ObjectFormatException("Invalid mode '%s'" % mode_text)
 704         name_end = text.index('\0', mode_end)
 705         name = text[mode_end+1:name_end]
 706         count = name_end+21
 707         sha = text[name_end+1:count]
 708         if len(sha) != 20:
 709             raise ObjectFormatException("Sha has invalid length")
 710         hexsha = sha_to_hex(sha)
 711         yield (name, mode, hexsha)
 712
 713
 714 def serialize_tree(items):
 715     """Serialize the items in a tree to a text.
 716
 717     :param items: Sorted iterable over (name, mode, sha) tuples
 718     :return: Serialized tree text as chunks
 719     """
 720     for name, mode, hexsha in items:
 721         yield "%04o %s\0%s" % (mode, name, hex_to_sha(hexsha))
 722
 723
 724 def sorted_tree_items(entries):
 725     """Iterate over a tree entries dictionary in the order in which
 726     the items would be serialized.
 727
 728     :param entries: Dictionary mapping names to (mode, sha) tuples
 729     :return: Iterator over (name, mode, hexsha)
 730     """
 731     for name, entry in sorted(entries.iteritems(), cmp=cmp_entry):
 732         mode, hexsha = entry
 733         # Stricter type checks than normal to mirror checks in the C version.
 734         mode = int(mode)
 735         if not isinstance(hexsha, str):
 736             raise TypeError('Expected a string for SHA, got %r' % hexsha)
 737         yield TreeEntry(name, mode, hexsha)
 738
 739
 740 def cmp_entry((name1, value1), (name2, value2)):
 741     """Compare two tree entries."""
 742     if stat.S_ISDIR(value1[0]):
 743         name1 += "/"
 744     if stat.S_ISDIR(value2[0]):
 745         name2 += "/"
 746     return cmp(name1, name2)
 747
 748
 749 class Tree(ShaFile):
 750     """A Git tree object"""
 751
 752     type_name = 'tree'
 753     type_num = 2
 754
 755     __slots__ = ('_entries')
 756
 757     def __init__(self):
 758         super(Tree, self).__init__()
 759         self._entries = {}
 760
 761     @classmethod
 762     def from_path(cls, filename):
 763         tree = ShaFile.from_path(filename)
 764         if not isinstance(tree, cls):
 765             raise NotTreeError(filename)
 766         return tree
 767
 768     def __contains__(self, name):
 769         self._ensure_parsed()
 770         return name in self._entries
 771
 772     def __getitem__(self, name):
 773         self._ensure_parsed()
 774         return self._entries[name]
 775
 776     def __setitem__(self, name, value):
 777         """Set a tree entry by name.
 778
 779         :param name: The name of the entry, as a string.
 780         :param value: A tuple of (mode, hexsha), where mode is the mode of the
 781             entry as an integral type and hexsha is the hex SHA of the entry as
 782             a string.
 783         """
 784         mode, hexsha = value
 785         self._ensure_parsed()
 786         self._entries[name] = (mode, hexsha)
 787         self._needs_serialization = True
 788
 789     def __delitem__(self, name):
 790         self._ensure_parsed()
 791         del self._entries[name]
 792         self._needs_serialization = True
 793
 794     def __len__(self):
 795         self._ensure_parsed()
 796         return len(self._entries)
 797
 798     def __iter__(self):
 799         self._ensure_parsed()
 800         return iter(self._entries)
 801
 802     def add(self, mode, name, hexsha):
 803         """Add an entry to the tree.
 804
 805         :param mode: The mode of the entry as an integral type. Not all possible
 806             modes are supported by git; see check() for details.
 807         :param name: The name of the entry, as a string.
 808         :param hexsha: The hex SHA of the entry as a string.
 809         """
 810         self._ensure_parsed()
 811         self._entries[name] = mode, hexsha
 812         self._needs_serialization = True
 813
 814     def entries(self):
 815         """Return a list of tuples describing the tree entries.
 816
 817         :note: The order of the tuples that are returned is different from that
 818             returned by the items and iteritems methods. This function will be
 819             deprecated in the future.
 820         """
 821         self._ensure_parsed()
 822         # The order of this is different from iteritems() for historical
 823         # reasons
 824         return [
 825             (mode, name, hexsha) for (name, mode, hexsha) in self.iteritems()]
 826
 827     def iteritems(self):
 828         """Iterate over entries in the order in which they would be serialized.
 829
 830         :return: Iterator over (name, mode, sha) tuples
 831         """
 832         self._ensure_parsed()
 833         return sorted_tree_items(self._entries)
 834
 835     def items(self):
 836         """Return the sorted entries in this tree.
 837
 838         :return: List with (name, mode, sha) tuples
 839         """
 840         return list(self.iteritems())
 841
 842     def _deserialize(self, chunks):
 843         """Grab the entries in the tree"""
 844         try:
 845             parsed_entries = parse_tree("".join(chunks))
 846         except ValueError, e:
 847             raise ObjectFormatException(e)
 848         # TODO: list comprehension is for efficiency in the common (small) case;
 849         # if memory efficiency in the large case is a concern, use a genexp.
 850         self._entries = dict([(n, (m, s)) for n, m, s in parsed_entries])
 851
 852     def check(self):
 853         """Check this object for internal consistency.
 854
 855         :raise ObjectFormatException: if the object is malformed in some way
 856         """
 857         super(Tree, self).check()
 858         last = None
 859         allowed_modes = (stat.S_IFREG | 0755, stat.S_IFREG | 0644,
 860                          stat.S_IFLNK, stat.S_IFDIR, S_IFGITLINK,
 861                          # TODO: optionally exclude as in git fsck --strict
 862                          stat.S_IFREG | 0664)
 863         for name, mode, sha in parse_tree("".join(self._chunked_text)):
 864             check_hexsha(sha, 'invalid sha %s' % sha)
 865             if '/' in name or name in ('', '.', '..'):
 866                 raise ObjectFormatException('invalid name %s' % name)
 867
 868             if mode not in allowed_modes:
 869                 raise ObjectFormatException('invalid mode %06o' % mode)
 870
 871             entry = (name, (mode, sha))
 872             if last:
 873                 if cmp_entry(last, entry) > 0:
 874                     raise ObjectFormatException('entries not sorted')
 875                 if name == last[0]:
 876                     raise ObjectFormatException('duplicate entry %s' % name)
 877             last = entry
 878
 879     def _serialize(self):
 880         return list(serialize_tree(self.iteritems()))
 881
 882     def as_pretty_string(self):
 883         text = []
 884         for name, mode, hexsha in self.iteritems():
 885             if mode & stat.S_IFDIR:
 886                 kind = "tree"
 887             else:
 888                 kind = "blob"
 889             text.append("%04o %s %s\t%s\n" % (mode, kind, hexsha, name))
 890         return "".join(text)
 891
 892
 893 def parse_timezone(text):
 894     """Parse a timezone text fragment (e.g. '+0100').
 895
 896     :param text: Text to parse.
 897     :return: Tuple with timezone as seconds difference to UTC
 898         and a boolean indicating whether this was a UTC timezone
 899         prefixed with a negative sign (-0000).
 900     """
 901     offset = int(text)
 902     negative_utc = (offset == 0 and text[0] == '-')
 903     signum = (offset < 0) and -1 or 1
 904     offset = abs(offset)
 905     hours = int(offset / 100)
 906     minutes = (offset % 100)
 907     return signum * (hours * 3600 + minutes * 60), negative_utc
 908
 909
 910 def format_timezone(offset, negative_utc=False):
 911     """Format a timezone for Git serialization.
 912
 913     :param offset: Timezone offset as seconds difference to UTC
 914     :param negative_utc: Whether to use a minus sign for UTC
 915         (-0000 rather than +0000).
 916     """
 917     if offset % 60 != 0:
 918         raise ValueError("Unable to handle non-minute offset.")
 919     if offset < 0 or (offset == 0 and negative_utc):
 920         sign = '-'
 921     else:
 922         sign = '+'
 923     offset = abs(offset)
 924     return '%c%02d%02d' % (sign, offset / 3600, (offset / 60) % 60)
 925
 926
 927 def parse_commit(text):
 928     return _parse_tag_or_commit(text)
 929
 930
 931 class Commit(ShaFile):
 932     """A git commit object"""
 933
 934     type_name = 'commit'
 935     type_num = 1
 936
 937     __slots__ = ('_parents', '_encoding', '_extra', '_author_timezone_neg_utc',
 938                  '_commit_timezone_neg_utc', '_commit_time',
 939                  '_author_time', '_author_timezone', '_commit_timezone',
 940                  '_author', '_committer', '_parents', '_extra',
 941                  '_encoding', '_tree', '_message')
 942
 943     def __init__(self):
 944         super(Commit, self).__init__()
 945         self._parents = []
 946         self._encoding = None
 947         self._extra = {}
 948         self._author_timezone_neg_utc = False
 949         self._commit_timezone_neg_utc = False
 950
 951     @classmethod
 952     def from_path(cls, path):
 953         commit = ShaFile.from_path(path)
 954         if not isinstance(commit, cls):
 955             raise NotCommitError(path)
 956         return commit
 957
 958     def _deserialize(self, chunks):
 959         self._parents = []
 960         self._extra = []
 961         self._author = None
 962         for field, value in parse_commit("".join(self._chunked_text)):
 963             if field == _TREE_HEADER:
 964                 self._tree = value
 965             elif field == _PARENT_HEADER:
 966                 self._parents.append(value)
 967             elif field == _AUTHOR_HEADER:
 968                 self._author, timetext, timezonetext = value.rsplit(" ", 2)
 969                 self._author_time = int(timetext)
 970                 self._author_timezone, self._author_timezone_neg_utc =\
 971                     parse_timezone(timezonetext)
 972             elif field == _COMMITTER_HEADER:
 973                 self._committer, timetext, timezonetext = value.rsplit(" ", 2)
 974                 self._commit_time = int(timetext)
 975                 self._commit_timezone, self._commit_timezone_neg_utc =\
 976                     parse_timezone(timezonetext)
 977             elif field == _ENCODING_HEADER:
 978                 self._encoding = value
 979             elif field is None:
 980                 self._message = value
 981             else:
 982                 self._extra.append((field, value))
 983
 984     def check(self):
 985         """Check this object for internal consistency.
 986
 987         :raise ObjectFormatException: if the object is malformed in some way
 988         """
 989         super(Commit, self).check()
 990         self._check_has_member("_tree", "missing tree")
 991         self._check_has_member("_author", "missing author")
 992         self._check_has_member("_committer", "missing committer")
 993         # times are currently checked when set
 994
 995         for parent in self._parents:
 996             check_hexsha(parent, "invalid parent sha")
 997         check_hexsha(self._tree, "invalid tree sha")
 998
 999         check_identity(self._author, "invalid author")
1000         check_identity(self._committer, "invalid committer")
1001
1002         last = None
1003         for field, _ in parse_commit("".join(self._chunked_text)):
1004             if field == _TREE_HEADER and last is not None:
1005                 raise ObjectFormatException("unexpected tree")
1006             elif field == _PARENT_HEADER and last not in (_PARENT_HEADER,
1007                                                           _TREE_HEADER):
1008                 raise ObjectFormatException("unexpected parent")
1009             elif field == _AUTHOR_HEADER and last not in (_TREE_HEADER,
1010                                                           _PARENT_HEADER):
1011                 raise ObjectFormatException("unexpected author")
1012             elif field == _COMMITTER_HEADER and last != _AUTHOR_HEADER:
1013                 raise ObjectFormatException("unexpected committer")
1014             elif field == _ENCODING_HEADER and last != _COMMITTER_HEADER:
1015                 raise ObjectFormatException("unexpected encoding")
1016             last = field
1017
1018         # TODO: optionally check for duplicate parents
1019
1020     def _serialize(self):
1021         chunks = []
1022         chunks.append("%s %s\n" % (_TREE_HEADER, self._tree))
1023         for p in self._parents:
1024             chunks.append("%s %s\n" % (_PARENT_HEADER, p))
1025         chunks.append("%s %s %s %s\n" % (
1026           _AUTHOR_HEADER, self._author, str(self._author_time),
1027           format_timezone(self._author_timezone,
1028                           self._author_timezone_neg_utc)))
1029         chunks.append("%s %s %s %s\n" % (
1030           _COMMITTER_HEADER, self._committer, str(self._commit_time),
1031           format_timezone(self._commit_timezone,
1032                           self._commit_timezone_neg_utc)))
1033         if self.encoding:
1034             chunks.append("%s %s\n" % (_ENCODING_HEADER, self.encoding))
1035         for k, v in self.extra:
1036             if "\n" in k or "\n" in v:
1037                 raise AssertionError("newline in extra data: %r -> %r" % (k, v))
1038             chunks.append("%s %s\n" % (k, v))
1039         chunks.append("\n") # There must be a new line after the headers
1040         chunks.append(self._message)
1041         return chunks
1042
1043     tree = serializable_property("tree", "Tree that is the state of this commit")
1044
1045     def _get_parents(self):
1046         """Return a list of parents of this commit."""
1047         self._ensure_parsed()
1048         return self._parents
1049
1050     def _set_parents(self, value):
1051         """Set a list of parents of this commit."""
1052         self._ensure_parsed()
1053         self._needs_serialization = True
1054         self._parents = value
1055
1056     parents = property(_get_parents, _set_parents)
1057
1058     def _get_extra(self):
1059         """Return extra settings of this commit."""
1060         self._ensure_parsed()
1061         return self._extra
1062
1063     extra = property(_get_extra)
1064
1065     author = serializable_property("author",
1066         "The name of the author of the commit")
1067
1068     committer = serializable_property("committer",
1069         "The name of the committer of the commit")
1070
1071     message = serializable_property("message",
1072         "The commit message")
1073
1074     commit_time = serializable_property("commit_time",
1075         "The timestamp of the commit. As the number of seconds since the epoch.")
1076
1077     commit_timezone = serializable_property("commit_timezone",
1078         "The zone the commit time is in")
1079
1080     author_time = serializable_property("author_time",
1081         "The timestamp the commit was written. as the number of seconds since the epoch.")
1082
1083     author_timezone = serializable_property("author_timezone",
1084         "Returns the zone the author time is in.")
1085
1086     encoding = serializable_property("encoding",
1087         "Encoding of the commit message.")
1088
1089
1090 OBJECT_CLASSES = (
1091     Commit,
1092     Tree,
1093     Blob,
1094     Tag,
1095     )
1096
1097 _TYPE_MAP = {}
1098
1099 for cls in OBJECT_CLASSES:
1100     _TYPE_MAP[cls.type_name] = cls
1101     _TYPE_MAP[cls.type_num] = cls
1102
1103
1104
1105 # Hold on to the pure-python implementations for testing
1106 _parse_tree_py = parse_tree
1107 _sorted_tree_items_py = sorted_tree_items
1108 try:
1109     # Try to import C versions
1110     from dulwich._objects import parse_tree, sorted_tree_items
1111 except ImportError:
1112     pass