dulwich/objects.py

   1 # objects.py -- Access to base git objects
   2 # Copyright (C) 2007 James Westby <jw+debian@jameswestby.net>
   3 # Copyright (C) 2008-2009 Jelmer Vernooij <jelmer@samba.org>
   4 #
   5 # This program is free software; you can redistribute it and/or
   6 # modify it under the terms of the GNU General Public License
   7 # as published by the Free Software Foundation; version 2
   8 # of the License or (at your option) a later version of the License.
   9 #
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 # GNU General Public License for more details.
  14 #
  15 # You should have received a copy of the GNU General Public License
  16 # along with this program; if not, write to the Free Software
  17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
  18 # MA  02110-1301, USA.
  19
  20
  21 """Access to base git objects."""
  22
  23
  24 import binascii
  25 from cStringIO import (
  26     StringIO,
  27     )
  28 import mmap
  29 import os
  30 import stat
  31 import time
  32 import zlib
  33
  34 from dulwich.errors import (
  35     NotBlobError,
  36     NotCommitError,
  37     NotTreeError,
  38     )
  39 from dulwich.file import GitFile
  40 from dulwich.misc import (
  41     make_sha,
  42     )
  43
  44 BLOB_ID = "blob"
  45 TAG_ID = "tag"
  46 TREE_ID = "tree"
  47 COMMIT_ID = "commit"
  48 PARENT_ID = "parent"
  49 AUTHOR_ID = "author"
  50 COMMITTER_ID = "committer"
  51 OBJECT_ID = "object"
  52 TYPE_ID = "type"
  53 TAGGER_ID = "tagger"
  54 ENCODING_ID = "encoding"
  55
  56 S_IFGITLINK     = 0160000
  57 def S_ISGITLINK(m):
  58     return (stat.S_IFMT(m) == S_IFGITLINK)
  59
  60 def _decompress(string):
  61     dcomp = zlib.decompressobj()
  62     dcomped = dcomp.decompress(string)
  63     dcomped += dcomp.flush()
  64     return dcomped
  65
  66
  67 def sha_to_hex(sha):
  68     """Takes a string and returns the hex of the sha within"""
  69     hexsha = binascii.hexlify(sha)
  70     assert len(hexsha) == 40, "Incorrect length of sha1 string: %d" % hexsha
  71     return hexsha
  72
  73
  74 def hex_to_sha(hex):
  75     """Takes a hex sha and returns a binary sha"""
  76     assert len(hex) == 40, "Incorrent length of hexsha: %s" % hex
  77     return binascii.unhexlify(hex)
  78
  79
  80 def serializable_property(name, docstring=None):
  81     def set(obj, value):
  82         obj._ensure_parsed()
  83         setattr(obj, "_"+name, value)
  84         obj._needs_serialization = True
  85     def get(obj):
  86         obj._ensure_parsed()
  87         return getattr(obj, "_"+name)
  88     return property(get, set, doc=docstring)
  89
  90
  91 class ShaFile(object):
  92     """A git SHA file."""
  93
  94     @classmethod
  95     def _parse_legacy_object(cls, map):
  96         """Parse a legacy object, creating it and setting object._text"""
  97         text = _decompress(map)
  98         object = None
  99         for posstype in type_map.keys():
 100             if text.startswith(posstype):
 101                 object = type_map[posstype]()
 102                 text = text[len(posstype):]
 103                 break
 104         assert object is not None, "%s is not a known object type" % text[:9]
 105         assert text[0] == ' ', "%s is not a space" % text[0]
 106         text = text[1:]
 107         size = 0
 108         i = 0
 109         while text[0] >= '0' and text[0] <= '9':
 110             if i > 0 and size == 0:
 111                 raise AssertionError("Size is not in canonical format")
 112             size = (size * 10) + int(text[0])
 113             text = text[1:]
 114             i += 1
 115         object._size = size
 116         assert text[0] == "\0", "Size not followed by null"
 117         text = text[1:]
 118         object.set_raw_string(text)
 119         return object
 120
 121     def as_legacy_object(self):
 122         text = self.as_raw_string()
 123         return zlib.compress("%s %d\0%s" % (self._type, len(text), text))
 124
 125     def as_raw_string(self):
 126         if self._needs_serialization:
 127             self.serialize()
 128         return self._text
 129
 130     def __str__(self):
 131         return self.as_raw_string()
 132
 133     def __hash__(self):
 134         return hash(self.id)
 135
 136     def as_pretty_string(self):
 137         return self.as_raw_string()
 138
 139     def _ensure_parsed(self):
 140         if self._needs_parsing:
 141             self._parse_text()
 142
 143     def set_raw_string(self, text):
 144         if type(text) != str:
 145             raise TypeError(text)
 146         self._text = text
 147         self._sha = None
 148         self._needs_parsing = True
 149         self._needs_serialization = False
 150
 151     @classmethod
 152     def _parse_object(cls, map):
 153         """Parse a new style object , creating it and setting object._text"""
 154         used = 0
 155         byte = ord(map[used])
 156         used += 1
 157         num_type = (byte >> 4) & 7
 158         try:
 159             object = num_type_map[num_type]()
 160         except KeyError:
 161             raise AssertionError("Not a known type: %d" % num_type)
 162         while (byte & 0x80) != 0:
 163             byte = ord(map[used])
 164             used += 1
 165         raw = map[used:]
 166         object.set_raw_string(_decompress(raw))
 167         return object
 168
 169     @classmethod
 170     def _parse_file(cls, map):
 171         word = (ord(map[0]) << 8) + ord(map[1])
 172         if ord(map[0]) == 0x78 and (word % 31) == 0:
 173             return cls._parse_legacy_object(map)
 174         else:
 175             return cls._parse_object(map)
 176
 177     def __init__(self):
 178         """Don't call this directly"""
 179         self._sha = None
 180
 181     def _parse_text(self):
 182         """For subclasses to do initialisation time parsing"""
 183
 184     @classmethod
 185     def from_file(cls, filename):
 186         """Get the contents of a SHA file on disk"""
 187         size = os.path.getsize(filename)
 188         f = GitFile(filename, 'rb')
 189         try:
 190             map = mmap.mmap(f.fileno(), size, access=mmap.ACCESS_READ)
 191             shafile = cls._parse_file(map)
 192             return shafile
 193         finally:
 194             f.close()
 195
 196     @classmethod
 197     def from_raw_string(cls, type, string):
 198         """Creates an object of the indicated type from the raw string given.
 199
 200         Type is the numeric type of an object. String is the raw uncompressed
 201         contents.
 202         """
 203         real_class = num_type_map[type]
 204         obj = real_class()
 205         obj.type = type
 206         obj.set_raw_string(string)
 207         return obj
 208
 209     @classmethod
 210     def from_string(cls, string):
 211         """Create a blob from a string."""
 212         shafile = cls()
 213         shafile.set_raw_string(string)
 214         return shafile
 215
 216     def _header(self):
 217         return "%s %lu\0" % (self._type, len(self.as_raw_string()))
 218
 219     def sha(self):
 220         """The SHA1 object that is the name of this object."""
 221         if self._needs_serialization or self._sha is None:
 222             self._sha = make_sha()
 223             self._sha.update(self._header())
 224             self._sha.update(self.as_raw_string())
 225         return self._sha
 226
 227     @property
 228     def id(self):
 229         return self.sha().hexdigest()
 230
 231     def get_type(self):
 232         return self._num_type
 233
 234     def set_type(self, type):
 235         self._num_type = type
 236
 237     type = property(get_type, set_type)
 238
 239     def __repr__(self):
 240         return "<%s %s>" % (self.__class__.__name__, self.id)
 241
 242     def __ne__(self, other):
 243         return self.id != other.id
 244
 245     def __eq__(self, other):
 246         """Return true id the sha of the two objects match.
 247
 248         The __le__ etc methods aren't overriden as they make no sense,
 249         certainly at this level.
 250         """
 251         return self.id == other.id
 252
 253
 254 class Blob(ShaFile):
 255     """A Git Blob object."""
 256
 257     _type = BLOB_ID
 258     _num_type = 3
 259     _needs_serialization = False
 260     _needs_parsing = False
 261
 262     def get_data(self):
 263         return self._text
 264
 265     def set_data(self, data):
 266         self._text = data
 267
 268     data = property(get_data, set_data,
 269             "The text contained within the blob object.")
 270
 271     @classmethod
 272     def from_file(cls, filename):
 273         blob = ShaFile.from_file(filename)
 274         if blob._type != cls._type:
 275             raise NotBlobError(filename)
 276         return blob
 277
 278
 279 class Tag(ShaFile):
 280     """A Git Tag object."""
 281
 282     _type = TAG_ID
 283     _num_type = 4
 284
 285     def __init__(self):
 286         super(Tag, self).__init__()
 287         self._needs_parsing = False
 288         self._needs_serialization = True
 289
 290     @classmethod
 291     def from_file(cls, filename):
 292         blob = ShaFile.from_file(filename)
 293         if blob._type != cls._type:
 294             raise NotBlobError(filename)
 295         return blob
 296
 297     @classmethod
 298     def from_string(cls, string):
 299         """Create a blob from a string."""
 300         shafile = cls()
 301         shafile.set_raw_string(string)
 302         return shafile
 303
 304     def serialize(self):
 305         f = StringIO()
 306         f.write("%s %s\n" % (OBJECT_ID, self._object_sha))
 307         f.write("%s %s\n" % (TYPE_ID, num_type_map[self._object_type]._type))
 308         f.write("%s %s\n" % (TAG_ID, self._name))
 309         if self._tagger:
 310             if self._tag_time is None:
 311                 f.write("%s %s\n" % (TAGGER_ID, self._tagger))
 312             else:
 313                 f.write("%s %s %d %s\n" % (TAGGER_ID, self._tagger, self._tag_time, format_timezone(self._tag_timezone)))
 314         f.write("\n") # To close headers
 315         f.write(self._message)
 316         self._text = f.getvalue()
 317         self._needs_serialization = False
 318
 319     def _parse_text(self):
 320         """Grab the metadata attached to the tag"""
 321         self._tagger = None
 322         f = StringIO(self._text)
 323         for l in f:
 324             l = l.rstrip("\n")
 325             if l == "":
 326                 break # empty line indicates end of headers
 327             (field, value) = l.split(" ", 1)
 328             if field == OBJECT_ID:
 329                 self._object_sha = value
 330             elif field == TYPE_ID:
 331                 self._object_type = type_map[value]
 332             elif field == TAG_ID:
 333                 self._name = value
 334             elif field == TAGGER_ID:
 335                 try:
 336                     sep = value.index("> ")
 337                 except ValueError:
 338                     self._tagger = value
 339                     self._tag_time = None
 340                     self._tag_timezone = None
 341                 else:
 342                     self._tagger = value[0:sep+1]
 343                     (timetext, timezonetext) = value[sep+2:].rsplit(" ", 1)
 344                     try:
 345                         self._tag_time = int(timetext)
 346                     except ValueError: #Not a unix timestamp
 347                         self._tag_time = time.strptime(timetext)
 348                     self._tag_timezone = parse_timezone(timezonetext)
 349             else:
 350                 raise AssertionError("Unknown field %s" % field)
 351         self._message = f.read()
 352         self._needs_parsing = False
 353
 354     def get_object(self):
 355         """Returns the object pointed by this tag, represented as a tuple(type, sha)"""
 356         self._ensure_parsed()
 357         return (self._object_type, self._object_sha)
 358
 359     def set_object(self, value):
 360         self._ensure_parsed()
 361         (self._object_type, self._object_sha) = value
 362         self._needs_serialization = True
 363
 364     object = property(get_object, set_object)
 365
 366     name = serializable_property("name", "The name of this tag")
 367     tagger = serializable_property("tagger",
 368         "Returns the name of the person who created this tag")
 369     tag_time = serializable_property("tag_time",
 370         "The creation timestamp of the tag.  As the number of seconds since the epoch")
 371     tag_timezone = serializable_property("tag_timezone",
 372         "The timezone that tag_time is in.")
 373     message = serializable_property("message", "The message attached to this tag")
 374
 375
 376 def parse_tree(text):
 377     ret = {}
 378     count = 0
 379     l = len(text)
 380     while count < l:
 381         mode_end = text.index(' ', count)
 382         mode = int(text[count:mode_end], 8)
 383
 384         name_end = text.index('\0', mode_end)
 385         name = text[mode_end+1:name_end]
 386
 387         count = name_end+21
 388
 389         sha = text[name_end+1:count]
 390
 391         ret[name] = (mode, sha_to_hex(sha))
 392
 393     return ret
 394
 395
 396 class Tree(ShaFile):
 397     """A Git tree object"""
 398
 399     _type = TREE_ID
 400     _num_type = 2
 401
 402     def __init__(self):
 403         super(Tree, self).__init__()
 404         self._entries = {}
 405         self._needs_parsing = False
 406         self._needs_serialization = True
 407
 408     @classmethod
 409     def from_file(cls, filename):
 410         tree = ShaFile.from_file(filename)
 411         if tree._type != cls._type:
 412             raise NotTreeError(filename)
 413         return tree
 414
 415     def __contains__(self, name):
 416         self._ensure_parsed()
 417         return name in self._entries
 418
 419     def __getitem__(self, name):
 420         self._ensure_parsed()
 421         return self._entries[name]
 422
 423     def __setitem__(self, name, value):
 424         assert isinstance(value, tuple)
 425         assert len(value) == 2
 426         self._ensure_parsed()
 427         self._entries[name] = value
 428         self._needs_serialization = True
 429
 430     def __delitem__(self, name):
 431         self._ensure_parsed()
 432         del self._entries[name]
 433         self._needs_serialization = True
 434
 435     def __len__(self):
 436         self._ensure_parsed()
 437         return len(self._entries)
 438
 439     def add(self, mode, name, hexsha):
 440         assert type(mode) == int
 441         assert type(name) == str
 442         assert type(hexsha) == str
 443         self._ensure_parsed()
 444         self._entries[name] = mode, hexsha
 445         self._needs_serialization = True
 446
 447     def entries(self):
 448         """Return a list of tuples describing the tree entries"""
 449         self._ensure_parsed()
 450         # The order of this is different from iteritems() for historical reasons
 451         return [(mode, name, hexsha) for (name, mode, hexsha) in self.iteritems()]
 452
 453     def iteritems(self):
 454         def cmp_entry((name1, value1), (name2, value2)):
 455             if stat.S_ISDIR(value1[0]):
 456                 name1 += "/"
 457             if stat.S_ISDIR(value2[0]):
 458                 name2 += "/"
 459             return cmp(name1, name2)
 460         self._ensure_parsed()
 461         for name, entry in sorted(self._entries.iteritems(), cmp=cmp_entry):
 462             yield name, entry[0], entry[1]
 463
 464     def _parse_text(self):
 465         """Grab the entries in the tree"""
 466         self._entries = parse_tree(self._text)
 467         self._needs_parsing = False
 468
 469     def serialize(self):
 470         f = StringIO()
 471         for name, mode, hexsha in self.iteritems():
 472             f.write("%04o %s\0%s" % (mode, name, hex_to_sha(hexsha)))
 473         self._text = f.getvalue()
 474         self._needs_serialization = False
 475
 476     def as_pretty_string(self):
 477         text = ""
 478         for name, mode, hexsha in self.iteritems():
 479             if mode & stat.S_IFDIR:
 480                 kind = "tree"
 481             else:
 482                 kind = "blob"
 483             text += "%04o %s %s\t%s\n" % (mode, kind, hexsha, name)
 484         return text
 485
 486
 487 def parse_timezone(text):
 488     offset = int(text)
 489     signum = (offset < 0) and -1 or 1
 490     offset = abs(offset)
 491     hours = int(offset / 100)
 492     minutes = (offset % 100)
 493     return signum * (hours * 3600 + minutes * 60)
 494
 495
 496 def format_timezone(offset):
 497     if offset % 60 != 0:
 498         raise ValueError("Unable to handle non-minute offset.")
 499     sign = (offset < 0) and '-' or '+'
 500     offset = abs(offset)
 501     return '%c%02d%02d' % (sign, offset / 3600, (offset / 60) % 60)
 502
 503
 504 class Commit(ShaFile):
 505     """A git commit object"""
 506
 507     _type = COMMIT_ID
 508     _num_type = 1
 509
 510     def __init__(self):
 511         super(Commit, self).__init__()
 512         self._parents = []
 513         self._encoding = None
 514         self._needs_parsing = False
 515         self._needs_serialization = True
 516         self._extra = {}
 517
 518     @classmethod
 519     def from_file(cls, filename):
 520         commit = ShaFile.from_file(filename)
 521         if commit._type != cls._type:
 522             raise NotCommitError(filename)
 523         return commit
 524
 525     def _parse_text(self):
 526         self._parents = []
 527         self._extra = []
 528         self._author = None
 529         f = StringIO(self._text)
 530         for l in f:
 531             l = l.rstrip("\n")
 532             if l == "":
 533                 # Empty line indicates end of headers
 534                 break
 535             (field, value) = l.split(" ", 1)
 536             if field == TREE_ID:
 537                 self._tree = value
 538             elif field == PARENT_ID:
 539                 self._parents.append(value)
 540             elif field == AUTHOR_ID:
 541                 self._author, timetext, timezonetext = value.rsplit(" ", 2)
 542                 self._author_time = int(timetext)
 543                 self._author_timezone = parse_timezone(timezonetext)
 544             elif field == COMMITTER_ID:
 545                 self._committer, timetext, timezonetext = value.rsplit(" ", 2)
 546                 self._commit_time = int(timetext)
 547                 self._commit_timezone = parse_timezone(timezonetext)
 548             elif field == ENCODING_ID:
 549                 self._encoding = value
 550             else:
 551                 self._extra.append((field, value))
 552         self._message = f.read()
 553         self._needs_parsing = False
 554
 555     def serialize(self):
 556         f = StringIO()
 557         f.write("%s %s\n" % (TREE_ID, self._tree))
 558         for p in self._parents:
 559             f.write("%s %s\n" % (PARENT_ID, p))
 560         f.write("%s %s %s %s\n" % (AUTHOR_ID, self._author, str(self._author_time), format_timezone(self._author_timezone)))
 561         f.write("%s %s %s %s\n" % (COMMITTER_ID, self._committer, str(self._commit_time), format_timezone(self._commit_timezone)))
 562         if self.encoding:
 563             f.write("%s %s\n" % (ENCODING_ID, self.encoding))
 564         for k, v in self.extra:
 565             if "\n" in k or "\n" in v:
 566                 raise AssertionError("newline in extra data: %r -> %r" % (k, v))
 567             f.write("%s %s\n" % (k, v))
 568         f.write("\n") # There must be a new line after the headers
 569         f.write(self._message)
 570         self._text = f.getvalue()
 571         self._needs_serialization = False
 572
 573     tree = serializable_property("tree", "Tree that is the state of this commit")
 574
 575     def get_parents(self):
 576         """Return a list of parents of this commit."""
 577         self._ensure_parsed()
 578         return self._parents
 579
 580     def set_parents(self, value):
 581         """Return a list of parents of this commit."""
 582         self._ensure_parsed()
 583         self._needs_serialization = True
 584         self._parents = value
 585
 586     parents = property(get_parents, set_parents)
 587
 588     def get_extra(self):
 589         """Return extra settings of this commit."""
 590         self._ensure_parsed()
 591         return self._extra
 592
 593     extra = property(get_extra)
 594
 595     author = serializable_property("author",
 596         "The name of the author of the commit")
 597
 598     committer = serializable_property("committer",
 599         "The name of the committer of the commit")
 600
 601     message = serializable_property("message",
 602         "The commit message")
 603
 604     commit_time = serializable_property("commit_time",
 605         "The timestamp of the commit. As the number of seconds since the epoch.")
 606
 607     commit_timezone = serializable_property("commit_timezone",
 608         "The zone the commit time is in")
 609
 610     author_time = serializable_property("author_time",
 611         "The timestamp the commit was written. as the number of seconds since the epoch.")
 612
 613     author_timezone = serializable_property("author_timezone",
 614         "Returns the zone the author time is in.")
 615
 616     encoding = serializable_property("encoding",
 617         "Encoding of the commit message.")
 618
 619
 620 type_map = {
 621     BLOB_ID : Blob,
 622     TREE_ID : Tree,
 623     COMMIT_ID : Commit,
 624     TAG_ID: Tag,
 625 }
 626
 627 num_type_map = {
 628     0: None,
 629     1: Commit,
 630     2: Tree,
 631     3: Blob,
 632     4: Tag,
 633     # 5 Is reserved for further expansion
 634 }
 635
 636 try:
 637     # Try to import C versions
 638     from dulwich._objects import parse_tree
 639 except ImportError:
 640     pass