dulwich/objects.py

   1 # objects.py -- Access to base git objects
   2 # Copyright (C) 2007 James Westby <jw+debian@jameswestby.net>
   3 # Copyright (C) 2008-2009 Jelmer Vernooij <jelmer@samba.org>
   4 #
   5 # This program is free software; you can redistribute it and/or
   6 # modify it under the terms of the GNU General Public License
   7 # as published by the Free Software Foundation; version 2
   8 # of the License or (at your option) a later version of the License.
   9 #
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 # GNU General Public License for more details.
  14 #
  15 # You should have received a copy of the GNU General Public License
  16 # along with this program; if not, write to the Free Software
  17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
  18 # MA  02110-1301, USA.
  19
  20
  21 """Access to base git objects."""
  22
  23
  24 from cStringIO import (
  25     StringIO,
  26     )
  27 import mmap
  28 import os
  29 import stat
  30 import time
  31 import zlib
  32
  33 from dulwich.errors import (
  34     NotBlobError,
  35     NotCommitError,
  36     NotTreeError,
  37     )
  38 from dulwich.misc import (
  39     make_sha,
  40     )
  41
  42 BLOB_ID = "blob"
  43 TAG_ID = "tag"
  44 TREE_ID = "tree"
  45 COMMIT_ID = "commit"
  46 PARENT_ID = "parent"
  47 AUTHOR_ID = "author"
  48 COMMITTER_ID = "committer"
  49 OBJECT_ID = "object"
  50 TYPE_ID = "type"
  51 TAGGER_ID = "tagger"
  52 ENCODING_ID = "encoding"
  53
  54 S_IFGITLINK     = 0160000
  55 def S_ISGITLINK(m):
  56     return (stat.S_IFMT(m) == S_IFGITLINK)
  57
  58 def _decompress(string):
  59     dcomp = zlib.decompressobj()
  60     dcomped = dcomp.decompress(string)
  61     dcomped += dcomp.flush()
  62     return dcomped
  63
  64
  65 def sha_to_hex(sha):
  66     """Takes a string and returns the hex of the sha within"""
  67     hexsha = "".join(["%02x" % ord(c) for c in sha])
  68     assert len(hexsha) == 40, "Incorrect length of sha1 string: %d" % hexsha
  69     return hexsha
  70
  71
  72 def hex_to_sha(hex):
  73     """Takes a hex sha and returns a binary sha"""
  74     assert len(hex) == 40, "Incorrent length of hexsha: %s" % hex
  75     return ''.join([chr(int(hex[i:i+2], 16)) for i in xrange(0, len(hex), 2)])
  76
  77
  78 def serializable_property(name, docstring=None):
  79     def set(obj, value):
  80         obj._ensure_parsed()
  81         setattr(obj, "_"+name, value)
  82         obj._needs_serialization = True
  83     def get(obj):
  84         obj._ensure_parsed()
  85         return getattr(obj, "_"+name)
  86     return property(get, set, doc=docstring)
  87
  88
  89 class ShaFile(object):
  90     """A git SHA file."""
  91
  92     @classmethod
  93     def _parse_legacy_object(cls, map):
  94         """Parse a legacy object, creating it and setting object._text"""
  95         text = _decompress(map)
  96         object = None
  97         for posstype in type_map.keys():
  98             if text.startswith(posstype):
  99                 object = type_map[posstype]()
 100                 text = text[len(posstype):]
 101                 break
 102         assert object is not None, "%s is not a known object type" % text[:9]
 103         assert text[0] == ' ', "%s is not a space" % text[0]
 104         text = text[1:]
 105         size = 0
 106         i = 0
 107         while text[0] >= '0' and text[0] <= '9':
 108             if i > 0 and size == 0:
 109                 assert False, "Size is not in canonical format"
 110             size = (size * 10) + int(text[0])
 111             text = text[1:]
 112             i += 1
 113         object._size = size
 114         assert text[0] == "\0", "Size not followed by null"
 115         text = text[1:]
 116         object.set_raw_string(text)
 117         return object
 118
 119     def as_legacy_object(self):
 120         text = self.as_raw_string()
 121         return zlib.compress("%s %d\0%s" % (self._type, len(text), text))
 122
 123     def as_raw_string(self):
 124         if self._needs_serialization:
 125             self.serialize()
 126         return self._text
 127
 128     def __str__(self):
 129         return self.as_raw_string()
 130
 131     def as_pretty_string(self):
 132         return self.as_raw_string()
 133
 134     def _ensure_parsed(self):
 135         if self._needs_parsing:
 136             self._parse_text()
 137
 138     def set_raw_string(self, text):
 139         if type(text) != str:
 140             raise TypeError(text)
 141         self._text = text
 142         self._sha = None
 143         self._needs_parsing = True
 144         self._needs_serialization = False
 145
 146     @classmethod
 147     def _parse_object(cls, map):
 148         """Parse a new style object , creating it and setting object._text"""
 149         used = 0
 150         byte = ord(map[used])
 151         used += 1
 152         num_type = (byte >> 4) & 7
 153         try:
 154             object = num_type_map[num_type]()
 155         except KeyError:
 156             raise AssertionError("Not a known type: %d" % num_type)
 157         while (byte & 0x80) != 0:
 158             byte = ord(map[used])
 159             used += 1
 160         raw = map[used:]
 161         object.set_raw_string(_decompress(raw))
 162         return object
 163
 164     @classmethod
 165     def _parse_file(cls, map):
 166         word = (ord(map[0]) << 8) + ord(map[1])
 167         if ord(map[0]) == 0x78 and (word % 31) == 0:
 168             return cls._parse_legacy_object(map)
 169         else:
 170             return cls._parse_object(map)
 171
 172     def __init__(self):
 173         """Don't call this directly"""
 174         self._sha = None
 175
 176     def _parse_text(self):
 177         """For subclasses to do initialisation time parsing"""
 178
 179     @classmethod
 180     def from_file(cls, filename):
 181         """Get the contents of a SHA file on disk"""
 182         size = os.path.getsize(filename)
 183         f = open(filename, 'rb')
 184         try:
 185             map = mmap.mmap(f.fileno(), size, access=mmap.ACCESS_READ)
 186             shafile = cls._parse_file(map)
 187             return shafile
 188         finally:
 189             f.close()
 190
 191     @classmethod
 192     def from_raw_string(cls, type, string):
 193         """Creates an object of the indicated type from the raw string given.
 194
 195         Type is the numeric type of an object. String is the raw uncompressed
 196         contents.
 197         """
 198         real_class = num_type_map[type]
 199         obj = real_class()
 200         obj.type = type
 201         obj.set_raw_string(string)
 202         return obj
 203
 204     def _header(self):
 205         return "%s %lu\0" % (self._type, len(self.as_raw_string()))
 206
 207     def sha(self):
 208         """The SHA1 object that is the name of this object."""
 209         if self._needs_serialization or self._sha is None:
 210             self._sha = make_sha()
 211             self._sha.update(self._header())
 212             self._sha.update(self.as_raw_string())
 213         return self._sha
 214
 215     @property
 216     def id(self):
 217         return self.sha().hexdigest()
 218
 219     def get_type(self):
 220         return self._num_type
 221
 222     def set_type(self, type):
 223         self._num_type = type
 224
 225     type = property(get_type, set_type)
 226
 227     def __repr__(self):
 228         return "<%s %s>" % (self.__class__.__name__, self.id)
 229
 230     def __ne__(self, other):
 231         return self.id != other.id
 232
 233     def __eq__(self, other):
 234         """Return true id the sha of the two objects match.
 235
 236         The __le__ etc methods aren't overriden as they make no sense,
 237         certainly at this level.
 238         """
 239         return self.id == other.id
 240
 241
 242 class Blob(ShaFile):
 243     """A Git Blob object."""
 244
 245     _type = BLOB_ID
 246     _num_type = 3
 247     _needs_serialization = False
 248     _needs_parsing = False
 249
 250     def get_data(self):
 251         return self._text
 252
 253     def set_data(self, data):
 254         self._text = data
 255
 256     data = property(get_data, set_data,
 257             "The text contained within the blob object.")
 258
 259     @classmethod
 260     def from_file(cls, filename):
 261         blob = ShaFile.from_file(filename)
 262         if blob._type != cls._type:
 263             raise NotBlobError(filename)
 264         return blob
 265
 266     @classmethod
 267     def from_string(cls, string):
 268         """Create a blob from a string."""
 269         shafile = cls()
 270         shafile.set_raw_string(string)
 271         return shafile
 272
 273
 274 class Tag(ShaFile):
 275     """A Git Tag object."""
 276
 277     _type = TAG_ID
 278     _num_type = 4
 279
 280     def __init__(self):
 281         super(Tag, self).__init__()
 282         self._needs_parsing = False
 283         self._needs_serialization = True
 284
 285     @classmethod
 286     def from_file(cls, filename):
 287         blob = ShaFile.from_file(filename)
 288         if blob._type != cls._type:
 289             raise NotBlobError(filename)
 290         return blob
 291
 292     @classmethod
 293     def from_string(cls, string):
 294         """Create a blob from a string."""
 295         shafile = cls()
 296         shafile.set_raw_string(string)
 297         return shafile
 298
 299     def serialize(self):
 300         f = StringIO()
 301         f.write("%s %s\n" % (OBJECT_ID, self._object_sha))
 302         f.write("%s %s\n" % (TYPE_ID, num_type_map[self._object_type]._type))
 303         f.write("%s %s\n" % (TAG_ID, self._name))
 304         if self._tagger:
 305             f.write("%s %s %d %s\n" % (TAGGER_ID, self._tagger, self._tag_time, format_timezone(self._tag_timezone)))
 306         f.write("\n") # To close headers
 307         f.write(self._message)
 308         self._text = f.getvalue()
 309         self._needs_serialization = False
 310
 311     def _parse_text(self):
 312         """Grab the metadata attached to the tag"""
 313         self._tagger = None
 314         f = StringIO(self._text)
 315         for l in f:
 316             l = l.rstrip("\n")
 317             if l == "":
 318                 break # empty line indicates end of headers
 319             (field, value) = l.split(" ", 1)
 320             if field == OBJECT_ID:
 321                 self._object_sha = value
 322             elif field == TYPE_ID:
 323                 self._object_type = type_map[value]
 324             elif field == TAG_ID:
 325                 self._name = value
 326             elif field == TAGGER_ID:
 327                 sep = value.index("> ")
 328                 self._tagger = value[0:sep+1]
 329                 (timetext, timezonetext) = value[sep+2:].rsplit(" ", 1)
 330                 try:
 331                     self._tag_time = int(timetext)
 332                 except ValueError: #Not a unix timestamp
 333                     self._tag_time = time.strptime(timetext)
 334                 self._tag_timezone = parse_timezone(timezonetext)
 335             else:
 336                 raise AssertionError("Unknown field %s" % field)
 337         self._message = f.read()
 338         self._needs_parsing = False
 339
 340     def get_object(self):
 341         """Returns the object pointed by this tag, represented as a tuple(type, sha)"""
 342         self._ensure_parsed()
 343         return (self._object_type, self._object_sha)
 344
 345     def set_object(self, value):
 346         self._ensure_parsed()
 347         (self._object_type, self._object_sha) = value
 348         self._needs_serialization = True
 349
 350     object = property(get_object, set_object)
 351
 352     name = serializable_property("name", "The name of this tag")
 353     tagger = serializable_property("tagger",
 354         "Returns the name of the person who created this tag")
 355     tag_time = serializable_property("tag_time",
 356         "The creation timestamp of the tag.  As the number of seconds since the epoch")
 357     tag_timezone = serializable_property("tag_timezone",
 358         "The timezone that tag_time is in.")
 359     message = serializable_property("message", "The message attached to this tag")
 360
 361
 362 def parse_tree(text):
 363     ret = {}
 364     count = 0
 365     while count < len(text):
 366         mode = 0
 367         chr = text[count]
 368         while chr != ' ':
 369             assert chr >= '0' and chr <= '7', "%s is not a valid mode char" % chr
 370             mode = (mode << 3) + (ord(chr) - ord('0'))
 371             count += 1
 372             chr = text[count]
 373         count += 1
 374         chr = text[count]
 375         name = ''
 376         while chr != '\0':
 377             name += chr
 378             count += 1
 379             chr = text[count]
 380         count += 1
 381         chr = text[count]
 382         sha = text[count:count+20]
 383         hexsha = sha_to_hex(sha)
 384         ret[name] = (mode, hexsha)
 385         count = count + 20
 386     return ret
 387
 388
 389 class Tree(ShaFile):
 390     """A Git tree object"""
 391
 392     _type = TREE_ID
 393     _num_type = 2
 394
 395     def __init__(self):
 396         super(Tree, self).__init__()
 397         self._entries = {}
 398         self._needs_parsing = False
 399         self._needs_serialization = True
 400
 401     @classmethod
 402     def from_file(cls, filename):
 403         tree = ShaFile.from_file(filename)
 404         if tree._type != cls._type:
 405             raise NotTreeError(filename)
 406         return tree
 407
 408     def __contains__(self, name):
 409         self._ensure_parsed()
 410         return name in self._entries
 411
 412     def __getitem__(self, name):
 413         self._ensure_parsed()
 414         return self._entries[name]
 415
 416     def __setitem__(self, name, value):
 417         assert isinstance(value, tuple)
 418         assert len(value) == 2
 419         self._ensure_parsed()
 420         self._entries[name] = value
 421         self._needs_serialization = True
 422
 423     def __delitem__(self, name):
 424         self._ensure_parsed()
 425         del self._entries[name]
 426         self._needs_serialization = True
 427
 428     def add(self, mode, name, hexsha):
 429         assert type(mode) == int
 430         assert type(name) == str
 431         assert type(hexsha) == str
 432         self._ensure_parsed()
 433         self._entries[name] = mode, hexsha
 434         self._needs_serialization = True
 435
 436     def entries(self):
 437         """Return a list of tuples describing the tree entries"""
 438         self._ensure_parsed()
 439         # The order of this is different from iteritems() for historical reasons
 440         return [(mode, name, hexsha) for (name, mode, hexsha) in self.iteritems()]
 441
 442     def iteritems(self):
 443         def cmp_entry((name1, value1), (name2, value2)):
 444             if stat.S_ISDIR(value1[0]):
 445                 name1 += "/"
 446             if stat.S_ISDIR(value2[0]):
 447                 name2 += "/"
 448             return cmp(name1, name2)
 449         self._ensure_parsed()
 450         for name, entry in sorted(self._entries.iteritems(), cmp=cmp_entry):
 451             yield name, entry[0], entry[1]
 452
 453     def _parse_text(self):
 454         """Grab the entries in the tree"""
 455         self._entries = parse_tree(self._text)
 456         self._needs_parsing = False
 457
 458     def serialize(self):
 459         f = StringIO()
 460         for name, mode, hexsha in self.iteritems():
 461             f.write("%04o %s\0%s" % (mode, name, hex_to_sha(hexsha)))
 462         self._text = f.getvalue()
 463         self._needs_serialization = False
 464
 465     def as_pretty_string(self):
 466         text = ""
 467         for name, mode, hexsha in self.iteritems():
 468             if mode & stat.S_IFDIR:
 469                 kind = "tree"
 470             else:
 471                 kind = "blob"
 472             text += "%04o %s %s\t%s\n" % (mode, kind, hexsha, name)
 473         return text
 474
 475
 476 def parse_timezone(text):
 477     offset = int(text)
 478     signum = (offset < 0) and -1 or 1
 479     offset = abs(offset)
 480     hours = int(offset / 100)
 481     minutes = (offset % 100)
 482     return signum * (hours * 3600 + minutes * 60)
 483
 484
 485 def format_timezone(offset):
 486     if offset % 60 != 0:
 487         raise ValueError("Unable to handle non-minute offset.")
 488     sign = (offset < 0) and '-' or '+'
 489     offset = abs(offset)
 490     return '%c%02d%02d' % (sign, offset / 3600, (offset / 60) % 60)
 491
 492
 493 class Commit(ShaFile):
 494     """A git commit object"""
 495
 496     _type = COMMIT_ID
 497     _num_type = 1
 498
 499     def __init__(self):
 500         super(Commit, self).__init__()
 501         self._parents = []
 502         self._encoding = None
 503         self._needs_parsing = False
 504         self._needs_serialization = True
 505
 506     @classmethod
 507     def from_file(cls, filename):
 508         commit = ShaFile.from_file(filename)
 509         if commit._type != cls._type:
 510             raise NotCommitError(filename)
 511         return commit
 512
 513     def _parse_text(self):
 514         self._parents = []
 515         self._author = None
 516         f = StringIO(self._text)
 517         for l in f:
 518             l = l.rstrip("\n")
 519             if l == "":
 520                 # Empty line indicates end of headers
 521                 break
 522             (field, value) = l.split(" ", 1)
 523             if field == TREE_ID:
 524                 self._tree = value
 525             elif field == PARENT_ID:
 526                 self._parents.append(value)
 527             elif field == AUTHOR_ID:
 528                 self._author, timetext, timezonetext = value.rsplit(" ", 2)
 529                 self._author_time = int(timetext)
 530                 self._author_timezone = parse_timezone(timezonetext)
 531             elif field == COMMITTER_ID:
 532                 self._committer, timetext, timezonetext = value.rsplit(" ", 2)
 533                 self._commit_time = int(timetext)
 534                 self._commit_timezone = parse_timezone(timezonetext)
 535             elif field == ENCODING_ID:
 536                 self._encoding = value
 537             else:
 538                 raise AssertionError("Unknown field %s" % field)
 539         self._message = f.read()
 540         self._needs_parsing = False
 541
 542     def serialize(self):
 543         f = StringIO()
 544         f.write("%s %s\n" % (TREE_ID, self._tree))
 545         for p in self._parents:
 546             f.write("%s %s\n" % (PARENT_ID, p))
 547         f.write("%s %s %s %s\n" % (AUTHOR_ID, self._author, str(self._author_time), format_timezone(self._author_timezone)))
 548         f.write("%s %s %s %s\n" % (COMMITTER_ID, self._committer, str(self._commit_time), format_timezone(self._commit_timezone)))
 549         if self.encoding:
 550             f.write("%s %s\n" % (ENCODING_ID, self.encoding))
 551         f.write("\n") # There must be a new line after the headers
 552         f.write(self._message)
 553         self._text = f.getvalue()
 554         self._needs_serialization = False
 555
 556     tree = serializable_property("tree", "Tree that is the state of this commit")
 557
 558     def get_parents(self):
 559         """Return a list of parents of this commit."""
 560         self._ensure_parsed()
 561         return self._parents
 562
 563     def set_parents(self, value):
 564         """Return a list of parents of this commit."""
 565         self._ensure_parsed()
 566         self._needs_serialization = True
 567         self._parents = value
 568
 569     parents = property(get_parents, set_parents)
 570
 571     author = serializable_property("author",
 572         "The name of the author of the commit")
 573
 574     committer = serializable_property("committer",
 575         "The name of the committer of the commit")
 576
 577     message = serializable_property("message",
 578         "The commit message")
 579
 580     commit_time = serializable_property("commit_time",
 581         "The timestamp of the commit. As the number of seconds since the epoch.")
 582
 583     commit_timezone = serializable_property("commit_timezone",
 584         "The zone the commit time is in")
 585
 586     author_time = serializable_property("author_time",
 587         "The timestamp the commit was written. as the number of seconds since the epoch.")
 588
 589     author_timezone = serializable_property("author_timezone",
 590         "Returns the zone the author time is in.")
 591
 592     encoding = serializable_property("encoding",
 593         "Encoding of the commit message.")
 594
 595
 596 type_map = {
 597     BLOB_ID : Blob,
 598     TREE_ID : Tree,
 599     COMMIT_ID : Commit,
 600     TAG_ID: Tag,
 601 }
 602
 603 num_type_map = {
 604     0: None,
 605     1: Commit,
 606     2: Tree,
 607     3: Blob,
 608     4: Tag,
 609     # 5 Is reserved for further expansion
 610 }
 611
 612 try:
 613     # Try to import C versions
 614     from dulwich._objects import hex_to_sha, sha_to_hex, parse_tree
 615 except ImportError:
 616     pass
 617