dulwich/objects.py

   1 # objects.py -- Access to base git objects
   2 # Copyright (C) 2007 James Westby <jw+debian@jameswestby.net>
   3 # Copyright (C) 2008-2009 Jelmer Vernooij <jelmer@samba.org>
   4 #
   5 # This program is free software; you can redistribute it and/or
   6 # modify it under the terms of the GNU General Public License
   7 # as published by the Free Software Foundation; version 2
   8 # of the License or (at your option) a later version of the License.
   9 #
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 # GNU General Public License for more details.
  14 #
  15 # You should have received a copy of the GNU General Public License
  16 # along with this program; if not, write to the Free Software
  17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
  18 # MA  02110-1301, USA.
  19
  20
  21 """Access to base git objects."""
  22
  23
  24 from cStringIO import (
  25     StringIO,
  26     )
  27 import mmap
  28 import os
  29 import stat
  30 import time
  31 import zlib
  32
  33 from dulwich.errors import (
  34     NotBlobError,
  35     NotCommitError,
  36     NotTreeError,
  37     )
  38 from dulwich.misc import (
  39     make_sha,
  40     )
  41
  42 BLOB_ID = "blob"
  43 TAG_ID = "tag"
  44 TREE_ID = "tree"
  45 COMMIT_ID = "commit"
  46 PARENT_ID = "parent"
  47 AUTHOR_ID = "author"
  48 COMMITTER_ID = "committer"
  49 OBJECT_ID = "object"
  50 TYPE_ID = "type"
  51 TAGGER_ID = "tagger"
  52 ENCODING_ID = "encoding"
  53
  54 S_IFGITLINK     = 0160000
  55 def S_ISGITLINK(m):
  56     return (stat.S_IFMT(m) == S_IFGITLINK)
  57
  58 def _decompress(string):
  59     dcomp = zlib.decompressobj()
  60     dcomped = dcomp.decompress(string)
  61     dcomped += dcomp.flush()
  62     return dcomped
  63
  64
  65 def sha_to_hex(sha):
  66     """Takes a string and returns the hex of the sha within"""
  67     hexsha = "".join(["%02x" % ord(c) for c in sha])
  68     assert len(hexsha) == 40, "Incorrect length of sha1 string: %d" % hexsha
  69     return hexsha
  70
  71
  72 def hex_to_sha(hex):
  73     """Takes a hex sha and returns a binary sha"""
  74     assert len(hex) == 40, "Incorrent length of hexsha: %s" % hex
  75     return ''.join([chr(int(hex[i:i+2], 16)) for i in xrange(0, len(hex), 2)])
  76
  77
  78 def serializable_property(name, docstring=None):
  79     def set(obj, value):
  80         obj._ensure_parsed()
  81         setattr(obj, "_"+name, value)
  82         obj._needs_serialization = True
  83     def get(obj):
  84         obj._ensure_parsed()
  85         return getattr(obj, "_"+name)
  86     return property(get, set, doc=docstring)
  87
  88
  89 class ShaFile(object):
  90     """A git SHA file."""
  91
  92     @classmethod
  93     def _parse_legacy_object(cls, map):
  94         """Parse a legacy object, creating it and setting object._text"""
  95         text = _decompress(map)
  96         object = None
  97         for posstype in type_map.keys():
  98             if text.startswith(posstype):
  99                 object = type_map[posstype]()
 100                 text = text[len(posstype):]
 101                 break
 102         assert object is not None, "%s is not a known object type" % text[:9]
 103         assert text[0] == ' ', "%s is not a space" % text[0]
 104         text = text[1:]
 105         size = 0
 106         i = 0
 107         while text[0] >= '0' and text[0] <= '9':
 108             if i > 0 and size == 0:
 109                 raise AssertionError("Size is not in canonical format")
 110             size = (size * 10) + int(text[0])
 111             text = text[1:]
 112             i += 1
 113         object._size = size
 114         assert text[0] == "\0", "Size not followed by null"
 115         text = text[1:]
 116         object.set_raw_string(text)
 117         return object
 118
 119     def as_legacy_object(self):
 120         text = self.as_raw_string()
 121         return zlib.compress("%s %d\0%s" % (self._type, len(text), text))
 122
 123     def as_raw_string(self):
 124         if self._needs_serialization:
 125             self.serialize()
 126         return self._text
 127
 128     def __str__(self):
 129         return self.as_raw_string()
 130
 131     def __hash__(self):
 132         return hash(self.id)
 133
 134     def as_pretty_string(self):
 135         return self.as_raw_string()
 136
 137     def _ensure_parsed(self):
 138         if self._needs_parsing:
 139             self._parse_text()
 140
 141     def set_raw_string(self, text):
 142         if type(text) != str:
 143             raise TypeError(text)
 144         self._text = text
 145         self._sha = None
 146         self._needs_parsing = True
 147         self._needs_serialization = False
 148
 149     @classmethod
 150     def _parse_object(cls, map):
 151         """Parse a new style object , creating it and setting object._text"""
 152         used = 0
 153         byte = ord(map[used])
 154         used += 1
 155         num_type = (byte >> 4) & 7
 156         try:
 157             object = num_type_map[num_type]()
 158         except KeyError:
 159             raise AssertionError("Not a known type: %d" % num_type)
 160         while (byte & 0x80) != 0:
 161             byte = ord(map[used])
 162             used += 1
 163         raw = map[used:]
 164         object.set_raw_string(_decompress(raw))
 165         return object
 166
 167     @classmethod
 168     def _parse_file(cls, map):
 169         word = (ord(map[0]) << 8) + ord(map[1])
 170         if ord(map[0]) == 0x78 and (word % 31) == 0:
 171             return cls._parse_legacy_object(map)
 172         else:
 173             return cls._parse_object(map)
 174
 175     def __init__(self):
 176         """Don't call this directly"""
 177         self._sha = None
 178
 179     def _parse_text(self):
 180         """For subclasses to do initialisation time parsing"""
 181
 182     @classmethod
 183     def from_file(cls, filename):
 184         """Get the contents of a SHA file on disk"""
 185         size = os.path.getsize(filename)
 186         f = open(filename, 'rb')
 187         try:
 188             map = mmap.mmap(f.fileno(), size, access=mmap.ACCESS_READ)
 189             shafile = cls._parse_file(map)
 190             return shafile
 191         finally:
 192             f.close()
 193
 194     @classmethod
 195     def from_raw_string(cls, type, string):
 196         """Creates an object of the indicated type from the raw string given.
 197
 198         Type is the numeric type of an object. String is the raw uncompressed
 199         contents.
 200         """
 201         real_class = num_type_map[type]
 202         obj = real_class()
 203         obj.type = type
 204         obj.set_raw_string(string)
 205         return obj
 206
 207     def _header(self):
 208         return "%s %lu\0" % (self._type, len(self.as_raw_string()))
 209
 210     def sha(self):
 211         """The SHA1 object that is the name of this object."""
 212         if self._needs_serialization or self._sha is None:
 213             self._sha = make_sha()
 214             self._sha.update(self._header())
 215             self._sha.update(self.as_raw_string())
 216         return self._sha
 217
 218     @property
 219     def id(self):
 220         return self.sha().hexdigest()
 221
 222     def get_type(self):
 223         return self._num_type
 224
 225     def set_type(self, type):
 226         self._num_type = type
 227
 228     type = property(get_type, set_type)
 229
 230     def __repr__(self):
 231         return "<%s %s>" % (self.__class__.__name__, self.id)
 232
 233     def __ne__(self, other):
 234         return self.id != other.id
 235
 236     def __eq__(self, other):
 237         """Return true id the sha of the two objects match.
 238
 239         The __le__ etc methods aren't overriden as they make no sense,
 240         certainly at this level.
 241         """
 242         return self.id == other.id
 243
 244
 245 class Blob(ShaFile):
 246     """A Git Blob object."""
 247
 248     _type = BLOB_ID
 249     _num_type = 3
 250     _needs_serialization = False
 251     _needs_parsing = False
 252
 253     def get_data(self):
 254         return self._text
 255
 256     def set_data(self, data):
 257         self._text = data
 258
 259     data = property(get_data, set_data,
 260             "The text contained within the blob object.")
 261
 262     @classmethod
 263     def from_file(cls, filename):
 264         blob = ShaFile.from_file(filename)
 265         if blob._type != cls._type:
 266             raise NotBlobError(filename)
 267         return blob
 268
 269     @classmethod
 270     def from_string(cls, string):
 271         """Create a blob from a string."""
 272         shafile = cls()
 273         shafile.set_raw_string(string)
 274         return shafile
 275
 276
 277 class Tag(ShaFile):
 278     """A Git Tag object."""
 279
 280     _type = TAG_ID
 281     _num_type = 4
 282
 283     def __init__(self):
 284         super(Tag, self).__init__()
 285         self._needs_parsing = False
 286         self._needs_serialization = True
 287
 288     @classmethod
 289     def from_file(cls, filename):
 290         blob = ShaFile.from_file(filename)
 291         if blob._type != cls._type:
 292             raise NotBlobError(filename)
 293         return blob
 294
 295     @classmethod
 296     def from_string(cls, string):
 297         """Create a blob from a string."""
 298         shafile = cls()
 299         shafile.set_raw_string(string)
 300         return shafile
 301
 302     def serialize(self):
 303         f = StringIO()
 304         f.write("%s %s\n" % (OBJECT_ID, self._object_sha))
 305         f.write("%s %s\n" % (TYPE_ID, num_type_map[self._object_type]._type))
 306         f.write("%s %s\n" % (TAG_ID, self._name))
 307         if self._tagger:
 308             f.write("%s %s %d %s\n" % (TAGGER_ID, self._tagger, self._tag_time, format_timezone(self._tag_timezone)))
 309         f.write("\n") # To close headers
 310         f.write(self._message)
 311         self._text = f.getvalue()
 312         self._needs_serialization = False
 313
 314     def _parse_text(self):
 315         """Grab the metadata attached to the tag"""
 316         self._tagger = None
 317         f = StringIO(self._text)
 318         for l in f:
 319             l = l.rstrip("\n")
 320             if l == "":
 321                 break # empty line indicates end of headers
 322             (field, value) = l.split(" ", 1)
 323             if field == OBJECT_ID:
 324                 self._object_sha = value
 325             elif field == TYPE_ID:
 326                 self._object_type = type_map[value]
 327             elif field == TAG_ID:
 328                 self._name = value
 329             elif field == TAGGER_ID:
 330                 sep = value.index("> ")
 331                 self._tagger = value[0:sep+1]
 332                 (timetext, timezonetext) = value[sep+2:].rsplit(" ", 1)
 333                 try:
 334                     self._tag_time = int(timetext)
 335                 except ValueError: #Not a unix timestamp
 336                     self._tag_time = time.strptime(timetext)
 337                 self._tag_timezone = parse_timezone(timezonetext)
 338             else:
 339                 raise AssertionError("Unknown field %s" % field)
 340         self._message = f.read()
 341         self._needs_parsing = False
 342
 343     def get_object(self):
 344         """Returns the object pointed by this tag, represented as a tuple(type, sha)"""
 345         self._ensure_parsed()
 346         return (self._object_type, self._object_sha)
 347
 348     def set_object(self, value):
 349         self._ensure_parsed()
 350         (self._object_type, self._object_sha) = value
 351         self._needs_serialization = True
 352
 353     object = property(get_object, set_object)
 354
 355     name = serializable_property("name", "The name of this tag")
 356     tagger = serializable_property("tagger",
 357         "Returns the name of the person who created this tag")
 358     tag_time = serializable_property("tag_time",
 359         "The creation timestamp of the tag.  As the number of seconds since the epoch")
 360     tag_timezone = serializable_property("tag_timezone",
 361         "The timezone that tag_time is in.")
 362     message = serializable_property("message", "The message attached to this tag")
 363
 364
 365 def parse_tree(text):
 366     ret = {}
 367     count = 0
 368     while count < len(text):
 369         mode = 0
 370         chr = text[count]
 371         while chr != ' ':
 372             assert chr >= '0' and chr <= '7', "%s is not a valid mode char" % chr
 373             mode = (mode << 3) + (ord(chr) - ord('0'))
 374             count += 1
 375             chr = text[count]
 376         count += 1
 377         chr = text[count]
 378         name = ''
 379         while chr != '\0':
 380             name += chr
 381             count += 1
 382             chr = text[count]
 383         count += 1
 384         chr = text[count]
 385         sha = text[count:count+20]
 386         hexsha = sha_to_hex(sha)
 387         ret[name] = (mode, hexsha)
 388         count = count + 20
 389     return ret
 390
 391
 392 class Tree(ShaFile):
 393     """A Git tree object"""
 394
 395     _type = TREE_ID
 396     _num_type = 2
 397
 398     def __init__(self):
 399         super(Tree, self).__init__()
 400         self._entries = {}
 401         self._needs_parsing = False
 402         self._needs_serialization = True
 403
 404     @classmethod
 405     def from_file(cls, filename):
 406         tree = ShaFile.from_file(filename)
 407         if tree._type != cls._type:
 408             raise NotTreeError(filename)
 409         return tree
 410
 411     def __contains__(self, name):
 412         self._ensure_parsed()
 413         return name in self._entries
 414
 415     def __getitem__(self, name):
 416         self._ensure_parsed()
 417         return self._entries[name]
 418
 419     def __setitem__(self, name, value):
 420         assert isinstance(value, tuple)
 421         assert len(value) == 2
 422         self._ensure_parsed()
 423         self._entries[name] = value
 424         self._needs_serialization = True
 425
 426     def __delitem__(self, name):
 427         self._ensure_parsed()
 428         del self._entries[name]
 429         self._needs_serialization = True
 430
 431     def __len__(self):
 432         return len(self._entries)
 433
 434     def add(self, mode, name, hexsha):
 435         assert type(mode) == int
 436         assert type(name) == str
 437         assert type(hexsha) == str
 438         self._ensure_parsed()
 439         self._entries[name] = mode, hexsha
 440         self._needs_serialization = True
 441
 442     def entries(self):
 443         """Return a list of tuples describing the tree entries"""
 444         self._ensure_parsed()
 445         # The order of this is different from iteritems() for historical reasons
 446         return [(mode, name, hexsha) for (name, mode, hexsha) in self.iteritems()]
 447
 448     def iteritems(self):
 449         def cmp_entry((name1, value1), (name2, value2)):
 450             if stat.S_ISDIR(value1[0]):
 451                 name1 += "/"
 452             if stat.S_ISDIR(value2[0]):
 453                 name2 += "/"
 454             return cmp(name1, name2)
 455         self._ensure_parsed()
 456         for name, entry in sorted(self._entries.iteritems(), cmp=cmp_entry):
 457             yield name, entry[0], entry[1]
 458
 459     def _parse_text(self):
 460         """Grab the entries in the tree"""
 461         self._entries = parse_tree(self._text)
 462         self._needs_parsing = False
 463
 464     def serialize(self):
 465         f = StringIO()
 466         for name, mode, hexsha in self.iteritems():
 467             f.write("%04o %s\0%s" % (mode, name, hex_to_sha(hexsha)))
 468         self._text = f.getvalue()
 469         self._needs_serialization = False
 470
 471     def as_pretty_string(self):
 472         text = ""
 473         for name, mode, hexsha in self.iteritems():
 474             if mode & stat.S_IFDIR:
 475                 kind = "tree"
 476             else:
 477                 kind = "blob"
 478             text += "%04o %s %s\t%s\n" % (mode, kind, hexsha, name)
 479         return text
 480
 481
 482 def parse_timezone(text):
 483     offset = int(text)
 484     signum = (offset < 0) and -1 or 1
 485     offset = abs(offset)
 486     hours = int(offset / 100)
 487     minutes = (offset % 100)
 488     return signum * (hours * 3600 + minutes * 60)
 489
 490
 491 def format_timezone(offset):
 492     if offset % 60 != 0:
 493         raise ValueError("Unable to handle non-minute offset.")
 494     sign = (offset < 0) and '-' or '+'
 495     offset = abs(offset)
 496     return '%c%02d%02d' % (sign, offset / 3600, (offset / 60) % 60)
 497
 498
 499 class Commit(ShaFile):
 500     """A git commit object"""
 501
 502     _type = COMMIT_ID
 503     _num_type = 1
 504
 505     def __init__(self):
 506         super(Commit, self).__init__()
 507         self._parents = []
 508         self._encoding = None
 509         self._needs_parsing = False
 510         self._needs_serialization = True
 511
 512     @classmethod
 513     def from_file(cls, filename):
 514         commit = ShaFile.from_file(filename)
 515         if commit._type != cls._type:
 516             raise NotCommitError(filename)
 517         return commit
 518
 519     def _parse_text(self):
 520         self._parents = []
 521         self._author = None
 522         f = StringIO(self._text)
 523         for l in f:
 524             l = l.rstrip("\n")
 525             if l == "":
 526                 # Empty line indicates end of headers
 527                 break
 528             (field, value) = l.split(" ", 1)
 529             if field == TREE_ID:
 530                 self._tree = value
 531             elif field == PARENT_ID:
 532                 self._parents.append(value)
 533             elif field == AUTHOR_ID:
 534                 self._author, timetext, timezonetext = value.rsplit(" ", 2)
 535                 self._author_time = int(timetext)
 536                 self._author_timezone = parse_timezone(timezonetext)
 537             elif field == COMMITTER_ID:
 538                 self._committer, timetext, timezonetext = value.rsplit(" ", 2)
 539                 self._commit_time = int(timetext)
 540                 self._commit_timezone = parse_timezone(timezonetext)
 541             elif field == ENCODING_ID:
 542                 self._encoding = value
 543             else:
 544                 raise AssertionError("Unknown field %s" % field)
 545         self._message = f.read()
 546         self._needs_parsing = False
 547
 548     def serialize(self):
 549         f = StringIO()
 550         f.write("%s %s\n" % (TREE_ID, self._tree))
 551         for p in self._parents:
 552             f.write("%s %s\n" % (PARENT_ID, p))
 553         f.write("%s %s %s %s\n" % (AUTHOR_ID, self._author, str(self._author_time), format_timezone(self._author_timezone)))
 554         f.write("%s %s %s %s\n" % (COMMITTER_ID, self._committer, str(self._commit_time), format_timezone(self._commit_timezone)))
 555         if self.encoding:
 556             f.write("%s %s\n" % (ENCODING_ID, self.encoding))
 557         f.write("\n") # There must be a new line after the headers
 558         f.write(self._message)
 559         self._text = f.getvalue()
 560         self._needs_serialization = False
 561
 562     tree = serializable_property("tree", "Tree that is the state of this commit")
 563
 564     def get_parents(self):
 565         """Return a list of parents of this commit."""
 566         self._ensure_parsed()
 567         return self._parents
 568
 569     def set_parents(self, value):
 570         """Return a list of parents of this commit."""
 571         self._ensure_parsed()
 572         self._needs_serialization = True
 573         self._parents = value
 574
 575     parents = property(get_parents, set_parents)
 576
 577     author = serializable_property("author",
 578         "The name of the author of the commit")
 579
 580     committer = serializable_property("committer",
 581         "The name of the committer of the commit")
 582
 583     message = serializable_property("message",
 584         "The commit message")
 585
 586     commit_time = serializable_property("commit_time",
 587         "The timestamp of the commit. As the number of seconds since the epoch.")
 588
 589     commit_timezone = serializable_property("commit_timezone",
 590         "The zone the commit time is in")
 591
 592     author_time = serializable_property("author_time",
 593         "The timestamp the commit was written. as the number of seconds since the epoch.")
 594
 595     author_timezone = serializable_property("author_timezone",
 596         "Returns the zone the author time is in.")
 597
 598     encoding = serializable_property("encoding",
 599         "Encoding of the commit message.")
 600
 601
 602 type_map = {
 603     BLOB_ID : Blob,
 604     TREE_ID : Tree,
 605     COMMIT_ID : Commit,
 606     TAG_ID: Tag,
 607 }
 608
 609 num_type_map = {
 610     0: None,
 611     1: Commit,
 612     2: Tree,
 613     3: Blob,
 614     4: Tag,
 615     # 5 Is reserved for further expansion
 616 }
 617
 618 try:
 619     # Try to import C versions
 620     from dulwich._objects import hex_to_sha, sha_to_hex, parse_tree
 621 except ImportError:
 622     pass
 623