dulwich/objects.py

   1 # objects.py -- Acces to base git objects
   2 # Copyright (C) 2007 James Westby <jw+debian@jameswestby.net>
   3 # Copyright (C) 2008 Jelmer Vernooij <jelmer@samba.org>
   4 # The header parsing code is based on that from git itself, which is
   5 # Copyright (C) 2005 Linus Torvalds
   6 # and licensed under v2 of the GPL.
   7 #
   8 # This program is free software; you can redistribute it and/or
   9 # modify it under the terms of the GNU General Public License
  10 # as published by the Free Software Foundation; version 2
  11 # of the License or (at your option) a later version of the License.
  12 #
  13 # This program is distributed in the hope that it will be useful,
  14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 # GNU General Public License for more details.
  17 #
  18 # You should have received a copy of the GNU General Public License
  19 # along with this program; if not, write to the Free Software
  20 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
  21 # MA  02110-1301, USA.
  22
  23 import mmap
  24 import os
  25 import sha
  26 import zlib
  27
  28 from dulwich.errors import (
  29     NotBlobError,
  30     NotCommitError,
  31     NotTreeError,
  32     )
  33
  34 BLOB_ID = "blob"
  35 TAG_ID = "tag"
  36 TREE_ID = "tree"
  37 COMMIT_ID = "commit"
  38 PARENT_ID = "parent"
  39 AUTHOR_ID = "author"
  40 COMMITTER_ID = "committer"
  41 OBJECT_ID = "object"
  42 TYPE_ID = "type"
  43 TAGGER_ID = "tagger"
  44
  45 def _decompress(string):
  46     dcomp = zlib.decompressobj()
  47     dcomped = dcomp.decompress(string)
  48     dcomped += dcomp.flush()
  49     return dcomped
  50
  51 def sha_to_hex(sha):
  52     """Takes a string and returns the hex of the sha within"""
  53     hexsha = ''
  54     for c in sha:
  55         hexsha += "%02x" % ord(c)
  56     assert len(hexsha) == 40, "Incorrect length of sha1 string: %d" % \
  57            len(hexsha)
  58     return hexsha
  59
  60 def hex_to_sha(hex):
  61     """Takes a hex sha and returns a binary sha"""
  62     sha = ''
  63     for i in range(0, len(hex), 2):
  64         sha += chr(int(hex[i:i+2], 16))
  65     assert len(sha) == 20, "Incorrent length of sha1: %d" % len(sha)
  66     return sha
  67
  68 class ShaFile(object):
  69     """A git SHA file."""
  70
  71     @classmethod
  72     def _parse_legacy_object(cls, map):
  73         """Parse a legacy object, creating it and setting object._text"""
  74         text = _decompress(map)
  75         object = None
  76         for posstype in type_map.keys():
  77             if text.startswith(posstype):
  78                 object = type_map[posstype]()
  79                 text = text[len(posstype):]
  80                 break
  81         assert object is not None, "%s is not a known object type" % text[:9]
  82         assert text[0] == ' ', "%s is not a space" % text[0]
  83         text = text[1:]
  84         size = 0
  85         i = 0
  86         while text[0] >= '0' and text[0] <= '9':
  87             if i > 0 and size == 0:
  88                 assert False, "Size is not in canonical format"
  89             size = (size * 10) + int(text[0])
  90             text = text[1:]
  91             i += 1
  92         object._size = size
  93         assert text[0] == "\0", "Size not followed by null"
  94         text = text[1:]
  95         object._text = text
  96         return object
  97
  98     def as_raw_string(self):
  99         return self._num_type, self._text
 100
 101     @classmethod
 102     def _parse_object(cls, map):
 103         """Parse a new style object , creating it and setting object._text"""
 104         used = 0
 105         byte = ord(map[used])
 106         used += 1
 107         num_type = (byte >> 4) & 7
 108         try:
 109             object = num_type_map[num_type]()
 110         except KeyError:
 111             assert False, "Not a known type: %d" % num_type
 112         while((byte & 0x80) != 0):
 113             byte = ord(map[used])
 114             used += 1
 115         raw = map[used:]
 116         object._text = _decompress(raw)
 117         return object
 118
 119     @classmethod
 120     def _parse_file(cls, map):
 121         word = (ord(map[0]) << 8) + ord(map[1])
 122         if ord(map[0]) == 0x78 and (word % 31) == 0:
 123             return cls._parse_legacy_object(map)
 124         else:
 125             return cls._parse_object(map)
 126
 127     def __init__(self):
 128         """Don't call this directly"""
 129
 130     def _parse_text(self):
 131         """For subclasses to do initialisation time parsing"""
 132
 133     @classmethod
 134     def from_file(cls, filename):
 135         """Get the contents of a SHA file on disk"""
 136         size = os.path.getsize(filename)
 137         f = open(filename, 'rb')
 138         try:
 139             map = mmap.mmap(f.fileno(), size, access=mmap.ACCESS_READ)
 140             shafile = cls._parse_file(map)
 141             shafile._parse_text()
 142             return shafile
 143         finally:
 144             f.close()
 145
 146     @classmethod
 147     def from_raw_string(cls, type, string):
 148         """Creates an object of the indicated type from the raw string given.
 149
 150         Type is the numeric type of an object. String is the raw uncompressed
 151         contents.
 152         """
 153         real_class = num_type_map[type]
 154         obj = real_class()
 155         obj._num_type = type
 156         obj._text = string
 157         obj._parse_text()
 158         return obj
 159
 160     def _header(self):
 161         return "%s %lu\0" % (self._type, len(self._text))
 162
 163     def crc32(self):
 164         return zlib.crc32(self._text) & 0xffffffff
 165
 166     def sha(self):
 167         """The SHA1 object that is the name of this object."""
 168         ressha = sha.new()
 169         ressha.update(self._header())
 170         ressha.update(self._text)
 171         return ressha
 172
 173     @property
 174     def id(self):
 175         return self.sha().hexdigest()
 176
 177     @property
 178     def type(self):
 179         return self._num_type
 180
 181     def __repr__(self):
 182         return "<%s %s>" % (self.__class__.__name__, self.id)
 183
 184     def __eq__(self, other):
 185         """Return true id the sha of the two objects match.
 186
 187         The __le__ etc methods aren't overriden as they make no sense,
 188         certainly at this level.
 189         """
 190         return self.sha().digest() == other.sha().digest()
 191
 192
 193 class Blob(ShaFile):
 194   """A Git Blob object."""
 195
 196   _type = BLOB_ID
 197   _num_type = 3
 198
 199   @property
 200   def data(self):
 201     """The text contained within the blob object."""
 202     return self._text
 203
 204   @classmethod
 205   def from_file(cls, filename):
 206     blob = ShaFile.from_file(filename)
 207     if blob._type != cls._type:
 208       raise NotBlobError(filename)
 209     return blob
 210
 211   @classmethod
 212   def from_string(cls, string):
 213     """Create a blob from a string."""
 214     shafile = cls()
 215     shafile._text = string
 216     return shafile
 217
 218
 219 class Tag(ShaFile):
 220   """A Git Tag object."""
 221
 222   _type = TAG_ID
 223
 224   @classmethod
 225   def from_file(cls, filename):
 226     blob = ShaFile.from_file(filename)
 227     if blob._type != cls._type:
 228       raise NotBlobError(filename)
 229     return blob
 230
 231   @classmethod
 232   def from_string(cls, string):
 233     """Create a blob from a string."""
 234     shafile = cls()
 235     shafile._text = string
 236     return shafile
 237
 238   def _parse_text(self):
 239     """Grab the metadata attached to the tag"""
 240     text = self._text
 241     count = 0
 242     assert text.startswith(OBJECT_ID), "Invalid tag object, " \
 243          "must start with %s" % OBJECT_ID
 244     count += len(OBJECT_ID)
 245     assert text[count] == ' ', "Invalid tag object, " \
 246          "%s must be followed by space not %s" % (OBJECT_ID, text[count])
 247     count += 1
 248     self._object_sha = text[count:count+40]
 249     count += 40
 250     assert text[count] == '\n', "Invalid tag object, " \
 251          "%s sha must be followed by newline" % OBJECT_ID
 252     count += 1
 253     assert text[count:].startswith(TYPE_ID), "Invalid tag object, " \
 254          "%s sha must be followed by %s" % (OBJECT_ID, TYPE_ID)
 255     count += len(TYPE_ID)
 256     assert text[count] == ' ', "Invalid tag object, " \
 257         "%s must be followed by space not %s" % (TAG_ID, text[count])
 258     count += 1
 259     self._object_type = ""
 260     while text[count] != '\n':
 261         self._object_type += text[count]
 262         count += 1
 263     count += 1
 264     assert self._object_type in (COMMIT_ID, BLOB_ID, TREE_ID, TAG_ID), "Invalid tag object, " \
 265         "unexpected object type %s" % self._object_type
 266     self._object_type = type_map[self._object_type]
 267
 268     assert text[count:].startswith(TAG_ID), "Invalid tag object, " \
 269         "object type must be followed by %s" % (TAG_ID)
 270     count += len(TAG_ID)
 271     assert text[count] == ' ', "Invalid tag object, " \
 272         "%s must be followed by space not %s" % (TAG_ID, text[count])
 273     count += 1
 274     self._name = ""
 275     while text[count] != '\n':
 276         self._name += text[count]
 277         count += 1
 278     count += 1
 279
 280     assert text[count:].startswith(TAGGER_ID), "Invalid tag object, " \
 281         "%s must be followed by %s" % (TAG_ID, TAGGER_ID)
 282     count += len(TAGGER_ID)
 283     assert text[count] == ' ', "Invalid tag object, " \
 284         "%s must be followed by space not %s" % (TAGGER_ID, text[count])
 285     count += 1
 286     self._tagger = ""
 287     while text[count] != '>':
 288         assert text[count] != '\n', "Malformed tagger information"
 289         self._tagger += text[count]
 290         count += 1
 291     self._tagger += text[count]
 292     count += 1
 293     assert text[count] == ' ', "Invalid tag object, " \
 294         "tagger information must be followed by space not %s" % text[count]
 295     count += 1
 296     self._tag_time = int(text[count:count+10])
 297     while text[count] != '\n':
 298         count += 1
 299     count += 1
 300     assert text[count] == '\n', "There must be a new line after the headers"
 301     count += 1
 302     self._message = text[count:]
 303
 304   @property
 305   def object(self):
 306     """Returns the object pointed by this tag, represented as a tuple(type, sha)"""
 307     return (self._object_type, self._object_sha)
 308
 309   @property
 310   def name(self):
 311     """Returns the name of this tag"""
 312     return self._name
 313
 314   @property
 315   def tagger(self):
 316     """Returns the name of the person who created this tag"""
 317     return self._tagger
 318
 319   @property
 320   def tag_time(self):
 321     """Returns the creation timestamp of the tag.
 322
 323     Returns it as the number of seconds since the epoch"""
 324     return self._tag_time
 325
 326   @property
 327   def message(self):
 328     """Returns the message attached to this tag"""
 329     return self._message
 330
 331
 332 class Tree(ShaFile):
 333   """A Git tree object"""
 334
 335   _type = TREE_ID
 336   _num_type = 2
 337
 338   def __init__(self):
 339     self._entries = []
 340
 341   @classmethod
 342   def from_file(cls, filename):
 343     tree = ShaFile.from_file(filename)
 344     if tree._type != cls._type:
 345       raise NotTreeError(filename)
 346     return tree
 347
 348   def add(self, mode, name, hexsha):
 349     self._entries.append((mode, name, hexsha))
 350
 351   def entries(self):
 352     """Return a list of tuples describing the tree entries"""
 353     return self._entries
 354
 355   def _parse_text(self):
 356     """Grab the entries in the tree"""
 357     count = 0
 358     while count < len(self._text):
 359       mode = 0
 360       chr = self._text[count]
 361       while chr != ' ':
 362         assert chr >= '0' and chr <= '7', "%s is not a valid mode char" % chr
 363         mode = (mode << 3) + (ord(chr) - ord('0'))
 364         count += 1
 365         chr = self._text[count]
 366       count += 1
 367       chr = self._text[count]
 368       name = ''
 369       while chr != '\0':
 370         name += chr
 371         count += 1
 372         chr = self._text[count]
 373       count += 1
 374       chr = self._text[count]
 375       sha = self._text[count:count+20]
 376       hexsha = sha_to_hex(sha)
 377       self.add(mode, name, hexsha)
 378       count = count + 20
 379
 380   def serialize(self):
 381     self._text = ""
 382     for mode, name, hexsha in self._entries:
 383         self._text += "%04o %s\0%s" % (mode, name, hex_to_sha(hexsha))
 384
 385
 386 class Commit(ShaFile):
 387   """A git commit object"""
 388
 389   _type = COMMIT_ID
 390   _num_type = 1
 391
 392   def __init__(self):
 393     self._parents = []
 394
 395   @classmethod
 396   def from_file(cls, filename):
 397     commit = ShaFile.from_file(filename)
 398     if commit._type != cls._type:
 399       raise NotCommitError(filename)
 400     return commit
 401
 402   def _parse_text(self):
 403     text = self._text
 404     count = 0
 405     assert text.startswith(TREE_ID), "Invalid commit object, " \
 406          "must start with %s" % TREE_ID
 407     count += len(TREE_ID)
 408     assert text[count] == ' ', "Invalid commit object, " \
 409          "%s must be followed by space not %s" % (TREE_ID, text[count])
 410     count += 1
 411     self._tree = text[count:count+40]
 412     count = count + 40
 413     assert text[count] == "\n", "Invalid commit object, " \
 414          "tree sha must be followed by newline"
 415     count += 1
 416     self._parents = []
 417     while text[count:].startswith(PARENT_ID):
 418       count += len(PARENT_ID)
 419       assert text[count] == ' ', "Invalid commit object, " \
 420            "%s must be followed by space not %s" % (PARENT_ID, text[count])
 421       count += 1
 422       self._parents.append(text[count:count+40])
 423       count += 40
 424       assert text[count] == "\n", "Invalid commit object, " \
 425            "parent sha must be followed by newline"
 426       count += 1
 427     self._author = None
 428     if text[count:].startswith(AUTHOR_ID):
 429       count += len(AUTHOR_ID)
 430       assert text[count] == ' ', "Invalid commit object, " \
 431            "%s must be followed by space not %s" % (AUTHOR_ID, text[count])
 432       count += 1
 433       self._author = ''
 434       while text[count] != '>':
 435         assert text[count] != '\n', "Malformed author information"
 436         self._author += text[count]
 437         count += 1
 438       self._author += text[count]
 439       count += 1
 440       while text[count] != '\n':
 441         count += 1
 442       count += 1
 443     self._committer = None
 444     if text[count:].startswith(COMMITTER_ID):
 445       count += len(COMMITTER_ID)
 446       assert text[count] == ' ', "Invalid commit object, " \
 447            "%s must be followed by space not %s" % (COMMITTER_ID, text[count])
 448       count += 1
 449       self._committer = ''
 450       while text[count] != '>':
 451         assert text[count] != '\n', "Malformed committer information"
 452         self._committer += text[count]
 453         count += 1
 454       self._committer += text[count]
 455       count += 1
 456       assert text[count] == ' ', "Invalid commit object, " \
 457            "commiter information must be followed by space not %s" % text[count]
 458       count += 1
 459       self._commit_time = int(text[count:count+10])
 460       while text[count] != '\n':
 461         count += 1
 462       count += 1
 463     assert text[count] == '\n', "There must be a new line after the headers"
 464     count += 1
 465     # XXX: There can be an encoding field.
 466     self._message = text[count:]
 467
 468   def serialize(self):
 469     self._text = ""
 470     self._text += "%s %s\n" % (TREE_ID, self._tree)
 471     for p in self._parents:
 472       self._text += "%s %s\n" % (PARENT_ID, p)
 473     self._text += "%s %s %s +0000\n" % (AUTHOR_ID, self._author, str(self._commit_time))
 474     self._text += "%s %s %s +0000\n" % (COMMITTER_ID, self._committer, str(self._commit_time))
 475     self._text += "\n" # There must be a new line after the headers
 476     self._text += self._message
 477
 478   @property
 479   def tree(self):
 480     """Returns the tree that is the state of this commit"""
 481     return self._tree
 482
 483   @property
 484   def parents(self):
 485     """Return a list of parents of this commit."""
 486     return self._parents
 487
 488   @property
 489   def author(self):
 490     """Returns the name of the author of the commit"""
 491     return self._author
 492
 493   @property
 494   def committer(self):
 495     """Returns the name of the committer of the commit"""
 496     return self._committer
 497
 498   @property
 499   def message(self):
 500     """Returns the commit message"""
 501     return self._message
 502
 503   @property
 504   def commit_time(self):
 505     """Returns the timestamp of the commit.
 506
 507     Returns it as the number of seconds since the epoch.
 508     """
 509     return self._commit_time
 510
 511
 512 type_map = {
 513   BLOB_ID : Blob,
 514   TREE_ID : Tree,
 515   COMMIT_ID : Commit,
 516   TAG_ID: Tag,
 517 }
 518
 519 num_type_map = {
 520   0: None,
 521   1: Commit,
 522   2: Tree,
 523   3: Blob,
 524   4: Tag,
 525   # 5 Is reserved for further expansion
 526 }
 527