Add check() methods to object classes for consistency checking.
authorDave Borowitz <dborowitz@google.com>
Thu, 8 Apr 2010 00:55:09 +0000 (02:55 +0200)
committerJelmer Vernooij <jelmer@samba.org>
Thu, 8 Apr 2010 00:55:09 +0000 (02:55 +0200)
These roughly mirror the parts of git fsck that deal with object
well-formedness, including checking the presence of required headers,
formatting of author/committer names, tree ordering, and so on. This
change does *not* include consistency checks that require touching other
objects; those will go in ObjectStore.

Note that currently many types of errors will still be raised during
object parsing proper, such as ValueErrors when trying to parse invalid
integers. In the check() method, any such exceptions are caught and
wrapped in an ObjectFormatException. The separate codepath also gives
us a place to put these checks if we decide to optimize them out of the
main object-parsing codepath.

dulwich/_objects.c
dulwich/errors.py
dulwich/objects.py
dulwich/tests/test_objects.py

index 3f939c19e5d9b36242d6697f2a3786ac2ea954be..7ef7ad4c4aa2553aed094fcbc1b607da8dac71ee 100644 (file)
@@ -74,6 +74,13 @@ static PyObject *py_parse_tree(PyObject *self, PyObject *args)
                        return NULL;
                }
 
+               if (text + namelen + 20 >= end) {
+                       PyErr_SetString(PyExc_RuntimeError, "SHA truncated");
+                       Py_DECREF(ret);
+                       Py_DECREF(name);
+                       return NULL;
+               }
+
                item = Py_BuildValue("(NlN)", name, mode,
                                                         sha_to_pyhex((unsigned char *)text+namelen+1));
                if (item == NULL) {
index 873eaeb6bae1ac00147457fd9eff4fd833bdf04a..dec7c2e0aa7a04447a7926ef17dcfe22ed6f0783 100644 (file)
@@ -123,5 +123,9 @@ class PackedRefsException(FileFormatException):
     """Indicates an error parsing a packed-refs file."""
 
 
+class ObjectFormatException(FileFormatException):
+    """Indicates an error parsing an object."""
+
+
 class NoIndexPresent(Exception):
     """No index is present."""
index 5250bd7d5625566c1a26c958c82d467f9441e81e..096a9c04372ae9ac6925df8a4c25c1b0b9a51a2c 100644 (file)
@@ -36,6 +36,7 @@ from dulwich.errors import (
     NotCommitError,
     NotTagError,
     NotTreeError,
+    ObjectFormatException,
     )
 from dulwich.file import GitFile
 from dulwich.misc import (
@@ -104,6 +105,23 @@ def object_class(type):
     return _TYPE_MAP[type]
 
 
+def check_hexsha(hex, error_msg):
+    try:
+        hex_to_sha(hex)
+    except (TypeError, AssertionError):
+        raise ObjectFormatException("%s %s" % (error_msg, hex))
+
+
+def check_identity(identity, error_msg):
+    email_start = identity.find("<")
+    email_end = identity.find(">")
+    if (email_start < 0 or email_end < 0 or email_end <= email_start
+        or identity.find("<", email_start + 1) >= 0
+        or identity.find(">", email_end + 1) >= 0
+        or not identity.endswith(">")):
+        raise ObjectFormatException(error_msg)
+
+
 class ShaFile(object):
     """A git SHA file."""
 
@@ -255,6 +273,31 @@ class ShaFile(object):
         obj.set_raw_string(string)
         return obj
 
+    def _check_has_member(self, member, error_msg):
+        """Check that the object has a given member variable.
+
+        :param member: the member variable to check for
+        :param error_msg: the message for an error if the member is missing
+        :raise ObjectFormatException: with the given error_msg if member is
+            missing or is None
+        """
+        if getattr(self, member, None) is None:
+            raise ObjectFormatException(error_msg)
+
+    def check(self):
+        """Check this object for internal consistency.
+
+        :raise ObjectFormatException: if the object is malformed in some way
+        """
+        # TODO: if we find that error-checking during object parsing is a
+        # performance bottleneck, those checks should be moved to the class's
+        # check() method during optimization so we can still check the object
+        # when necessary.
+        try:
+            self._deserialize(self._chunked_text)
+        except Exception, e:
+            raise ObjectFormatException(e)
+
     def _header(self):
         return "%s %lu\0" % (self.type_name, self.raw_length())
 
@@ -343,6 +386,13 @@ class Blob(ShaFile):
             raise NotBlobError(filename)
         return blob
 
+    def check(self):
+        """Check this object for internal consistency.
+
+        :raise ObjectFormatException: if the object is malformed in some way
+        """
+        pass  # it's impossible for raw data to be malformed
+
 
 class Tag(ShaFile):
     """A Git Tag object."""
@@ -369,6 +419,25 @@ class Tag(ShaFile):
         shafile.set_raw_string(string)
         return shafile
 
+    def check(self):
+        """Check this object for internal consistency.
+
+        :raise ObjectFormatException: if the object is malformed in some way
+        """
+        super(Tag, self).check()
+        # TODO(dborowitz): check header order
+        self._check_has_member("_object_sha", "missing object sha")
+        self._check_has_member("_object_class", "missing object type")
+        self._check_has_member("_name", "missing tag name")
+
+        if not self._name:
+            raise ObjectFormatException("empty tag name")
+
+        check_hexsha(self._object_sha, "invalid object sha")
+
+        if getattr(self, "_tagger", None):
+            check_identity(self._tagger, "invalid tagger")
+
     def _serialize(self):
         chunks = []
         chunks.append("%s %s\n" % (_OBJECT_HEADER, self._object_sha))
@@ -410,10 +479,7 @@ class Tag(ShaFile):
                 else:
                     self._tagger = value[0:sep+1]
                     (timetext, timezonetext) = value[sep+2:].rsplit(" ", 1)
-                    try:
-                        self._tag_time = int(timetext)
-                    except ValueError: #Not a unix timestamp
-                        self._tag_time = time.strptime(timetext)
+                    self._tag_time = int(timetext)
                     self._tag_timezone = parse_timezone(timezonetext)
             else:
                 raise AssertionError("Unknown field %s" % field)
@@ -479,16 +545,19 @@ def sorted_tree_items(entries):
     :param entries: Dictionary mapping names to (mode, sha) tuples
     :return: Iterator over (name, mode, sha)
     """
-    def cmp_entry((name1, value1), (name2, value2)):
-        if stat.S_ISDIR(value1[0]):
-            name1 += "/"
-        if stat.S_ISDIR(value2[0]):
-            name2 += "/"
-        return cmp(name1, name2)
     for name, entry in sorted(entries.iteritems(), cmp=cmp_entry):
         yield name, entry[0], entry[1]
 
 
+def cmp_entry((name1, value1), (name2, value2)):
+    """Compare two tree entries."""
+    if stat.S_ISDIR(value1[0]):
+        name1 += "/"
+    if stat.S_ISDIR(value2[0]):
+        name2 += "/"
+    return cmp(name1, name2)
+
+
 class Tree(ShaFile):
     """A Git tree object"""
 
@@ -549,8 +618,7 @@ class Tree(ShaFile):
             (mode, name, hexsha) for (name, mode, hexsha) in self.iteritems()]
 
     def iteritems(self):
-        """Iterate over all entries in the order in which they would be
-        serialized.
+        """Iterate over entries in the order in which they would be serialized.
 
         :return: Iterator over (name, mode, sha) tuples
         """
@@ -565,6 +633,33 @@ class Tree(ShaFile):
         self._entries = dict([(n, (m, s)) for n, m, s in parsed_entries])
         self._needs_parsing = False
 
+    def check(self):
+        """Check this object for internal consistency.
+
+        :raise ObjectFormatException: if the object is malformed in some way
+        """
+        super(Tree, self).check()
+        last = None
+        allowed_modes = (stat.S_IFREG | 0755, stat.S_IFREG | 0644,
+                         stat.S_IFLNK, stat.S_IFDIR, S_IFGITLINK,
+                         # TODO: optionally exclude as in git fsck --strict
+                         stat.S_IFREG | 0664)
+        for name, mode, sha in parse_tree("".join(self._chunked_text)):
+            check_hexsha(sha, 'invalid sha %s' % sha)
+            if '/' in name or name in ('', '.', '..'):
+                raise ObjectFormatException('invalid name %s' % name)
+
+            if mode not in allowed_modes:
+                raise ObjectFormatException('invalid mode %06o' % mode)
+
+            entry = (name, (mode, sha))
+            if last:
+                if cmp_entry(last, entry) > 0:
+                    raise ObjectFormatException('entries not sorted')
+                if name == last[0]:
+                    raise ObjectFormatException('duplicate entry %s' % name)
+            last = entry
+
     def _serialize(self):
         return list(serialize_tree(self.iteritems()))
 
@@ -646,6 +741,26 @@ class Commit(ShaFile):
                 self._extra.append((field, value))
         self._message = f.read()
 
+    def check(self):
+        """Check this object for internal consistency.
+
+        :raise ObjectFormatException: if the object is malformed in some way
+        """
+        super(Commit, self).check()
+        # TODO(dborowitz): check header order
+        # TODO(dborowitz): check for duplicate headers
+        self._check_has_member("_tree", "missing tree")
+        self._check_has_member("_author", "missing author")
+        self._check_has_member("_committer", "missing committer")
+        # times are currently checked when set
+
+        for parent in self._parents:
+            check_hexsha(parent, "invalid parent sha")
+        check_hexsha(self._tree, "invalid tree sha")
+
+        check_identity(self._author, "invalid author")
+        check_identity(self._committer, "invalid committer")
+
     def _serialize(self):
         chunks = []
         chunks.append("%s %s\n" % (_TREE_HEADER, self._tree))
index 63b7d7ee37544654a6a8a639056a4e0e02478b1f..c117edfa533ce5a01c5096c6b3acd2d30bc166d6 100644 (file)
 # TODO: Round-trip parse-serialize-parse and serialize-parse-serialize tests.
 
 
+import datetime
 import os
 import stat
 import unittest
 
+from dulwich.errors import (
+    ObjectFormatException,
+    )
 from dulwich.objects import (
     Blob,
     Tree,
@@ -34,6 +38,8 @@ from dulwich.objects import (
     Tag,
     format_timezone,
     hex_to_sha,
+    check_hexsha,
+    check_identity,
     parse_timezone,
     parse_tree,
     _parse_tree_py,
@@ -169,7 +175,21 @@ class BlobReadTests(unittest.TestCase):
         self.assertEqual(c.commit_timezone, 0)
         self.assertEqual(c.author_timezone, 0)
         self.assertEqual(c.message, 'Merge ../b\n')
-  
+
+
+class ShaFileCheckTests(unittest.TestCase):
+
+    def assertCheckFails(self, obj, data):
+        obj.set_raw_string(data)
+        self.assertRaises(ObjectFormatException, obj.check)
+
+    def assertCheckSucceeds(self, obj, data):
+        obj.set_raw_string(data)
+        try:
+            obj.check()
+        except ObjectFormatException, e:
+            raise
+            self.fail(e)
 
 
 class CommitSerializationTests(unittest.TestCase):
@@ -226,42 +246,87 @@ class CommitSerializationTests(unittest.TestCase):
         self.assertTrue(" -0100\n" in c.as_raw_string())
 
 
-class CommitDeserializationTests(unittest.TestCase):
+default_committer = 'James Westby <jw+debian@jameswestby.net> 1174773719 +0000'
+
+class CommitParseTests(ShaFileCheckTests):
+
+    def make_commit_text(self,
+                         tree='d80c186a03f423a81b39df39dc87fd269736ca86',
+                         parents=['ab64bbdcc51b170d21588e5c5d391ee5c0c96dfd',
+                                  '4cffe90e0a41ad3f5190079d7c8f036bde29cbe6'],
+                         author=default_committer,
+                         committer=default_committer,
+                         encoding=None,
+                         message='Merge ../b\n',
+                         extra=None):
+        lines = []
+        if tree is not None:
+            lines.append('tree %s' % tree)
+        if parents is not None:
+            lines.extend('parent %s' % p for p in parents)
+        if author is not None:
+            lines.append('author %s' % author)
+        if committer is not None:
+            lines.append('committer %s' % committer)
+        if encoding is not None:
+            lines.append('encoding %s' % encoding)
+        if extra is not None:
+            for name, value in sorted(extra.iteritems()):
+                lines.append('%s %s' % (name, value))
+        lines.append('')
+        if message is not None:
+            lines.append(message)
+        return '\n'.join(lines)
 
     def test_simple(self):
-        c = Commit.from_string(
-                'tree d80c186a03f423a81b39df39dc87fd269736ca86\n'
-                'parent ab64bbdcc51b170d21588e5c5d391ee5c0c96dfd\n'
-                'parent 4cffe90e0a41ad3f5190079d7c8f036bde29cbe6\n'
-                'author James Westby <jw+debian@jameswestby.net> 1174773719 +0000\n'
-                'committer James Westby <jw+debian@jameswestby.net> 1174773719 +0000\n'
-                '\n'
-                'Merge ../b\n')
+        c = Commit.from_string(self.make_commit_text())
         self.assertEquals('Merge ../b\n', c.message)
+        self.assertEquals('James Westby <jw+debian@jameswestby.net>', c.author)
         self.assertEquals('James Westby <jw+debian@jameswestby.net>',
-            c.author)
-        self.assertEquals('James Westby <jw+debian@jameswestby.net>',
-            c.committer)
-        self.assertEquals('d80c186a03f423a81b39df39dc87fd269736ca86',
-            c.tree)
+                          c.committer)
+        self.assertEquals('d80c186a03f423a81b39df39dc87fd269736ca86', c.tree)
         self.assertEquals(['ab64bbdcc51b170d21588e5c5d391ee5c0c96dfd',
-                          '4cffe90e0a41ad3f5190079d7c8f036bde29cbe6'],
-            c.parents)
+                           '4cffe90e0a41ad3f5190079d7c8f036bde29cbe6'],
+                          c.parents)
+        expected_time = datetime.datetime(2007, 3, 24, 15, 1, 59)
+        self.assertEquals(expected_time,
+                          datetime.datetime.fromtimestamp(c.commit_time))
+        self.assertEquals(0, c.commit_timezone)
+        self.assertEquals(expected_time,
+                          datetime.datetime.fromtimestamp(c.author_time))
+        self.assertEquals(0, c.author_timezone)
+        self.assertEquals(None, c.encoding)
 
     def test_custom(self):
-        c = Commit.from_string(
-                'tree d80c186a03f423a81b39df39dc87fd269736ca86\n'
-                'parent ab64bbdcc51b170d21588e5c5d391ee5c0c96dfd\n'
-                'parent 4cffe90e0a41ad3f5190079d7c8f036bde29cbe6\n'
-                'author James Westby <jw+debian@jameswestby.net> 1174773719 +0000\n'
-                'committer James Westby <jw+debian@jameswestby.net> 1174773719 +0000\n'
-                'extra-field data\n'
-                '\n'
-                'Merge ../b\n')
+        c = Commit.from_string(self.make_commit_text(
+          extra={'extra-field': 'data'}))
         self.assertEquals([('extra-field', 'data')], c.extra)
 
-
-class TreeTests(unittest.TestCase):
+    def test_encoding(self):
+        c = Commit.from_string(self.make_commit_text(encoding='UTF-8'))
+        self.assertEquals('UTF-8', c.encoding)
+
+    def test_check(self):
+        self.assertCheckSucceeds(Commit(), self.make_commit_text())
+        self.assertCheckSucceeds(Commit(), self.make_commit_text(parents=None))
+        self.assertCheckSucceeds(Commit(),
+                                 self.make_commit_text(encoding='UTF-8'))
+
+        self.assertCheckFails(Commit(), self.make_commit_text(tree='xxx'))
+        self.assertCheckFails(Commit(), self.make_commit_text(
+          parents=[a_sha, 'xxx']))
+        bad_committer = "some guy without an email address 1174773719 +0000"
+        self.assertCheckFails(Commit(),
+                              self.make_commit_text(committer=bad_committer))
+        self.assertCheckFails(Commit(),
+                              self.make_commit_text(author=bad_committer))
+        self.assertCheckFails(Commit(), self.make_commit_text(author=None))
+        self.assertCheckFails(Commit(), self.make_commit_text(committer=None))
+        self.assertCheckFails(Commit(), self.make_commit_text(
+          author=None, committer=None))
+
+
+class TreeTests(ShaFileCheckTests):
 
     def test_simple(self):
         myhexsha = "d80c186a03f423a81b39df39dc87fd269736ca86"
@@ -291,6 +356,37 @@ class TreeTests(unittest.TestCase):
             raise TestSkipped('parse_tree extension not found')
         self._do_test_parse_tree(parse_tree)
 
+    def test_check(self):
+        t = Tree()
+        sha = hex_to_sha(a_sha)
+
+        # filenames
+        self.assertCheckSucceeds(t, '100644 .a\0%s' % sha)
+        self.assertCheckFails(t, '100644 \0%s' % sha)
+        self.assertCheckFails(t, '100644 .\0%s' % sha)
+        self.assertCheckFails(t, '100644 a/a\0%s' % sha)
+        self.assertCheckFails(t, '100644 ..\0%s' % sha)
+
+        # modes
+        self.assertCheckSucceeds(t, '100644 a\0%s' % sha)
+        self.assertCheckSucceeds(t, '100755 a\0%s' % sha)
+        self.assertCheckSucceeds(t, '160000 a\0%s' % sha)
+        # TODO more whitelisted modes
+        self.assertCheckFails(t, '123456 a\0%s' % sha)
+        self.assertCheckFails(t, '123abc a\0%s' % sha)
+
+        # shas
+        self.assertCheckFails(t, '100644 a\0%s' % ('x' * 5))
+        self.assertCheckFails(t, '100644 a\0%s' % ('x' * 18 + '\0'))
+        self.assertCheckFails(t, '100644 a\0%s\n100644 b\0%s' % ('x' * 21, sha))
+
+        # ordering
+        sha2 = hex_to_sha(b_sha)
+        self.assertCheckSucceeds(t, '100644 a\0%s\n100644 b\0%s' % (sha, sha))
+        self.assertCheckSucceeds(t, '100644 a\0%s\n100644 b\0%s' % (sha, sha2))
+        self.assertCheckFails(t, '100644 a\0%s\n100755 a\0%s' % (sha, sha2))
+        self.assertCheckFails(t, '100644 b\0%s\n100644 a\0%s' % (sha2, sha))
+
 
 class TagSerializeTests(unittest.TestCase):
 
@@ -310,16 +406,9 @@ tagger Jelmer Vernooij <jelmer@samba.org> 423423423 +0000
 Tag 0.1""", x.as_raw_string())
 
 
-class TagParseTests(unittest.TestCase):
-
-    def test_parse_ctime(self):
-        x = Tag()
-        x.set_raw_string("""object a38d6181ff27824c79fc7df825164a212eff6a3f
-type commit
-tag v2.6.22-rc7
-tagger Linus Torvalds <torvalds@woody.linux-foundation.org> Sun Jul 1 12:54:34 2007 -0700
-
-Linux 2.6.22-rc7
+default_tagger = ('Linus Torvalds <torvalds@woody.linux-foundation.org> '
+                  '1183319674 -0700')
+default_message = """Linux 2.6.22-rc7
 -----BEGIN PGP SIGNATURE-----
 Version: GnuPG v1.4.7 (GNU/Linux)
 
@@ -327,28 +416,95 @@ iD8DBQBGiAaAF3YsRnbiHLsRAitMAKCiLboJkQECM/jpYsY3WPfvUgLXkACgg3ql
 OK2XeQOiEeXtT76rV4t2WR4=
 =ivrA
 -----END PGP SIGNATURE-----
-""")
-        self.assertEquals("Linus Torvalds <torvalds@woody.linux-foundation.org>", x.tagger)
+"""
+
+
+class TagParseTests(ShaFileCheckTests):
+    def make_tag_text(self,
+                      object_sha="a38d6181ff27824c79fc7df825164a212eff6a3f",
+                      object_type_name="commit",
+                      name="v2.6.22-rc7",
+                      tagger=default_tagger,
+                      message=default_message):
+        lines = []
+        if object_sha is not None:
+            lines.append("object %s" % object_sha)
+        if object_type_name is not None:
+            lines.append("type %s" % object_type_name)
+        if name is not None:
+            lines.append("tag %s" % name)
+        if tagger is not None:
+            lines.append("tagger %s" % tagger)
+        lines.append("")
+        if message is not None:
+            lines.append(message)
+        return "\n".join(lines)
+
+    def test_parse(self):
+        x = Tag()
+        x.set_raw_string(self.make_tag_text())
+        self.assertEquals(
+            "Linus Torvalds <torvalds@woody.linux-foundation.org>", x.tagger)
         self.assertEquals("v2.6.22-rc7", x.name)
+        object_type, object_sha = x.object
+        self.assertEquals("a38d6181ff27824c79fc7df825164a212eff6a3f",
+                          object_sha)
+        self.assertEquals(Commit, object_type)
+        self.assertEquals(datetime.datetime.fromtimestamp(x.tag_time),
+                          datetime.datetime(2007, 7, 1, 12, 54, 34))
+        self.assertEquals(-25200, x.tag_timezone)
 
     def test_parse_no_tagger(self):
         x = Tag()
-        x.set_raw_string("""object a38d6181ff27824c79fc7df825164a212eff6a3f
-type commit
-tag v2.6.22-rc7
-
-Linux 2.6.22-rc7
------BEGIN PGP SIGNATURE-----
-Version: GnuPG v1.4.7 (GNU/Linux)
-
-iD8DBQBGiAaAF3YsRnbiHLsRAitMAKCiLboJkQECM/jpYsY3WPfvUgLXkACgg3ql
-OK2XeQOiEeXtT76rV4t2WR4=
-=ivrA
------END PGP SIGNATURE-----
-""")
+        x.set_raw_string(self.make_tag_text(tagger=None))
         self.assertEquals(None, x.tagger)
         self.assertEquals("v2.6.22-rc7", x.name)
 
+    def test_check(self):
+        self.assertCheckSucceeds(Tag(), self.make_tag_text())
+        self.assertCheckFails(Tag(), self.make_tag_text(object_sha=None))
+        self.assertCheckFails(Tag(), self.make_tag_text(object_type_name=None))
+        self.assertCheckFails(Tag(), self.make_tag_text(name=None))
+        self.assertCheckFails(Tag(), self.make_tag_text(name=''))
+        self.assertCheckFails(Tag(), self.make_tag_text(
+          object_type_name="foobar"))
+        self.assertCheckFails(Tag(), self.make_tag_text(
+          tagger="some guy without an email address 1183319674 -0700"))
+        self.assertCheckFails(Tag(), self.make_tag_text(
+          tagger=("Linus Torvalds <torvalds@woody.linux-foundation.org> "
+                  "Sun 7 Jul 2007 12:54:34 +0700")))
+        self.assertCheckFails(Tag(), self.make_tag_text(object_sha="xxx"))
+
+
+class CheckTests(unittest.TestCase):
+    def test_check_hexsha(self):
+        check_hexsha(a_sha, "failed to check good sha")
+        self.assertRaises(ObjectFormatException, check_hexsha, '1' * 39,
+                          'sha too short')
+        self.assertRaises(ObjectFormatException, check_hexsha, '1' * 41,
+                          'sha too long')
+        self.assertRaises(ObjectFormatException, check_hexsha, 'x' * 40,
+                          'invalid characters')
+
+    def test_check_identity(self):
+        check_identity("Dave Borowitz <dborowitz@google.com>",
+                       "failed to check good identity")
+        check_identity("<dborowitz@google.com>",
+                       "failed to check good identity")
+        self.assertRaises(ObjectFormatException, check_identity,
+                          "Dave Borowitz", "no email")
+        self.assertRaises(ObjectFormatException, check_identity,
+                          "Dave Borowitz <dborowitz", "incomplete email")
+        self.assertRaises(ObjectFormatException, check_identity,
+                          "dborowitz@google.com>", "incomplete email")
+        self.assertRaises(ObjectFormatException, check_identity,
+                          "Dave Borowitz <<dborowitz@google.com>", "typo")
+        self.assertRaises(ObjectFormatException, check_identity,
+                          "Dave Borowitz <dborowitz@google.com>>", "typo")
+        self.assertRaises(ObjectFormatException, check_identity,
+                          "Dave Borowitz <dborowitz@google.com>xxx",
+                          "trailing characters")
+
 
 class TimezoneTests(unittest.TestCase):