From b55892fd97530529ff943cce49643fdbd9c9b1d9 Mon Sep 17 00:00:00 2001
From: Dave Borowitz <dborowitz@google.com>
Date: Mon, 5 Apr 2010 21:48:27 +0200
Subject: [PATCH] Change parse_tree to return a list rather than a dict.

For future consistency checks, we will need to ensure that objects in
the tree are stored in sorted order when we parse the tree, so it makes
sense to be able to reuse parse_tree for this purpose.

Also added tests for both the C and Python versions of parse_tree. To
do this in one test run, we also need to hold onto the pure-python
implementation before importing the C version if available.
---
 dulwich/_objects.c            | 10 ++++++----
 dulwich/objects.py            | 16 +++++++++++-----
 dulwich/tests/test_objects.py | 20 +++++++++++++++++++-
 3 files changed, 36 insertions(+), 10 deletions(-)

diff --git a/dulwich/_objects.c b/dulwich/_objects.c
index 0273d74..3f939c1 100644
--- a/dulwich/_objects.c
+++ b/dulwich/_objects.c
@@ -44,7 +44,10 @@ static PyObject *py_parse_tree(PyObject *self, PyObject *args)
 	if (!PyArg_ParseTuple(args, "s#", &text, &len))
 		return NULL;
 
-	ret = PyDict_New();
+	/* TODO: currently this returns a list; if memory usage is a concern,
+	* consider rewriting as a custom iterator object */
+	ret = PyList_New(0);
+
 	if (ret == NULL) {
 		return NULL;
 	}
@@ -71,19 +74,18 @@ static PyObject *py_parse_tree(PyObject *self, PyObject *args)
 			return NULL;
 		}
 
-		item = Py_BuildValue("(lN)", mode,
+		item = Py_BuildValue("(NlN)", name, mode,
 							 sha_to_pyhex((unsigned char *)text+namelen+1));
 		if (item == NULL) {
 			Py_DECREF(ret);
 			Py_DECREF(name);
 			return NULL;
 		}
-		if (PyDict_SetItem(ret, name, item) == -1) {
+		if (PyList_Append(ret, item) == -1) {
 			Py_DECREF(ret);
 			Py_DECREF(item);
 			return NULL;
 		}
-		Py_DECREF(name);
 		Py_DECREF(item);
 
 		text += namelen+21;
diff --git a/dulwich/objects.py b/dulwich/objects.py
index 9a77cd3..b05f932 100644
--- a/dulwich/objects.py
+++ b/dulwich/objects.py
@@ -447,9 +447,8 @@ def parse_tree(text):
     """Parse a tree text.
 
     :param text: Serialized text to parse
-    :return: Dictionary with names as keys, (mode, sha) tuples as values
+    :yields: tuples of (name, mode, sha)
     """
-    ret = {}
     count = 0
     l = len(text)
     while count < l:
@@ -459,8 +458,7 @@ def parse_tree(text):
         name = text[mode_end+1:name_end]
         count = name_end+21
         sha = text[name_end+1:count]
-        ret[name] = (mode, sha_to_hex(sha))
-    return ret
+        yield (name, mode, sha_to_hex(sha))
 
 
 def serialize_tree(items):
@@ -560,7 +558,11 @@ class Tree(ShaFile):
 
     def _deserialize(self, chunks):
         """Grab the entries in the tree"""
-        self._entries = parse_tree("".join(chunks))
+        parsed_entries = parse_tree("".join(chunks))
+        # TODO: list comprehension is for efficiency in the common (small) case;
+        # if memory efficiency in the large case is a concern, use a genexp.
+        self._entries = dict([(n, (m, s)) for n, m, s in parsed_entries])
+        self._needs_parsing = False
 
     def _serialize(self):
         return list(serialize_tree(self.iteritems()))
@@ -725,6 +727,10 @@ for cls in OBJECT_CLASSES:
     _TYPE_MAP[cls.type_num] = cls
 
 
+
+# Hold on to the pure-python implementations for testing
+_parse_tree_py = parse_tree
+_sorted_tree_items_py = sorted_tree_items
 try:
     # Try to import C versions
     from dulwich._objects import parse_tree, sorted_tree_items
diff --git a/dulwich/tests/test_objects.py b/dulwich/tests/test_objects.py
index c5555a3..17a40df 100644
--- a/dulwich/tests/test_objects.py
+++ b/dulwich/tests/test_objects.py
@@ -27,6 +27,8 @@ import os
 import stat
 import unittest
 
+import nose
+
 from dulwich.objects import (
     Blob,
     Tree,
@@ -35,6 +37,8 @@ from dulwich.objects import (
     format_timezone,
     hex_to_sha,
     parse_timezone,
+    parse_tree,
+    _parse_tree_py,
     )
 
 a_sha = '6f670c0fb53f9463760b7295fbb814e965fb20c8'
@@ -256,7 +260,7 @@ class CommitDeserializationTests(unittest.TestCase):
         self.assertEquals([('extra-field', 'data')], c.extra)
 
 
-class TreeSerializationTests(unittest.TestCase):
+class TreeTests(unittest.TestCase):
 
     def test_simple(self):
         myhexsha = "d80c186a03f423a81b39df39dc87fd269736ca86"
@@ -272,6 +276,20 @@ class TreeSerializationTests(unittest.TestCase):
         x["a/c"] = (stat.S_IFDIR, "d80c186a03f423a81b39df39dc87fd269736ca86")
         self.assertEquals(["a.c", "a", "a/c"], [p[0] for p in x.iteritems()])
 
+    def _do_test_parse_tree(self, parse_tree):
+        o = Tree.from_file(os.path.join(os.path.dirname(__file__), 'data',
+                                        'trees', tree_sha))
+        self.assertEquals([('a', 0100644, a_sha), ('b', 0100644, b_sha)],
+                          list(parse_tree(o.as_raw_string())))
+
+    def test_parse_tree(self):
+        self._do_test_parse_tree(_parse_tree_py)
+
+    def test_parse_tree_extension(self):
+        if parse_tree is _parse_tree_py:
+            raise nose.SkipTest('parse_tree extension not found')
+        self._do_test_parse_tree(parse_tree)
+
 
 class TagSerializeTests(unittest.TestCase):
 
-- 
2.34.1