# object_store.py -- Object store for git objects
-# Copyright (C) 2008-2009 Jelmer Vernooij <jelmer@samba.org>
+# Copyright (C) 2008-2013 Jelmer Vernooij <jelmer@samba.org>
+# and others
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
"""Git object store interfaces and implementation."""
+from io import BytesIO
import errno
-import itertools
+from itertools import chain
import os
import stat
import tempfile
from dulwich.pack import (
Pack,
PackData,
+ PackInflater,
iter_sha1,
write_pack_header,
write_pack_index_v2,
def __getitem__(self, sha):
"""Obtain an object by SHA1."""
type_num, uncomp = self.get_raw(sha)
- return ShaFile.from_raw_string(type_num, uncomp)
+ return ShaFile.from_raw_string(type_num, uncomp, sha=sha)
def __iter__(self):
"""Iterate over the SHAs that are present in this store."""
def tree_changes(self, source, target, want_unchanged=False):
"""Find the differences between the contents of two trees
- :param object_store: Object store to use for retrieving tree contents
- :param tree: SHA1 of the root tree
+ :param source: SHA1 of the source tree
+ :param target: SHA1 of the target tree
:param want_unchanged: Whether unchanged files should be reported
:return: Iterator over tuples with
(oldpath, newpath), (oldmode, newmode), (oldsha, newsha)
yield entry
def find_missing_objects(self, haves, wants, progress=None,
- get_tagged=None):
+ get_tagged=None,
+ get_parents=lambda commit: commit.parents):
"""Find the missing objects required for a set of revisions.
:param haves: Iterable over SHAs already in common.
updated progress strings.
:param get_tagged: Function that returns a dict of pointed-to sha -> tag
sha for including tags.
+ :param get_parents: Optional function for getting the parents of a commit.
:return: Iterator over (sha, path) pairs.
"""
- finder = MissingObjectFinder(self, haves, wants, progress, get_tagged)
+ finder = MissingObjectFinder(self, haves, wants, progress, get_tagged, get_parents=get_parents)
return iter(finder.next, None)
def find_common_revisions(self, graphwalker):
:return: List of SHAs that are in common
"""
haves = []
- sha = graphwalker.next()
+ sha = next(graphwalker)
while sha:
if sha in self:
haves.append(sha)
graphwalker.ack(sha)
- sha = graphwalker.next()
+ sha = next(graphwalker)
return haves
- def get_graph_walker(self, heads):
- """Obtain a graph walker for this object store.
-
- :param heads: Local heads to start search with
- :return: GraphWalker object
- """
- return ObjectStoreGraphWalker(heads, lambda sha: self[sha].parents)
-
def generate_pack_contents(self, have, want, progress=None):
"""Iterate over the contents of a pack file.
obj = self[sha]
return obj
+ def _collect_ancestors(self, heads, common=set(),
+ get_parents=lambda commit: commit.parents):
+ """Collect all ancestors of heads up to (excluding) those in common.
+
+ :param heads: commits to start from
+ :param common: commits to end at, or empty set to walk repository
+ completely
+ :param get_parents: Optional function for getting the parents of a commit.
+ :return: a tuple (A, B) where A - all commits reachable
+ from heads but not present in common, B - common (shared) elements
+ that are directly reachable from heads
+ """
+ bases = set()
+ commits = set()
+ queue = []
+ queue.extend(heads)
+ while queue:
+ e = queue.pop(0)
+ if e in common:
+ bases.add(e)
+ elif e not in commits:
+ commits.add(e)
+ cmt = self[e]
+ queue.extend(get_parents(cmt))
+ return (commits, bases)
+
+ def close(self):
+ """Close any files opened by this object store."""
+ # Default implementation is a NO-OP
+
class PackBasedObjectStore(BaseObjectStore):
def __init__(self):
- self._pack_cache = None
+ self._pack_cache = {}
@property
def alternates(self):
return []
def contains_packed(self, sha):
- """Check if a particular object is present by SHA1 and is packed."""
+ """Check if a particular object is present by SHA1 and is packed.
+
+ This does not check alternates.
+ """
for pack in self.packs:
if sha in pack:
return True
return False
- def _load_packs(self):
- raise NotImplementedError(self._load_packs)
+ def __contains__(self, sha):
+ """Check if a particular object is present by SHA1.
+
+ This method makes no distinction between loose and packed objects.
+ """
+ if self.contains_packed(sha) or self.contains_loose(sha):
+ return True
+ for alternate in self.alternates:
+ if sha in alternate:
+ return True
+ return False
def _pack_cache_stale(self):
"""Check whether the pack cache is stale."""
raise NotImplementedError(self._pack_cache_stale)
- def _add_known_pack(self, pack):
+ def _add_known_pack(self, base_name, pack):
"""Add a newly appeared pack to the cache by path.
"""
- if self._pack_cache is not None:
- self._pack_cache.append(pack)
+ self._pack_cache[base_name] = pack
+
+ def close(self):
+ pack_cache = self._pack_cache
+ self._pack_cache = {}
+ while pack_cache:
+ (name, pack) = pack_cache.popitem()
+ pack.close()
@property
def packs(self):
"""List with pack objects."""
if self._pack_cache is None or self._pack_cache_stale():
- self._pack_cache = self._load_packs()
- return self._pack_cache
+ self._update_pack_cache()
+
+ return self._pack_cache.values()
+
+ def _iter_alternate_objects(self):
+ """Iterate over the SHAs of all the objects in alternate stores."""
+ for alternate in self.alternates:
+ for alternate_object in alternate:
+ yield alternate_object
def _iter_loose_objects(self):
"""Iterate over the SHAs of all loose objects."""
def pack_loose_objects(self):
"""Pack loose objects.
-
+
:return: Number of objects packed
"""
objects = set()
def __iter__(self):
"""Iterate over the SHAs that are present in this store."""
- iterables = self.packs + [self._iter_loose_objects()]
- return itertools.chain(*iterables)
+ iterables = self.packs + [self._iter_loose_objects()] + [self._iter_alternate_objects()]
+ return chain(*iterables)
def contains_loose(self, sha):
- """Check if a particular object is present by SHA1 and is loose."""
+ """Check if a particular object is present by SHA1 and is loose.
+
+ This does not check alternates.
+ """
return self._get_loose_object(sha) is not None
def get_raw(self, name):
if len(objects) == 0:
# Don't bother writing an empty pack file
return
- f, commit = self.add_pack()
- write_pack_objects(f, objects)
- return commit()
+ f, commit, abort = self.add_pack()
+ try:
+ write_pack_objects(f, objects)
+ except:
+ abort()
+ raise
+ else:
+ return commit()
class DiskObjectStore(PackBasedObjectStore):
self.path = path
self.pack_dir = os.path.join(self.path, PACKDIR)
self._pack_cache_time = 0
+ self._pack_cache = {}
self._alternates = None
+ def __repr__(self):
+ return "<%s(%r)>" % (self.__class__.__name__, self.path)
+
@property
def alternates(self):
if self._alternates is not None:
try:
f = GitFile(os.path.join(self.path, "info", "alternates"),
'rb')
- except (OSError, IOError), e:
+ except (OSError, IOError) as e:
if e.errno == errno.ENOENT:
return []
raise
l = l.rstrip("\n")
if l[0] == "#":
continue
- if not os.path.isabs(l):
- continue
- ret.append(l)
+ if os.path.isabs(l):
+ ret.append(l)
+ else:
+ ret.append(os.path.join(self.path, l))
return ret
finally:
f.close()
"""
try:
os.mkdir(os.path.join(self.path, "info"))
- except OSError, e:
+ except OSError as e:
if e.errno != errno.EEXIST:
raise
alternates_path = os.path.join(self.path, "info/alternates")
try:
try:
orig_f = open(alternates_path, 'rb')
- except (OSError, IOError), e:
+ except (OSError, IOError) as e:
if e.errno != errno.ENOENT:
raise
else:
f.write("%s\n" % path)
finally:
f.close()
+
+ if not os.path.isabs(path):
+ path = os.path.join(self.path, path)
self.alternates.append(DiskObjectStore(path))
- def _load_packs(self):
- pack_files = []
+ def _update_pack_cache(self):
try:
- self._pack_cache_time = os.stat(self.pack_dir).st_mtime
pack_dir_contents = os.listdir(self.pack_dir)
- for name in pack_dir_contents:
- # TODO: verify that idx exists first
- if name.startswith("pack-") and name.endswith(".pack"):
- filename = os.path.join(self.pack_dir, name)
- pack_files.append((os.stat(filename).st_mtime, filename))
- except OSError, e:
+ except OSError as e:
if e.errno == errno.ENOENT:
- return []
+ self._pack_cache_time = 0
+ self.close()
+ return
raise
- pack_files.sort(reverse=True)
- suffix_len = len(".pack")
- return [Pack(f[:-suffix_len]) for _, f in pack_files]
+ self._pack_cache_time = os.stat(self.pack_dir).st_mtime
+ pack_files = set()
+ for name in pack_dir_contents:
+ # TODO: verify that idx exists first
+ if name.startswith("pack-") and name.endswith(".pack"):
+ pack_files.add(name[:-len(".pack")])
+
+ # Open newly appeared pack files
+ for f in pack_files:
+ if f not in self._pack_cache:
+ self._pack_cache[f] = Pack(os.path.join(self.pack_dir, f))
+ # Remove disappeared pack files
+ for f in set(self._pack_cache) - pack_files:
+ self._pack_cache.pop(f).close()
def _pack_cache_stale(self):
try:
return os.stat(self.pack_dir).st_mtime > self._pack_cache_time
- except OSError, e:
+ except OSError as e:
if e.errno == errno.ENOENT:
return True
raise
path = self._get_shafile_path(sha)
try:
return ShaFile.from_path(path)
- except (OSError, IOError), e:
+ except (OSError, IOError) as e:
if e.errno == errno.ENOENT:
return None
raise
f.seek(0)
write_pack_header(f, len(entries) + len(indexer.ext_refs()))
+ # Must flush before reading (http://bugs.python.org/issue3207)
+ f.flush()
+
# Rescan the rest of the pack, computing the SHA with the new header.
new_sha = compute_file_sha(f, end_ofs=-20)
+ # Must reposition before writing (http://bugs.python.org/issue3207)
+ f.seek(0, os.SEEK_CUR)
+
# Complete the pack.
for ext_sha in indexer.ext_refs():
+ assert len(ext_sha) == 20
type_num, data = self.get_raw(ext_sha)
offset = f.tell()
crc32 = write_pack_object(f, type_num, data, sha=new_sha)
# Add the pack to the store and return it.
final_pack = Pack(pack_base_name)
final_pack.check_length_and_checksum()
- self._add_known_pack(final_pack)
+ self._add_known_pack(pack_base_name, final_pack)
return final_pack
def add_thin_pack(self, read_all, read_some):
:param path: Path to the pack file.
"""
p = PackData(path)
- entries = p.sorted_entries()
- basename = os.path.join(self.pack_dir,
- "pack-%s" % iter_sha1(entry[0] for entry in entries))
- f = GitFile(basename+".idx", "wb")
try:
- write_pack_index_v2(f, entries, p.get_stored_checksum())
+ entries = p.sorted_entries()
+ basename = os.path.join(self.pack_dir,
+ "pack-%s" % iter_sha1(entry[0] for entry in entries))
+ f = GitFile(basename+".idx", "wb")
+ try:
+ write_pack_index_v2(f, entries, p.get_stored_checksum())
+ finally:
+ f.close()
finally:
- f.close()
- p.close()
+ p.close()
os.rename(path, basename + ".pack")
final_pack = Pack(basename)
- self._add_known_pack(final_pack)
+ self._add_known_pack(basename, final_pack)
return final_pack
def add_pack(self):
"""Add a new pack to this object store.
- :return: Fileobject to write to and a commit function to
- call when the pack is finished.
+ :return: Fileobject to write to, a commit function to
+ call when the pack is finished and an abort
+ function.
"""
fd, path = tempfile.mkstemp(dir=self.pack_dir, suffix=".pack")
f = os.fdopen(fd, 'wb')
else:
os.remove(path)
return None
- return f, commit
+ def abort():
+ f.close()
+ os.remove(path)
+ return f, commit, abort
def add_object(self, obj):
"""Add a single object to this object store.
dir = os.path.join(self.path, obj.id[:2])
try:
os.mkdir(dir)
- except OSError, e:
+ except OSError as e:
if e.errno != errno.EEXIST:
raise
path = os.path.join(dir, obj.id[2:])
def init(cls, path):
try:
os.mkdir(path)
- except OSError, e:
+ except OSError as e:
if e.errno != errno.EEXIST:
raise
os.mkdir(os.path.join(path, "info"))
super(MemoryObjectStore, self).__init__()
self._data = {}
+ def _to_hexsha(self, sha):
+ if len(sha) == 40:
+ return sha
+ elif len(sha) == 20:
+ return sha_to_hex(sha)
+ else:
+ raise ValueError("Invalid sha %r" % (sha,))
+
def contains_loose(self, sha):
"""Check if a particular object is present by SHA1 and is loose."""
- return sha in self._data
+ return self._to_hexsha(sha) in self._data
def contains_packed(self, sha):
"""Check if a particular object is present by SHA1 and is packed."""
:param name: sha for the object.
:return: tuple with numeric type and object contents.
"""
- obj = self[name]
+ obj = self[self._to_hexsha(name)]
return obj.type_num, obj.as_raw_string()
def __getitem__(self, name):
- return self._data[name]
+ return self._data[self._to_hexsha(name)]
def __delitem__(self, name):
"""Delete an object from this store, for testing only."""
- del self._data[name]
+ del self._data[self._to_hexsha(name)]
def add_object(self, obj):
"""Add a single object to this object store.
for obj, path in objects:
self._data[obj.id] = obj
+ def add_pack(self):
+ """Add a new pack to this object store.
+
+ Because this object store doesn't support packs, we extract and add the
+ individual objects.
+
+ :return: Fileobject to write to and a commit function to
+ call when the pack is finished.
+ """
+ f = BytesIO()
+ def commit():
+ p = PackData.from_file(BytesIO(f.getvalue()), f.tell())
+ f.close()
+ for obj in PackInflater.for_pack_data(p):
+ self._data[obj.id] = obj
+ def abort():
+ pass
+ return f, commit, abort
+
+ def _complete_thin_pack(self, f, indexer):
+ """Complete a thin pack by adding external references.
+
+ :param f: Open file object for the pack.
+ :param indexer: A PackIndexer for indexing the pack.
+ """
+ entries = list(indexer)
+
+ # Update the header with the new number of objects.
+ f.seek(0)
+ write_pack_header(f, len(entries) + len(indexer.ext_refs()))
+
+ # Rescan the rest of the pack, computing the SHA with the new header.
+ new_sha = compute_file_sha(f, end_ofs=-20)
+
+ # Complete the pack.
+ for ext_sha in indexer.ext_refs():
+ assert len(ext_sha) == 20
+ type_num, data = self.get_raw(ext_sha)
+ write_pack_object(f, type_num, data, sha=new_sha)
+ pack_sha = new_sha.digest()
+ f.write(pack_sha)
+
+ def add_thin_pack(self, read_all, read_some):
+ """Add a new thin pack to this object store.
+
+ Thin packs are packs that contain deltas with parents that exist outside
+ the pack. Because this object store doesn't support packs, we extract
+ and add the individual objects.
+
+ :param read_all: Read function that blocks until the number of requested
+ bytes are read.
+ :param read_some: Read function that returns at least one byte, but may
+ not return the number of bytes requested.
+ """
+ f, commit, abort = self.add_pack()
+ try:
+ indexer = PackIndexer(f, resolve_ext_ref=self.get_raw)
+ copier = PackStreamCopier(read_all, read_some, f, delta_iter=indexer)
+ copier.verify()
+ self._complete_thin_pack(f, indexer)
+ except:
+ abort()
+ raise
+ else:
+ commit()
+
class ObjectImporter(object):
"""Interface for importing objects."""
return tree.lookup_path(lookup_obj, path)
+def _collect_filetree_revs(obj_store, tree_sha, kset):
+ """Collect SHA1s of files and directories for specified tree.
+
+ :param obj_store: Object store to get objects by SHA from
+ :param tree_sha: tree reference to walk
+ :param kset: set to fill with references to files and directories
+ """
+ filetree = obj_store[tree_sha]
+ for name, mode, sha in filetree.iteritems():
+ if not S_ISGITLINK(mode) and sha not in kset:
+ kset.add(sha)
+ if stat.S_ISDIR(mode):
+ _collect_filetree_revs(obj_store, sha, kset)
+
+
+def _split_commits_and_tags(obj_store, lst, ignore_unknown=False):
+ """Split object id list into two list with commit SHA1s and tag SHA1s.
+
+ Commits referenced by tags are included into commits
+ list as well. Only SHA1s known in this repository will get
+ through, and unless ignore_unknown argument is True, KeyError
+ is thrown for SHA1 missing in the repository
+
+ :param obj_store: Object store to get objects by SHA1 from
+ :param lst: Collection of commit and tag SHAs
+ :param ignore_unknown: True to skip SHA1 missing in the repository
+ silently.
+ :return: A tuple of (commits, tags) SHA1s
+ """
+ commits = set()
+ tags = set()
+ for e in lst:
+ try:
+ o = obj_store[e]
+ except KeyError:
+ if not ignore_unknown:
+ raise
+ else:
+ if isinstance(o, Commit):
+ commits.add(e)
+ elif isinstance(o, Tag):
+ tags.add(e)
+ commits.add(o.object[1])
+ else:
+ raise KeyError('Not a commit or a tag: %s' % e)
+ return (commits, tags)
+
+
class MissingObjectFinder(object):
"""Find the objects missing from another object store.
:param progress: Optional function to report progress to.
:param get_tagged: Function that returns a dict of pointed-to sha -> tag
sha for including tags.
+ :param get_parents: Optional function for getting the parents of a commit.
:param tagged: dict of pointed-to sha -> tag sha for including tags
"""
def __init__(self, object_store, haves, wants, progress=None,
- get_tagged=None):
- haves = set(haves)
- self.sha_done = haves
- self.objects_to_send = set([(w, None, False) for w in wants
- if w not in haves])
+ get_tagged=None, get_parents=lambda commit: commit.parents):
self.object_store = object_store
+ self._get_parents = get_parents
+ # process Commits and Tags differently
+ # Note, while haves may list commits/tags not available locally,
+ # and such SHAs would get filtered out by _split_commits_and_tags,
+ # wants shall list only known SHAs, and otherwise
+ # _split_commits_and_tags fails with KeyError
+ have_commits, have_tags = \
+ _split_commits_and_tags(object_store, haves, True)
+ want_commits, want_tags = \
+ _split_commits_and_tags(object_store, wants, False)
+ # all_ancestors is a set of commits that shall not be sent
+ # (complete repository up to 'haves')
+ all_ancestors = object_store._collect_ancestors(
+ have_commits,
+ get_parents=self._get_parents)[0]
+ # all_missing - complete set of commits between haves and wants
+ # common - commits from all_ancestors we hit into while
+ # traversing parent hierarchy of wants
+ missing_commits, common_commits = object_store._collect_ancestors(
+ want_commits,
+ all_ancestors,
+ get_parents=self._get_parents);
+ self.sha_done = set()
+ # Now, fill sha_done with commits and revisions of
+ # files and directories known to be both locally
+ # and on target. Thus these commits and files
+ # won't get selected for fetch
+ for h in common_commits:
+ self.sha_done.add(h)
+ cmt = object_store[h]
+ _collect_filetree_revs(object_store, cmt.tree, self.sha_done)
+ # record tags we have as visited, too
+ for t in have_tags:
+ self.sha_done.add(t)
+
+ missing_tags = want_tags.difference(have_tags)
+ # in fact, what we 'want' is commits and tags
+ # we've found missing
+ wants = missing_commits.union(missing_tags)
+
+ self.objects_to_send = set([(w, None, False) for w in wants])
+
if progress is None:
self.progress = lambda x: None
else:
self.objects_to_send.update([e for e in entries
if not e[0] in self.sha_done])
- def parse_tree(self, tree):
- self.add_todo([(sha, name, not stat.S_ISDIR(mode))
- for name, mode, sha in tree.iteritems()
- if not S_ISGITLINK(mode)])
-
- def parse_commit(self, commit):
- self.add_todo([(commit.tree, "", False)])
- self.add_todo([(p, None, False) for p in commit.parents])
-
- def parse_tag(self, tag):
- self.add_todo([(tag.object[1], None, False)])
-
def next(self):
while True:
if not self.objects_to_send:
if not leaf:
o = self.object_store[sha]
if isinstance(o, Commit):
- self.parse_commit(o)
+ self.add_todo([(o.tree, "", False)])
elif isinstance(o, Tree):
- self.parse_tree(o)
+ self.add_todo([(s, n, not stat.S_ISDIR(m))
+ for n, m, s in o.iteritems()
+ if not S_ISGITLINK(m)])
elif isinstance(o, Tag):
- self.parse_tag(o)
+ self.add_todo([(o.object[1], None, False)])
if sha in self._tagged:
self.add_todo([(self._tagged[sha], None, True)])
self.sha_done.add(sha)
self.progress("counting objects: %d\r" % len(self.sha_done))
return (sha, name)
+ __next__ = next
+
class ObjectStoreGraphWalker(object):
"""Graph walker that finds what commits are missing from an object store.
self.heads.update([p for p in ps if not p in self.parents])
return ret
return None
+
+ __next__ = next