implicitly rename children on export when directory renamed
[jelmer/python-fastimport.git] / bzr_exporter.py
1 # -*- coding: utf-8 -*-
2
3 # Copyright (C) 2008 Canonical Ltd
4 #
5 # This program is free software; you can redistribute it and/or modify
6 # it under the terms of the GNU General Public License as published by
7 # the Free Software Foundation; either version 2 of the License, or
8 # (at your option) any later version.
9 #
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 # GNU General Public License for more details.
14 #
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18 #
19 # Original Copyright (c) 2008 Adeodato Simó
20 # Original License: MIT (See exporters/bzr-fast-export.LICENSE)
21 #
22 # vim: fileencoding=utf-8
23
24 """Core engine for the fast-export command."""
25
26 # TODO: if a new_git_branch below gets merged repeatedly, the tip of the branch
27 # is not updated (because the parent of commit is already merged, so we don't
28 # set new_git_branch to the previously used name)
29
30 from email.Utils import parseaddr
31 import sys, time
32
33 import bzrlib.branch
34 import bzrlib.revision
35 from bzrlib import (
36     builtins,
37     errors as bazErrors,
38     osutils,
39     progress,
40     trace,
41     )
42
43 from bzrlib.plugins.fastimport import commands, helpers, marks_file
44
45
46 class BzrFastExporter(object):
47
48     def __init__(self, source, destination, git_branch=None, checkpoint=-1,
49         import_marks_file=None, export_marks_file=None, revision=None,
50         verbose=False, plain_format=False):
51         """Export branch data in fast import format.
52
53         :param plain_format: if True, 'classic' fast-import format is
54           used without any extended features; if False, the generated
55           data is richer and includes information like multiple
56           authors, revision properties, etc.
57         """
58         self.source = source
59         if destination is None or destination == '-':
60             self.outf = helpers.binary_stream(sys.stdout)
61         elif destination.endswith('gz'):
62             import gzip
63             self.outf = gzip.open(destination, 'wb')
64         else:
65             self.outf = open(destination, 'wb')
66         self.git_branch = git_branch
67         self.checkpoint = checkpoint
68         self.import_marks_file = import_marks_file
69         self.export_marks_file = export_marks_file
70         self.revision = revision
71         self.excluded_revisions = set()
72         self.plain_format = plain_format
73         self._multi_author_api_available = hasattr(bzrlib.revision.Revision,
74             'get_apparent_authors')
75         self.properties_to_exclude = ['authors', 'author']
76
77         # Progress reporting stuff
78         self.verbose = verbose
79         if verbose:
80             self.progress_every = 100
81         else:
82             self.progress_every = 1000
83         self._start_time = time.time()
84         self._commit_total = 0
85
86         # Load the marks and initialise things accordingly
87         self.revid_to_mark = {}
88         self.branch_names = {}
89         if self.import_marks_file:
90             marks_info = marks_file.import_marks(self.import_marks_file)
91             if marks_info is not None:
92                 self.revid_to_mark = dict((r, m) for m, r in
93                     marks_info[0].items())
94                 self.branch_names = marks_info[1]
95  
96     def interesting_history(self):
97         if self.revision:
98             rev1, rev2 = builtins._get_revision_range(self.revision,
99                 self.branch, "fast-export")
100             start_rev_id = rev1.rev_id
101             end_rev_id = rev2.rev_id
102         else:
103             start_rev_id = None
104             end_rev_id = None
105         self.note("Calculating the revisions to include ...")
106         view_revisions = reversed([rev_id for rev_id, _, _, _ in
107             self.branch.iter_merge_sorted_revisions(end_rev_id, start_rev_id)])
108         # If a starting point was given, we need to later check that we don't
109         # start emitting revisions from before that point. Collect the
110         # revisions to exclude now ...
111         if start_rev_id is not None:
112             self.note("Calculating the revisions to exclude ...")
113             self.excluded_revisions = set([rev_id for rev_id, _, _, _ in
114                 self.branch.iter_merge_sorted_revisions(start_rev_id)])
115         return list(view_revisions)
116
117     def run(self):
118         # Open the source
119         self.branch = bzrlib.branch.Branch.open_containing(self.source)[0]
120
121         # Export the data
122         self.branch.repository.lock_read()
123         try:
124             interesting = self.interesting_history()
125             self._commit_total = len(interesting)
126             self.note("Starting export of %d revisions ..." %
127                 self._commit_total)
128             if not self.plain_format:
129                 self.emit_features()
130             for revid in interesting:
131                 self.emit_commit(revid, self.git_branch)
132             if self.branch.supports_tags():
133                 self.emit_tags()
134         finally:
135             self.branch.repository.unlock()
136
137         # Save the marks if requested
138         self._save_marks()
139         self.dump_stats()
140
141     def note(self, msg, *args):
142         """Output a note but timestamp it."""
143         msg = "%s %s" % (self._time_of_day(), msg)
144         trace.note(msg, *args)
145
146     def warning(self, msg, *args):
147         """Output a warning but timestamp it."""
148         msg = "%s WARNING: %s" % (self._time_of_day(), msg)
149         trace.warning(msg, *args)
150
151     def _time_of_day(self):
152         """Time of day as a string."""
153         # Note: this is a separate method so tests can patch in a fixed value
154         return time.strftime("%H:%M:%S")
155
156     def report_progress(self, commit_count, details=''):
157         if commit_count and commit_count % self.progress_every == 0:
158             if self._commit_total:
159                 counts = "%d/%d" % (commit_count, self._commit_total)
160             else:
161                 counts = "%d" % (commit_count,)
162             minutes = (time.time() - self._start_time) / 60
163             rate = commit_count * 1.0 / minutes
164             if rate > 10:
165                 rate_str = "at %.0f/minute " % rate
166             else:
167                 rate_str = "at %.1f/minute " % rate
168             self.note("%s commits exported %s%s" % (counts, rate_str, details))
169
170     def dump_stats(self):
171         time_required = progress.str_tdelta(time.time() - self._start_time)
172         rc = len(self.revid_to_mark)
173         self.note("Exported %d %s in %s",
174             rc, helpers.single_plural(rc, "revision", "revisions"),
175             time_required)
176
177     def print_cmd(self, cmd):
178         self.outf.write("%r\n" % cmd)
179
180     def _save_marks(self):
181         if self.export_marks_file:
182             revision_ids = dict((m, r) for r, m in self.revid_to_mark.items())
183             marks_file.export_marks(self.export_marks_file, revision_ids,
184                 self.branch_names)
185  
186     def is_empty_dir(self, tree, path):
187         path_id = tree.path2id(path)
188         if path_id is None:
189             self.warning("Skipping empty_dir detection - no file_id for %s" %
190                 (path,))
191             return False
192
193         # Continue if path is not a directory
194         if tree.kind(path_id) != 'directory':
195             return False
196
197         # Use treewalk to find the contents of our directory
198         contents = list(tree.walkdirs(prefix=path))[0]
199         if len(contents[1]) == 0:
200             return True
201         else:
202             return False
203
204     def emit_features(self):
205         for feature in sorted(commands.FEATURE_NAMES):
206             self.print_cmd(commands.FeatureCommand(feature))
207
208     def emit_commit(self, revid, git_branch):
209         if revid in self.revid_to_mark or revid in self.excluded_revisions:
210             return
211
212         # Get the Revision object
213         try:
214             revobj = self.branch.repository.get_revision(revid)
215         except bazErrors.NoSuchRevision:
216             # This is a ghost revision. Mark it as not found and next!
217             self.revid_to_mark[revid] = -1
218             return
219  
220         # Get the primary parent
221         # TODO: Consider the excluded revisions when deciding the parents.
222         # Currently, a commit with parents that are excluded ought to be
223         # triggering the git_branch calculation below (and it is not).
224         # IGC 20090824
225         ncommits = len(self.revid_to_mark)
226         nparents = len(revobj.parent_ids)
227         if nparents == 0:
228             if ncommits:
229                 # This is a parentless commit but it's not the first one
230                 # output. We need to create a new temporary branch for it
231                 # otherwise git-fast-import will assume the previous commit
232                 # was this one's parent
233                 git_branch = self._next_tmp_branch_name()
234             parent = bzrlib.revision.NULL_REVISION
235         else:
236             parent = revobj.parent_ids[0]
237
238         # Print the commit
239         git_ref = 'refs/heads/%s' % (git_branch,)
240         mark = ncommits + 1
241         self.revid_to_mark[revid] = mark
242         file_cmds = self._get_filecommands(parent, revid)
243         self.print_cmd(self._get_commit_command(git_ref, mark, revobj,
244             file_cmds))
245
246         # Report progress and checkpoint if it's time for that
247         self.report_progress(ncommits)
248         if (self.checkpoint > 0 and ncommits
249             and ncommits % self.checkpoint == 0):
250             self.note("Exported %i commits - adding checkpoint to output"
251                 % ncommits)
252             self._save_marks()
253             self.print_cmd(commands.CheckpointCommand())
254
255     def _get_name_email(self, user):
256         if user.find('<') == -1:
257             # If the email isn't inside <>, we need to use it as the name
258             # in order for things to round-trip correctly.
259             # (note: parseaddr('a@b.com') => name:'', email: 'a@b.com')
260             name = user
261             email = ''
262         else:
263             name, email = parseaddr(user)
264         return name, email
265
266     def _get_commit_command(self, git_ref, mark, revobj, file_cmds):
267         # Get the committer and author info
268         committer = revobj.committer
269         name, email = self._get_name_email(committer)
270         committer_info = (name, email, revobj.timestamp, revobj.timezone)
271         if self._multi_author_api_available:
272             more_authors = revobj.get_apparent_authors()
273             author = more_authors.pop(0)
274         else:
275             more_authors = []
276             author = revobj.get_apparent_author()
277         if more_authors:
278             name, email = self._get_name_email(author)
279             author_info = (name, email, revobj.timestamp, revobj.timezone)
280             more_author_info = []
281             for a in more_authors:
282                 name, email = self._get_name_email(a)
283                 more_author_info.append(
284                     (name, email, revobj.timestamp, revobj.timezone))
285         elif author != committer:
286             name, email = self._get_name_email(author)
287             author_info = (name, email, revobj.timestamp, revobj.timezone)
288             more_author_info = None
289         else:
290             author_info = None
291             more_author_info = None
292
293         # Get the parents in terms of marks
294         non_ghost_parents = []
295         for p in revobj.parent_ids:
296             if p in self.excluded_revisions:
297                 continue
298             try:
299                 parent_mark = self.revid_to_mark[p]
300                 non_ghost_parents.append(":%s" % parent_mark)
301             except KeyError:
302                 # ghost - ignore
303                 continue
304         if non_ghost_parents:
305             from_ = non_ghost_parents[0]
306             merges = non_ghost_parents[1:]
307         else:
308             from_ = None
309             merges = None
310
311         # Filter the revision properties. Some metadata (like the
312         # author information) is already exposed in other ways so
313         # don't repeat it here.
314         if self.plain_format:
315             properties = None
316         else:
317             properties = revobj.properties
318             for prop in self.properties_to_exclude:
319                 try:
320                     del properties[prop]
321                 except KeyError:
322                     pass
323
324         # Build and return the result
325         return commands.CommitCommand(git_ref, mark, author_info,
326             committer_info, revobj.message, from_, merges, iter(file_cmds),
327             more_authors=more_author_info, properties=properties)
328
329     def _get_revision_trees(self, parent, revision_id):
330         try:
331             tree_old = self.branch.repository.revision_tree(parent)
332         except bazErrors.UnexpectedInventoryFormat:
333             self.warning("Parent is malformed - diffing against previous parent")
334             # We can't find the old parent. Let's diff against his parent
335             pp = self.branch.repository.get_revision(parent)
336             tree_old = self.branch.repository.revision_tree(pp.parent_ids[0])
337         tree_new = None
338         try:
339             tree_new = self.branch.repository.revision_tree(revision_id)
340         except bazErrors.UnexpectedInventoryFormat:
341             # We can't really do anything anymore
342             self.warning("Revision %s is malformed - skipping" % revision_id)
343         return tree_old, tree_new
344
345     def _get_filecommands(self, parent, revision_id):
346         """Get the list of FileCommands for the changes between two revisions."""
347         tree_old, tree_new = self._get_revision_trees(parent, revision_id)
348         if not(tree_old and tree_new):
349             # Something is wrong with this revision - ignore the filecommands
350             return []
351
352         changes = tree_new.changes_from(tree_old)
353
354         # Make "modified" have 3-tuples, as added does
355         my_modified = [ x[0:3] for x in changes.modified ]
356
357         # The potential interaction between renames and deletes is messy.
358         # Handle it here ...
359         file_cmds, rd_modifies, renamed = self._process_renames_and_deletes(
360             changes.renamed, changes.removed, revision_id, tree_old)
361
362         # Map kind changes to a delete followed by an add
363         for path, id_, kind1, kind2 in changes.kind_changed:
364             path = self._adjust_path_for_renames(path, renamed, revision_id)
365             # IGC: I don't understand why a delete is needed here.
366             # In fact, it seems harmful? If you uncomment this line,
367             # please file a bug explaining why you needed to.
368             #file_cmds.append(commands.FileDeleteCommand(path))
369             my_modified.append((path, id_, kind2))
370
371         # Record modifications
372         for path, id_, kind in changes.added + my_modified + rd_modifies:
373             if kind == 'file':
374                 text = tree_new.get_file_text(id_)
375                 file_cmds.append(commands.FileModifyCommand(path, 'file',
376                     tree_new.is_executable(id_), None, text))
377             elif kind == 'symlink':
378                 file_cmds.append(commands.FileModifyCommand(path, 'symlink',
379                     False, None, tree_new.get_symlink_target(id_)))
380             elif kind == 'directory':
381                 file_cmds.append(commands.FileModifyCommand(path, 'directory',
382                     False, None, None))
383             else:
384                 self.warning("cannot export '%s' of kind %s yet - ignoring" %
385                     (path, kind))
386         return file_cmds
387
388     def _process_renames_and_deletes(self, renames, deletes,
389         revision_id, tree_old):
390         file_cmds = []
391         modifies = []
392         renamed = []
393
394         # See https://bugs.edge.launchpad.net/bzr-fastimport/+bug/268933.
395         # In a nutshell, there are several nasty cases:
396         #
397         # 1) bzr rm a; bzr mv b a; bzr commit
398         # 2) bzr mv x/y z; bzr rm x; commmit
399         #
400         # The first must come out with the delete first like this:
401         #
402         # D a
403         # R b a
404         #
405         # The second case must come out with the rename first like this:
406         #
407         # R x/y z
408         # D x
409         #
410         # So outputting all deletes first or all renames first won't work.
411         # Instead, we need to make multiple passes over the various lists to
412         # get the ordering right.
413
414         must_be_renamed = {}
415         old_to_new = {}
416         deleted_paths = set([p for p, _, _ in deletes])
417         for (oldpath, newpath, id_, kind,
418                 text_modified, meta_modified) in renames:
419             if newpath in deleted_paths:
420                 file_cmds.append(commands.FileDeleteCommand(newpath))
421                 deleted_paths.remove(newpath)
422             if (self.is_empty_dir(tree_old, oldpath)):
423                 self.note("Skipping empty dir %s in rev %s" % (oldpath,
424                     revision_id))
425                 continue
426             #oldpath = self._adjust_path_for_renames(oldpath, renamed,
427             #    revision_id)
428             renamed.append([oldpath, newpath])
429             old_to_new[oldpath] = newpath
430             file_cmds.append(commands.FileRenameCommand(oldpath, newpath))
431             if text_modified or meta_modified:
432                 modifies.append((newpath, id_, kind))
433
434             # Renaming a directory implies all children must be renamed.
435             # Note: changes_from() doesn't handle this
436             if kind == 'directory':
437                 for p, e in tree_old.inventory.iter_entries_by_dir(from_dir=id_):
438                     old_child_path = osutils.pathjoin(oldpath, p)
439                     new_child_path = osutils.pathjoin(newpath, p)
440                     must_be_renamed[old_child_path] = new_child_path
441
442         # Add children not already renamed
443         if must_be_renamed:
444             renamed_already = set(old_to_new.keys())
445             still_to_be_renamed = set(must_be_renamed.keys()) - renamed_already
446             for old_child_path in sorted(still_to_be_renamed):
447                 new_child_path = must_be_renamed[old_child_path]
448                 if self.verbose:
449                     self.note("implicitly renaming %s => %s" % (old_child_path,
450                         new_child_path))
451                 file_cmds.append(commands.FileRenameCommand(old_child_path,
452                     new_child_path))
453
454         # Record remaining deletes
455         for path, id_, kind in deletes:
456             if path not in deleted_paths:
457                 continue
458             #path = self._adjust_path_for_renames(path, renamed, revision_id)
459             file_cmds.append(commands.FileDeleteCommand(path))
460         return file_cmds, modifies, renamed
461
462     def _adjust_path_for_renames(self, path, renamed, revision_id):
463         # If a previous rename is found, we should adjust the path
464         for old, new in renamed:
465             if path == old:
466                 self.note("Changing path %s given rename to %s in revision %s"
467                     % (path, new, revision_id))
468                 path = new
469             elif path.startswith(old + '/'):
470                 self.note(
471                     "Adjusting path %s given rename of %s to %s in revision %s"
472                     % (path, old, new, revision_id))
473                 path = path.replace(old + "/", new + "/")
474         return path
475
476     def emit_tags(self):
477         for tag, revid in self.branch.tags.get_tag_dict().items():
478             try:
479                 mark = self.revid_to_mark[revid]
480             except KeyError:
481                 self.warning('not creating tag %r pointing to non-existent '
482                     'revision %s' % (tag, revid))
483             else:
484                 git_ref = 'refs/tags/%s' % tag
485                 self.print_cmd(commands.ResetCommand(git_ref, ":" + str(mark)))
486
487     def _next_tmp_branch_name(self):
488         """Return a unique branch name. The name will start with "tmp"."""
489         prefix = 'tmp'
490         if prefix not in self.branch_names:
491             self.branch_names[prefix] = 0
492         else:
493             self.branch_names[prefix] += 1
494             prefix = '%s.%d' % (prefix, self.branch_names[prefix])
495         return prefix