dulwich/archive.py

   1 # archive.py -- Creating an archive from a tarball
   2 # Copyright (C) 2015 Jonas Haag <jonas@lophus.org>
   3 # Copyright (C) 2015 Jelmer Vernooij <jelmer@jelmer.uk>
   4 #
   5 # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
   6 # General Public License as public by the Free Software Foundation; version 2.0
   7 # or (at your option) any later version. You can redistribute it and/or
   8 # modify it under the terms of either of these two licenses.
   9 #
  10 # Unless required by applicable law or agreed to in writing, software
  11 # distributed under the License is distributed on an "AS IS" BASIS,
  12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 # See the License for the specific language governing permissions and
  14 # limitations under the License.
  15 #
  16 # You should have received a copy of the licenses; if not, see
  17 # <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
  18 # and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
  19 # License, Version 2.0.
  20 #
  21
  22 """Generates tarballs for Git trees.
  23
  24 """
  25
  26 import posixpath
  27 import stat
  28 import tarfile
  29 import struct
  30 from os import SEEK_END
  31 from io import BytesIO
  32 from contextlib import closing
  33
  34
  35 class ChunkedBytesIO(object):
  36     """Turn a list of bytestrings into a file-like object.
  37
  38     This is similar to creating a `BytesIO` from a concatenation of the
  39     bytestring list, but saves memory by NOT creating one giant bytestring
  40     first::
  41
  42         BytesIO(b''.join(list_of_bytestrings)) =~= ChunkedBytesIO(
  43             list_of_bytestrings)
  44     """
  45     def __init__(self, contents):
  46         self.contents = contents
  47         self.pos = (0, 0)
  48
  49     def read(self, maxbytes=None):
  50         if maxbytes < 0:
  51             maxbytes = float('inf')
  52
  53         buf = []
  54         chunk, cursor = self.pos
  55
  56         while chunk < len(self.contents):
  57             if maxbytes < len(self.contents[chunk]) - cursor:
  58                 buf.append(self.contents[chunk][cursor:cursor+maxbytes])
  59                 cursor += maxbytes
  60                 self.pos = (chunk, cursor)
  61                 break
  62             else:
  63                 buf.append(self.contents[chunk][cursor:])
  64                 maxbytes -= len(self.contents[chunk]) - cursor
  65                 chunk += 1
  66                 cursor = 0
  67                 self.pos = (chunk, cursor)
  68         return b''.join(buf)
  69
  70
  71 def tar_stream(store, tree, mtime, prefix=b'', format=''):
  72     """Generate a tar stream for the contents of a Git tree.
  73
  74     Returns a generator that lazily assembles a .tar.gz archive, yielding it in
  75     pieces (bytestrings). To obtain the complete .tar.gz binary file, simply
  76     concatenate these chunks.
  77
  78     :param store: Object store to retrieve objects from
  79     :param tree: Tree object for the tree root
  80     :param mtime: UNIX timestamp that is assigned as the modification time for
  81         all files, and the gzip header modification time if format='gz'
  82     :param format: Optional compression format for tarball
  83     :return: Bytestrings
  84     """
  85     buf = BytesIO()
  86     with closing(tarfile.open(None, "w:%s" % format, buf)) as tar:
  87         if format == 'gz':
  88             # Manually correct the gzip header file modification time so that
  89             # archives created from the same Git tree are always identical.
  90             # The gzip header file modification time is not currenctly
  91             # accessible from the tarfile API, see:
  92             # https://bugs.python.org/issue31526
  93             buf.seek(0)
  94             assert buf.read(2) == b'\x1f\x8b', 'Invalid gzip header'
  95             buf.seek(4)
  96             buf.write(struct.pack('<L', mtime))
  97             buf.seek(0, SEEK_END)
  98
  99         for entry_abspath, entry in _walk_tree(store, tree, prefix):
 100             try:
 101                 blob = store[entry.sha]
 102             except KeyError:
 103                 # Entry probably refers to a submodule, which we don't yet
 104                 # support.
 105                 continue
 106             data = ChunkedBytesIO(blob.chunked)
 107
 108             info = tarfile.TarInfo()
 109             # tarfile only works with ascii.
 110             info.name = entry_abspath.decode('ascii')
 111             info.size = blob.raw_length()
 112             info.mode = entry.mode
 113             info.mtime = mtime
 114
 115             tar.addfile(info, data)
 116             yield buf.getvalue()
 117             buf.truncate(0)
 118             buf.seek(0)
 119     yield buf.getvalue()
 120
 121
 122 def _walk_tree(store, tree, root=b''):
 123     """Recursively walk a dulwich Tree, yielding tuples of
 124     (absolute path, TreeEntry) along the way.
 125     """
 126     for entry in tree.iteritems():
 127         entry_abspath = posixpath.join(root, entry.path)
 128         if stat.S_ISDIR(entry.mode):
 129             for _ in _walk_tree(store, store[entry.sha], entry_abspath):
 130                 yield _
 131         else:
 132             yield (entry_abspath, entry)