1 # archive.py -- Creating an archive from a tarball
2 # Copyright (C) 2015 Jonas Haag <jonas@lophus.org>
3 # Copyright (C) 2015 Jelmer Vernooij <jelmer@jelmer.uk>
5 # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
6 # General Public License as public by the Free Software Foundation; version 2.0
7 # or (at your option) any later version. You can redistribute it and/or
8 # modify it under the terms of either of these two licenses.
10 # Unless required by applicable law or agreed to in writing, software
11 # distributed under the License is distributed on an "AS IS" BASIS,
12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 # See the License for the specific language governing permissions and
14 # limitations under the License.
16 # You should have received a copy of the licenses; if not, see
17 # <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
18 # and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
19 # License, Version 2.0.
22 """Generates tarballs for Git trees.
30 from os import SEEK_END
31 from io import BytesIO
32 from contextlib import closing
35 class ChunkedBytesIO(object):
36 """Turn a list of bytestrings into a file-like object.
38 This is similar to creating a `BytesIO` from a concatenation of the
39 bytestring list, but saves memory by NOT creating one giant bytestring
42 BytesIO(b''.join(list_of_bytestrings)) =~= ChunkedBytesIO(
45 def __init__(self, contents):
46 self.contents = contents
49 def read(self, maxbytes=None):
51 maxbytes = float('inf')
54 chunk, cursor = self.pos
56 while chunk < len(self.contents):
57 if maxbytes < len(self.contents[chunk]) - cursor:
58 buf.append(self.contents[chunk][cursor:cursor+maxbytes])
60 self.pos = (chunk, cursor)
63 buf.append(self.contents[chunk][cursor:])
64 maxbytes -= len(self.contents[chunk]) - cursor
67 self.pos = (chunk, cursor)
71 def tar_stream(store, tree, mtime, prefix=b'', format=''):
72 """Generate a tar stream for the contents of a Git tree.
74 Returns a generator that lazily assembles a .tar.gz archive, yielding it in
75 pieces (bytestrings). To obtain the complete .tar.gz binary file, simply
76 concatenate these chunks.
78 :param store: Object store to retrieve objects from
79 :param tree: Tree object for the tree root
80 :param mtime: UNIX timestamp that is assigned as the modification time for
81 all files, and the gzip header modification time if format='gz'
82 :param format: Optional compression format for tarball
86 with closing(tarfile.open(None, "w:%s" % format, buf)) as tar:
88 # Manually correct the gzip header file modification time so that
89 # archives created from the same Git tree are always identical.
90 # The gzip header file modification time is not currenctly
91 # accessible from the tarfile API, see:
92 # https://bugs.python.org/issue31526
94 assert buf.read(2) == b'\x1f\x8b', 'Invalid gzip header'
96 buf.write(struct.pack('<L', mtime))
99 for entry_abspath, entry in _walk_tree(store, tree, prefix):
101 blob = store[entry.sha]
103 # Entry probably refers to a submodule, which we don't yet
106 data = ChunkedBytesIO(blob.chunked)
108 info = tarfile.TarInfo()
109 # tarfile only works with ascii.
110 info.name = entry_abspath.decode('ascii')
111 info.size = blob.raw_length()
112 info.mode = entry.mode
115 tar.addfile(info, data)
122 def _walk_tree(store, tree, root=b''):
123 """Recursively walk a dulwich Tree, yielding tuples of
124 (absolute path, TreeEntry) along the way.
126 for entry in tree.iteritems():
127 entry_abspath = posixpath.join(root, entry.path)
128 if stat.S_ISDIR(entry.mode):
129 for _ in _walk_tree(store, store[entry.sha], entry_abspath):
132 yield (entry_abspath, entry)