-
-
Save leth/6adb9d30f2fdcb8802532a87dfbeff77 to your computer and use it in GitHub Desktop.
Sample code to build a tar chunk-by-chunk and stream it out all at once.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# | |
# Building a tar file chunk-by-chunk. | |
# | |
# This is a quick bit of sample code for streaming data to a tar file, | |
# building it piece-by-piece. The tarfile is built on-the-fly and streamed | |
# back out. This is useful for web applications that need to dynamically | |
# build a tar file without swamping the server. | |
from io import BytesIO | |
from os import walk | |
from tarfile import TarFile, NUL, BLOCKSIZE | |
from os.path import ( | |
abspath, join as path_join, sep as path_sep, split as path_split | |
) | |
class FileStream: | |
def __init__(self): | |
self.buffer = BytesIO() | |
self.offset = 0 | |
def write(self, s): | |
self.buffer.write(s) | |
self.offset += len(s) | |
def tell(self): | |
return self.offset | |
def close(self): | |
self.buffer.close() | |
def read_all(self): | |
try: | |
return self.buffer.getvalue() | |
finally: | |
self.buffer.close() | |
self.buffer = BytesIO() | |
class StreamingTar: | |
def __init__(self, directory, file_chunk_size=8192): | |
self._directory = directory | |
self._file_chunk_size = file_chunk_size | |
@staticmethod | |
def _stream_file_into_tar(tarinfo, tar, fh, buf_size): | |
out = tar.fileobj | |
for b in iter(lambda: fh.read(buf_size), b''): | |
out.write(b) | |
yield | |
blocks, remainder = divmod(tarinfo.size, BLOCKSIZE) | |
if remainder > 0: | |
out.write(NUL * (BLOCKSIZE - remainder)) | |
blocks += 1 | |
tar.offset += blocks * BLOCKSIZE | |
yield | |
def __iter__(self): | |
out = FileStream() | |
tar = TarFile(fileobj=out, mode='w') | |
prefix, name = path_split(self._directory) | |
prefix_len = len(prefix) + len(path_sep) | |
tar.add(name=self._directory, arcname=name, recursive=False) | |
for path, dirs, files in walk(self._directory): | |
arcpath = path[prefix_len:] | |
# Add files | |
for f in files: | |
filepath = path_join(path, f) | |
with open(filepath, 'rb') as fh: | |
info = tar.gettarinfo( | |
name=filepath, arcname=path_join(arcpath, f), | |
fileobj=fh) | |
tar.addfile(info) | |
for _ in self._stream_file_into_tar( | |
info, tar, fh, self._file_chunk_size): | |
yield out.read_all() | |
# Add directories | |
for d in dirs: | |
tar.add( | |
name=path_join(path, d), arcname=path_join(arcpath, d), | |
recursive=False) | |
yield out.read_all() | |
tar.close() | |
yield out.read_all() | |
out.close() | |
if __name__ == '__main__': | |
t = StreamingTar(abspath('foobar')) | |
with open('out.tar', 'wb') as fh: | |
for chunk in t: | |
# print(repr(b)) | |
fh.write(chunk) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment