Last active
April 29, 2019 15:45
-
-
Save six8/77a2ba0ef25216c6618a572d43555f06 to your computer and use it in GitHub Desktop.
Python chunked base64 encoding
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import base64 | |
import os | |
from binascii import b2a_base64 | |
from math import ceil | |
from tempfile import TemporaryFile, NamedTemporaryFile | |
from timeit import timeit | |
def original_chunked_encode(input, input_size, output, read_size=1024): | |
""" | |
Read a file in configurable sized chunks and write to it base64 | |
encoded to an output file. | |
This is an optimization over ``base64.encode`` which only reads 57 | |
bytes at a time from the input file. Normally this is OK if the | |
file in question is opened with ``open`` because Python will | |
actually read the data into a larger buffer and only feed out | |
57 bytes at a time. But if the input file is something like a | |
file stream that's read over the network, only 57 bytes will be | |
read at a time. This is very slow if the file stream is not | |
buffered some other way. | |
This is the case for MongoDB GridFS. The GridOut file returned by | |
GridFS is not a normal file on disk. Instead it's a file read in | |
256 KB chunks from MongoDB. If you read from it 57 bytes at a time, | |
GridFS will read 256 KB then make lots of copies of that chunk | |
to return only 57 bytes at a time. By reading in chunks equal | |
to the GridFS chunk size, performance is 300 times better. | |
Performance comparison: | |
File size 10 MB | |
Save to MongoDB took 0.271495819092 seconds | |
Fast Base 64 encode (chunk size 261120) took 0.250380992889 seconds | |
Base 64 encode (chunk size 57) took 62.9280769825 seconds | |
File size 100 MB | |
Save to MongoDB took 0.994009971619 seconds | |
Fast Base 64 encode (chunk size 261120) took 2.78231501579 seconds | |
Base 64 encode (chunk size 57) took 645.734956026 seconds | |
For regular files on disk, there is no noticeable performance gain | |
for this function over ``base64.encode`` because of Python's built | |
in buffering for disk files. | |
Args: | |
input (file): File like object (implements ``read()``). | |
input_size (int): Size of file in bytes | |
output (file): File like object (implements ``write()``). | |
read_size (int): How many bytes to read from ``input`` at | |
a time | |
""" | |
# 57 bytes of input will be 76 bytes of base64 | |
chunk_size = base64.MAXBINSIZE | |
base64_line_size = base64.MAXLINESIZE | |
# Read size needs to be in increments of chunk size for base64 | |
# output to be RFC 3548 compliant. | |
read_size = read_size - (read_size % chunk_size) | |
num_reads = int(ceil(input_size / float(read_size))) | |
# RFC 3548 says lines should be 76 chars | |
base64_lines_per_read = read_size / chunk_size | |
input.seek(0) | |
for r in xrange(num_reads): | |
is_last_read = r == num_reads - 1 | |
s = input.read(read_size) | |
if not s: | |
# If this were to happen, then ``input_size`` is wrong or | |
# the file is corrupt. | |
raise ValueError( | |
u'Expected to need to read %d times but got no data back on read %d' % ( | |
num_reads, r + 1)) | |
data = b2a_base64(s) | |
if is_last_read: | |
# The last chunk will be smaller than the others so the | |
# line count needs to be calculated. b2a_base64 adds a line | |
# break so we don't count that char | |
base64_lines_per_read = int(ceil((len(data) - 1) / float(base64_line_size))) | |
# Split the data chunks into base64_lines_per_read number of | |
# lines, each 76 chars long. | |
for l in xrange(base64_lines_per_read): | |
is_last_line = l == base64_lines_per_read - 1 | |
pos = l * base64_line_size | |
line = data[pos:pos + base64_line_size] | |
output.write(line) | |
if not (is_last_line and is_last_read): | |
# The very last line will already have a \n because of | |
# b2a_base64. The other lines will not so we add it | |
output.write('\n') | |
def latest_chunked_encode( | |
input, output, read_size=1024, write_size=(base64.MAXLINESIZE + 1) * 64, input_size=None): | |
""" | |
Read a file in configurable sized chunks and write to it base64 | |
encoded to an output file. | |
Args: | |
input (file): File like object (implements ``read()``). | |
output (file): File like object (implements ``write()``). | |
read_size (int): How many bytes to read from ``input`` at | |
a time. More efficient if in increments of 57. | |
write_size (int): How many bytes to write at a time. More efficient | |
if in increments of 77. | |
""" | |
# 57 bytes of input will be 76 bytes of base64 | |
chunk_size = base64.MAXBINSIZE | |
base64_line_size = base64.MAXLINESIZE | |
# Read size needs to be in increments of chunk size for base64 | |
# output to be RFC 3548 compliant. | |
buffer_read_size = max(chunk_size, read_size - (read_size % chunk_size)) | |
input.seek(0) | |
read_buffer = bytearray() | |
write_buffer = bytearray() | |
while True: | |
# Read from file and store in buffer until we have enough data | |
# to meet buffer_read_size | |
while input and len(read_buffer) < buffer_read_size: | |
s = input.read(read_size) | |
if s: | |
read_buffer.extend(s) | |
else: | |
# Nothing left to read | |
input = None | |
if not len(read_buffer): | |
# Nothing in buffer to read, finished | |
break | |
# Base 64 encode up to buffer_read_size and remove the trailing | |
# line break. | |
data = memoryview(b2a_base64(read_buffer[:buffer_read_size]))[:-1] | |
# Put any unread data back into the buffer | |
read_buffer = read_buffer[buffer_read_size:] | |
# Read the data in chunks of base64_line_size and append a | |
# linebreak | |
for pos in xrange(0, len(data), base64_line_size): | |
write_buffer.extend(data[pos:pos + base64_line_size]) | |
write_buffer.extend('\n') | |
if len(write_buffer) >= write_size: | |
# Flush write buffer | |
output.write(write_buffer) | |
del write_buffer[:] | |
if len(write_buffer): | |
output.write(write_buffer) | |
del write_buffer[:] | |
def standard_base64_encode(input, output, read_size=1024, input_size=None): | |
base64.encode(input, output) | |
def test1(encoder, bufsize, iterations=10): | |
test_size = 1024 * 1024 * 10 | |
with NamedTemporaryFile() as test_file: | |
for _ in range(test_size / 4096): | |
test_file.write(os.urandom(4096)) | |
test_file.flush() | |
def test(): | |
# Disable file buffering to simulate socket behavior | |
with open(test_file.name, 'rb', buffering=False) as input, TemporaryFile() as output: | |
encoder(input=input, output=output, read_size=bufsize, input_size=test_size) | |
print('%-22s %s seconds for %s iterations' % ( | |
encoder.__name__, timeit(test, number=iterations), iterations)) | |
for bufsize in (4096, 2048, 1024, 17 * 57, 100, 57): | |
print('--- bufsize %4s' % bufsize) | |
for f in (standard_base64_encode, original_chunked_encode, latest_chunked_encode): | |
test1(encoder=f, bufsize=bufsize) |
Author
six8
commented
Apr 29, 2019
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment