six8 · April 29, 2019 15:45 · six8 · Apr 29, 2019
diff --git a/test.py b/test.py
 import base64
 import os
 from binascii import b2a_base64
 from math import ceil
 from tempfile import TemporaryFile, NamedTemporaryFile
 from timeit import timeit


 def original_chunked_encode(input, input_size, output, read_size=1024):
    """
    Read a file in configurable sized chunks and write to it base64
    encoded to an output file.

    This is an optimization over ``base64.encode`` which only reads 57
    bytes at a time from the input file. Normally this is OK if the
    file in question is opened with ``open`` because Python will
    actually read the data into a larger buffer and only feed out
    57 bytes at a time. But if the input file is something like a
    file stream that's read over the network, only 57 bytes will be
    read at a time. This is very slow if the file stream is not
    buffered some other way.

    This is the case for MongoDB GridFS. The GridOut file returned by
    GridFS is not a normal file on disk. Instead it's a file read in
    256 KB chunks from MongoDB. If you read from it 57 bytes at a time,
    GridFS will read 256 KB then make lots of copies of that chunk
    to return only 57 bytes at a time. By reading in chunks equal
    to the GridFS chunk size, performance is 300 times better.

    Performance comparison:

        File size 10 MB
        Save to MongoDB took 0.271495819092 seconds
        Fast Base 64 encode (chunk size 261120) took 0.250380992889 seconds
        Base 64 encode (chunk size 57) took 62.9280769825 seconds

        File size 100 MB
        Save to MongoDB took 0.994009971619 seconds
        Fast Base 64 encode (chunk size 261120) took 2.78231501579 seconds
        Base 64 encode (chunk size 57) took 645.734956026 seconds

    For regular files on disk, there is no noticeable performance gain
    for this function over ``base64.encode`` because of Python's built
    in buffering for disk files.

    Args:
        input (file): File like object (implements ``read()``).
        input_size (int): Size of file in bytes
        output (file): File like object (implements ``write()``).
        read_size (int): How many bytes to read from ``input`` at
            a time
    """
    # 57 bytes of input will be 76 bytes of base64
    chunk_size = base64.MAXBINSIZE
    base64_line_size = base64.MAXLINESIZE
    # Read size needs to be in increments of chunk size for base64
    # output to be RFC 3548 compliant.
    read_size = read_size - (read_size % chunk_size)
    num_reads = int(ceil(input_size / float(read_size)))
    # RFC 3548 says lines should be 76 chars
    base64_lines_per_read = read_size / chunk_size

    input.seek(0)
    for r in xrange(num_reads):
        is_last_read = r == num_reads - 1
        s = input.read(read_size)
        if not s:
            # If this were to happen, then ``input_size`` is wrong or
            # the file is corrupt.
            raise ValueError(
                u'Expected to need to read %d times but got no data back on read %d' % (
                    num_reads, r + 1))

        data = b2a_base64(s)

        if is_last_read:
            # The last chunk will be smaller than the others so the
            # line count needs to be calculated. b2a_base64 adds a line
            # break so we don't count that char
            base64_lines_per_read = int(ceil((len(data) - 1) / float(base64_line_size)))

        # Split the data chunks into base64_lines_per_read number of
        # lines, each 76 chars long.
        for l in xrange(base64_lines_per_read):
            is_last_line = l == base64_lines_per_read - 1
            pos = l * base64_line_size
            line = data[pos:pos + base64_line_size]
            output.write(line)

            if not (is_last_line and is_last_read):
                # The very last line will already have a \n because of
                # b2a_base64. The other lines will not so we add it
                output.write('\n')



 def latest_chunked_encode(
        input, output, read_size=1024, write_size=(base64.MAXLINESIZE + 1) * 64, input_size=None):
    """
    Read a file in configurable sized chunks and write to it base64
    encoded to an output file.

    Args:
        input (file): File like object (implements ``read()``).
        output (file): File like object (implements ``write()``).
        read_size (int): How many bytes to read from ``input`` at
            a time. More efficient if in increments of 57.
        write_size (int): How many bytes to write at a time. More efficient
            if in increments of 77.
    """
    # 57 bytes of input will be 76 bytes of base64
    chunk_size = base64.MAXBINSIZE
    base64_line_size = base64.MAXLINESIZE
    # Read size needs to be in increments of chunk size for base64
    # output to be RFC 3548 compliant.
    buffer_read_size = max(chunk_size, read_size - (read_size % chunk_size))
    input.seek(0)

    read_buffer = bytearray()
    write_buffer = bytearray()

    while True:
        # Read from file and store in buffer until we have enough data
        # to meet buffer_read_size
        while input and len(read_buffer) < buffer_read_size:
            s = input.read(read_size)
            if s:
                read_buffer.extend(s)
            else:
                # Nothing left to read
                input = None

        if not len(read_buffer):
            # Nothing in buffer to read, finished
            break

        # Base 64 encode up to buffer_read_size and remove the trailing
        # line break.
        data = memoryview(b2a_base64(read_buffer[:buffer_read_size]))[:-1]
        # Put any unread data back into the buffer
        read_buffer = read_buffer[buffer_read_size:]

        # Read the data in chunks of base64_line_size and append a
        # linebreak
        for pos in xrange(0, len(data), base64_line_size):
            write_buffer.extend(data[pos:pos + base64_line_size])
            write_buffer.extend('\n')

            if len(write_buffer) >= write_size:
                # Flush write buffer
                output.write(write_buffer)
                del write_buffer[:]

    if len(write_buffer):
        output.write(write_buffer)
        del write_buffer[:]


 def standard_base64_encode(input, output, read_size=1024, input_size=None):
    base64.encode(input, output)


 def test1(encoder, bufsize, iterations=10):
    test_size = 1024 * 1024 * 10

    with NamedTemporaryFile() as test_file:
        for _ in range(test_size / 4096):
            test_file.write(os.urandom(4096))
        test_file.flush()

        def test():
            # Disable file buffering to simulate socket behavior
            with open(test_file.name, 'rb', buffering=False) as input, TemporaryFile() as output:
                encoder(input=input, output=output, read_size=bufsize, input_size=test_size)

        print('%-22s %s seconds for %s iterations' % (
            encoder.__name__, timeit(test, number=iterations), iterations))


 for bufsize in (4096, 2048, 1024, 17 * 57, 100, 57):
    print('--- bufsize %4s' % bufsize)
    for f in (standard_base64_encode, original_chunked_encode, latest_chunked_encode):
        test1(encoder=f, bufsize=bufsize)
	import base64
	import os
	from binascii import b2a_base64
	from math import ceil
	from tempfile import TemporaryFile, NamedTemporaryFile
	from timeit import timeit


	def original_chunked_encode(input, input_size, output, read_size=1024):
	"""
	Read a file in configurable sized chunks and write to it base64
	encoded to an output file.

	This is an optimization over ``base64.encode`` which only reads 57
	bytes at a time from the input file. Normally this is OK if the
	file in question is opened with ``open`` because Python will
	actually read the data into a larger buffer and only feed out
	57 bytes at a time. But if the input file is something like a
	file stream that's read over the network, only 57 bytes will be
	read at a time. This is very slow if the file stream is not
	buffered some other way.

	This is the case for MongoDB GridFS. The GridOut file returned by
	GridFS is not a normal file on disk. Instead it's a file read in
	256 KB chunks from MongoDB. If you read from it 57 bytes at a time,
	GridFS will read 256 KB then make lots of copies of that chunk
	to return only 57 bytes at a time. By reading in chunks equal
	to the GridFS chunk size, performance is 300 times better.

	Performance comparison:

	File size 10 MB
	Save to MongoDB took 0.271495819092 seconds
	Fast Base 64 encode (chunk size 261120) took 0.250380992889 seconds
	Base 64 encode (chunk size 57) took 62.9280769825 seconds

	File size 100 MB
	Save to MongoDB took 0.994009971619 seconds
	Fast Base 64 encode (chunk size 261120) took 2.78231501579 seconds
	Base 64 encode (chunk size 57) took 645.734956026 seconds

	For regular files on disk, there is no noticeable performance gain
	for this function over ``base64.encode`` because of Python's built
	in buffering for disk files.

	Args:
	input (file): File like object (implements ``read()``).
	input_size (int): Size of file in bytes
	output (file): File like object (implements ``write()``).
	read_size (int): How many bytes to read from ``input`` at
	a time
	"""
	# 57 bytes of input will be 76 bytes of base64
	chunk_size = base64.MAXBINSIZE
	base64_line_size = base64.MAXLINESIZE
	# Read size needs to be in increments of chunk size for base64
	# output to be RFC 3548 compliant.
	read_size = read_size - (read_size % chunk_size)
	num_reads = int(ceil(input_size / float(read_size)))
	# RFC 3548 says lines should be 76 chars
	base64_lines_per_read = read_size / chunk_size

	input.seek(0)
	for r in xrange(num_reads):
	is_last_read = r == num_reads - 1
	s = input.read(read_size)
	if not s:
	# If this were to happen, then ``input_size`` is wrong or
	# the file is corrupt.
	raise ValueError(
	u'Expected to need to read %d times but got no data back on read %d' % (
	num_reads, r + 1))

	data = b2a_base64(s)

	if is_last_read:
	# The last chunk will be smaller than the others so the
	# line count needs to be calculated. b2a_base64 adds a line
	# break so we don't count that char
	base64_lines_per_read = int(ceil((len(data) - 1) / float(base64_line_size)))

	# Split the data chunks into base64_lines_per_read number of
	# lines, each 76 chars long.
	for l in xrange(base64_lines_per_read):
	is_last_line = l == base64_lines_per_read - 1
	pos = l * base64_line_size
	line = data[pos:pos + base64_line_size]
	output.write(line)

	if not (is_last_line and is_last_read):
	# The very last line will already have a \n because of
	# b2a_base64. The other lines will not so we add it
	output.write('\n')



	def latest_chunked_encode(
	input, output, read_size=1024, write_size=(base64.MAXLINESIZE + 1) * 64, input_size=None):
	"""
	Read a file in configurable sized chunks and write to it base64
	encoded to an output file.

	Args:
	input (file): File like object (implements ``read()``).
	output (file): File like object (implements ``write()``).
	read_size (int): How many bytes to read from ``input`` at
	a time. More efficient if in increments of 57.
	write_size (int): How many bytes to write at a time. More efficient
	if in increments of 77.
	"""
	# 57 bytes of input will be 76 bytes of base64
	chunk_size = base64.MAXBINSIZE
	base64_line_size = base64.MAXLINESIZE
	# Read size needs to be in increments of chunk size for base64
	# output to be RFC 3548 compliant.
	buffer_read_size = max(chunk_size, read_size - (read_size % chunk_size))
	input.seek(0)

	read_buffer = bytearray()
	write_buffer = bytearray()

	while True:
	# Read from file and store in buffer until we have enough data
	# to meet buffer_read_size
	while input and len(read_buffer) < buffer_read_size:
	s = input.read(read_size)
	if s:
	read_buffer.extend(s)
	else:
	# Nothing left to read
	input = None

	if not len(read_buffer):
	# Nothing in buffer to read, finished
	break

	# Base 64 encode up to buffer_read_size and remove the trailing
	# line break.
	data = memoryview(b2a_base64(read_buffer[:buffer_read_size]))[:-1]
	# Put any unread data back into the buffer
	read_buffer = read_buffer[buffer_read_size:]

	# Read the data in chunks of base64_line_size and append a
	# linebreak
	for pos in xrange(0, len(data), base64_line_size):
	write_buffer.extend(data[pos:pos + base64_line_size])
	write_buffer.extend('\n')

	if len(write_buffer) >= write_size:
	# Flush write buffer
	output.write(write_buffer)
	del write_buffer[:]

	if len(write_buffer):
	output.write(write_buffer)
	del write_buffer[:]


	def standard_base64_encode(input, output, read_size=1024, input_size=None):
	base64.encode(input, output)


	def test1(encoder, bufsize, iterations=10):
	test_size = 1024 * 1024 * 10

	with NamedTemporaryFile() as test_file:
	for _ in range(test_size / 4096):
	test_file.write(os.urandom(4096))
	test_file.flush()

	def test():
	# Disable file buffering to simulate socket behavior
	with open(test_file.name, 'rb', buffering=False) as input, TemporaryFile() as output:
	encoder(input=input, output=output, read_size=bufsize, input_size=test_size)

	print('%-22s %s seconds for %s iterations' % (
	encoder.__name__, timeit(test, number=iterations), iterations))


	for bufsize in (4096, 2048, 1024, 17 * 57, 100, 57):
	print('--- bufsize %4s' % bufsize)
	for f in (standard_base64_encode, original_chunked_encode, latest_chunked_encode):
	test1(encoder=f, bufsize=bufsize)