-
-
Save JorgeMadson/dbc529d81a93a88df528954df619cd8d to your computer and use it in GitHub Desktop.
Read large files line by line without loading entire file to memory. Supports files of GB size
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def read_lines_from_file_as_data_chunks(file_name, chunk_size, callback, return_whole_chunk=False): | |
""" | |
read file line by line regardless of its size | |
:param file_name: absolute path of file to read | |
:param chunk_size: size of data to be read at at time | |
:param callback: callback method, prototype ----> def callback(data, eof, file_name) | |
:param return_whole_chunk: if True, returns whole chunks instead of line by line | |
:return: None | |
""" | |
def read_in_chunks(file_obj, chunk_size=5000): | |
""" | |
https://stackoverflow.com/a/519653/5130720 | |
Lazy function to read a file | |
Default chunk size: 5000. | |
""" | |
while True: | |
data = file_obj.read(chunk_size) | |
if not data: | |
break | |
yield data | |
with open(file_name, 'r') as fp: | |
data_left_over = None | |
# loop through characters | |
for chunk in read_in_chunks(fp, chunk_size): | |
# if uncompleted data exists | |
if data_left_over: | |
# print('\n left over found') | |
current_chunk = data_left_over + chunk | |
else: | |
current_chunk = chunk | |
# split chunk by new line | |
lines = current_chunk.splitlines() | |
# check if line is complete | |
if current_chunk.endswith('\n'): | |
data_left_over = None | |
else: | |
data_left_over = lines.pop() | |
if return_whole_chunk: | |
callback(data=lines, eof=False, file_name=file_name) | |
else: | |
for line in lines: | |
callback(data=line, eof=False, file_name=file_name) | |
# Process remaining data | |
if data_left_over: | |
lines = data_left_over.splitlines() | |
if return_whole_chunk: | |
callback(data=lines, eof=False, file_name=file_name) | |
else: | |
for line in lines: | |
callback(data=line, eof=False, file_name=file_name) | |
# Notify end of file | |
callback(data=None, eof=True, file_name=file_name) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from data_loading_utils import read_lines_from_file_as_data_chunks | |
file_name = 'file_name.ext' | |
CHUNK_SIZE = 1000000 # configure this variable depending on your machine's hardware configuration | |
# callback method | |
def process_lines(data, eof, file_name): | |
line_number = 0 | |
# check if end of file reached | |
if not eof: | |
# process data, data is one single line of the file | |
line_number += 1 | |
print('Data:', line_number, data) | |
else: | |
# end of file reached | |
print('End of file reached') | |
if __name__ == "__main__": | |
read_lines_from_file_as_data_chunks(file_name, chunk_size=CHUNK_SIZE, callback=process_lines) | |
# process_lines method is the callback method. | |
# It will be called for all the lines, with parameter data representing one single line of the file at a time |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment