Skip to content

Instantly share code, notes, and snippets.

@JorgeMadson
Forked from iyvinjose/data_loading_utils.py
Last active March 27, 2025 14:05
Show Gist options
  • Save JorgeMadson/dbc529d81a93a88df528954df619cd8d to your computer and use it in GitHub Desktop.
Save JorgeMadson/dbc529d81a93a88df528954df619cd8d to your computer and use it in GitHub Desktop.
Read large files line by line without loading entire file to memory. Supports files of GB size
def read_lines_from_file_as_data_chunks(file_name, chunk_size, callback, return_whole_chunk=False):
"""
read file line by line regardless of its size
:param file_name: absolute path of file to read
:param chunk_size: size of data to be read at at time
:param callback: callback method, prototype ----> def callback(data, eof, file_name)
:param return_whole_chunk: if True, returns whole chunks instead of line by line
:return: None
"""
def read_in_chunks(file_obj, chunk_size=5000):
"""
https://stackoverflow.com/a/519653/5130720
Lazy function to read a file
Default chunk size: 5000.
"""
while True:
data = file_obj.read(chunk_size)
if not data:
break
yield data
with open(file_name, 'r') as fp:
data_left_over = None
# loop through characters
for chunk in read_in_chunks(fp, chunk_size):
# if uncompleted data exists
if data_left_over:
# print('\n left over found')
current_chunk = data_left_over + chunk
else:
current_chunk = chunk
# split chunk by new line
lines = current_chunk.splitlines()
# check if line is complete
if current_chunk.endswith('\n'):
data_left_over = None
else:
data_left_over = lines.pop()
if return_whole_chunk:
callback(data=lines, eof=False, file_name=file_name)
else:
for line in lines:
callback(data=line, eof=False, file_name=file_name)
# Process remaining data
if data_left_over:
lines = data_left_over.splitlines()
if return_whole_chunk:
callback(data=lines, eof=False, file_name=file_name)
else:
for line in lines:
callback(data=line, eof=False, file_name=file_name)
# Notify end of file
callback(data=None, eof=True, file_name=file_name)
from data_loading_utils import read_lines_from_file_as_data_chunks
file_name = 'file_name.ext'
CHUNK_SIZE = 1000000 # configure this variable depending on your machine's hardware configuration
# callback method
def process_lines(data, eof, file_name):
line_number = 0
# check if end of file reached
if not eof:
# process data, data is one single line of the file
line_number += 1
print('Data:', line_number, data)
else:
# end of file reached
print('End of file reached')
if __name__ == "__main__":
read_lines_from_file_as_data_chunks(file_name, chunk_size=CHUNK_SIZE, callback=process_lines)
# process_lines method is the callback method.
# It will be called for all the lines, with parameter data representing one single line of the file at a time
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment