JorgeMadson · March 27, 2025 14:05
diff --git a/data_loading_utils.py b/data_loading_utils.py
 def read_lines_from_file_as_data_chunks(file_name, chunk_size, callback, return_whole_chunk=False):
    """
    read file line by line regardless of its size
    :param file_name: absolute path of file to read
    :param chunk_size: size of data to be read at at time
    :param callback: callback method, prototype ----> def callback(data, eof, file_name)
    :param return_whole_chunk: if True, returns whole chunks instead of line by line
    :return: None
    """

    def read_in_chunks(file_obj, chunk_size=5000):
        """
        https://stackoverflow.com/a/519653/5130720
        Lazy function to read a file 
        Default chunk size: 5000.
        """
        while True:
            data = file_obj.read(chunk_size)
            if not data:
                break
            yield data

    with open(file_name, 'r') as fp:
        data_left_over = None

        # loop through characters
        for chunk in read_in_chunks(fp, chunk_size):
            # if uncompleted data exists
            if data_left_over:
                # print('\n left over found')
                current_chunk = data_left_over + chunk
            else:
                current_chunk = chunk

            # split chunk by new line
            lines = current_chunk.splitlines()

            # check if line is complete
            if current_chunk.endswith('\n'):
                data_left_over = None

            else:
                data_left_over = lines.pop()

            if return_whole_chunk:
                callback(data=lines, eof=False, file_name=file_name)

            else:

                for line in lines:
                    callback(data=line, eof=False, file_name=file_name)

        # Process remaining data
        if data_left_over:
            lines = data_left_over.splitlines()
            if return_whole_chunk:
                callback(data=lines, eof=False, file_name=file_name)
            else:
                for line in lines:
                    callback(data=line, eof=False, file_name=file_name)

    # Notify end of file
    callback(data=None, eof=True, file_name=file_name)
diff --git a/main.py b/main.py
 from data_loading_utils import read_lines_from_file_as_data_chunks

 file_name = 'file_name.ext'
 CHUNK_SIZE = 1000000  # configure this variable depending on your machine's hardware configuration

 # callback method
 def process_lines(data, eof, file_name):
    line_number = 0
    # check if end of file reached
    if not eof:
        # process data, data is one single line of the file
        line_number += 1
        print('Data:', line_number, data)
    else:
        # end of file reached
        print('End of file reached')

 if __name__ == "__main__":
    read_lines_from_file_as_data_chunks(file_name, chunk_size=CHUNK_SIZE, callback=process_lines)
    
    # process_lines method is the callback method. 
    # It will be called for all the lines, with parameter data representing one single line of the file at a time
	def read_lines_from_file_as_data_chunks(file_name, chunk_size, callback, return_whole_chunk=False):
	"""
	read file line by line regardless of its size
	:param file_name: absolute path of file to read
	:param chunk_size: size of data to be read at at time
	:param callback: callback method, prototype ----> def callback(data, eof, file_name)
	:param return_whole_chunk: if True, returns whole chunks instead of line by line
	:return: None
	"""

	def read_in_chunks(file_obj, chunk_size=5000):
	"""
	https://stackoverflow.com/a/519653/5130720
	Lazy function to read a file
	Default chunk size: 5000.
	"""
	while True:
	data = file_obj.read(chunk_size)
	if not data:
	break
	yield data

	with open(file_name, 'r') as fp:
	data_left_over = None

	# loop through characters
	for chunk in read_in_chunks(fp, chunk_size):
	# if uncompleted data exists
	if data_left_over:
	# print('\n left over found')
	current_chunk = data_left_over + chunk
	else:
	current_chunk = chunk

	# split chunk by new line
	lines = current_chunk.splitlines()

	# check if line is complete
	if current_chunk.endswith('\n'):
	data_left_over = None

	else:
	data_left_over = lines.pop()

	if return_whole_chunk:
	callback(data=lines, eof=False, file_name=file_name)

	else:

	for line in lines:
	callback(data=line, eof=False, file_name=file_name)

	# Process remaining data
	if data_left_over:
	lines = data_left_over.splitlines()
	if return_whole_chunk:
	callback(data=lines, eof=False, file_name=file_name)
	else:
	for line in lines:
	callback(data=line, eof=False, file_name=file_name)

	# Notify end of file
	callback(data=None, eof=True, file_name=file_name)
	from data_loading_utils import read_lines_from_file_as_data_chunks

	file_name = 'file_name.ext'
	CHUNK_SIZE = 1000000 # configure this variable depending on your machine's hardware configuration

	# callback method
	def process_lines(data, eof, file_name):
	line_number = 0
	# check if end of file reached
	if not eof:
	# process data, data is one single line of the file
	line_number += 1
	print('Data:', line_number, data)
	else:
	# end of file reached
	print('End of file reached')

	if __name__ == "__main__":
	read_lines_from_file_as_data_chunks(file_name, chunk_size=CHUNK_SIZE, callback=process_lines)

	# process_lines method is the callback method.
	# It will be called for all the lines, with parameter data representing one single line of the file at a time