erjan · February 26, 2025 08:31
diff --git a/json_to_csv_chunk_splitter.py b/json_to_csv_chunk_splitter.py
 import json
 import os
 import time

 import csv


 def dump_to_csv(data, filename):
    with open(filename, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile, delimiter='\t')
        # target_aoi_id     neightbor_aoi_id    distance
        writer.writerow(["target_aoi_id", "neightbor_aoi_id", "distance"]) # Write header
        for guid_station, contents in data.items():

            for content,distance in contents.items():
                writer.writerow([guid_station, content, distance])

 def split_json_by_guid_station(input_file, output_dir, chunk_size_mb):
    """
    Splits a JSON file into smaller files, ensuring that each file
    contains complete 'guid_station' entries.  It avoids splitting a
    'guid_station' across multiple files. 
    this includes timing and progress output.      
    
    Args:
        input_file (str): Path to the input JSON file.
        output_dir (str): Directory to store the split JSON files.
        chunk_size_mb (int): Approximate size of each output file in MB.
                              It's an approximate target; files may be slightly
                              larger to avoid splitting 'guid_station' entries.
                              
    It checks that the size of a batch of station_id data can fit altogether or not at all -so the data will never be split in middle
    avg completion time: 70 sec for 430 mb file size                              
    """

    start_time = time.time()  # Start the overall timer

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    chunk_size_bytes = chunk_size_mb * 1024 * 1024
    file_count = 0
    current_chunk = {}
    current_chunk_size = 0
    output_file = None

    print(f"Starting to process {input_file}...")

    with open(input_file, 'r') as f:
        try:
            data = json.load(f)  # Load the entire file into a Python dictionary
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")
            return

    if not isinstance(data, dict):
        print("Error: Input file does not contain a top-level JSON object (dictionary).")
        return

    total_guid_stations = len(data)
    guid_station_count = 0

    # Store the first guid_station of each chunk for validation
    first_guid_stations = {}

    for guid_station, contents in data.items():
        loop_start_time = time.time()  # Start timer for each guid_station

        entry_string = json.dumps({guid_station: contents})  # Convert entry to JSON string to measure its size
        entry_size = len(entry_string.encode('utf-8'))

        if current_chunk_size + entry_size > chunk_size_bytes and current_chunk:
            # Save the current chunk to a file
            if output_file:

                dump_to_csv(current_chunk, output_filename)
                output_file.close()
                # json.dump(current_chunk, output_file, indent=4)
                # output_file.close()
                print(f"CSV Chunk saved to: {output_filename}")

                # Validation: Check for split guid_stations
                current_chunk_keys = list(current_chunk.keys())
                if len(current_chunk_keys) > 0:
                  first_guid_stations[file_count] = current_chunk_keys[0] # Save the *first* guid_station in chunk

                output_file = None  # Reset output_file to None after closing
            else:
                print(f"Chunk was empty and not saved")

            # Start a new chunk
           # file_count += 1  #<-------- REMOVE THIS LINE
            current_chunk = {}
            current_chunk_size = 0

        # Add the guid_station entry to the current chunk
        current_chunk[guid_station] = contents
        current_chunk_size += entry_size

        # Open new output file if not already open
        if output_file is None:
            file_count += 1
            output_filename = os.path.join(output_dir, f"chunk_{file_count:04d}.csv")
            output_file = open(output_filename, 'w')
            #Save the first guid station for the file to validate that there are no splits between them
            first_guid_stations[file_count] = guid_station # Save the *first* guid_station in chunk

        guid_station_count += 1
        loop_end_time = time.time()  # End timer for each guid_station
        loop_duration = loop_end_time - loop_start_time

        print(f"Processed guid_station {guid_station_count}/{total_guid_stations} ({guid_station[:20]}...): Chunk size: {current_chunk_size / (1024 * 1024):.2f} MB.  Time taken: {loop_duration:.2f} seconds. Current csv file {output_filename}")

    # Save the last chunk if it's not empty
    if current_chunk:
        if output_file:
            dump_to_csv(current_chunk, output_filename)
            output_file.close()
            print(f"Final csv chunk saved to: {output_filename}")

            # Validation: Check for split guid_stations
            current_chunk_keys = list(current_chunk.keys())
            if len(current_chunk_keys) > 0:
              first_guid_stations[file_count] = current_chunk_keys[0]

        else:
            print(f"Final chunk was empty and not saved")

    end_time = time.time()  # End the overall timer
    total_time = end_time - start_time

    print(f"JSON file split into  {file_count} CSV files in: {output_dir}")
    print(f"Total time taken: {total_time:.2f} seconds")
    



 input_file = "nearest_51_100_20250101.json"  # Replace with the name of your actual JSON file
 input_file_name = input_file.split('.')[0]
 output_dir = f"output_csv_chunks_for_{input_file_name}"
 chunk_size_mb = 13

 split_json_by_guid_station(input_file, output_dir, chunk_size_mb)
	import json
	import os
	import time

	import csv


	def dump_to_csv(data, filename):
	with open(filename, 'w', newline='') as csvfile:
	writer = csv.writer(csvfile, delimiter='\t')
	# target_aoi_id neightbor_aoi_id distance
	writer.writerow(["target_aoi_id", "neightbor_aoi_id", "distance"]) # Write header
	for guid_station, contents in data.items():

	for content,distance in contents.items():
	writer.writerow([guid_station, content, distance])

	def split_json_by_guid_station(input_file, output_dir, chunk_size_mb):
	"""
	Splits a JSON file into smaller files, ensuring that each file
	contains complete 'guid_station' entries. It avoids splitting a
	'guid_station' across multiple files.
	this includes timing and progress output.

	Args:
	input_file (str): Path to the input JSON file.
	output_dir (str): Directory to store the split JSON files.
	chunk_size_mb (int): Approximate size of each output file in MB.
	It's an approximate target; files may be slightly
	larger to avoid splitting 'guid_station' entries.

	It checks that the size of a batch of station_id data can fit altogether or not at all -so the data will never be split in middle
	avg completion time: 70 sec for 430 mb file size
	"""

	start_time = time.time() # Start the overall timer

	if not os.path.exists(output_dir):
	os.makedirs(output_dir)

	chunk_size_bytes = chunk_size_mb * 1024 * 1024
	file_count = 0
	current_chunk = {}
	current_chunk_size = 0
	output_file = None

	print(f"Starting to process {input_file}...")

	with open(input_file, 'r') as f:
	try:
	data = json.load(f) # Load the entire file into a Python dictionary
	except json.JSONDecodeError as e:
	print(f"Error decoding JSON: {e}")
	return

	if not isinstance(data, dict):
	print("Error: Input file does not contain a top-level JSON object (dictionary).")
	return

	total_guid_stations = len(data)
	guid_station_count = 0

	# Store the first guid_station of each chunk for validation
	first_guid_stations = {}

	for guid_station, contents in data.items():
	loop_start_time = time.time() # Start timer for each guid_station

	entry_string = json.dumps({guid_station: contents}) # Convert entry to JSON string to measure its size
	entry_size = len(entry_string.encode('utf-8'))

	if current_chunk_size + entry_size > chunk_size_bytes and current_chunk:
	# Save the current chunk to a file
	if output_file:

	dump_to_csv(current_chunk, output_filename)
	output_file.close()
	# json.dump(current_chunk, output_file, indent=4)
	# output_file.close()
	print(f"CSV Chunk saved to: {output_filename}")

	# Validation: Check for split guid_stations
	current_chunk_keys = list(current_chunk.keys())
	if len(current_chunk_keys) > 0:
	first_guid_stations[file_count] = current_chunk_keys[0] # Save the first guid_station in chunk

	output_file = None # Reset output_file to None after closing
	else:
	print(f"Chunk was empty and not saved")

	# Start a new chunk
	# file_count += 1 #<-------- REMOVE THIS LINE
	current_chunk = {}
	current_chunk_size = 0

	# Add the guid_station entry to the current chunk
	current_chunk[guid_station] = contents
	current_chunk_size += entry_size

	# Open new output file if not already open
	if output_file is None:
	file_count += 1
	output_filename = os.path.join(output_dir, f"chunk_{file_count:04d}.csv")
	output_file = open(output_filename, 'w')
	#Save the first guid station for the file to validate that there are no splits between them
	first_guid_stations[file_count] = guid_station # Save the first guid_station in chunk

	guid_station_count += 1
	loop_end_time = time.time() # End timer for each guid_station
	loop_duration = loop_end_time - loop_start_time

	print(f"Processed guid_station {guid_station_count}/{total_guid_stations} ({guid_station[:20]}...): Chunk size: {current_chunk_size / (1024 * 1024):.2f} MB. Time taken: {loop_duration:.2f} seconds. Current csv file {output_filename}")

	# Save the last chunk if it's not empty
	if current_chunk:
	if output_file:
	dump_to_csv(current_chunk, output_filename)
	output_file.close()
	print(f"Final csv chunk saved to: {output_filename}")

	# Validation: Check for split guid_stations
	current_chunk_keys = list(current_chunk.keys())
	if len(current_chunk_keys) > 0:
	first_guid_stations[file_count] = current_chunk_keys[0]

	else:
	print(f"Final chunk was empty and not saved")

	end_time = time.time() # End the overall timer
	total_time = end_time - start_time

	print(f"JSON file split into {file_count} CSV files in: {output_dir}")
	print(f"Total time taken: {total_time:.2f} seconds")




	input_file = "nearest_51_100_20250101.json" # Replace with the name of your actual JSON file
	input_file_name = input_file.split('.')[0]
	output_dir = f"output_csv_chunks_for_{input_file_name}"
	chunk_size_mb = 13

	split_json_by_guid_station(input_file, output_dir, chunk_size_mb)