Skip to content

Instantly share code, notes, and snippets.

@erjan
Created February 26, 2025 08:31
Show Gist options
  • Save erjan/a55fee5ef0943320fb52260b5dbe9139 to your computer and use it in GitHub Desktop.
Save erjan/a55fee5ef0943320fb52260b5dbe9139 to your computer and use it in GitHub Desktop.
json to csv file splitter - mandy chan
import json
import os
import time
import csv
def dump_to_csv(data, filename):
with open(filename, 'w', newline='') as csvfile:
writer = csv.writer(csvfile, delimiter='\t')
# target_aoi_id neightbor_aoi_id distance
writer.writerow(["target_aoi_id", "neightbor_aoi_id", "distance"]) # Write header
for guid_station, contents in data.items():
for content,distance in contents.items():
writer.writerow([guid_station, content, distance])
def split_json_by_guid_station(input_file, output_dir, chunk_size_mb):
"""
Splits a JSON file into smaller files, ensuring that each file
contains complete 'guid_station' entries. It avoids splitting a
'guid_station' across multiple files.
this includes timing and progress output.
Args:
input_file (str): Path to the input JSON file.
output_dir (str): Directory to store the split JSON files.
chunk_size_mb (int): Approximate size of each output file in MB.
It's an approximate target; files may be slightly
larger to avoid splitting 'guid_station' entries.
It checks that the size of a batch of station_id data can fit altogether or not at all -so the data will never be split in middle
avg completion time: 70 sec for 430 mb file size
"""
start_time = time.time() # Start the overall timer
if not os.path.exists(output_dir):
os.makedirs(output_dir)
chunk_size_bytes = chunk_size_mb * 1024 * 1024
file_count = 0
current_chunk = {}
current_chunk_size = 0
output_file = None
print(f"Starting to process {input_file}...")
with open(input_file, 'r') as f:
try:
data = json.load(f) # Load the entire file into a Python dictionary
except json.JSONDecodeError as e:
print(f"Error decoding JSON: {e}")
return
if not isinstance(data, dict):
print("Error: Input file does not contain a top-level JSON object (dictionary).")
return
total_guid_stations = len(data)
guid_station_count = 0
# Store the first guid_station of each chunk for validation
first_guid_stations = {}
for guid_station, contents in data.items():
loop_start_time = time.time() # Start timer for each guid_station
entry_string = json.dumps({guid_station: contents}) # Convert entry to JSON string to measure its size
entry_size = len(entry_string.encode('utf-8'))
if current_chunk_size + entry_size > chunk_size_bytes and current_chunk:
# Save the current chunk to a file
if output_file:
dump_to_csv(current_chunk, output_filename)
output_file.close()
# json.dump(current_chunk, output_file, indent=4)
# output_file.close()
print(f"CSV Chunk saved to: {output_filename}")
# Validation: Check for split guid_stations
current_chunk_keys = list(current_chunk.keys())
if len(current_chunk_keys) > 0:
first_guid_stations[file_count] = current_chunk_keys[0] # Save the *first* guid_station in chunk
output_file = None # Reset output_file to None after closing
else:
print(f"Chunk was empty and not saved")
# Start a new chunk
# file_count += 1 #<-------- REMOVE THIS LINE
current_chunk = {}
current_chunk_size = 0
# Add the guid_station entry to the current chunk
current_chunk[guid_station] = contents
current_chunk_size += entry_size
# Open new output file if not already open
if output_file is None:
file_count += 1
output_filename = os.path.join(output_dir, f"chunk_{file_count:04d}.csv")
output_file = open(output_filename, 'w')
#Save the first guid station for the file to validate that there are no splits between them
first_guid_stations[file_count] = guid_station # Save the *first* guid_station in chunk
guid_station_count += 1
loop_end_time = time.time() # End timer for each guid_station
loop_duration = loop_end_time - loop_start_time
print(f"Processed guid_station {guid_station_count}/{total_guid_stations} ({guid_station[:20]}...): Chunk size: {current_chunk_size / (1024 * 1024):.2f} MB. Time taken: {loop_duration:.2f} seconds. Current csv file {output_filename}")
# Save the last chunk if it's not empty
if current_chunk:
if output_file:
dump_to_csv(current_chunk, output_filename)
output_file.close()
print(f"Final csv chunk saved to: {output_filename}")
# Validation: Check for split guid_stations
current_chunk_keys = list(current_chunk.keys())
if len(current_chunk_keys) > 0:
first_guid_stations[file_count] = current_chunk_keys[0]
else:
print(f"Final chunk was empty and not saved")
end_time = time.time() # End the overall timer
total_time = end_time - start_time
print(f"JSON file split into {file_count} CSV files in: {output_dir}")
print(f"Total time taken: {total_time:.2f} seconds")
input_file = "nearest_51_100_20250101.json" # Replace with the name of your actual JSON file
input_file_name = input_file.split('.')[0]
output_dir = f"output_csv_chunks_for_{input_file_name}"
chunk_size_mb = 13
split_json_by_guid_station(input_file, output_dir, chunk_size_mb)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment