Created
February 26, 2025 08:31
-
-
Save erjan/a55fee5ef0943320fb52260b5dbe9139 to your computer and use it in GitHub Desktop.
json to csv file splitter - mandy chan
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import os | |
import time | |
import csv | |
def dump_to_csv(data, filename): | |
with open(filename, 'w', newline='') as csvfile: | |
writer = csv.writer(csvfile, delimiter='\t') | |
# target_aoi_id neightbor_aoi_id distance | |
writer.writerow(["target_aoi_id", "neightbor_aoi_id", "distance"]) # Write header | |
for guid_station, contents in data.items(): | |
for content,distance in contents.items(): | |
writer.writerow([guid_station, content, distance]) | |
def split_json_by_guid_station(input_file, output_dir, chunk_size_mb): | |
""" | |
Splits a JSON file into smaller files, ensuring that each file | |
contains complete 'guid_station' entries. It avoids splitting a | |
'guid_station' across multiple files. | |
this includes timing and progress output. | |
Args: | |
input_file (str): Path to the input JSON file. | |
output_dir (str): Directory to store the split JSON files. | |
chunk_size_mb (int): Approximate size of each output file in MB. | |
It's an approximate target; files may be slightly | |
larger to avoid splitting 'guid_station' entries. | |
It checks that the size of a batch of station_id data can fit altogether or not at all -so the data will never be split in middle | |
avg completion time: 70 sec for 430 mb file size | |
""" | |
start_time = time.time() # Start the overall timer | |
if not os.path.exists(output_dir): | |
os.makedirs(output_dir) | |
chunk_size_bytes = chunk_size_mb * 1024 * 1024 | |
file_count = 0 | |
current_chunk = {} | |
current_chunk_size = 0 | |
output_file = None | |
print(f"Starting to process {input_file}...") | |
with open(input_file, 'r') as f: | |
try: | |
data = json.load(f) # Load the entire file into a Python dictionary | |
except json.JSONDecodeError as e: | |
print(f"Error decoding JSON: {e}") | |
return | |
if not isinstance(data, dict): | |
print("Error: Input file does not contain a top-level JSON object (dictionary).") | |
return | |
total_guid_stations = len(data) | |
guid_station_count = 0 | |
# Store the first guid_station of each chunk for validation | |
first_guid_stations = {} | |
for guid_station, contents in data.items(): | |
loop_start_time = time.time() # Start timer for each guid_station | |
entry_string = json.dumps({guid_station: contents}) # Convert entry to JSON string to measure its size | |
entry_size = len(entry_string.encode('utf-8')) | |
if current_chunk_size + entry_size > chunk_size_bytes and current_chunk: | |
# Save the current chunk to a file | |
if output_file: | |
dump_to_csv(current_chunk, output_filename) | |
output_file.close() | |
# json.dump(current_chunk, output_file, indent=4) | |
# output_file.close() | |
print(f"CSV Chunk saved to: {output_filename}") | |
# Validation: Check for split guid_stations | |
current_chunk_keys = list(current_chunk.keys()) | |
if len(current_chunk_keys) > 0: | |
first_guid_stations[file_count] = current_chunk_keys[0] # Save the *first* guid_station in chunk | |
output_file = None # Reset output_file to None after closing | |
else: | |
print(f"Chunk was empty and not saved") | |
# Start a new chunk | |
# file_count += 1 #<-------- REMOVE THIS LINE | |
current_chunk = {} | |
current_chunk_size = 0 | |
# Add the guid_station entry to the current chunk | |
current_chunk[guid_station] = contents | |
current_chunk_size += entry_size | |
# Open new output file if not already open | |
if output_file is None: | |
file_count += 1 | |
output_filename = os.path.join(output_dir, f"chunk_{file_count:04d}.csv") | |
output_file = open(output_filename, 'w') | |
#Save the first guid station for the file to validate that there are no splits between them | |
first_guid_stations[file_count] = guid_station # Save the *first* guid_station in chunk | |
guid_station_count += 1 | |
loop_end_time = time.time() # End timer for each guid_station | |
loop_duration = loop_end_time - loop_start_time | |
print(f"Processed guid_station {guid_station_count}/{total_guid_stations} ({guid_station[:20]}...): Chunk size: {current_chunk_size / (1024 * 1024):.2f} MB. Time taken: {loop_duration:.2f} seconds. Current csv file {output_filename}") | |
# Save the last chunk if it's not empty | |
if current_chunk: | |
if output_file: | |
dump_to_csv(current_chunk, output_filename) | |
output_file.close() | |
print(f"Final csv chunk saved to: {output_filename}") | |
# Validation: Check for split guid_stations | |
current_chunk_keys = list(current_chunk.keys()) | |
if len(current_chunk_keys) > 0: | |
first_guid_stations[file_count] = current_chunk_keys[0] | |
else: | |
print(f"Final chunk was empty and not saved") | |
end_time = time.time() # End the overall timer | |
total_time = end_time - start_time | |
print(f"JSON file split into {file_count} CSV files in: {output_dir}") | |
print(f"Total time taken: {total_time:.2f} seconds") | |
input_file = "nearest_51_100_20250101.json" # Replace with the name of your actual JSON file | |
input_file_name = input_file.split('.')[0] | |
output_dir = f"output_csv_chunks_for_{input_file_name}" | |
chunk_size_mb = 13 | |
split_json_by_guid_station(input_file, output_dir, chunk_size_mb) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment