Last active
September 4, 2024 19:02
-
-
Save dylanmartin/b16c6588b1fe2cc6fb3766f7d0b771b8 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
import csv | |
import pandas as pd | |
# --- Step 1: Input Validation --- | |
def validate_input_file(file_path): | |
"""Checks if the file exists and is accessible.""" | |
if not os.path.exists(file_path): | |
print(f"Error: {file_path} does not exist.") | |
sys.exit(1) | |
def validate_directory(directory): | |
"""Checks if the directory exists.""" | |
if not os.path.isdir(directory): | |
print(f"Error: {directory} is not a valid directory.") | |
sys.exit(1) | |
# --- Step 2: CSV File Processing --- | |
def check_files_in_csv_exist(csv_filename): | |
"""Checks if all files listed in the first column of the CSV exist.""" | |
validate_input_file(csv_filename) | |
cwd = os.getcwd() | |
# Open and read the CSV file | |
with open(csv_filename, mode='r', newline='') as csvfile: | |
csvreader = csv.reader(csvfile) | |
next(csvreader) # Skip the header | |
missing_files = [] | |
for row in csvreader: | |
if row: | |
file_path = row[0] | |
full_path = os.path.join(cwd, file_path) | |
if not os.path.exists(full_path): | |
missing_files.append(full_path) | |
if missing_files: | |
print("The following files do not exist:") | |
for missing_file in missing_files: | |
print(missing_file) | |
else: | |
print("All files listed in the CSV exist.") | |
# --- Step 3: Checking for Duplicate Keys in ASEG Files --- | |
def check_repeated_keys_in_file(file_path): | |
"""Check if the first column of a tab-separated file has repeated keys.""" | |
try: | |
df = pd.read_csv(file_path, sep='\t', index_col=0) | |
duplicated = df.index.duplicated(keep=False) | |
return duplicated.any() # Returns True if duplicates exist | |
except Exception as e: | |
print(f"Error reading {file_path}: {e}") | |
return False | |
def find_files_with_repeated_keys(directory): | |
"""Finds all .txt files in the directory that contain repeated keys.""" | |
validate_directory(directory) | |
files_with_repeats = [] | |
for root, _, files in os.walk(directory): | |
for file in files: | |
if file.endswith(".txt"): | |
file_path = os.path.join(root, file) | |
if check_repeated_keys_in_file(file_path): | |
print(f"Repeated rows found in: {file_path}") | |
files_with_repeats.append(file_path) | |
if not files_with_repeats: | |
print("No files with repeated keys found.") | |
return files_with_repeats | |
# --- Step 4: Check if ASEG Files Contain All ROIs --- | |
def read_first_column(file_path): | |
"""Reads and returns the first column from a tab-delimited file.""" | |
try: | |
df = pd.read_csv(file_path, delimiter='\t', header=None) | |
return df.iloc[:, 0] | |
except Exception as e: | |
print(f"Error reading {file_path}: {e}") | |
return pd.Series() | |
def check_rois(rois_file, aseg_file): | |
"""Checks if the ASEG file contains all ROIs listed in the ROIs file.""" | |
try: | |
rois = set(read_first_column(rois_file)) | |
aseg = set(read_first_column(aseg_file)) | |
missing_rois = rois - aseg # ROIs in rois_file but not in aseg_file | |
if missing_rois: | |
print(f"{aseg_file} is missing the following ROIs: {', '.join(missing_rois)}") | |
else: | |
print(f"{aseg_file} contains all the ROIs.") | |
except Exception as e: | |
print(f"An error occurred while checking ROIs: {e}") | |
def check_aseg_files_for_rois(rois_file, directory): | |
"""Checks all .txt ASEG files in a directory for missing ROIs.""" | |
validate_input_file(rois_file) | |
validate_directory(directory) | |
txt_files = [f for f in os.listdir(directory) if f.endswith('.txt') and f != rois_file] | |
for file in txt_files: | |
aseg_file_path = os.path.join(directory, file) | |
check_rois(rois_file, aseg_file_path) | |
# --- Main Execution Flow --- | |
if __name__ == "__main__": | |
# Example usage based on command-line arguments | |
if len(sys.argv) < 2: | |
print("Usage: python validator.py <csv_filename> <rois_file> <directory>") | |
sys.exit(1) | |
# Extract input arguments | |
csv_filename = sys.argv[1] | |
rois_file = sys.argv[2] | |
directory = sys.argv[3] | |
# Validate CSV file for existence of paths | |
check_files_in_csv_exist(csv_filename) | |
# Check for repeated keys in .txt files in the directory | |
find_files_with_repeated_keys(directory) | |
# Check ASEG files for missing ROIs | |
check_aseg_files_for_rois(rois_file, directory) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment