Skip to content

Instantly share code, notes, and snippets.

@dylanmartin
Last active September 4, 2024 19:02
Show Gist options
  • Save dylanmartin/b16c6588b1fe2cc6fb3766f7d0b771b8 to your computer and use it in GitHub Desktop.
Save dylanmartin/b16c6588b1fe2cc6fb3766f7d0b771b8 to your computer and use it in GitHub Desktop.
import os
import sys
import csv
import pandas as pd
# --- Step 1: Input Validation ---
def validate_input_file(file_path):
"""Checks if the file exists and is accessible."""
if not os.path.exists(file_path):
print(f"Error: {file_path} does not exist.")
sys.exit(1)
def validate_directory(directory):
"""Checks if the directory exists."""
if not os.path.isdir(directory):
print(f"Error: {directory} is not a valid directory.")
sys.exit(1)
# --- Step 2: CSV File Processing ---
def check_files_in_csv_exist(csv_filename):
"""Checks if all files listed in the first column of the CSV exist."""
validate_input_file(csv_filename)
cwd = os.getcwd()
# Open and read the CSV file
with open(csv_filename, mode='r', newline='') as csvfile:
csvreader = csv.reader(csvfile)
next(csvreader) # Skip the header
missing_files = []
for row in csvreader:
if row:
file_path = row[0]
full_path = os.path.join(cwd, file_path)
if not os.path.exists(full_path):
missing_files.append(full_path)
if missing_files:
print("The following files do not exist:")
for missing_file in missing_files:
print(missing_file)
else:
print("All files listed in the CSV exist.")
# --- Step 3: Checking for Duplicate Keys in ASEG Files ---
def check_repeated_keys_in_file(file_path):
"""Check if the first column of a tab-separated file has repeated keys."""
try:
df = pd.read_csv(file_path, sep='\t', index_col=0)
duplicated = df.index.duplicated(keep=False)
return duplicated.any() # Returns True if duplicates exist
except Exception as e:
print(f"Error reading {file_path}: {e}")
return False
def find_files_with_repeated_keys(directory):
"""Finds all .txt files in the directory that contain repeated keys."""
validate_directory(directory)
files_with_repeats = []
for root, _, files in os.walk(directory):
for file in files:
if file.endswith(".txt"):
file_path = os.path.join(root, file)
if check_repeated_keys_in_file(file_path):
print(f"Repeated rows found in: {file_path}")
files_with_repeats.append(file_path)
if not files_with_repeats:
print("No files with repeated keys found.")
return files_with_repeats
# --- Step 4: Check if ASEG Files Contain All ROIs ---
def read_first_column(file_path):
"""Reads and returns the first column from a tab-delimited file."""
try:
df = pd.read_csv(file_path, delimiter='\t', header=None)
return df.iloc[:, 0]
except Exception as e:
print(f"Error reading {file_path}: {e}")
return pd.Series()
def check_rois(rois_file, aseg_file):
"""Checks if the ASEG file contains all ROIs listed in the ROIs file."""
try:
rois = set(read_first_column(rois_file))
aseg = set(read_first_column(aseg_file))
missing_rois = rois - aseg # ROIs in rois_file but not in aseg_file
if missing_rois:
print(f"{aseg_file} is missing the following ROIs: {', '.join(missing_rois)}")
else:
print(f"{aseg_file} contains all the ROIs.")
except Exception as e:
print(f"An error occurred while checking ROIs: {e}")
def check_aseg_files_for_rois(rois_file, directory):
"""Checks all .txt ASEG files in a directory for missing ROIs."""
validate_input_file(rois_file)
validate_directory(directory)
txt_files = [f for f in os.listdir(directory) if f.endswith('.txt') and f != rois_file]
for file in txt_files:
aseg_file_path = os.path.join(directory, file)
check_rois(rois_file, aseg_file_path)
# --- Main Execution Flow ---
if __name__ == "__main__":
# Example usage based on command-line arguments
if len(sys.argv) < 2:
print("Usage: python validator.py <csv_filename> <rois_file> <directory>")
sys.exit(1)
# Extract input arguments
csv_filename = sys.argv[1]
rois_file = sys.argv[2]
directory = sys.argv[3]
# Validate CSV file for existence of paths
check_files_in_csv_exist(csv_filename)
# Check for repeated keys in .txt files in the directory
find_files_with_repeated_keys(directory)
# Check ASEG files for missing ROIs
check_aseg_files_for_rois(rois_file, directory)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment