Created
April 4, 2025 10:25
-
-
Save philippmuench/b428c1c3ab0b255342011ff52b2daa96 to your computer and use it in GitHub Desktop.
check_for_exact_copies.r
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env Rscript | |
# Script to find exact matches between evaluation and training datasets | |
# Identifies which samples from evaluation set are identical to samples in training set | |
# Load necessary libraries | |
if (!requireNamespace("Matrix", quietly = TRUE)) { | |
install.packages("Matrix") | |
} | |
library(Matrix) | |
# Function to check if two matrices are exactly the same | |
# Returns TRUE if matrices are identical, FALSE otherwise | |
are_matrices_identical <- function(matrix1, matrix2) { | |
# Check if dimensions match | |
if (!identical(dim(matrix1), dim(matrix2))) { | |
return(FALSE) | |
} | |
# Check if all elements are equal | |
return(all(matrix1 == matrix2)) | |
} | |
# Function to find exact copies between evaluation and training samples | |
find_exact_copies <- function(eval_samples, train_samples, eval_indices, train_indices) { | |
n_eval <- length(eval_samples) | |
n_train <- length(train_samples) | |
# Create a data frame to store matching pairs | |
exact_matches <- data.frame( | |
EvalSampleIndex = integer(), | |
TrainSampleIndex = integer(), | |
stringsAsFactors = FALSE | |
) | |
cat("Searching for exact copies across", n_eval, "evaluation samples and", n_train, "training samples...\n") | |
# Progress counter | |
count <- 0 | |
report_interval <- max(1, floor(n_eval/10)) | |
# For each evaluation sample, check against all training samples | |
for (i in 1:n_eval) { | |
eval_idx <- eval_indices[i] | |
eval_sample <- eval_samples[[i]] | |
# Show progress | |
count <- count + 1 | |
if (count %% report_interval == 0) { | |
cat(" Progress:", round(count/n_eval * 100), "%\n") | |
} | |
found_match <- FALSE | |
# Compare with each training sample | |
for (j in 1:n_train) { | |
train_idx <- train_indices[j] | |
train_sample <- train_samples[[j]] | |
# Check if they are identical | |
if (are_matrices_identical(eval_sample, train_sample)) { | |
exact_matches <- rbind(exact_matches, data.frame( | |
EvalSampleIndex = eval_idx, | |
TrainSampleIndex = train_idx | |
)) | |
found_match <- TRUE | |
} | |
} | |
} | |
return(exact_matches) | |
} | |
# Function to extract relevant samples for a specific GO term | |
extract_positive_samples <- function(data, indices) { | |
samples_list <- list() | |
for (i in 1:length(indices)) { | |
sample_idx <- indices[i] | |
samples_list[[i]] <- data$X[sample_idx, , ] | |
} | |
return(samples_list) | |
} | |
# Main execution | |
main <- function() { | |
# --- Configuration --- | |
data_file <- "training_data/pred_MF_train_v3.rds" | |
eval_file <- "evaluation_data/pred_MF_expT1.rds" | |
# --- Load Data --- | |
cat("Loading training data from:", data_file, "\n") | |
dat <- readRDS(data_file) | |
cat("Training data loaded successfully.\n") | |
cat("Loading evaluation data from:", eval_file, "\n") | |
eval_dat <- readRDS(eval_file) | |
cat("Evaluation data loaded successfully.\n") | |
cat("Dimensions of training data features (X):", dim(dat$X), "\n") | |
cat("Dimensions of evaluation data targets (Y):", dim(dat$Y), "\n") | |
# --- Find GO terms that exist in both datasets --- | |
n_go_terms <- ncol(dat$Y) | |
cat("Number of GO terms in training data:", n_go_terms, "\n") | |
valid_go_terms <- c() | |
for (i in 1:n_go_terms) { | |
train_pos_count <- sum(dat$Y[, i] == 1) | |
eval_pos_count <- sum(eval_dat$Y[, i] == 1) | |
if (train_pos_count > 0 && eval_pos_count > 0) { | |
valid_go_terms <- c(valid_go_terms, i) | |
} | |
} | |
if (length(valid_go_terms) == 0) { | |
stop("No GO terms found with positive examples in both training and evaluation datasets.") | |
} | |
cat("Number of GO terms with positive examples in both datasets:", length(valid_go_terms), "\n") | |
# Check for exact copies across all samples (regardless of GO term) | |
cat("\n=== Checking for exact copies across ALL samples ===\n") | |
# Extract all samples | |
n_eval_samples <- dim(eval_dat$X)[1] | |
n_train_samples <- dim(dat$X)[1] | |
all_eval_indices <- 1:n_eval_samples | |
all_train_indices <- 1:n_train_samples | |
# Since we're dealing with potentially large datasets, we'll use a more memory-efficient approach | |
cat("Preparing samples for comparison...\n") | |
# Check if we need to do sampling (for very large datasets) | |
max_samples_to_check <- 1000 # Set a reasonable limit | |
if (n_eval_samples > max_samples_to_check) { | |
cat("Large evaluation dataset detected. Sampling", max_samples_to_check, "samples for efficiency.\n") | |
eval_sample_indices <- sample(all_eval_indices, max_samples_to_check) | |
} else { | |
eval_sample_indices <- all_eval_indices | |
} | |
# Extract samples | |
eval_samples_list <- extract_positive_samples(eval_dat, eval_sample_indices) | |
train_samples_list <- extract_positive_samples(dat, all_train_indices) | |
# Find exact copies | |
exact_matches <- find_exact_copies(eval_samples_list, train_samples_list, eval_sample_indices, all_train_indices) | |
# Print results | |
if (nrow(exact_matches) > 0) { | |
cat("\n=== EXACT COPIES FOUND ===\n") | |
cat("Found", nrow(exact_matches), "evaluation samples that are exact copies of training samples.\n") | |
cat("Evaluation Sample Index | Training Sample Index\n") | |
cat("----------------------------------------\n") | |
for (i in 1:nrow(exact_matches)) { | |
cat(sprintf("%20d | %20d\n", exact_matches$EvalSampleIndex[i], exact_matches$TrainSampleIndex[i])) | |
} | |
# Write to file | |
output_file <- "exact_copies_all_samples.csv" | |
write.csv(exact_matches, output_file, row.names = FALSE) | |
cat("\nResults also saved to:", output_file, "\n") | |
} else { | |
cat("\nNo exact copies found between evaluation and training samples.\n") | |
} | |
# Now check for each GO term | |
cat("\n=== Checking for exact copies by GO term ===\n") | |
go_term_results <- list() | |
for (go_term_idx in valid_go_terms) { | |
cat("\nAnalyzing GO Term:", go_term_idx, "\n") | |
# Extract positive samples for this GO term | |
train_positive_indices <- which(dat$Y[, go_term_idx] == 1) | |
eval_positive_indices <- which(eval_dat$Y[, go_term_idx] == 1) | |
n_positive_train <- length(train_positive_indices) | |
n_positive_eval <- length(eval_positive_indices) | |
cat(" Number of positive training samples:", n_positive_train, "\n") | |
cat(" Number of positive evaluation samples:", n_positive_eval, "\n") | |
if (n_positive_eval > 0 && n_positive_train > 0) { | |
# Extract samples | |
eval_pos_samples_list <- extract_positive_samples(eval_dat, eval_positive_indices) | |
train_pos_samples_list <- extract_positive_samples(dat, train_positive_indices) | |
# Find exact copies | |
go_term_matches <- find_exact_copies(eval_pos_samples_list, train_pos_samples_list, | |
eval_positive_indices, train_positive_indices) | |
# Store results | |
go_term_results[[as.character(go_term_idx)]] <- go_term_matches | |
# Print results | |
if (nrow(go_term_matches) > 0) { | |
cat("\n EXACT COPIES FOUND for GO Term", go_term_idx, "\n") | |
cat(" Found", nrow(go_term_matches), "evaluation samples that are exact copies of training samples.\n") | |
cat(" Evaluation Sample Index | Training Sample Index\n") | |
cat(" ----------------------------------------\n") | |
for (i in 1:nrow(go_term_matches)) { | |
cat(sprintf(" %20d | %20d\n", go_term_matches$EvalSampleIndex[i], go_term_matches$TrainSampleIndex[i])) | |
} | |
# Write to file | |
output_file <- paste0("exact_copies_GO_term_", go_term_idx, ".csv") | |
write.csv(go_term_matches, output_file, row.names = FALSE) | |
cat("\n Results also saved to:", output_file, "\n") | |
} else { | |
cat("\n No exact copies found for GO Term", go_term_idx, "\n") | |
} | |
} | |
} | |
cat("\n=== Analysis completed ===\n") | |
} | |
# Execute the main function | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment