Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save philippmuench/b428c1c3ab0b255342011ff52b2daa96 to your computer and use it in GitHub Desktop.
Save philippmuench/b428c1c3ab0b255342011ff52b2daa96 to your computer and use it in GitHub Desktop.
check_for_exact_copies.r
#!/usr/bin/env Rscript
# Script to find exact matches between evaluation and training datasets
# Identifies which samples from evaluation set are identical to samples in training set
# Load necessary libraries
if (!requireNamespace("Matrix", quietly = TRUE)) {
install.packages("Matrix")
}
library(Matrix)
# Function to check if two matrices are exactly the same
# Returns TRUE if matrices are identical, FALSE otherwise
are_matrices_identical <- function(matrix1, matrix2) {
# Check if dimensions match
if (!identical(dim(matrix1), dim(matrix2))) {
return(FALSE)
}
# Check if all elements are equal
return(all(matrix1 == matrix2))
}
# Function to find exact copies between evaluation and training samples
find_exact_copies <- function(eval_samples, train_samples, eval_indices, train_indices) {
n_eval <- length(eval_samples)
n_train <- length(train_samples)
# Create a data frame to store matching pairs
exact_matches <- data.frame(
EvalSampleIndex = integer(),
TrainSampleIndex = integer(),
stringsAsFactors = FALSE
)
cat("Searching for exact copies across", n_eval, "evaluation samples and", n_train, "training samples...\n")
# Progress counter
count <- 0
report_interval <- max(1, floor(n_eval/10))
# For each evaluation sample, check against all training samples
for (i in 1:n_eval) {
eval_idx <- eval_indices[i]
eval_sample <- eval_samples[[i]]
# Show progress
count <- count + 1
if (count %% report_interval == 0) {
cat(" Progress:", round(count/n_eval * 100), "%\n")
}
found_match <- FALSE
# Compare with each training sample
for (j in 1:n_train) {
train_idx <- train_indices[j]
train_sample <- train_samples[[j]]
# Check if they are identical
if (are_matrices_identical(eval_sample, train_sample)) {
exact_matches <- rbind(exact_matches, data.frame(
EvalSampleIndex = eval_idx,
TrainSampleIndex = train_idx
))
found_match <- TRUE
}
}
}
return(exact_matches)
}
# Function to extract relevant samples for a specific GO term
extract_positive_samples <- function(data, indices) {
samples_list <- list()
for (i in 1:length(indices)) {
sample_idx <- indices[i]
samples_list[[i]] <- data$X[sample_idx, , ]
}
return(samples_list)
}
# Main execution
main <- function() {
# --- Configuration ---
data_file <- "training_data/pred_MF_train_v3.rds"
eval_file <- "evaluation_data/pred_MF_expT1.rds"
# --- Load Data ---
cat("Loading training data from:", data_file, "\n")
dat <- readRDS(data_file)
cat("Training data loaded successfully.\n")
cat("Loading evaluation data from:", eval_file, "\n")
eval_dat <- readRDS(eval_file)
cat("Evaluation data loaded successfully.\n")
cat("Dimensions of training data features (X):", dim(dat$X), "\n")
cat("Dimensions of evaluation data targets (Y):", dim(dat$Y), "\n")
# --- Find GO terms that exist in both datasets ---
n_go_terms <- ncol(dat$Y)
cat("Number of GO terms in training data:", n_go_terms, "\n")
valid_go_terms <- c()
for (i in 1:n_go_terms) {
train_pos_count <- sum(dat$Y[, i] == 1)
eval_pos_count <- sum(eval_dat$Y[, i] == 1)
if (train_pos_count > 0 && eval_pos_count > 0) {
valid_go_terms <- c(valid_go_terms, i)
}
}
if (length(valid_go_terms) == 0) {
stop("No GO terms found with positive examples in both training and evaluation datasets.")
}
cat("Number of GO terms with positive examples in both datasets:", length(valid_go_terms), "\n")
# Check for exact copies across all samples (regardless of GO term)
cat("\n=== Checking for exact copies across ALL samples ===\n")
# Extract all samples
n_eval_samples <- dim(eval_dat$X)[1]
n_train_samples <- dim(dat$X)[1]
all_eval_indices <- 1:n_eval_samples
all_train_indices <- 1:n_train_samples
# Since we're dealing with potentially large datasets, we'll use a more memory-efficient approach
cat("Preparing samples for comparison...\n")
# Check if we need to do sampling (for very large datasets)
max_samples_to_check <- 1000 # Set a reasonable limit
if (n_eval_samples > max_samples_to_check) {
cat("Large evaluation dataset detected. Sampling", max_samples_to_check, "samples for efficiency.\n")
eval_sample_indices <- sample(all_eval_indices, max_samples_to_check)
} else {
eval_sample_indices <- all_eval_indices
}
# Extract samples
eval_samples_list <- extract_positive_samples(eval_dat, eval_sample_indices)
train_samples_list <- extract_positive_samples(dat, all_train_indices)
# Find exact copies
exact_matches <- find_exact_copies(eval_samples_list, train_samples_list, eval_sample_indices, all_train_indices)
# Print results
if (nrow(exact_matches) > 0) {
cat("\n=== EXACT COPIES FOUND ===\n")
cat("Found", nrow(exact_matches), "evaluation samples that are exact copies of training samples.\n")
cat("Evaluation Sample Index | Training Sample Index\n")
cat("----------------------------------------\n")
for (i in 1:nrow(exact_matches)) {
cat(sprintf("%20d | %20d\n", exact_matches$EvalSampleIndex[i], exact_matches$TrainSampleIndex[i]))
}
# Write to file
output_file <- "exact_copies_all_samples.csv"
write.csv(exact_matches, output_file, row.names = FALSE)
cat("\nResults also saved to:", output_file, "\n")
} else {
cat("\nNo exact copies found between evaluation and training samples.\n")
}
# Now check for each GO term
cat("\n=== Checking for exact copies by GO term ===\n")
go_term_results <- list()
for (go_term_idx in valid_go_terms) {
cat("\nAnalyzing GO Term:", go_term_idx, "\n")
# Extract positive samples for this GO term
train_positive_indices <- which(dat$Y[, go_term_idx] == 1)
eval_positive_indices <- which(eval_dat$Y[, go_term_idx] == 1)
n_positive_train <- length(train_positive_indices)
n_positive_eval <- length(eval_positive_indices)
cat(" Number of positive training samples:", n_positive_train, "\n")
cat(" Number of positive evaluation samples:", n_positive_eval, "\n")
if (n_positive_eval > 0 && n_positive_train > 0) {
# Extract samples
eval_pos_samples_list <- extract_positive_samples(eval_dat, eval_positive_indices)
train_pos_samples_list <- extract_positive_samples(dat, train_positive_indices)
# Find exact copies
go_term_matches <- find_exact_copies(eval_pos_samples_list, train_pos_samples_list,
eval_positive_indices, train_positive_indices)
# Store results
go_term_results[[as.character(go_term_idx)]] <- go_term_matches
# Print results
if (nrow(go_term_matches) > 0) {
cat("\n EXACT COPIES FOUND for GO Term", go_term_idx, "\n")
cat(" Found", nrow(go_term_matches), "evaluation samples that are exact copies of training samples.\n")
cat(" Evaluation Sample Index | Training Sample Index\n")
cat(" ----------------------------------------\n")
for (i in 1:nrow(go_term_matches)) {
cat(sprintf(" %20d | %20d\n", go_term_matches$EvalSampleIndex[i], go_term_matches$TrainSampleIndex[i]))
}
# Write to file
output_file <- paste0("exact_copies_GO_term_", go_term_idx, ".csv")
write.csv(go_term_matches, output_file, row.names = FALSE)
cat("\n Results also saved to:", output_file, "\n")
} else {
cat("\n No exact copies found for GO Term", go_term_idx, "\n")
}
}
}
cat("\n=== Analysis completed ===\n")
}
# Execute the main function
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment