philippmuench · April 4, 2025 10:25
diff --git a/gistfile1.txt b/gistfile1.txt
 #!/usr/bin/env Rscript

 # Script to find exact matches between evaluation and training datasets
 # Identifies which samples from evaluation set are identical to samples in training set

 # Load necessary libraries
 if (!requireNamespace("Matrix", quietly = TRUE)) {
  install.packages("Matrix")
 }
 library(Matrix)

 # Function to check if two matrices are exactly the same
 # Returns TRUE if matrices are identical, FALSE otherwise
 are_matrices_identical <- function(matrix1, matrix2) {
  # Check if dimensions match
  if (!identical(dim(matrix1), dim(matrix2))) {
    return(FALSE)
  }
  
  # Check if all elements are equal
  return(all(matrix1 == matrix2))
 }

 # Function to find exact copies between evaluation and training samples
 find_exact_copies <- function(eval_samples, train_samples, eval_indices, train_indices) {
  n_eval <- length(eval_samples)
  n_train <- length(train_samples)
  
  # Create a data frame to store matching pairs
  exact_matches <- data.frame(
    EvalSampleIndex = integer(),
    TrainSampleIndex = integer(),
    stringsAsFactors = FALSE
  )
  
  cat("Searching for exact copies across", n_eval, "evaluation samples and", n_train, "training samples...\n")
  
  # Progress counter
  count <- 0
  report_interval <- max(1, floor(n_eval/10))
  
  # For each evaluation sample, check against all training samples
  for (i in 1:n_eval) {
    eval_idx <- eval_indices[i]
    eval_sample <- eval_samples[[i]]
    
    # Show progress
    count <- count + 1
    if (count %% report_interval == 0) {
      cat("  Progress:", round(count/n_eval * 100), "%\n")
    }
    
    found_match <- FALSE
    
    # Compare with each training sample
    for (j in 1:n_train) {
      train_idx <- train_indices[j]
      train_sample <- train_samples[[j]]
      
      # Check if they are identical
      if (are_matrices_identical(eval_sample, train_sample)) {
        exact_matches <- rbind(exact_matches, data.frame(
          EvalSampleIndex = eval_idx,
          TrainSampleIndex = train_idx
        ))
        found_match <- TRUE
      }
    }
  }
  
  return(exact_matches)
 }

 # Function to extract relevant samples for a specific GO term
 extract_positive_samples <- function(data, indices) {
  samples_list <- list()
  for (i in 1:length(indices)) {
    sample_idx <- indices[i]
    samples_list[[i]] <- data$X[sample_idx, , ]
  }
  return(samples_list)
 }

 # Main execution
 main <- function() {
  # --- Configuration ---
  data_file <- "training_data/pred_MF_train_v3.rds"
  eval_file <- "evaluation_data/pred_MF_expT1.rds"
  
  # --- Load Data ---
  cat("Loading training data from:", data_file, "\n")
  dat <- readRDS(data_file)
  cat("Training data loaded successfully.\n")
  
  cat("Loading evaluation data from:", eval_file, "\n")
  eval_dat <- readRDS(eval_file)
  cat("Evaluation data loaded successfully.\n")
  
  cat("Dimensions of training data features (X):", dim(dat$X), "\n")
  cat("Dimensions of evaluation data targets (Y):", dim(dat$Y), "\n")
  
  # --- Find GO terms that exist in both datasets ---
  n_go_terms <- ncol(dat$Y)
  cat("Number of GO terms in training data:", n_go_terms, "\n")
  
  valid_go_terms <- c()
  for (i in 1:n_go_terms) {
    train_pos_count <- sum(dat$Y[, i] == 1)
    eval_pos_count <- sum(eval_dat$Y[, i] == 1)
    
    if (train_pos_count > 0 && eval_pos_count > 0) {
      valid_go_terms <- c(valid_go_terms, i)
    }
  }
  
  if (length(valid_go_terms) == 0) {
    stop("No GO terms found with positive examples in both training and evaluation datasets.")
  }
  
  cat("Number of GO terms with positive examples in both datasets:", length(valid_go_terms), "\n")
  
  # Check for exact copies across all samples (regardless of GO term)
  cat("\n=== Checking for exact copies across ALL samples ===\n")
  
  # Extract all samples
  n_eval_samples <- dim(eval_dat$X)[1]
  n_train_samples <- dim(dat$X)[1]
  
  all_eval_indices <- 1:n_eval_samples
  all_train_indices <- 1:n_train_samples
  
  # Since we're dealing with potentially large datasets, we'll use a more memory-efficient approach
  cat("Preparing samples for comparison...\n")
  
  # Check if we need to do sampling (for very large datasets)
  max_samples_to_check <- 1000  # Set a reasonable limit
  
  if (n_eval_samples > max_samples_to_check) {
    cat("Large evaluation dataset detected. Sampling", max_samples_to_check, "samples for efficiency.\n")
    eval_sample_indices <- sample(all_eval_indices, max_samples_to_check)
  } else {
    eval_sample_indices <- all_eval_indices
  }
  
  # Extract samples
  eval_samples_list <- extract_positive_samples(eval_dat, eval_sample_indices)
  train_samples_list <- extract_positive_samples(dat, all_train_indices)
  
  # Find exact copies
  exact_matches <- find_exact_copies(eval_samples_list, train_samples_list, eval_sample_indices, all_train_indices)
  
  # Print results
  if (nrow(exact_matches) > 0) {
    cat("\n=== EXACT COPIES FOUND ===\n")
    cat("Found", nrow(exact_matches), "evaluation samples that are exact copies of training samples.\n")
    cat("Evaluation Sample Index | Training Sample Index\n")
    cat("----------------------------------------\n")
    for (i in 1:nrow(exact_matches)) {
      cat(sprintf("%20d | %20d\n", exact_matches$EvalSampleIndex[i], exact_matches$TrainSampleIndex[i]))
    }
    
    # Write to file
    output_file <- "exact_copies_all_samples.csv"
    write.csv(exact_matches, output_file, row.names = FALSE)
    cat("\nResults also saved to:", output_file, "\n")
  } else {
    cat("\nNo exact copies found between evaluation and training samples.\n")
  }
  
  # Now check for each GO term
  cat("\n=== Checking for exact copies by GO term ===\n")
  
  go_term_results <- list()
  
  for (go_term_idx in valid_go_terms) {
    cat("\nAnalyzing GO Term:", go_term_idx, "\n")
    
    # Extract positive samples for this GO term
    train_positive_indices <- which(dat$Y[, go_term_idx] == 1)
    eval_positive_indices <- which(eval_dat$Y[, go_term_idx] == 1)
    
    n_positive_train <- length(train_positive_indices)
    n_positive_eval <- length(eval_positive_indices)
    
    cat("  Number of positive training samples:", n_positive_train, "\n")
    cat("  Number of positive evaluation samples:", n_positive_eval, "\n")
    
    if (n_positive_eval > 0 && n_positive_train > 0) {
      # Extract samples
      eval_pos_samples_list <- extract_positive_samples(eval_dat, eval_positive_indices)
      train_pos_samples_list <- extract_positive_samples(dat, train_positive_indices)
      
      # Find exact copies
      go_term_matches <- find_exact_copies(eval_pos_samples_list, train_pos_samples_list, 
                                         eval_positive_indices, train_positive_indices)
      
      # Store results
      go_term_results[[as.character(go_term_idx)]] <- go_term_matches
      
      # Print results
      if (nrow(go_term_matches) > 0) {
        cat("\n  EXACT COPIES FOUND for GO Term", go_term_idx, "\n")
        cat("  Found", nrow(go_term_matches), "evaluation samples that are exact copies of training samples.\n")
        cat("  Evaluation Sample Index | Training Sample Index\n")
        cat("  ----------------------------------------\n")
        for (i in 1:nrow(go_term_matches)) {
          cat(sprintf("  %20d | %20d\n", go_term_matches$EvalSampleIndex[i], go_term_matches$TrainSampleIndex[i]))
        }
        
        # Write to file
        output_file <- paste0("exact_copies_GO_term_", go_term_idx, ".csv")
        write.csv(go_term_matches, output_file, row.names = FALSE)
        cat("\n  Results also saved to:", output_file, "\n")
      } else {
        cat("\n  No exact copies found for GO Term", go_term_idx, "\n")
      }
    }
  }
  
  cat("\n=== Analysis completed ===\n")
 }

 # Execute the main function
 main()
	#!/usr/bin/env Rscript

	# Script to find exact matches between evaluation and training datasets
	# Identifies which samples from evaluation set are identical to samples in training set

	# Load necessary libraries
	if (!requireNamespace("Matrix", quietly = TRUE)) {
	install.packages("Matrix")
	}
	library(Matrix)

	# Function to check if two matrices are exactly the same
	# Returns TRUE if matrices are identical, FALSE otherwise
	are_matrices_identical <- function(matrix1, matrix2) {
	# Check if dimensions match
	if (!identical(dim(matrix1), dim(matrix2))) {
	return(FALSE)
	}

	# Check if all elements are equal
	return(all(matrix1 == matrix2))
	}

	# Function to find exact copies between evaluation and training samples
	find_exact_copies <- function(eval_samples, train_samples, eval_indices, train_indices) {
	n_eval <- length(eval_samples)
	n_train <- length(train_samples)

	# Create a data frame to store matching pairs
	exact_matches <- data.frame(
	EvalSampleIndex = integer(),
	TrainSampleIndex = integer(),
	stringsAsFactors = FALSE
	)

	cat("Searching for exact copies across", n_eval, "evaluation samples and", n_train, "training samples...\n")

	# Progress counter
	count <- 0
	report_interval <- max(1, floor(n_eval/10))

	# For each evaluation sample, check against all training samples
	for (i in 1:n_eval) {
	eval_idx <- eval_indices[i]
	eval_sample <- eval_samples[[i]]

	# Show progress
	count <- count + 1
	if (count %% report_interval == 0) {
	cat(" Progress:", round(count/n_eval * 100), "%\n")
	}

	found_match <- FALSE

	# Compare with each training sample
	for (j in 1:n_train) {
	train_idx <- train_indices[j]
	train_sample <- train_samples[[j]]

	# Check if they are identical
	if (are_matrices_identical(eval_sample, train_sample)) {
	exact_matches <- rbind(exact_matches, data.frame(
	EvalSampleIndex = eval_idx,
	TrainSampleIndex = train_idx
	))
	found_match <- TRUE
	}
	}
	}

	return(exact_matches)
	}

	# Function to extract relevant samples for a specific GO term
	extract_positive_samples <- function(data, indices) {
	samples_list <- list()
	for (i in 1:length(indices)) {
	sample_idx <- indices[i]
	samples_list[[i]] <- data$X[sample_idx, , ]
	}
	return(samples_list)
	}

	# Main execution
	main <- function() {
	# --- Configuration ---
	data_file <- "training_data/pred_MF_train_v3.rds"
	eval_file <- "evaluation_data/pred_MF_expT1.rds"

	# --- Load Data ---
	cat("Loading training data from:", data_file, "\n")
	dat <- readRDS(data_file)
	cat("Training data loaded successfully.\n")

	cat("Loading evaluation data from:", eval_file, "\n")
	eval_dat <- readRDS(eval_file)
	cat("Evaluation data loaded successfully.\n")

	cat("Dimensions of training data features (X):", dim(dat$X), "\n")
	cat("Dimensions of evaluation data targets (Y):", dim(dat$Y), "\n")

	# --- Find GO terms that exist in both datasets ---
	n_go_terms <- ncol(dat$Y)
	cat("Number of GO terms in training data:", n_go_terms, "\n")

	valid_go_terms <- c()
	for (i in 1:n_go_terms) {
	train_pos_count <- sum(dat$Y[, i] == 1)
	eval_pos_count <- sum(eval_dat$Y[, i] == 1)

	if (train_pos_count > 0 && eval_pos_count > 0) {
	valid_go_terms <- c(valid_go_terms, i)
	}
	}

	if (length(valid_go_terms) == 0) {
	stop("No GO terms found with positive examples in both training and evaluation datasets.")
	}

	cat("Number of GO terms with positive examples in both datasets:", length(valid_go_terms), "\n")

	# Check for exact copies across all samples (regardless of GO term)
	cat("\n=== Checking for exact copies across ALL samples ===\n")

	# Extract all samples
	n_eval_samples <- dim(eval_dat$X)[1]
	n_train_samples <- dim(dat$X)[1]

	all_eval_indices <- 1:n_eval_samples
	all_train_indices <- 1:n_train_samples

	# Since we're dealing with potentially large datasets, we'll use a more memory-efficient approach
	cat("Preparing samples for comparison...\n")

	# Check if we need to do sampling (for very large datasets)
	max_samples_to_check <- 1000 # Set a reasonable limit

	if (n_eval_samples > max_samples_to_check) {
	cat("Large evaluation dataset detected. Sampling", max_samples_to_check, "samples for efficiency.\n")
	eval_sample_indices <- sample(all_eval_indices, max_samples_to_check)
	} else {
	eval_sample_indices <- all_eval_indices
	}

	# Extract samples
	eval_samples_list <- extract_positive_samples(eval_dat, eval_sample_indices)
	train_samples_list <- extract_positive_samples(dat, all_train_indices)

	# Find exact copies
	exact_matches <- find_exact_copies(eval_samples_list, train_samples_list, eval_sample_indices, all_train_indices)

	# Print results
	if (nrow(exact_matches) > 0) {
	cat("\n=== EXACT COPIES FOUND ===\n")
	cat("Found", nrow(exact_matches), "evaluation samples that are exact copies of training samples.\n")
	cat("Evaluation Sample Index \| Training Sample Index\n")
	cat("----------------------------------------\n")
	for (i in 1:nrow(exact_matches)) {
	cat(sprintf("%20d \| %20d\n", exact_matches$EvalSampleIndex[i], exact_matches$TrainSampleIndex[i]))
	}

	# Write to file
	output_file <- "exact_copies_all_samples.csv"
	write.csv(exact_matches, output_file, row.names = FALSE)
	cat("\nResults also saved to:", output_file, "\n")
	} else {
	cat("\nNo exact copies found between evaluation and training samples.\n")
	}

	# Now check for each GO term
	cat("\n=== Checking for exact copies by GO term ===\n")

	go_term_results <- list()

	for (go_term_idx in valid_go_terms) {
	cat("\nAnalyzing GO Term:", go_term_idx, "\n")

	# Extract positive samples for this GO term
	train_positive_indices <- which(dat$Y[, go_term_idx] == 1)
	eval_positive_indices <- which(eval_dat$Y[, go_term_idx] == 1)

	n_positive_train <- length(train_positive_indices)
	n_positive_eval <- length(eval_positive_indices)

	cat(" Number of positive training samples:", n_positive_train, "\n")
	cat(" Number of positive evaluation samples:", n_positive_eval, "\n")

	if (n_positive_eval > 0 && n_positive_train > 0) {
	# Extract samples
	eval_pos_samples_list <- extract_positive_samples(eval_dat, eval_positive_indices)
	train_pos_samples_list <- extract_positive_samples(dat, train_positive_indices)

	# Find exact copies
	go_term_matches <- find_exact_copies(eval_pos_samples_list, train_pos_samples_list,
	eval_positive_indices, train_positive_indices)

	# Store results
	go_term_results[[as.character(go_term_idx)]] <- go_term_matches

	# Print results
	if (nrow(go_term_matches) > 0) {
	cat("\n EXACT COPIES FOUND for GO Term", go_term_idx, "\n")
	cat(" Found", nrow(go_term_matches), "evaluation samples that are exact copies of training samples.\n")
	cat(" Evaluation Sample Index \| Training Sample Index\n")
	cat(" ----------------------------------------\n")
	for (i in 1:nrow(go_term_matches)) {
	cat(sprintf(" %20d \| %20d\n", go_term_matches$EvalSampleIndex[i], go_term_matches$TrainSampleIndex[i]))
	}

	# Write to file
	output_file <- paste0("exact_copies_GO_term_", go_term_idx, ".csv")
	write.csv(go_term_matches, output_file, row.names = FALSE)
	cat("\n Results also saved to:", output_file, "\n")
	} else {
	cat("\n No exact copies found for GO Term", go_term_idx, "\n")
	}
	}
	}

	cat("\n=== Analysis completed ===\n")
	}

	# Execute the main function
	main()