Last active
June 11, 2025 11:52
-
-
Save AkselA/3d3730110733cb959387e3d31e9ca9fa to your computer and use it in GitHub Desktop.
Python functions for performing simple DNA mutation simulations
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# # # Aksel A. Henriksen 10.07.2025 | |
# # # With good suggestions by the folks at Code Review: | |
# # # https://codereview.stackexchange.com/questions/297264/simple-mutation-simulation-for-use-in-science-class | |
import random | |
import os | |
import csv | |
def diss(seq_1, seq_2): | |
""" | |
Counts number of dissimilarities between two sequences. | |
Also called Hamming distance. | |
Keyword arguments: | |
seq_1 -- A list or tuple | |
seq_2 -- A list or tuple | |
If the input sequences aren't of equal length, then the latter | |
part of the longer sequence is ignored. | |
Return: | |
Single integer value | |
""" | |
count = 0 | |
seq_length = min([len(seq_1), len(seq_2)]) | |
for index in range(seq_length): | |
if seq_1[index] != seq_2[index]: | |
count += 1 | |
return count | |
def mutate(dna_seq): | |
""" | |
Mutates a DNA sequence, substituting a nucleotide at a random location | |
with a random nucleotide. | |
Keyword arguments: | |
dna_seq -- A list containing single upper case characters [A, T, G, C] | |
Return: | |
A list containing single upper case characters [A, T, G, C], | |
same length as input | |
""" | |
# Possible nucleotides | |
pos_nuc = ['A', 'T', 'G', 'C'] | |
# Random location for mutation | |
mut_loc = random.randint(0, len(dna_seq)-1) | |
# Select one nucleotide at random and substitute it into the dna sequence | |
dna_seq[mut_loc] = random.choice(pos_nuc) | |
return(dna_seq) | |
def sim_once(dna_seq, generations): | |
""" | |
Mutates a DNA sequence ('dna_seq') 'generations' times. | |
For each mutation the algorithm calculates the proportional | |
dissimilarity between it and the original sequence. | |
Keyword arguments: | |
dna_seq -- A list containing single upper case characters [A, T, G, C] | |
generations -- An integer specifying the number of generations | |
Return: | |
A list of floating point values. The list is 'generations' long. | |
""" | |
# Store a copy of the original DNA sequence that | |
# the mutated sequence can be compared to. | |
dna_seq_orig = dna_seq[:] | |
seq_length = len(dna_seq) | |
diss_val = [] | |
# For loop that mutates the sequence generations times and calculates the | |
# dissimilarity after each mutation. | |
for _ in range(generations): | |
# Mutate the DNA sequence | |
dna_seq = mutate(dna_seq) | |
# Calculate proportional dissimalirity | |
diss_val.append(diss(dna_seq, dna_seq_orig)/seq_length) | |
# Print proportional dissimilarities. | |
return diss_val | |
def sim_repeat(dna_seq, generations, reps): | |
""" | |
Repeat the sim_once function 'reps' times. | |
Keyword arguments: | |
dna_seq -- list containing single upper case characters [A, T, G, C] | |
generations -- integer specifying the number of generations | |
reps -- integer specifying the number of times to repeat the simulation | |
Return: | |
A 2D rectangular list of floating point values. | |
Each of the 'reps' sub-lists is the result of an individual run of the | |
function sim_once, and is therefore 'generations' long. | |
""" | |
mutate_mat = [] | |
for _ in range(reps): | |
mutate_mat.append(sim_once(dna_seq, generations)) | |
return mutate_mat | |
def export_mutate(mutate_mat, filename="mutation_simulation.csv"): | |
""" | |
Helps with exporting mutation simulation results to a CSV file | |
that can be read by other programs, like Google Sheets. | |
Keyword arguments: | |
mutate_mat -- A 2D rectangular list | |
filename -- Text string with a .csv ending | |
Return: | |
Saves a CSV file in the current working directory and returns a text | |
string giving the path to the saved file. | |
""" | |
# Add a count of the generations | |
generation_count = list(range(1, len(mutate_mat[0])+1)) | |
mutate_mat.insert(0, generation_count) | |
# Transpose the list (swap rows and columns) | |
# This creates a list of tuples, not a list of lists | |
mutate_mat_trans = list(zip(*mutate_mat)) | |
# Add headers | |
header = ["generation"] | |
for i in range(1, len(mutate_mat_trans[0])): | |
header.append("sim_" + str(i)) | |
mutate_mat_trans.insert(0, header) | |
# Export simulation data as a CSV file that can be imported to | |
# Google Sheets | |
with open(filename, 'w', newline='') as csvfile: | |
csvwriter = csv.writer(csvfile, delimiter=',') | |
csvwriter.writerows(mutate_mat_trans) | |
# Print the path to the CSV file | |
return os.path.join(os.getcwd(), filename) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# # # Simulation start | |
dna_seq = list("ATGC" * 4) # Original DNA sequence | |
num_sims = 5 # Number of simulations | |
num_gens = 20 # Number of generations in each simulation | |
# Set random seed so the simulation is repeatable | |
random.seed(1) | |
mutate_mat = sim_repeat(dna_seq, num_gens, num_sims) | |
# Export simulation results | |
export_mutate(mutate_mat) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment