Skip to content

Instantly share code, notes, and snippets.

@philippmuench
Last active September 3, 2024 10:25
Show Gist options
  • Save philippmuench/63fb7333e6910054c2119c956b5fec8f to your computer and use it in GitHub Desktop.
Save philippmuench/63fb7333e6910054c2119c956b5fec8f to your computer and use it in GitHub Desktop.
Snakemake file for gene list generation for Muench et all, 2024 manuscript
import os
import random
# Get a list of all FASTA files in the bacdive_gff folder
FASTA_FILES, = glob_wildcards("fasta/{fasta_file}.fasta")
rule all:
input:
expand("gff/{fasta_file}.gff", fasta_file=FASTA_FILES),
expand("reformatted_gff_shuffled/{fasta_file}.gff", fasta_file=FASTA_FILES),
expand("reformatted_gff_shuffled_10/{fasta_file}.gff", fasta_file=FASTA_FILES),
expand("reformatted_gff_shuffled_50/{fasta_file}.gff", fasta_file=FASTA_FILES),
expand("reformatted_gff_non_hyp/{fasta_file}.txt", fasta_file=FASTA_FILES)
rule prokka:
input:
fasta = "fasta/{fasta_file}.fasta"
output:
gff = "gff/{fasta_file}.gff"
threads: 10
shell:
"""
prokka --centre X --compliant --force --outdir {wildcards.fasta_file}_prokka --prefix {wildcards.fasta_file} --cpus {threads} {input.fasta}
mv {wildcards.fasta_file}_prokka/{wildcards.fasta_file}.gff {output.gff}
rm -rf {wildcards.fasta_file}_prokka
"""
rule reformat_gff:
input:
gff = "gff/{fasta_file}.gff"
output:
reformatted_gff = "reformatted_gff/{fasta_file}.gff"
shell:
"""
python reformat_gff.py --input {input.gff} --output {output.reformatted_gff}
"""
rule shuffle_gff:
input:
reformatted_gff = "reformatted_gff/{fasta_file}.gff"
output:
shuffled_gff = "reformatted_gff_shuffled/{fasta_file}.gff"
run:
with open(input.reformatted_gff, 'r') as file:
lines = file.readlines()
random.shuffle(lines)
with open(output.shuffled_gff, 'w') as file:
file.writelines(lines)
rule subset_gff_10:
input:
shuffled_gff = "reformatted_gff_shuffled/{fasta_file}.gff"
output:
subset_gff = "reformatted_gff_shuffled_10/{fasta_file}.gff"
run:
with open(input.shuffled_gff, 'r') as file:
lines = file.readlines()
subset_lines = lines[:len(lines)//10]
with open(output.subset_gff, 'w') as file:
file.writelines(subset_lines)
rule subset_gff_50:
input:
shuffled_gff = "reformatted_gff_shuffled/{fasta_file}.gff"
output:
subset_gff = "reformatted_gff_shuffled_50/{fasta_file}.gff"
run:
with open(input.shuffled_gff, 'r') as file:
lines = file.readlines()
subset_lines = lines[:len(lines)//2]
with open(output.subset_gff, 'w') as file:
file.writelines(subset_lines)
rule remove_hypothetical_proteins:
input:
reformatted_gff = "reformatted_gff/{fasta_file}.gff"
output:
non_hyp_gff = "reformatted_gff_non_hyp/{fasta_file}.txt"
run:
with open(input.reformatted_gff, 'r') as infile, open(output.non_hyp_gff, 'w') as outfile:
for line in infile:
if "hypothetical protein" not in line:
outfile.write(line)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment