philippmuench · September 3, 2024 10:25
diff --git a/Snakefile b/Snakefile
 import os
 import random

 # Get a list of all FASTA files in the bacdive_gff folder
 FASTA_FILES, = glob_wildcards("fasta/{fasta_file}.fasta")

 rule all:
    input:
 	expand("gff/{fasta_file}.gff", fasta_file=FASTA_FILES),
        expand("reformatted_gff_shuffled/{fasta_file}.gff", fasta_file=FASTA_FILES),
        expand("reformatted_gff_shuffled_10/{fasta_file}.gff", fasta_file=FASTA_FILES),
        expand("reformatted_gff_shuffled_50/{fasta_file}.gff", fasta_file=FASTA_FILES),
 	expand("reformatted_gff_non_hyp/{fasta_file}.txt", fasta_file=FASTA_FILES)

 rule prokka:
    input:
        fasta = "fasta/{fasta_file}.fasta"
    output:
        gff = "gff/{fasta_file}.gff"
    threads: 10
    shell:
        """
        prokka --centre X --compliant --force --outdir {wildcards.fasta_file}_prokka --prefix {wildcards.fasta_file} --cpus {threads} {input.fasta}
        mv {wildcards.fasta_file}_prokka/{wildcards.fasta_file}.gff {output.gff}
        rm -rf {wildcards.fasta_file}_prokka
        """

 rule reformat_gff:
    input:
        gff = "gff/{fasta_file}.gff"
    output:
        reformatted_gff = "reformatted_gff/{fasta_file}.gff"
    shell:
        """
        python reformat_gff.py --input {input.gff} --output {output.reformatted_gff}
        """

 rule shuffle_gff:
    input:
        reformatted_gff = "reformatted_gff/{fasta_file}.gff"
    output:
        shuffled_gff = "reformatted_gff_shuffled/{fasta_file}.gff"
    run:
        with open(input.reformatted_gff, 'r') as file:
            lines = file.readlines()
        random.shuffle(lines)
        with open(output.shuffled_gff, 'w') as file:
            file.writelines(lines)

 rule subset_gff_10:
    input:
        shuffled_gff = "reformatted_gff_shuffled/{fasta_file}.gff"
    output:
        subset_gff = "reformatted_gff_shuffled_10/{fasta_file}.gff"
    run:
        with open(input.shuffled_gff, 'r') as file:
            lines = file.readlines()
        subset_lines = lines[:len(lines)//10]
        with open(output.subset_gff, 'w') as file:
            file.writelines(subset_lines)

 rule subset_gff_50:
    input:
        shuffled_gff = "reformatted_gff_shuffled/{fasta_file}.gff"
    output:
        subset_gff = "reformatted_gff_shuffled_50/{fasta_file}.gff"
    run:
        with open(input.shuffled_gff, 'r') as file:
            lines = file.readlines()
        subset_lines = lines[:len(lines)//2]
        with open(output.subset_gff, 'w') as file:
            file.writelines(subset_lines)

 rule remove_hypothetical_proteins:
    input:
        reformatted_gff = "reformatted_gff/{fasta_file}.gff"
    output:
        non_hyp_gff = "reformatted_gff_non_hyp/{fasta_file}.txt"
    run:
        with open(input.reformatted_gff, 'r') as infile, open(output.non_hyp_gff, 'w') as outfile:
            for line in infile:
                if "hypothetical protein" not in line:
                    outfile.write(line)
	import os
	import random

	# Get a list of all FASTA files in the bacdive_gff folder
	FASTA_FILES, = glob_wildcards("fasta/{fasta_file}.fasta")

	rule all:
	input:
	expand("gff/{fasta_file}.gff", fasta_file=FASTA_FILES),
	expand("reformatted_gff_shuffled/{fasta_file}.gff", fasta_file=FASTA_FILES),
	expand("reformatted_gff_shuffled_10/{fasta_file}.gff", fasta_file=FASTA_FILES),
	expand("reformatted_gff_shuffled_50/{fasta_file}.gff", fasta_file=FASTA_FILES),
	expand("reformatted_gff_non_hyp/{fasta_file}.txt", fasta_file=FASTA_FILES)

	rule prokka:
	input:
	fasta = "fasta/{fasta_file}.fasta"
	output:
	gff = "gff/{fasta_file}.gff"
	threads: 10
	shell:
	"""
	prokka --centre X --compliant --force --outdir {wildcards.fasta_file}_prokka --prefix {wildcards.fasta_file} --cpus {threads} {input.fasta}
	mv {wildcards.fasta_file}_prokka/{wildcards.fasta_file}.gff {output.gff}
	rm -rf {wildcards.fasta_file}_prokka
	"""

	rule reformat_gff:
	input:
	gff = "gff/{fasta_file}.gff"
	output:
	reformatted_gff = "reformatted_gff/{fasta_file}.gff"
	shell:
	"""
	python reformat_gff.py --input {input.gff} --output {output.reformatted_gff}
	"""

	rule shuffle_gff:
	input:
	reformatted_gff = "reformatted_gff/{fasta_file}.gff"
	output:
	shuffled_gff = "reformatted_gff_shuffled/{fasta_file}.gff"
	run:
	with open(input.reformatted_gff, 'r') as file:
	lines = file.readlines()
	random.shuffle(lines)
	with open(output.shuffled_gff, 'w') as file:
	file.writelines(lines)

	rule subset_gff_10:
	input:
	shuffled_gff = "reformatted_gff_shuffled/{fasta_file}.gff"
	output:
	subset_gff = "reformatted_gff_shuffled_10/{fasta_file}.gff"
	run:
	with open(input.shuffled_gff, 'r') as file:
	lines = file.readlines()
	subset_lines = lines[:len(lines)//10]
	with open(output.subset_gff, 'w') as file:
	file.writelines(subset_lines)

	rule subset_gff_50:
	input:
	shuffled_gff = "reformatted_gff_shuffled/{fasta_file}.gff"
	output:
	subset_gff = "reformatted_gff_shuffled_50/{fasta_file}.gff"
	run:
	with open(input.shuffled_gff, 'r') as file:
	lines = file.readlines()
	subset_lines = lines[:len(lines)//2]
	with open(output.subset_gff, 'w') as file:
	file.writelines(subset_lines)

	rule remove_hypothetical_proteins:
	input:
	reformatted_gff = "reformatted_gff/{fasta_file}.gff"
	output:
	non_hyp_gff = "reformatted_gff_non_hyp/{fasta_file}.txt"
	run:
	with open(input.reformatted_gff, 'r') as infile, open(output.non_hyp_gff, 'w') as outfile:
	for line in infile:
	if "hypothetical protein" not in line:
	outfile.write(line)