Created
January 27, 2025 14:03
-
-
Save MartGro/1b0facf12165134851cb678503dc7567 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import re | |
from pathlib import Path | |
from Bio import SeqIO | |
from Bio.Seq import Seq | |
from Bio.SeqRecord import SeqRecord | |
from Bio.SeqFeature import SeqFeature, FeatureLocation | |
from datetime import datetime | |
def parse_pdraw(pdraw_content): | |
"""Parse pDRAW file content and return sequence and annotations.""" | |
lines = pdraw_content.split('\n') | |
sequence = '' | |
annotations = [] | |
current_annotation = {} | |
in_sequence = False | |
dna_name = None | |
is_circular = False | |
for line in lines: | |
line = line.strip() | |
# Skip empty lines | |
if not line: | |
continue | |
# Get DNA name | |
if line.startswith('DNAname'): | |
dna_name = line.split(None, 1)[1].strip() | |
# Get circularity | |
if line.startswith('IScircular'): | |
is_circular = line.split(None, 1)[1].strip() == 'YES' | |
# Check if we're entering sequence section | |
if line.startswith('Sequence'): | |
in_sequence = True | |
continue | |
# Handle sequence lines | |
if in_sequence and re.match(r'^\s*\d+\s+[ATCG\s]+', line): | |
# Extract sequence data, ignoring numbers and spaces | |
sequence += ''.join(re.findall(r'[ATCG]', line)) | |
continue | |
# Handle annotation lines | |
try: | |
if line.startswith('Annotation_'): | |
parts = line.split(None, 1) | |
if len(parts) == 2: | |
key, value = parts | |
if key == 'Annotation_Number': | |
if current_annotation: | |
annotations.append(current_annotation) | |
current_annotation = {} | |
current_annotation[key] = value.strip() | |
except Exception as e: | |
print(f"Warning: Failed to parse annotation line: {line}") | |
print(f"Error: {str(e)}") | |
continue | |
# Add the last annotation if exists | |
if current_annotation: | |
annotations.append(current_annotation) | |
if not sequence: | |
raise ValueError("No sequence data found in the file") | |
return sequence, annotations, dna_name, is_circular | |
def create_genbank(sequence, annotations, plasmid_name, is_circular): | |
"""Create a GenBank record from sequence and annotations.""" | |
# Clean up plasmid name for GenBank | |
clean_name = re.sub(r'[^\w.-]', '_', plasmid_name) | |
# Create the record with required GenBank fields | |
record = SeqRecord( | |
Seq(sequence), | |
id=clean_name, | |
name=clean_name, | |
description=plasmid_name, | |
annotations={ | |
"molecule_type": "DNA", | |
"topology": "circular" if is_circular else "linear", | |
"data_file_division": "SYN", | |
"date": datetime.now().strftime("%d-%b-%Y").upper(), | |
"comment": "Converted from pDRAW format" | |
} | |
) | |
# Add features | |
for annotation in annotations: | |
try: | |
if 'Annotation_Start' in annotation and 'Annotation_End' in annotation: | |
start = int(annotation['Annotation_Start']) - 1 # Convert to 0-based | |
end = int(annotation['Annotation_End']) | |
# In pDRAW, -1 is forward strand (5' to 3'), 1 is reverse strand | |
strand = 1 if annotation.get('Annotation_Orientation', '1') == '-1' else -1 | |
# Create qualifiers dictionary | |
qualifiers = { | |
"gene": [annotation.get('Annotation_Name', 'unknown')], | |
"label": [annotation.get('Annotation_Name', 'unknown')], | |
"note": [f"Color: {annotation.get('Annotation_FillColor', 'none')}"] | |
} | |
feature = SeqFeature( | |
FeatureLocation(start, end, strand), | |
type="gene", | |
qualifiers=qualifiers | |
) | |
record.features.append(feature) | |
except Exception as e: | |
print(f"Warning: Failed to add feature {annotation.get('Annotation_Name', 'unknown')}") | |
print(f"Error: {str(e)}") | |
continue | |
return record | |
def convert_pdraw_to_gb(input_file, output_file): | |
"""Convert a single pDRAW file to GenBank format.""" | |
try: | |
# Read the pDRAW file | |
with open(input_file, 'r', encoding='utf-8-sig') as f: | |
pdraw_content = f.read() | |
# Parse the pDRAW content | |
sequence, annotations, dna_name, is_circular = parse_pdraw(pdraw_content) | |
print(f"Found {len(sequence)} bp sequence and {len(annotations)} annotations") | |
print(f"Plasmid name: {dna_name}") | |
print(f"Topology: {'circular' if is_circular else 'linear'}") | |
# Create GenBank record | |
gb_record = create_genbank(sequence, annotations, dna_name or os.path.basename(input_file), is_circular) | |
# Write to file | |
with open(output_file, 'w') as handle: | |
SeqIO.write(gb_record, handle, 'genbank') | |
return True | |
except Exception as e: | |
print(f"Error converting {input_file}: {str(e)}") | |
return False | |
def process_directory(directory): | |
""" | |
Recursively search for and convert all pDRAW files in a directory and its subdirectories. | |
""" | |
# Convert directory to Path object | |
dir_path = Path(directory) | |
# Counter for converted files | |
converted = 0 | |
failed = 0 | |
# Find all files with common pDRAW extensions (case insensitive) | |
pdraw_files = [] | |
for ext in ['.pdraw', '.pd', '.pdr', '.pdw', '.PDW']: | |
pdraw_files.extend(dir_path.rglob(f'*{ext}')) | |
# Remove duplicates and sort | |
pdraw_files = sorted(set(pdraw_files)) | |
# Process each file | |
for pdraw_file in pdraw_files: | |
# Create output filename with .gb extension | |
output_file = pdraw_file.with_suffix('.gb') | |
print(f"\nProcessing: {pdraw_file}") | |
# Convert the file | |
if convert_pdraw_to_gb(str(pdraw_file), str(output_file)): | |
converted += 1 | |
print(f"Successfully converted to: {output_file}") | |
else: | |
failed += 1 | |
# Print summary | |
print(f"\nConversion complete!") | |
print(f"Files converted: {converted}") | |
print(f"Files failed: {failed}") | |
print(f"Total files processed: {converted + failed}") | |
if __name__ == "__main__": | |
import sys | |
# Get directory from command line argument or use current directory | |
directory = sys.argv[1] if len(sys.argv) > 1 else "." | |
# Process the directory | |
process_directory(directory) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment