Skip to content

Instantly share code, notes, and snippets.

@MartGro
Created January 27, 2025 14:03
Show Gist options
  • Save MartGro/1b0facf12165134851cb678503dc7567 to your computer and use it in GitHub Desktop.
Save MartGro/1b0facf12165134851cb678503dc7567 to your computer and use it in GitHub Desktop.
import os
import re
from pathlib import Path
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation
from datetime import datetime
def parse_pdraw(pdraw_content):
"""Parse pDRAW file content and return sequence and annotations."""
lines = pdraw_content.split('\n')
sequence = ''
annotations = []
current_annotation = {}
in_sequence = False
dna_name = None
is_circular = False
for line in lines:
line = line.strip()
# Skip empty lines
if not line:
continue
# Get DNA name
if line.startswith('DNAname'):
dna_name = line.split(None, 1)[1].strip()
# Get circularity
if line.startswith('IScircular'):
is_circular = line.split(None, 1)[1].strip() == 'YES'
# Check if we're entering sequence section
if line.startswith('Sequence'):
in_sequence = True
continue
# Handle sequence lines
if in_sequence and re.match(r'^\s*\d+\s+[ATCG\s]+', line):
# Extract sequence data, ignoring numbers and spaces
sequence += ''.join(re.findall(r'[ATCG]', line))
continue
# Handle annotation lines
try:
if line.startswith('Annotation_'):
parts = line.split(None, 1)
if len(parts) == 2:
key, value = parts
if key == 'Annotation_Number':
if current_annotation:
annotations.append(current_annotation)
current_annotation = {}
current_annotation[key] = value.strip()
except Exception as e:
print(f"Warning: Failed to parse annotation line: {line}")
print(f"Error: {str(e)}")
continue
# Add the last annotation if exists
if current_annotation:
annotations.append(current_annotation)
if not sequence:
raise ValueError("No sequence data found in the file")
return sequence, annotations, dna_name, is_circular
def create_genbank(sequence, annotations, plasmid_name, is_circular):
"""Create a GenBank record from sequence and annotations."""
# Clean up plasmid name for GenBank
clean_name = re.sub(r'[^\w.-]', '_', plasmid_name)
# Create the record with required GenBank fields
record = SeqRecord(
Seq(sequence),
id=clean_name,
name=clean_name,
description=plasmid_name,
annotations={
"molecule_type": "DNA",
"topology": "circular" if is_circular else "linear",
"data_file_division": "SYN",
"date": datetime.now().strftime("%d-%b-%Y").upper(),
"comment": "Converted from pDRAW format"
}
)
# Add features
for annotation in annotations:
try:
if 'Annotation_Start' in annotation and 'Annotation_End' in annotation:
start = int(annotation['Annotation_Start']) - 1 # Convert to 0-based
end = int(annotation['Annotation_End'])
# In pDRAW, -1 is forward strand (5' to 3'), 1 is reverse strand
strand = 1 if annotation.get('Annotation_Orientation', '1') == '-1' else -1
# Create qualifiers dictionary
qualifiers = {
"gene": [annotation.get('Annotation_Name', 'unknown')],
"label": [annotation.get('Annotation_Name', 'unknown')],
"note": [f"Color: {annotation.get('Annotation_FillColor', 'none')}"]
}
feature = SeqFeature(
FeatureLocation(start, end, strand),
type="gene",
qualifiers=qualifiers
)
record.features.append(feature)
except Exception as e:
print(f"Warning: Failed to add feature {annotation.get('Annotation_Name', 'unknown')}")
print(f"Error: {str(e)}")
continue
return record
def convert_pdraw_to_gb(input_file, output_file):
"""Convert a single pDRAW file to GenBank format."""
try:
# Read the pDRAW file
with open(input_file, 'r', encoding='utf-8-sig') as f:
pdraw_content = f.read()
# Parse the pDRAW content
sequence, annotations, dna_name, is_circular = parse_pdraw(pdraw_content)
print(f"Found {len(sequence)} bp sequence and {len(annotations)} annotations")
print(f"Plasmid name: {dna_name}")
print(f"Topology: {'circular' if is_circular else 'linear'}")
# Create GenBank record
gb_record = create_genbank(sequence, annotations, dna_name or os.path.basename(input_file), is_circular)
# Write to file
with open(output_file, 'w') as handle:
SeqIO.write(gb_record, handle, 'genbank')
return True
except Exception as e:
print(f"Error converting {input_file}: {str(e)}")
return False
def process_directory(directory):
"""
Recursively search for and convert all pDRAW files in a directory and its subdirectories.
"""
# Convert directory to Path object
dir_path = Path(directory)
# Counter for converted files
converted = 0
failed = 0
# Find all files with common pDRAW extensions (case insensitive)
pdraw_files = []
for ext in ['.pdraw', '.pd', '.pdr', '.pdw', '.PDW']:
pdraw_files.extend(dir_path.rglob(f'*{ext}'))
# Remove duplicates and sort
pdraw_files = sorted(set(pdraw_files))
# Process each file
for pdraw_file in pdraw_files:
# Create output filename with .gb extension
output_file = pdraw_file.with_suffix('.gb')
print(f"\nProcessing: {pdraw_file}")
# Convert the file
if convert_pdraw_to_gb(str(pdraw_file), str(output_file)):
converted += 1
print(f"Successfully converted to: {output_file}")
else:
failed += 1
# Print summary
print(f"\nConversion complete!")
print(f"Files converted: {converted}")
print(f"Files failed: {failed}")
print(f"Total files processed: {converted + failed}")
if __name__ == "__main__":
import sys
# Get directory from command line argument or use current directory
directory = sys.argv[1] if len(sys.argv) > 1 else "."
# Process the directory
process_directory(directory)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment