MartGro · January 27, 2025 14:03
diff --git a/Simple pDRAW to .gb converter b/Simple pDRAW to .gb converter
 import os
 import re
 from pathlib import Path
 from Bio import SeqIO
 from Bio.Seq import Seq
 from Bio.SeqRecord import SeqRecord
 from Bio.SeqFeature import SeqFeature, FeatureLocation
 from datetime import datetime

 def parse_pdraw(pdraw_content):
    """Parse pDRAW file content and return sequence and annotations."""
    lines = pdraw_content.split('\n')
    sequence = ''
    annotations = []
    current_annotation = {}
    in_sequence = False
    dna_name = None
    is_circular = False
    
    for line in lines:
        line = line.strip()
        
        # Skip empty lines
        if not line:
            continue
            
        # Get DNA name
        if line.startswith('DNAname'):
            dna_name = line.split(None, 1)[1].strip()
        
        # Get circularity
        if line.startswith('IScircular'):
            is_circular = line.split(None, 1)[1].strip() == 'YES'
            
        # Check if we're entering sequence section
        if line.startswith('Sequence'):
            in_sequence = True
            continue
            
        # Handle sequence lines
        if in_sequence and re.match(r'^\s*\d+\s+[ATCG\s]+', line):
            # Extract sequence data, ignoring numbers and spaces
            sequence += ''.join(re.findall(r'[ATCG]', line))
            continue
        
        # Handle annotation lines
        try:
            if line.startswith('Annotation_'):
                parts = line.split(None, 1)
                if len(parts) == 2:
                    key, value = parts
                    if key == 'Annotation_Number':
                        if current_annotation:
                            annotations.append(current_annotation)
                        current_annotation = {}
                    current_annotation[key] = value.strip()
        except Exception as e:
            print(f"Warning: Failed to parse annotation line: {line}")
            print(f"Error: {str(e)}")
            continue
    
    # Add the last annotation if exists
    if current_annotation:
        annotations.append(current_annotation)
    
    if not sequence:
        raise ValueError("No sequence data found in the file")
        
    return sequence, annotations, dna_name, is_circular

 def create_genbank(sequence, annotations, plasmid_name, is_circular):
    """Create a GenBank record from sequence and annotations."""
    # Clean up plasmid name for GenBank
    clean_name = re.sub(r'[^\w.-]', '_', plasmid_name)
    
    # Create the record with required GenBank fields
    record = SeqRecord(
        Seq(sequence),
        id=clean_name,
        name=clean_name,
        description=plasmid_name,
        annotations={
            "molecule_type": "DNA",
            "topology": "circular" if is_circular else "linear",
            "data_file_division": "SYN",
            "date": datetime.now().strftime("%d-%b-%Y").upper(),
            "comment": "Converted from pDRAW format"
        }
    )
    
    # Add features
    for annotation in annotations:
        try:
            if 'Annotation_Start' in annotation and 'Annotation_End' in annotation:
                start = int(annotation['Annotation_Start']) - 1  # Convert to 0-based
                end = int(annotation['Annotation_End'])
                # In pDRAW, -1 is forward strand (5' to 3'), 1 is reverse strand
                strand = 1 if annotation.get('Annotation_Orientation', '1') == '-1' else -1
                
                # Create qualifiers dictionary
                qualifiers = {
                    "gene": [annotation.get('Annotation_Name', 'unknown')],
                    "label": [annotation.get('Annotation_Name', 'unknown')],
                    "note": [f"Color: {annotation.get('Annotation_FillColor', 'none')}"]
                }
                
                feature = SeqFeature(
                    FeatureLocation(start, end, strand),
                    type="gene",
                    qualifiers=qualifiers
                )
                record.features.append(feature)
        except Exception as e:
            print(f"Warning: Failed to add feature {annotation.get('Annotation_Name', 'unknown')}")
            print(f"Error: {str(e)}")
            continue
    
    return record

 def convert_pdraw_to_gb(input_file, output_file):
    """Convert a single pDRAW file to GenBank format."""
    try:
        # Read the pDRAW file
        with open(input_file, 'r', encoding='utf-8-sig') as f:
            pdraw_content = f.read()
        
        # Parse the pDRAW content
        sequence, annotations, dna_name, is_circular = parse_pdraw(pdraw_content)
        
        print(f"Found {len(sequence)} bp sequence and {len(annotations)} annotations")
        print(f"Plasmid name: {dna_name}")
        print(f"Topology: {'circular' if is_circular else 'linear'}")
        
        # Create GenBank record
        gb_record = create_genbank(sequence, annotations, dna_name or os.path.basename(input_file), is_circular)
        
        # Write to file
        with open(output_file, 'w') as handle:
            SeqIO.write(gb_record, handle, 'genbank')
        
        return True
    except Exception as e:
        print(f"Error converting {input_file}: {str(e)}")
        return False

 def process_directory(directory):
    """
    Recursively search for and convert all pDRAW files in a directory and its subdirectories.
    """
    # Convert directory to Path object
    dir_path = Path(directory)
    
    # Counter for converted files
    converted = 0
    failed = 0
    
    # Find all files with common pDRAW extensions (case insensitive)
    pdraw_files = []
    for ext in ['.pdraw', '.pd', '.pdr', '.pdw', '.PDW']:
        pdraw_files.extend(dir_path.rglob(f'*{ext}'))
    
    # Remove duplicates and sort
    pdraw_files = sorted(set(pdraw_files))
    
    # Process each file
    for pdraw_file in pdraw_files:
        # Create output filename with .gb extension
        output_file = pdraw_file.with_suffix('.gb')
        
        print(f"\nProcessing: {pdraw_file}")
        
        # Convert the file
        if convert_pdraw_to_gb(str(pdraw_file), str(output_file)):
            converted += 1
            print(f"Successfully converted to: {output_file}")
        else:
            failed += 1
    
    # Print summary
    print(f"\nConversion complete!")
    print(f"Files converted: {converted}")
    print(f"Files failed: {failed}")
    print(f"Total files processed: {converted + failed}")

 if __name__ == "__main__":
    import sys
    
    # Get directory from command line argument or use current directory
    directory = sys.argv[1] if len(sys.argv) > 1 else "."
    
    # Process the directory
    process_directory(directory)
	import os
	import re
	from pathlib import Path
	from Bio import SeqIO
	from Bio.Seq import Seq
	from Bio.SeqRecord import SeqRecord
	from Bio.SeqFeature import SeqFeature, FeatureLocation
	from datetime import datetime

	def parse_pdraw(pdraw_content):
	"""Parse pDRAW file content and return sequence and annotations."""
	lines = pdraw_content.split('\n')
	sequence = ''
	annotations = []
	current_annotation = {}
	in_sequence = False
	dna_name = None
	is_circular = False

	for line in lines:
	line = line.strip()

	# Skip empty lines
	if not line:
	continue

	# Get DNA name
	if line.startswith('DNAname'):
	dna_name = line.split(None, 1)[1].strip()

	# Get circularity
	if line.startswith('IScircular'):
	is_circular = line.split(None, 1)[1].strip() == 'YES'

	# Check if we're entering sequence section
	if line.startswith('Sequence'):
	in_sequence = True
	continue

	# Handle sequence lines
	if in_sequence and re.match(r'^\s*\d+\s+[ATCG\s]+', line):
	# Extract sequence data, ignoring numbers and spaces
	sequence += ''.join(re.findall(r'[ATCG]', line))
	continue

	# Handle annotation lines
	try:
	if line.startswith('Annotation_'):
	parts = line.split(None, 1)
	if len(parts) == 2:
	key, value = parts
	if key == 'Annotation_Number':
	if current_annotation:
	annotations.append(current_annotation)
	current_annotation = {}
	current_annotation[key] = value.strip()
	except Exception as e:
	print(f"Warning: Failed to parse annotation line: {line}")
	print(f"Error: {str(e)}")
	continue

	# Add the last annotation if exists
	if current_annotation:
	annotations.append(current_annotation)

	if not sequence:
	raise ValueError("No sequence data found in the file")

	return sequence, annotations, dna_name, is_circular

	def create_genbank(sequence, annotations, plasmid_name, is_circular):
	"""Create a GenBank record from sequence and annotations."""
	# Clean up plasmid name for GenBank
	clean_name = re.sub(r'[^\w.-]', '_', plasmid_name)

	# Create the record with required GenBank fields
	record = SeqRecord(
	Seq(sequence),
	id=clean_name,
	name=clean_name,
	description=plasmid_name,
	annotations={
	"molecule_type": "DNA",
	"topology": "circular" if is_circular else "linear",
	"data_file_division": "SYN",
	"date": datetime.now().strftime("%d-%b-%Y").upper(),
	"comment": "Converted from pDRAW format"
	}
	)

	# Add features
	for annotation in annotations:
	try:
	if 'Annotation_Start' in annotation and 'Annotation_End' in annotation:
	start = int(annotation['Annotation_Start']) - 1 # Convert to 0-based
	end = int(annotation['Annotation_End'])
	# In pDRAW, -1 is forward strand (5' to 3'), 1 is reverse strand
	strand = 1 if annotation.get('Annotation_Orientation', '1') == '-1' else -1

	# Create qualifiers dictionary
	qualifiers = {
	"gene": [annotation.get('Annotation_Name', 'unknown')],
	"label": [annotation.get('Annotation_Name', 'unknown')],
	"note": [f"Color: {annotation.get('Annotation_FillColor', 'none')}"]
	}

	feature = SeqFeature(
	FeatureLocation(start, end, strand),
	type="gene",
	qualifiers=qualifiers
	)
	record.features.append(feature)
	except Exception as e:
	print(f"Warning: Failed to add feature {annotation.get('Annotation_Name', 'unknown')}")
	print(f"Error: {str(e)}")
	continue

	return record

	def convert_pdraw_to_gb(input_file, output_file):
	"""Convert a single pDRAW file to GenBank format."""
	try:
	# Read the pDRAW file
	with open(input_file, 'r', encoding='utf-8-sig') as f:
	pdraw_content = f.read()

	# Parse the pDRAW content
	sequence, annotations, dna_name, is_circular = parse_pdraw(pdraw_content)

	print(f"Found {len(sequence)} bp sequence and {len(annotations)} annotations")
	print(f"Plasmid name: {dna_name}")
	print(f"Topology: {'circular' if is_circular else 'linear'}")

	# Create GenBank record
	gb_record = create_genbank(sequence, annotations, dna_name or os.path.basename(input_file), is_circular)

	# Write to file
	with open(output_file, 'w') as handle:
	SeqIO.write(gb_record, handle, 'genbank')

	return True
	except Exception as e:
	print(f"Error converting {input_file}: {str(e)}")
	return False

	def process_directory(directory):
	"""
	Recursively search for and convert all pDRAW files in a directory and its subdirectories.
	"""
	# Convert directory to Path object
	dir_path = Path(directory)

	# Counter for converted files
	converted = 0
	failed = 0

	# Find all files with common pDRAW extensions (case insensitive)
	pdraw_files = []
	for ext in ['.pdraw', '.pd', '.pdr', '.pdw', '.PDW']:
	pdraw_files.extend(dir_path.rglob(f'*{ext}'))

	# Remove duplicates and sort
	pdraw_files = sorted(set(pdraw_files))

	# Process each file
	for pdraw_file in pdraw_files:
	# Create output filename with .gb extension
	output_file = pdraw_file.with_suffix('.gb')

	print(f"\nProcessing: {pdraw_file}")

	# Convert the file
	if convert_pdraw_to_gb(str(pdraw_file), str(output_file)):
	converted += 1
	print(f"Successfully converted to: {output_file}")
	else:
	failed += 1

	# Print summary
	print(f"\nConversion complete!")
	print(f"Files converted: {converted}")
	print(f"Files failed: {failed}")
	print(f"Total files processed: {converted + failed}")

	if __name__ == "__main__":
	import sys

	# Get directory from command line argument or use current directory
	directory = sys.argv[1] if len(sys.argv) > 1 else "."

	# Process the directory
	process_directory(directory)