Created
August 1, 2025 01:52
-
-
Save eric-czech/df84a29c807e42e8f6d82b083a07a8ef to your computer and use it in GitHub Desktop.
Metaflow example using Docker SDK to invoke bedtools
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
Metaflow Example: Simple bedtools sorting with Docker | |
Demonstrates using biocontainers/bedtools via Docker SDK to sort genomic intervals. | |
""" | |
import tempfile | |
import os | |
import docker | |
from metaflow import FlowSpec, step | |
# Example unsorted BED data to be sorted | |
BED_DATA = """chr2 4000 5000 region6 180 + | |
chr1 3000 4000 region3 150 - | |
chr2 500 1500 region4 300 + | |
chr1 1000 2000 region1 100 + | |
chr2 1200 2200 region5 250 - | |
chr1 1500 2500 region2 200 +""" | |
class BedtoolsFlow(FlowSpec): | |
""" | |
Simple Metaflow flow that sorts BED intervals using bedtools in Docker. | |
Shows: Docker SDK integration, file mounting, and capturing container output. | |
""" | |
@step | |
def start(self): | |
"""Create sample genomic data and start the flow.""" | |
print("Creating sample BED data for sorting...") | |
# Sample genomic intervals (unsorted) | |
self.bed_data = BED_DATA | |
print(f"Created {len(self.bed_data.strip().split('\n'))} genomic intervals") | |
self.next(self.sort_intervals) | |
@step | |
def sort_intervals(self): | |
"""Sort BED intervals using bedtools in Docker container.""" | |
print("Sorting genomic intervals with bedtools...") | |
# Initialize Docker client and create temporary file | |
client = docker.from_env() | |
with tempfile.NamedTemporaryFile(mode='w', suffix='.bed', delete=False) as tmp_file: | |
tmp_file.write(self.bed_data) | |
tmp_bed_file = tmp_file.name | |
try: | |
# Mount host directory into container at /data | |
# This allows bedtools inside container to access our temp file | |
host_dir = os.path.dirname(tmp_bed_file) | |
filename = os.path.basename(tmp_bed_file) | |
print(f"Mounting {host_dir} -> /data in container") | |
# Use bedtools sort to sort genomic intervals | |
sort_result = client.containers.run( | |
"biocontainers/bedtools:v2.27.1dfsg-4-deb_cv1", | |
command=['bedtools', 'sort', '-i', f'/data/{filename}'], | |
volumes={host_dir: {'bind': '/data', 'mode': 'ro'}}, # Read-only mount | |
remove=True # Auto-cleanup container after execution | |
) | |
self.sorted_intervals = sort_result.decode().strip() | |
finally: | |
os.unlink(tmp_bed_file) # Cleanup temp file | |
self.next(self.end) | |
@step | |
def end(self): | |
"""Display sorted results.""" | |
original_count = len(self.bed_data.strip().split('\n')) | |
print(f"\n=== Bedtools Sort Results ===") | |
print(f"Original intervals: {original_count}") | |
print(f"\nSorted intervals:") | |
print(self.sorted_intervals) | |
print(f"\n✓ Successfully used biocontainers/bedtools via Docker SDK") | |
print(f"✓ Demonstrated file mounting: host -> container") | |
print(f"✓ Captured and processed container output") | |
if __name__ == "__main__": | |
BedtoolsFlow() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment