Last active
March 25, 2025 05:05
-
-
Save pkerpedjiev/b7fb4a97df7115fc464ed06916ee349b to your computer and use it in GitHub Desktop.
juv-bam-files.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "e300e3e5", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# /// script\n", | |
"# requires-python = \">=3.10\"\n", | |
"# dependencies = [\n", | |
"# \"biopython\",\n", | |
"# \"clodius\",\n", | |
"# \"cooler\",\n", | |
"# \"higlass-python==1.3.0\",\n", | |
"# \"smart_open\",\n", | |
"# ]\n", | |
"#\n", | |
"# [tool.uv.sources]\n", | |
"# clodius = { path = \"../../resgen/rhodius\", editable = true }\n", | |
"# ///" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"id": "c7e3cb5a-5c16-4ed2-b936-1ea687dbe604", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"The autoreload extension is already loaded. To reload it, use:\n", | |
" %reload_ext autoreload\n", | |
"env: ANYWIDGET_HMR=1\n" | |
] | |
} | |
], | |
"source": [ | |
"%load_ext autoreload\n", | |
"%autoreload 2\n", | |
"%env ANYWIDGET_HMR=1" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 98, | |
"id": "4926fadb-8d03-4d52-b793-f24f01f6ce6c", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import Bio\n", | |
"from Bio import Align\n", | |
"from Bio.Seq import Seq\n", | |
"\n", | |
"def align_sequences(seq1, seq2):\n", | |
" aligner = Align.PairwiseAligner()\n", | |
"\n", | |
" aligner.match_score = 1\n", | |
" aligner.mismatch_score = -4\n", | |
" aligner.open_gap_score = -6\n", | |
" aligner.extend_gap_score = -1\n", | |
"\n", | |
" alignments = aligner.align(seq1, seq2)\n", | |
"\n", | |
" best_alignment = alignments[0]\n", | |
"\n", | |
" return best_alignment\n", | |
" \n", | |
"def get_subs(a):\n", | |
" parts = []\n", | |
" ttrue = 0\n", | |
" tpos = 0\n", | |
" qpos = 0\n", | |
"\n", | |
" start = 0\n", | |
" end = 0\n", | |
"\n", | |
" aligneds = list(zip(a.aligned[0], a.aligned[1]))\n", | |
"\n", | |
" for i, ((ts, te), (qs, qe)) in enumerate(aligneds):\n", | |
" ts,te,qs,qe = int(ts), int(te), int(qs), int(qe)\n", | |
" \n", | |
" if i == 0:\n", | |
" # start position\n", | |
" start = ts\n", | |
" tpos = ts\n", | |
" ttrue = 0\n", | |
" if i == len(aligneds) - 1:\n", | |
" # end position\n", | |
" end = te\n", | |
" \n", | |
" if ts > tpos:\n", | |
" parts += [{'pos': ttrue, 'type': 'D', 'length': ts - tpos}]\n", | |
" ttrue += ts - tpos\n", | |
" if qs > qpos:\n", | |
" parts += [{'pos': ttrue, 'type': 'I', 'length': qs - qpos}]\n", | |
" for i in range(te - ts):\n", | |
" if a.target[ts + i] != a.query[qs + i]:\n", | |
" parts += [{'pos': ttrue + i, 'type': 'X', 'length': 1, 'base': a.target[ts + i], 'variant': a.query[qs + i]}]\n", | |
"\n", | |
" ttrue += (te - ts)\n", | |
" tpos = te\n", | |
" qpos = qe\n", | |
" return start+1, end+1, parts\n", | |
"\n", | |
"a = align_sequences(\"TTTTT\", \"TTATT\")\n", | |
"s = get_subs(a)\n", | |
"\n", | |
"# assert 1-based start positions and closed intervals\n", | |
"assert s[0] == 1\n", | |
"assert s[1] == 6\n", | |
"assert s[2][0]['pos'] == 2 # subs are 0-based\n", | |
"assert s[2][0]['base'] == 'T'\n", | |
"assert s[2][0]['variant'] == 'A'\n", | |
"\n", | |
"a = align_sequences(\"TTTTT\", \"TTATTT\")\n", | |
"s = get_subs(a)\n", | |
"\n", | |
"assert s[0] == 1\n", | |
"assert s[1] == 6\n", | |
"assert s[2][0]['pos'] == 2\n", | |
"assert s[2][0]['type'] == 'I'\n", | |
"assert s[2][0]['length'] == 1\n", | |
"\n", | |
"# a = align_sequences(\"TATTTTGGACCGCGCGTTCATTTACACGTC\", \"ATTGA\")\n", | |
"# print(a)\n", | |
"# s = get_subs(a)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 100, | |
"id": "e71c7ef3-afd2-484b-9082-162d47d4b09b", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"ref TATTTTGGACCGCGCGTTCATTTACACGTC\n", | |
"seq ATTGA\n", | |
"target 0 TATTTTGGACCGCGCGTTCATTTACACGTC 30\n", | |
" 0 -------------------|||.|------ 30\n", | |
"query 0 -------------------ATTGA------ 5\n", | |
"\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"{'type': 'local-tiles',\n", | |
" 'tilesetInfo': {'min_pos': [0],\n", | |
" 'max_pos': [30],\n", | |
" 'max_width': 30,\n", | |
" 'tile_size': 30,\n", | |
" 'chromsizes': [['a', 30]],\n", | |
" 'max_zoom': 0,\n", | |
" 'max_tile_width': 100000,\n", | |
" 'format': 'subs'},\n", | |
" 'tiles': {'0.0': [{'id': 'r0',\n", | |
" 'from': 20,\n", | |
" 'to': 25,\n", | |
" 'substitutions': [{'pos': 3,\n", | |
" 'type': 'X',\n", | |
" 'length': 1,\n", | |
" 'base': 'T',\n", | |
" 'variant': 'G'}],\n", | |
" 'color': 0}]}}" | |
] | |
}, | |
"execution_count": 100, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"\n", | |
"\n", | |
"def get_pileup_alignment_data(ref: str, seqs: list[str]) -> dict:\n", | |
" \"\"\"Get a local tile for use in a higlass-pileup plot.\n", | |
"\n", | |
" :param ref: The reference to align to\n", | |
" :param seqs: The sequences to align to the reference\n", | |
" \"\"\"\n", | |
" local_data = {\n", | |
" \"type\": 'local-tiles',\n", | |
" \"tilesetInfo\": {\n", | |
" 'min_pos': [0],\n", | |
" 'max_pos': [len(ref)],\n", | |
" 'max_width': len(ref),\n", | |
" 'tile_size': len(ref),\n", | |
" 'chromsizes': [['a', len(ref)]],\n", | |
" 'max_zoom': 0,\n", | |
" 'max_tile_width': 100000,\n", | |
" 'format': 'subs'\n", | |
" },\n", | |
" \"tiles\": {\n", | |
" '0.0': [],\n", | |
" }\n", | |
" }\n", | |
" \n", | |
" for i,seq in enumerate(seqs):\n", | |
" print(\"ref\", ref)\n", | |
" print(\"seq\", seq)\n", | |
" a = align_sequences(ref, seq)\n", | |
" print(a)\n", | |
" start, end, subs = get_subs(a)\n", | |
"\n", | |
" local_data['tiles']['0.0'].append({\n", | |
" \"id\": f\"r{i}\",\n", | |
" \"from\": start,\n", | |
" \"to\": end,\n", | |
" \"substitutions\": subs,\n", | |
" \"color\": 0\n", | |
" })\n", | |
"\n", | |
" # print(local_data)\n", | |
" return local_data\n", | |
"\n", | |
"get_pileup_alignment_data(\n", | |
" \"TATTTTGGACCGCGCGTTCATTTACACGTC\", [\"ATTGA\"])\n", | |
" " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 101, | |
"id": "abfbb8e6", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"ref TATTTTGGACCGCGCGTTCATTTACACGTC\n", | |
"seq ATTGA\n", | |
"target 0 TATTTTGGACCGCGCGTTCATTTACACGTC 30\n", | |
" 0 -------------------|||.|------ 30\n", | |
"query 0 -------------------ATTGA------ 5\n", | |
"\n" | |
] | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "e562dd0e8c9c4e14b36e639eab72b24c", | |
"version_major": 2, | |
"version_minor": 1 | |
}, | |
"text/plain": [ | |
"HiGlassWidget()" | |
] | |
}, | |
"execution_count": 101, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"from typing import Literal, ClassVar\n", | |
"import higlass as hg\n", | |
"\n", | |
"class PileupTrack(hg.PluginTrack):\n", | |
" type: Literal[\"pileup\"] = \"pileup\"\n", | |
" # plugin_url: ClassVar[str] = \"https://unpkg.com/higlass-pileup/dist/higlass-pileup.min.js\"\n", | |
" plugin_url: ClassVar[str] = \"http://localhost:8080/higlass-pileup.min.js\"\n", | |
"\n", | |
"\n", | |
"# Specify the track-specific data\n", | |
"pileup_data = {\n", | |
" \"type\": \"bam\",\n", | |
" \"url\": \"https://pkerp.s3.amazonaws.com/public/bamfile_test/SRR1770413.sorted.bam\",\n", | |
" \"chromSizesUrl\": \"https://pkerp.s3.amazonaws.com/public/bamfile_test/GCF_000005845.2_ASM584v2_genomic.chrom.sizes\",\n", | |
" \"options\": {\"maxTileWidth\": 30000},\n", | |
"}\n", | |
"\n", | |
"# Create and use the custom track\n", | |
"pileup_track = PileupTrack(data=get_pileup_alignment_data(\n", | |
" 'TATTTTGGACCGCGCGTTCATTTACACGTC',\n", | |
" ['ATTGA'])\n", | |
" , height=180).opts(\n", | |
" axisPositionHorizontal=\"right\",\n", | |
" axisLabelFormatting=\"normal\",\n", | |
" showCoverage=True,\n", | |
" colorScale=[\n", | |
" \"#2c7bb6\",\"#92c5de\",\"#ffffbf\",\"#fdae61\",\"#808080\", \"#DCDCDC\",\n", | |
" ]\n", | |
")\n", | |
"\n", | |
"view = hg.view((pileup_track, \"top\"), (hg.track(\"top-axis\"), 'top')).domain(x = [0, 100])\n", | |
"view" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"id": "4e191c1c-c9ea-4497-858c-db285280f8dc", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"id": "faea5eb5-358c-4565-a5e1-514bb232afbe", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"id": "1f934768-3271-49bf-a504-4a2acb9a27ef", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"target 0 GGA---GGGGAGG 10\n", | |
" 0 |||---||..||| 13\n", | |
"query 0 GGAAAGGGAAAGG 13\n", | |
"\n", | |
"[{'pos': 3, 'type': 'I', 'length': 3}, {'pos': 5, 'type': 'X', 'length': 1, 'base': 'G', 'variant': 'A'}, {'pos': 6, 'type': 'X', 'length': 1, 'base': 'G', 'variant': 'A'}]\n", | |
"target 0 GGAAAGGGAAAGG 13\n", | |
" 0 |||---||..||| 13\n", | |
"query 0 GGA---GGGGAGG 10\n", | |
"\n", | |
"[{'pos': 3, 'type': 'D', 'length': 3}, {'pos': 8, 'type': 'X', 'length': 1, 'base': 'A', 'variant': 'G'}, {'pos': 9, 'type': 'X', 'length': 1, 'base': 'A', 'variant': 'G'}]\n", | |
"target 0 GGAAAGAGGAAAGG 14\n", | |
" 0 ||--||.||--||| 14\n", | |
"query 0 GG--AGTGG--AGG 10\n", | |
"\n", | |
"[{'pos': 2, 'type': 'D', 'length': 2}, {'pos': 6, 'type': 'X', 'length': 1, 'base': 'A', 'variant': 'T'}, {'pos': 9, 'type': 'D', 'length': 2}]\n", | |
"target 0 GGAAAGTTAGGAAAGG 16\n", | |
" 0 -----|||||||---- 16\n", | |
"query 0 -----GTTAGGA---- 7\n", | |
"\n", | |
"[{'pos': 0, 'type': 'D', 'length': 5}]\n" | |
] | |
} | |
], | |
"source": [ | |
"\n", | |
" " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"id": "688831b6-6111-47af-8ce2-a285a1ff5508", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[{'pos': 0, 'type': 'D', 'length': 2},\n", | |
" {'pos': 4, 'type': 'X', 'length': 1, 'base': 'A', 'variant': 'T'},\n", | |
" {'pos': 2, 'type': 'D', 'length': 2}]" | |
] | |
}, | |
"execution_count": 17, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"parts" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"id": "e5791bd2-5017-45e0-a2a6-a2f3b93af71c", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'GGAAAGAGGAAAGG'" | |
] | |
}, | |
"execution_count": 24, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"a.target" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "bfab5061-9695-4a32-a930-9df29e23bf7d", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 33, | |
"id": "e620f0c5-e3a1-43d5-ba8d-78e2acb18b6c", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from higlass.tilesets import ClodiusTileset\n", | |
"\n", | |
"def tileset_info(a):\n", | |
" target = a.target\n", | |
" tile_size = 1024\n", | |
" max_width = 2 ** math.ceil(math.log(33) / math.log(2))\n", | |
" # max_zoom = math.ceil(max_width // tile_size)\n", | |
"\n", | |
" # We'll fit everything into one tile\n", | |
" max_zoom = 0\n", | |
"\n", | |
" chromsizes_list = [['a', len(target)]]\n", | |
" \n", | |
" return {\n", | |
" \"min_pos\": [0],\n", | |
" \"max_pos\": [len(target)],\n", | |
" \"max_width\": max_width,\n", | |
" \"tile_size\": tile_size,\n", | |
" \"chromsizes\": chromsizes_list,\n", | |
" \"max_zoom\": max_zoom,\n", | |
" \"max_tile_width\": 100000,\n", | |
" \"format\": \"subs\"\n", | |
" }\n", | |
"\n", | |
"\n", | |
"def tiles(a, tile_ids):\n", | |
" return get_subs(a)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 34, | |
"id": "52906247-4610-49c9-bce9-543d27fdabe7", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'min_pos': [0],\n", | |
" 'max_pos': [14],\n", | |
" 'max_width': 64,\n", | |
" 'tile_size': 1024,\n", | |
" 'chromsizes': [['a', 14]],\n", | |
" 'max_zoom': 0,\n", | |
" 'max_tile_width': 100000,\n", | |
" 'format': 'subs'}" | |
] | |
}, | |
"execution_count": 34, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"tileset_info(a)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 35, | |
"id": "2ee9e898-ff4d-40e2-b98c-f8a3182bea1f", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[{'pos': 2, 'type': 'D', 'length': 2},\n", | |
" {'pos': 6, 'type': 'X', 'length': 1, 'base': 'A', 'variant': 'T'},\n", | |
" {'pos': 9, 'type': 'D', 'length': 2}]" | |
] | |
}, | |
"execution_count": 35, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"tiles(a, [])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "ad9c7234-5777-4a3b-8bbd-c6ec3da32b49", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.10.15" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment