Skip to content

Instantly share code, notes, and snippets.

@pkerpedjiev
Last active March 25, 2025 05:05
Show Gist options
  • Save pkerpedjiev/b7fb4a97df7115fc464ed06916ee349b to your computer and use it in GitHub Desktop.
Save pkerpedjiev/b7fb4a97df7115fc464ed06916ee349b to your computer and use it in GitHub Desktop.
juv-bam-files.ipynb
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 6,
"id": "e300e3e5",
"metadata": {},
"outputs": [],
"source": [
"# /// script\n",
"# requires-python = \">=3.10\"\n",
"# dependencies = [\n",
"# \"biopython\",\n",
"# \"clodius\",\n",
"# \"cooler\",\n",
"# \"higlass-python==1.3.0\",\n",
"# \"smart_open\",\n",
"# ]\n",
"#\n",
"# [tool.uv.sources]\n",
"# clodius = { path = \"../../resgen/rhodius\", editable = true }\n",
"# ///"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "c7e3cb5a-5c16-4ed2-b936-1ea687dbe604",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The autoreload extension is already loaded. To reload it, use:\n",
" %reload_ext autoreload\n",
"env: ANYWIDGET_HMR=1\n"
]
}
],
"source": [
"%load_ext autoreload\n",
"%autoreload 2\n",
"%env ANYWIDGET_HMR=1"
]
},
{
"cell_type": "code",
"execution_count": 98,
"id": "4926fadb-8d03-4d52-b793-f24f01f6ce6c",
"metadata": {},
"outputs": [],
"source": [
"import Bio\n",
"from Bio import Align\n",
"from Bio.Seq import Seq\n",
"\n",
"def align_sequences(seq1, seq2):\n",
" aligner = Align.PairwiseAligner()\n",
"\n",
" aligner.match_score = 1\n",
" aligner.mismatch_score = -4\n",
" aligner.open_gap_score = -6\n",
" aligner.extend_gap_score = -1\n",
"\n",
" alignments = aligner.align(seq1, seq2)\n",
"\n",
" best_alignment = alignments[0]\n",
"\n",
" return best_alignment\n",
" \n",
"def get_subs(a):\n",
" parts = []\n",
" ttrue = 0\n",
" tpos = 0\n",
" qpos = 0\n",
"\n",
" start = 0\n",
" end = 0\n",
"\n",
" aligneds = list(zip(a.aligned[0], a.aligned[1]))\n",
"\n",
" for i, ((ts, te), (qs, qe)) in enumerate(aligneds):\n",
" ts,te,qs,qe = int(ts), int(te), int(qs), int(qe)\n",
" \n",
" if i == 0:\n",
" # start position\n",
" start = ts\n",
" tpos = ts\n",
" ttrue = 0\n",
" if i == len(aligneds) - 1:\n",
" # end position\n",
" end = te\n",
" \n",
" if ts > tpos:\n",
" parts += [{'pos': ttrue, 'type': 'D', 'length': ts - tpos}]\n",
" ttrue += ts - tpos\n",
" if qs > qpos:\n",
" parts += [{'pos': ttrue, 'type': 'I', 'length': qs - qpos}]\n",
" for i in range(te - ts):\n",
" if a.target[ts + i] != a.query[qs + i]:\n",
" parts += [{'pos': ttrue + i, 'type': 'X', 'length': 1, 'base': a.target[ts + i], 'variant': a.query[qs + i]}]\n",
"\n",
" ttrue += (te - ts)\n",
" tpos = te\n",
" qpos = qe\n",
" return start+1, end+1, parts\n",
"\n",
"a = align_sequences(\"TTTTT\", \"TTATT\")\n",
"s = get_subs(a)\n",
"\n",
"# assert 1-based start positions and closed intervals\n",
"assert s[0] == 1\n",
"assert s[1] == 6\n",
"assert s[2][0]['pos'] == 2 # subs are 0-based\n",
"assert s[2][0]['base'] == 'T'\n",
"assert s[2][0]['variant'] == 'A'\n",
"\n",
"a = align_sequences(\"TTTTT\", \"TTATTT\")\n",
"s = get_subs(a)\n",
"\n",
"assert s[0] == 1\n",
"assert s[1] == 6\n",
"assert s[2][0]['pos'] == 2\n",
"assert s[2][0]['type'] == 'I'\n",
"assert s[2][0]['length'] == 1\n",
"\n",
"# a = align_sequences(\"TATTTTGGACCGCGCGTTCATTTACACGTC\", \"ATTGA\")\n",
"# print(a)\n",
"# s = get_subs(a)\n"
]
},
{
"cell_type": "code",
"execution_count": 100,
"id": "e71c7ef3-afd2-484b-9082-162d47d4b09b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"ref TATTTTGGACCGCGCGTTCATTTACACGTC\n",
"seq ATTGA\n",
"target 0 TATTTTGGACCGCGCGTTCATTTACACGTC 30\n",
" 0 -------------------|||.|------ 30\n",
"query 0 -------------------ATTGA------ 5\n",
"\n"
]
},
{
"data": {
"text/plain": [
"{'type': 'local-tiles',\n",
" 'tilesetInfo': {'min_pos': [0],\n",
" 'max_pos': [30],\n",
" 'max_width': 30,\n",
" 'tile_size': 30,\n",
" 'chromsizes': [['a', 30]],\n",
" 'max_zoom': 0,\n",
" 'max_tile_width': 100000,\n",
" 'format': 'subs'},\n",
" 'tiles': {'0.0': [{'id': 'r0',\n",
" 'from': 20,\n",
" 'to': 25,\n",
" 'substitutions': [{'pos': 3,\n",
" 'type': 'X',\n",
" 'length': 1,\n",
" 'base': 'T',\n",
" 'variant': 'G'}],\n",
" 'color': 0}]}}"
]
},
"execution_count": 100,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\n",
"\n",
"def get_pileup_alignment_data(ref: str, seqs: list[str]) -> dict:\n",
" \"\"\"Get a local tile for use in a higlass-pileup plot.\n",
"\n",
" :param ref: The reference to align to\n",
" :param seqs: The sequences to align to the reference\n",
" \"\"\"\n",
" local_data = {\n",
" \"type\": 'local-tiles',\n",
" \"tilesetInfo\": {\n",
" 'min_pos': [0],\n",
" 'max_pos': [len(ref)],\n",
" 'max_width': len(ref),\n",
" 'tile_size': len(ref),\n",
" 'chromsizes': [['a', len(ref)]],\n",
" 'max_zoom': 0,\n",
" 'max_tile_width': 100000,\n",
" 'format': 'subs'\n",
" },\n",
" \"tiles\": {\n",
" '0.0': [],\n",
" }\n",
" }\n",
" \n",
" for i,seq in enumerate(seqs):\n",
" print(\"ref\", ref)\n",
" print(\"seq\", seq)\n",
" a = align_sequences(ref, seq)\n",
" print(a)\n",
" start, end, subs = get_subs(a)\n",
"\n",
" local_data['tiles']['0.0'].append({\n",
" \"id\": f\"r{i}\",\n",
" \"from\": start,\n",
" \"to\": end,\n",
" \"substitutions\": subs,\n",
" \"color\": 0\n",
" })\n",
"\n",
" # print(local_data)\n",
" return local_data\n",
"\n",
"get_pileup_alignment_data(\n",
" \"TATTTTGGACCGCGCGTTCATTTACACGTC\", [\"ATTGA\"])\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 101,
"id": "abfbb8e6",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"ref TATTTTGGACCGCGCGTTCATTTACACGTC\n",
"seq ATTGA\n",
"target 0 TATTTTGGACCGCGCGTTCATTTACACGTC 30\n",
" 0 -------------------|||.|------ 30\n",
"query 0 -------------------ATTGA------ 5\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "e562dd0e8c9c4e14b36e639eab72b24c",
"version_major": 2,
"version_minor": 1
},
"text/plain": [
"HiGlassWidget()"
]
},
"execution_count": 101,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from typing import Literal, ClassVar\n",
"import higlass as hg\n",
"\n",
"class PileupTrack(hg.PluginTrack):\n",
" type: Literal[\"pileup\"] = \"pileup\"\n",
" # plugin_url: ClassVar[str] = \"https://unpkg.com/higlass-pileup/dist/higlass-pileup.min.js\"\n",
" plugin_url: ClassVar[str] = \"http://localhost:8080/higlass-pileup.min.js\"\n",
"\n",
"\n",
"# Specify the track-specific data\n",
"pileup_data = {\n",
" \"type\": \"bam\",\n",
" \"url\": \"https://pkerp.s3.amazonaws.com/public/bamfile_test/SRR1770413.sorted.bam\",\n",
" \"chromSizesUrl\": \"https://pkerp.s3.amazonaws.com/public/bamfile_test/GCF_000005845.2_ASM584v2_genomic.chrom.sizes\",\n",
" \"options\": {\"maxTileWidth\": 30000},\n",
"}\n",
"\n",
"# Create and use the custom track\n",
"pileup_track = PileupTrack(data=get_pileup_alignment_data(\n",
" 'TATTTTGGACCGCGCGTTCATTTACACGTC',\n",
" ['ATTGA'])\n",
" , height=180).opts(\n",
" axisPositionHorizontal=\"right\",\n",
" axisLabelFormatting=\"normal\",\n",
" showCoverage=True,\n",
" colorScale=[\n",
" \"#2c7bb6\",\"#92c5de\",\"#ffffbf\",\"#fdae61\",\"#808080\", \"#DCDCDC\",\n",
" ]\n",
")\n",
"\n",
"view = hg.view((pileup_track, \"top\"), (hg.track(\"top-axis\"), 'top')).domain(x = [0, 100])\n",
"view"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "4e191c1c-c9ea-4497-858c-db285280f8dc",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 12,
"id": "faea5eb5-358c-4565-a5e1-514bb232afbe",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 15,
"id": "1f934768-3271-49bf-a504-4a2acb9a27ef",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"target 0 GGA---GGGGAGG 10\n",
" 0 |||---||..||| 13\n",
"query 0 GGAAAGGGAAAGG 13\n",
"\n",
"[{'pos': 3, 'type': 'I', 'length': 3}, {'pos': 5, 'type': 'X', 'length': 1, 'base': 'G', 'variant': 'A'}, {'pos': 6, 'type': 'X', 'length': 1, 'base': 'G', 'variant': 'A'}]\n",
"target 0 GGAAAGGGAAAGG 13\n",
" 0 |||---||..||| 13\n",
"query 0 GGA---GGGGAGG 10\n",
"\n",
"[{'pos': 3, 'type': 'D', 'length': 3}, {'pos': 8, 'type': 'X', 'length': 1, 'base': 'A', 'variant': 'G'}, {'pos': 9, 'type': 'X', 'length': 1, 'base': 'A', 'variant': 'G'}]\n",
"target 0 GGAAAGAGGAAAGG 14\n",
" 0 ||--||.||--||| 14\n",
"query 0 GG--AGTGG--AGG 10\n",
"\n",
"[{'pos': 2, 'type': 'D', 'length': 2}, {'pos': 6, 'type': 'X', 'length': 1, 'base': 'A', 'variant': 'T'}, {'pos': 9, 'type': 'D', 'length': 2}]\n",
"target 0 GGAAAGTTAGGAAAGG 16\n",
" 0 -----|||||||---- 16\n",
"query 0 -----GTTAGGA---- 7\n",
"\n",
"[{'pos': 0, 'type': 'D', 'length': 5}]\n"
]
}
],
"source": [
"\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "688831b6-6111-47af-8ce2-a285a1ff5508",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[{'pos': 0, 'type': 'D', 'length': 2},\n",
" {'pos': 4, 'type': 'X', 'length': 1, 'base': 'A', 'variant': 'T'},\n",
" {'pos': 2, 'type': 'D', 'length': 2}]"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"parts"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "e5791bd2-5017-45e0-a2a6-a2f3b93af71c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'GGAAAGAGGAAAGG'"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"a.target"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bfab5061-9695-4a32-a930-9df29e23bf7d",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 33,
"id": "e620f0c5-e3a1-43d5-ba8d-78e2acb18b6c",
"metadata": {},
"outputs": [],
"source": [
"from higlass.tilesets import ClodiusTileset\n",
"\n",
"def tileset_info(a):\n",
" target = a.target\n",
" tile_size = 1024\n",
" max_width = 2 ** math.ceil(math.log(33) / math.log(2))\n",
" # max_zoom = math.ceil(max_width // tile_size)\n",
"\n",
" # We'll fit everything into one tile\n",
" max_zoom = 0\n",
"\n",
" chromsizes_list = [['a', len(target)]]\n",
" \n",
" return {\n",
" \"min_pos\": [0],\n",
" \"max_pos\": [len(target)],\n",
" \"max_width\": max_width,\n",
" \"tile_size\": tile_size,\n",
" \"chromsizes\": chromsizes_list,\n",
" \"max_zoom\": max_zoom,\n",
" \"max_tile_width\": 100000,\n",
" \"format\": \"subs\"\n",
" }\n",
"\n",
"\n",
"def tiles(a, tile_ids):\n",
" return get_subs(a)"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "52906247-4610-49c9-bce9-543d27fdabe7",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'min_pos': [0],\n",
" 'max_pos': [14],\n",
" 'max_width': 64,\n",
" 'tile_size': 1024,\n",
" 'chromsizes': [['a', 14]],\n",
" 'max_zoom': 0,\n",
" 'max_tile_width': 100000,\n",
" 'format': 'subs'}"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tileset_info(a)"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "2ee9e898-ff4d-40e2-b98c-f8a3182bea1f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[{'pos': 2, 'type': 'D', 'length': 2},\n",
" {'pos': 6, 'type': 'X', 'length': 1, 'base': 'A', 'variant': 'T'},\n",
" {'pos': 9, 'type': 'D', 'length': 2}]"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tiles(a, [])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ad9c7234-5777-4a3b-8bbd-c6ec3da32b49",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.15"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment