Last active
March 25, 2025 05:05
-
-
Save pkerpedjiev/b7fb4a97df7115fc464ed06916ee349b to your computer and use it in GitHub Desktop.
juv-bam-files.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "id": "e300e3e5", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# /// script\n", | |
| "# requires-python = \">=3.10\"\n", | |
| "# dependencies = [\n", | |
| "# \"biopython\",\n", | |
| "# \"clodius\",\n", | |
| "# \"cooler\",\n", | |
| "# \"higlass-python==1.3.0\",\n", | |
| "# \"smart_open\",\n", | |
| "# ]\n", | |
| "#\n", | |
| "# [tool.uv.sources]\n", | |
| "# clodius = { path = \"../../resgen/rhodius\", editable = true }\n", | |
| "# ///" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "id": "c7e3cb5a-5c16-4ed2-b936-1ea687dbe604", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "The autoreload extension is already loaded. To reload it, use:\n", | |
| " %reload_ext autoreload\n", | |
| "env: ANYWIDGET_HMR=1\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "%load_ext autoreload\n", | |
| "%autoreload 2\n", | |
| "%env ANYWIDGET_HMR=1" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 98, | |
| "id": "4926fadb-8d03-4d52-b793-f24f01f6ce6c", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import Bio\n", | |
| "from Bio import Align\n", | |
| "from Bio.Seq import Seq\n", | |
| "\n", | |
| "def align_sequences(seq1, seq2):\n", | |
| " aligner = Align.PairwiseAligner()\n", | |
| "\n", | |
| " aligner.match_score = 1\n", | |
| " aligner.mismatch_score = -4\n", | |
| " aligner.open_gap_score = -6\n", | |
| " aligner.extend_gap_score = -1\n", | |
| "\n", | |
| " alignments = aligner.align(seq1, seq2)\n", | |
| "\n", | |
| " best_alignment = alignments[0]\n", | |
| "\n", | |
| " return best_alignment\n", | |
| " \n", | |
| "def get_subs(a):\n", | |
| " parts = []\n", | |
| " ttrue = 0\n", | |
| " tpos = 0\n", | |
| " qpos = 0\n", | |
| "\n", | |
| " start = 0\n", | |
| " end = 0\n", | |
| "\n", | |
| " aligneds = list(zip(a.aligned[0], a.aligned[1]))\n", | |
| "\n", | |
| " for i, ((ts, te), (qs, qe)) in enumerate(aligneds):\n", | |
| " ts,te,qs,qe = int(ts), int(te), int(qs), int(qe)\n", | |
| " \n", | |
| " if i == 0:\n", | |
| " # start position\n", | |
| " start = ts\n", | |
| " tpos = ts\n", | |
| " ttrue = 0\n", | |
| " if i == len(aligneds) - 1:\n", | |
| " # end position\n", | |
| " end = te\n", | |
| " \n", | |
| " if ts > tpos:\n", | |
| " parts += [{'pos': ttrue, 'type': 'D', 'length': ts - tpos}]\n", | |
| " ttrue += ts - tpos\n", | |
| " if qs > qpos:\n", | |
| " parts += [{'pos': ttrue, 'type': 'I', 'length': qs - qpos}]\n", | |
| " for i in range(te - ts):\n", | |
| " if a.target[ts + i] != a.query[qs + i]:\n", | |
| " parts += [{'pos': ttrue + i, 'type': 'X', 'length': 1, 'base': a.target[ts + i], 'variant': a.query[qs + i]}]\n", | |
| "\n", | |
| " ttrue += (te - ts)\n", | |
| " tpos = te\n", | |
| " qpos = qe\n", | |
| " return start+1, end+1, parts\n", | |
| "\n", | |
| "a = align_sequences(\"TTTTT\", \"TTATT\")\n", | |
| "s = get_subs(a)\n", | |
| "\n", | |
| "# assert 1-based start positions and closed intervals\n", | |
| "assert s[0] == 1\n", | |
| "assert s[1] == 6\n", | |
| "assert s[2][0]['pos'] == 2 # subs are 0-based\n", | |
| "assert s[2][0]['base'] == 'T'\n", | |
| "assert s[2][0]['variant'] == 'A'\n", | |
| "\n", | |
| "a = align_sequences(\"TTTTT\", \"TTATTT\")\n", | |
| "s = get_subs(a)\n", | |
| "\n", | |
| "assert s[0] == 1\n", | |
| "assert s[1] == 6\n", | |
| "assert s[2][0]['pos'] == 2\n", | |
| "assert s[2][0]['type'] == 'I'\n", | |
| "assert s[2][0]['length'] == 1\n", | |
| "\n", | |
| "# a = align_sequences(\"TATTTTGGACCGCGCGTTCATTTACACGTC\", \"ATTGA\")\n", | |
| "# print(a)\n", | |
| "# s = get_subs(a)\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 100, | |
| "id": "e71c7ef3-afd2-484b-9082-162d47d4b09b", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "ref TATTTTGGACCGCGCGTTCATTTACACGTC\n", | |
| "seq ATTGA\n", | |
| "target 0 TATTTTGGACCGCGCGTTCATTTACACGTC 30\n", | |
| " 0 -------------------|||.|------ 30\n", | |
| "query 0 -------------------ATTGA------ 5\n", | |
| "\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "{'type': 'local-tiles',\n", | |
| " 'tilesetInfo': {'min_pos': [0],\n", | |
| " 'max_pos': [30],\n", | |
| " 'max_width': 30,\n", | |
| " 'tile_size': 30,\n", | |
| " 'chromsizes': [['a', 30]],\n", | |
| " 'max_zoom': 0,\n", | |
| " 'max_tile_width': 100000,\n", | |
| " 'format': 'subs'},\n", | |
| " 'tiles': {'0.0': [{'id': 'r0',\n", | |
| " 'from': 20,\n", | |
| " 'to': 25,\n", | |
| " 'substitutions': [{'pos': 3,\n", | |
| " 'type': 'X',\n", | |
| " 'length': 1,\n", | |
| " 'base': 'T',\n", | |
| " 'variant': 'G'}],\n", | |
| " 'color': 0}]}}" | |
| ] | |
| }, | |
| "execution_count": 100, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "\n", | |
| "\n", | |
| "def get_pileup_alignment_data(ref: str, seqs: list[str]) -> dict:\n", | |
| " \"\"\"Get a local tile for use in a higlass-pileup plot.\n", | |
| "\n", | |
| " :param ref: The reference to align to\n", | |
| " :param seqs: The sequences to align to the reference\n", | |
| " \"\"\"\n", | |
| " local_data = {\n", | |
| " \"type\": 'local-tiles',\n", | |
| " \"tilesetInfo\": {\n", | |
| " 'min_pos': [0],\n", | |
| " 'max_pos': [len(ref)],\n", | |
| " 'max_width': len(ref),\n", | |
| " 'tile_size': len(ref),\n", | |
| " 'chromsizes': [['a', len(ref)]],\n", | |
| " 'max_zoom': 0,\n", | |
| " 'max_tile_width': 100000,\n", | |
| " 'format': 'subs'\n", | |
| " },\n", | |
| " \"tiles\": {\n", | |
| " '0.0': [],\n", | |
| " }\n", | |
| " }\n", | |
| " \n", | |
| " for i,seq in enumerate(seqs):\n", | |
| " print(\"ref\", ref)\n", | |
| " print(\"seq\", seq)\n", | |
| " a = align_sequences(ref, seq)\n", | |
| " print(a)\n", | |
| " start, end, subs = get_subs(a)\n", | |
| "\n", | |
| " local_data['tiles']['0.0'].append({\n", | |
| " \"id\": f\"r{i}\",\n", | |
| " \"from\": start,\n", | |
| " \"to\": end,\n", | |
| " \"substitutions\": subs,\n", | |
| " \"color\": 0\n", | |
| " })\n", | |
| "\n", | |
| " # print(local_data)\n", | |
| " return local_data\n", | |
| "\n", | |
| "get_pileup_alignment_data(\n", | |
| " \"TATTTTGGACCGCGCGTTCATTTACACGTC\", [\"ATTGA\"])\n", | |
| " " | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 101, | |
| "id": "abfbb8e6", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "ref TATTTTGGACCGCGCGTTCATTTACACGTC\n", | |
| "seq ATTGA\n", | |
| "target 0 TATTTTGGACCGCGCGTTCATTTACACGTC 30\n", | |
| " 0 -------------------|||.|------ 30\n", | |
| "query 0 -------------------ATTGA------ 5\n", | |
| "\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "application/vnd.jupyter.widget-view+json": { | |
| "model_id": "e562dd0e8c9c4e14b36e639eab72b24c", | |
| "version_major": 2, | |
| "version_minor": 1 | |
| }, | |
| "text/plain": [ | |
| "HiGlassWidget()" | |
| ] | |
| }, | |
| "execution_count": 101, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "from typing import Literal, ClassVar\n", | |
| "import higlass as hg\n", | |
| "\n", | |
| "class PileupTrack(hg.PluginTrack):\n", | |
| " type: Literal[\"pileup\"] = \"pileup\"\n", | |
| " # plugin_url: ClassVar[str] = \"https://unpkg.com/higlass-pileup/dist/higlass-pileup.min.js\"\n", | |
| " plugin_url: ClassVar[str] = \"http://localhost:8080/higlass-pileup.min.js\"\n", | |
| "\n", | |
| "\n", | |
| "# Specify the track-specific data\n", | |
| "pileup_data = {\n", | |
| " \"type\": \"bam\",\n", | |
| " \"url\": \"https://pkerp.s3.amazonaws.com/public/bamfile_test/SRR1770413.sorted.bam\",\n", | |
| " \"chromSizesUrl\": \"https://pkerp.s3.amazonaws.com/public/bamfile_test/GCF_000005845.2_ASM584v2_genomic.chrom.sizes\",\n", | |
| " \"options\": {\"maxTileWidth\": 30000},\n", | |
| "}\n", | |
| "\n", | |
| "# Create and use the custom track\n", | |
| "pileup_track = PileupTrack(data=get_pileup_alignment_data(\n", | |
| " 'TATTTTGGACCGCGCGTTCATTTACACGTC',\n", | |
| " ['ATTGA'])\n", | |
| " , height=180).opts(\n", | |
| " axisPositionHorizontal=\"right\",\n", | |
| " axisLabelFormatting=\"normal\",\n", | |
| " showCoverage=True,\n", | |
| " colorScale=[\n", | |
| " \"#2c7bb6\",\"#92c5de\",\"#ffffbf\",\"#fdae61\",\"#808080\", \"#DCDCDC\",\n", | |
| " ]\n", | |
| ")\n", | |
| "\n", | |
| "view = hg.view((pileup_track, \"top\"), (hg.track(\"top-axis\"), 'top')).domain(x = [0, 100])\n", | |
| "view" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 11, | |
| "id": "4e191c1c-c9ea-4497-858c-db285280f8dc", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 12, | |
| "id": "faea5eb5-358c-4565-a5e1-514bb232afbe", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 15, | |
| "id": "1f934768-3271-49bf-a504-4a2acb9a27ef", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "target 0 GGA---GGGGAGG 10\n", | |
| " 0 |||---||..||| 13\n", | |
| "query 0 GGAAAGGGAAAGG 13\n", | |
| "\n", | |
| "[{'pos': 3, 'type': 'I', 'length': 3}, {'pos': 5, 'type': 'X', 'length': 1, 'base': 'G', 'variant': 'A'}, {'pos': 6, 'type': 'X', 'length': 1, 'base': 'G', 'variant': 'A'}]\n", | |
| "target 0 GGAAAGGGAAAGG 13\n", | |
| " 0 |||---||..||| 13\n", | |
| "query 0 GGA---GGGGAGG 10\n", | |
| "\n", | |
| "[{'pos': 3, 'type': 'D', 'length': 3}, {'pos': 8, 'type': 'X', 'length': 1, 'base': 'A', 'variant': 'G'}, {'pos': 9, 'type': 'X', 'length': 1, 'base': 'A', 'variant': 'G'}]\n", | |
| "target 0 GGAAAGAGGAAAGG 14\n", | |
| " 0 ||--||.||--||| 14\n", | |
| "query 0 GG--AGTGG--AGG 10\n", | |
| "\n", | |
| "[{'pos': 2, 'type': 'D', 'length': 2}, {'pos': 6, 'type': 'X', 'length': 1, 'base': 'A', 'variant': 'T'}, {'pos': 9, 'type': 'D', 'length': 2}]\n", | |
| "target 0 GGAAAGTTAGGAAAGG 16\n", | |
| " 0 -----|||||||---- 16\n", | |
| "query 0 -----GTTAGGA---- 7\n", | |
| "\n", | |
| "[{'pos': 0, 'type': 'D', 'length': 5}]\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "\n", | |
| " " | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 17, | |
| "id": "688831b6-6111-47af-8ce2-a285a1ff5508", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[{'pos': 0, 'type': 'D', 'length': 2},\n", | |
| " {'pos': 4, 'type': 'X', 'length': 1, 'base': 'A', 'variant': 'T'},\n", | |
| " {'pos': 2, 'type': 'D', 'length': 2}]" | |
| ] | |
| }, | |
| "execution_count": 17, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "parts" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 24, | |
| "id": "e5791bd2-5017-45e0-a2a6-a2f3b93af71c", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "'GGAAAGAGGAAAGG'" | |
| ] | |
| }, | |
| "execution_count": 24, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "a.target" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "bfab5061-9695-4a32-a930-9df29e23bf7d", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 33, | |
| "id": "e620f0c5-e3a1-43d5-ba8d-78e2acb18b6c", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "from higlass.tilesets import ClodiusTileset\n", | |
| "\n", | |
| "def tileset_info(a):\n", | |
| " target = a.target\n", | |
| " tile_size = 1024\n", | |
| " max_width = 2 ** math.ceil(math.log(33) / math.log(2))\n", | |
| " # max_zoom = math.ceil(max_width // tile_size)\n", | |
| "\n", | |
| " # We'll fit everything into one tile\n", | |
| " max_zoom = 0\n", | |
| "\n", | |
| " chromsizes_list = [['a', len(target)]]\n", | |
| " \n", | |
| " return {\n", | |
| " \"min_pos\": [0],\n", | |
| " \"max_pos\": [len(target)],\n", | |
| " \"max_width\": max_width,\n", | |
| " \"tile_size\": tile_size,\n", | |
| " \"chromsizes\": chromsizes_list,\n", | |
| " \"max_zoom\": max_zoom,\n", | |
| " \"max_tile_width\": 100000,\n", | |
| " \"format\": \"subs\"\n", | |
| " }\n", | |
| "\n", | |
| "\n", | |
| "def tiles(a, tile_ids):\n", | |
| " return get_subs(a)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 34, | |
| "id": "52906247-4610-49c9-bce9-543d27fdabe7", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "{'min_pos': [0],\n", | |
| " 'max_pos': [14],\n", | |
| " 'max_width': 64,\n", | |
| " 'tile_size': 1024,\n", | |
| " 'chromsizes': [['a', 14]],\n", | |
| " 'max_zoom': 0,\n", | |
| " 'max_tile_width': 100000,\n", | |
| " 'format': 'subs'}" | |
| ] | |
| }, | |
| "execution_count": 34, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "tileset_info(a)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 35, | |
| "id": "2ee9e898-ff4d-40e2-b98c-f8a3182bea1f", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[{'pos': 2, 'type': 'D', 'length': 2},\n", | |
| " {'pos': 6, 'type': 'X', 'length': 1, 'base': 'A', 'variant': 'T'},\n", | |
| " {'pos': 9, 'type': 'D', 'length': 2}]" | |
| ] | |
| }, | |
| "execution_count": 35, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "tiles(a, [])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "ad9c7234-5777-4a3b-8bbd-c6ec3da32b49", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3 (ipykernel)", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.10.15" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 5 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment