Last active
October 29, 2024 17:37
-
-
Save alexllc/a5df4481cdc24907e5399cb816b439eb to your computer and use it in GitHub Desktop.
Simple automatic Visium QC and preprocessing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
import seaborn as sns | |
from scipy.signal import find_peaks | |
import matplotlib as mpl | |
import matplotlib.pyplot as plt | |
import scanpy as sc | |
from anndata import AnnData | |
# Function cell | |
def auto_visium_preprocess(adata, leiden_iter = 2): | |
adata.var_names_make_unique() | |
adata.var["mt"] = adata.var_names.str.startswith("MT-") | |
sc.pp.calculate_qc_metrics(adata, qc_vars=["mt"], inplace=True) | |
fig, axs = plt.subplots(1, 4, figsize=(15, 4)) | |
sns.histplot(adata.obs["total_counts"], kde=False, ax=axs[0]) | |
sns.histplot( | |
adata.obs["total_counts"][adata.obs["total_counts"] < 10000], | |
kde=False, | |
bins=40, | |
ax=axs[1], | |
) | |
sns.histplot(adata.obs["n_genes_by_counts"], kde=False, bins=60, ax=axs[2]) | |
sns.histplot( | |
adata.obs["n_genes_by_counts"][adata.obs["n_genes_by_counts"] < 5000], | |
kde=False, | |
bins=60, | |
ax=axs[3], | |
) | |
# Automatic detection of peaks for filtering | |
slide_total_counts = adata.obs["total_counts"][adata.obs["total_counts"] < 10000] | |
# Convert to sequence | |
hist, bin_edges = np.histogram(slide_total_counts, bins=5000) | |
peaks, _ = find_peaks(hist, distance=500) | |
bin_edges = bin_edges[1:] | |
fig, axs = plt.subplots(1, 1, figsize=(5, 5)) | |
plt.plot(bin_edges, hist) | |
plt.plot(bin_edges[peaks], hist[peaks], "x") | |
plt.title("Histogram of low total count noise peaks") | |
plt.show() | |
value_of_2nd_peak = bin_edges[peaks[1]] | |
print("Before filtering, adata shape is: ", adata.shape) | |
adata.obsm["spatial"] = adata.obsm["spatial"].astype(int) | |
# Before filtering | |
# filter by in tissue | |
adata = adata[adata.obs["in_tissue"] == '1'] | |
plt.rcParams["figure.figsize"] = (8, 8) | |
sc.pl.spatial(adata, img_key="hires", color=["total_counts", "n_genes_by_counts"]) | |
adata_filtered = adata[adata.obs["total_counts"] > value_of_2nd_peak, :] | |
adata_filtered = adata_filtered[adata_filtered.obs["pct_counts_mt"] < 10] | |
print("After filtering, adata shape is: ", adata_filtered.shape) | |
sc.pl.spatial(adata_filtered, img_key="hires", color=["total_counts", "n_genes_by_counts"]) | |
# Basic preprocessing | |
sc.pp.normalize_total(adata_filtered, inplace=True) | |
sc.pp.log1p(adata_filtered) | |
sc.pp.pca(adata_filtered) | |
sc.pp.neighbors(adata_filtered) | |
sc.tl.umap(adata_filtered) | |
sc.tl.leiden(adata_filtered, key_added="clusters", directed=False, n_iterations=leiden_iter) | |
return adata_filtered |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment