Skip to content

Instantly share code, notes, and snippets.

@alexllc
Last active October 29, 2024 17:37
Show Gist options
  • Save alexllc/a5df4481cdc24907e5399cb816b439eb to your computer and use it in GitHub Desktop.
Save alexllc/a5df4481cdc24907e5399cb816b439eb to your computer and use it in GitHub Desktop.
Simple automatic Visium QC and preprocessing
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.signal import find_peaks
import matplotlib as mpl
import matplotlib.pyplot as plt
import scanpy as sc
from anndata import AnnData
# Function cell
def auto_visium_preprocess(adata, leiden_iter = 2):
adata.var_names_make_unique()
adata.var["mt"] = adata.var_names.str.startswith("MT-")
sc.pp.calculate_qc_metrics(adata, qc_vars=["mt"], inplace=True)
fig, axs = plt.subplots(1, 4, figsize=(15, 4))
sns.histplot(adata.obs["total_counts"], kde=False, ax=axs[0])
sns.histplot(
adata.obs["total_counts"][adata.obs["total_counts"] < 10000],
kde=False,
bins=40,
ax=axs[1],
)
sns.histplot(adata.obs["n_genes_by_counts"], kde=False, bins=60, ax=axs[2])
sns.histplot(
adata.obs["n_genes_by_counts"][adata.obs["n_genes_by_counts"] < 5000],
kde=False,
bins=60,
ax=axs[3],
)
# Automatic detection of peaks for filtering
slide_total_counts = adata.obs["total_counts"][adata.obs["total_counts"] < 10000]
# Convert to sequence
hist, bin_edges = np.histogram(slide_total_counts, bins=5000)
peaks, _ = find_peaks(hist, distance=500)
bin_edges = bin_edges[1:]
fig, axs = plt.subplots(1, 1, figsize=(5, 5))
plt.plot(bin_edges, hist)
plt.plot(bin_edges[peaks], hist[peaks], "x")
plt.title("Histogram of low total count noise peaks")
plt.show()
value_of_2nd_peak = bin_edges[peaks[1]]
print("Before filtering, adata shape is: ", adata.shape)
adata.obsm["spatial"] = adata.obsm["spatial"].astype(int)
# Before filtering
# filter by in tissue
adata = adata[adata.obs["in_tissue"] == '1']
plt.rcParams["figure.figsize"] = (8, 8)
sc.pl.spatial(adata, img_key="hires", color=["total_counts", "n_genes_by_counts"])
adata_filtered = adata[adata.obs["total_counts"] > value_of_2nd_peak, :]
adata_filtered = adata_filtered[adata_filtered.obs["pct_counts_mt"] < 10]
print("After filtering, adata shape is: ", adata_filtered.shape)
sc.pl.spatial(adata_filtered, img_key="hires", color=["total_counts", "n_genes_by_counts"])
# Basic preprocessing
sc.pp.normalize_total(adata_filtered, inplace=True)
sc.pp.log1p(adata_filtered)
sc.pp.pca(adata_filtered)
sc.pp.neighbors(adata_filtered)
sc.tl.umap(adata_filtered)
sc.tl.leiden(adata_filtered, key_added="clusters", directed=False, n_iterations=leiden_iter)
return adata_filtered
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment