alexllc · October 29, 2024 17:37
diff --git a/auto_qc_pp.py b/auto_qc_pp.py
 import numpy as np
 import pandas as pd
 import seaborn as sns

 from scipy.signal import find_peaks
 import matplotlib as mpl
 import matplotlib.pyplot as plt

 import scanpy as sc
 from anndata import AnnData

 # Function cell
 def auto_visium_preprocess(adata, leiden_iter = 2):
    adata.var_names_make_unique()
    adata.var["mt"] = adata.var_names.str.startswith("MT-")
    sc.pp.calculate_qc_metrics(adata, qc_vars=["mt"], inplace=True)
    
    fig, axs = plt.subplots(1, 4, figsize=(15, 4))
    sns.histplot(adata.obs["total_counts"], kde=False, ax=axs[0])
    sns.histplot(
        adata.obs["total_counts"][adata.obs["total_counts"] < 10000],
        kde=False,
        bins=40,
        ax=axs[1],
    )
    sns.histplot(adata.obs["n_genes_by_counts"], kde=False, bins=60, ax=axs[2])
    sns.histplot(
        adata.obs["n_genes_by_counts"][adata.obs["n_genes_by_counts"] < 5000],
        kde=False,
        bins=60,
        ax=axs[3],
    )
    
    # Automatic detection of peaks for filtering
    slide_total_counts = adata.obs["total_counts"][adata.obs["total_counts"] < 10000]

    # Convert to sequence
    hist, bin_edges = np.histogram(slide_total_counts, bins=5000)
    peaks, _ = find_peaks(hist, distance=500)
    bin_edges = bin_edges[1:]
    
    fig, axs = plt.subplots(1, 1, figsize=(5, 5))
    plt.plot(bin_edges, hist)
    plt.plot(bin_edges[peaks], hist[peaks], "x")
    plt.title("Histogram of low total count noise peaks")
    plt.show()
        
    value_of_2nd_peak = bin_edges[peaks[1]]
    
    print("Before filtering, adata shape is: ", adata.shape)
    adata.obsm["spatial"] = adata.obsm["spatial"].astype(int)
    # Before filtering
    # filter by in tissue
    adata = adata[adata.obs["in_tissue"] == '1']

    plt.rcParams["figure.figsize"] = (8, 8)
    sc.pl.spatial(adata, img_key="hires", color=["total_counts", "n_genes_by_counts"])

    adata_filtered = adata[adata.obs["total_counts"] > value_of_2nd_peak, :]
    adata_filtered = adata_filtered[adata_filtered.obs["pct_counts_mt"] < 10]

    print("After filtering, adata shape is: ", adata_filtered.shape)
    sc.pl.spatial(adata_filtered, img_key="hires", color=["total_counts", "n_genes_by_counts"])
    
    # Basic preprocessing
    sc.pp.normalize_total(adata_filtered, inplace=True)
    sc.pp.log1p(adata_filtered)
    
    sc.pp.pca(adata_filtered)
    sc.pp.neighbors(adata_filtered)
    sc.tl.umap(adata_filtered)
    sc.tl.leiden(adata_filtered, key_added="clusters", directed=False, n_iterations=leiden_iter)
    
    return adata_filtered
	import numpy as np
	import pandas as pd
	import seaborn as sns

	from scipy.signal import find_peaks
	import matplotlib as mpl
	import matplotlib.pyplot as plt

	import scanpy as sc
	from anndata import AnnData

	# Function cell
	def auto_visium_preprocess(adata, leiden_iter = 2):
	adata.var_names_make_unique()
	adata.var["mt"] = adata.var_names.str.startswith("MT-")
	sc.pp.calculate_qc_metrics(adata, qc_vars=["mt"], inplace=True)

	fig, axs = plt.subplots(1, 4, figsize=(15, 4))
	sns.histplot(adata.obs["total_counts"], kde=False, ax=axs[0])
	sns.histplot(
	adata.obs["total_counts"][adata.obs["total_counts"] < 10000],
	kde=False,
	bins=40,
	ax=axs[1],
	)
	sns.histplot(adata.obs["n_genes_by_counts"], kde=False, bins=60, ax=axs[2])
	sns.histplot(
	adata.obs["n_genes_by_counts"][adata.obs["n_genes_by_counts"] < 5000],
	kde=False,
	bins=60,
	ax=axs[3],
	)

	# Automatic detection of peaks for filtering
	slide_total_counts = adata.obs["total_counts"][adata.obs["total_counts"] < 10000]

	# Convert to sequence
	hist, bin_edges = np.histogram(slide_total_counts, bins=5000)
	peaks, _ = find_peaks(hist, distance=500)
	bin_edges = bin_edges[1:]

	fig, axs = plt.subplots(1, 1, figsize=(5, 5))
	plt.plot(bin_edges, hist)
	plt.plot(bin_edges[peaks], hist[peaks], "x")
	plt.title("Histogram of low total count noise peaks")
	plt.show()

	value_of_2nd_peak = bin_edges[peaks[1]]

	print("Before filtering, adata shape is: ", adata.shape)
	adata.obsm["spatial"] = adata.obsm["spatial"].astype(int)
	# Before filtering
	# filter by in tissue
	adata = adata[adata.obs["in_tissue"] == '1']

	plt.rcParams["figure.figsize"] = (8, 8)
	sc.pl.spatial(adata, img_key="hires", color=["total_counts", "n_genes_by_counts"])

	adata_filtered = adata[adata.obs["total_counts"] > value_of_2nd_peak, :]
	adata_filtered = adata_filtered[adata_filtered.obs["pct_counts_mt"] < 10]

	print("After filtering, adata shape is: ", adata_filtered.shape)
	sc.pl.spatial(adata_filtered, img_key="hires", color=["total_counts", "n_genes_by_counts"])

	# Basic preprocessing
	sc.pp.normalize_total(adata_filtered, inplace=True)
	sc.pp.log1p(adata_filtered)

	sc.pp.pca(adata_filtered)
	sc.pp.neighbors(adata_filtered)
	sc.tl.umap(adata_filtered)
	sc.tl.leiden(adata_filtered, key_added="clusters", directed=False, n_iterations=leiden_iter)

	return adata_filtered