Created
April 19, 2018 09:04
-
-
Save parashardhapola/e55665162867edb1bab7736d38334bd0 to your computer and use it in GitHub Desktop.
Remove mean variance trend in single cell RNA-Seq data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
from statsmodels.nonparametric.smoothers_lowess import lowess | |
def fix_var(gene_stats, n_bins = 200, lowess_frac = 0.4): | |
# gene_stats is a dataframe with gene names as index (rownames) and two | |
# columns: | |
# 'm': mean value of each gene | |
# 'v': variance of each gene | |
# find out bin positions | |
bin_edges = np.histogram(gene_stats.m, bins=n_bins)[1] | |
bin_edges[-1] += 0.1 # For inclusion of last gene | |
bin_genes = [] # contains names of genes in each bin | |
for i in range(n_bins): | |
# idx is a boolean vector where each element corresponds to a gene | |
# value is set to true is gene is present in current bin (value of | |
# i in this loop) | |
idx = (gene_stats.m >= bin_edges[i]) & \ | |
(gene_stats.m < bin_edges[i + 1]) | |
if idx.sum() > 0: # making sure that the bin contains at least one gene | |
bin_genes.append(list(idx[idx].index)) | |
bin_vals = [] # holds the variance and mean value of such a gene | |
# from each bin that has lowest variance in that bin | |
for genes in bin_genes: | |
# subset the full dataframe to extract genes of current bin | |
temp_stat = gene_stats.reindex(genes) | |
# identify gene with lowest varaince in current bin | |
temp_gene = temp_stat.idxmin().v | |
bin_vals.append( | |
[temp_stat.v[temp_gene], temp_stat.m[temp_gene]]) | |
# transpose to have a shape : (2, nbins) such that first row contains | |
# variances and second contain the means | |
bin_vals = np.array(bin_vals).T | |
# Fit lowess curve and get predicted y values | |
bin_cor_fac = lowess(bin_vals[0], bin_vals[1], return_sorted=False, frac=lowess_frac, it=100).T | |
fixed_var = {} | |
for bcf, genes in zip(bin_cor_fac, bin_genes): | |
for gene in genes: | |
# correct predicted y values from all | |
# genes of the corresponding bin | |
fixed_var[gene] = gene_stats.v[gene] - bcf | |
fixed_var = pd.Series(fixed_var) | |
return fixed_var |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment