parashardhapola · April 19, 2018 09:04
diff --git a/scRNA_Seq_fix_var.py b/scRNA_Seq_fix_var.py
 import numpy as np
 import pandas as pd
 from statsmodels.nonparametric.smoothers_lowess import lowess

 def fix_var(gene_stats, n_bins = 200, lowess_frac = 0.4):

    # gene_stats is a dataframe with gene names as index (rownames) and two
    # columns:
    # 'm': mean value of each gene
    # 'v': variance of each gene

    # find out bin positions
    bin_edges = np.histogram(gene_stats.m, bins=n_bins)[1]
    bin_edges[-1] += 0.1  # For inclusion of last gene
    bin_genes = []  # contains names of genes in each bin
    for i in range(n_bins):
        # idx is a boolean vector where each element corresponds to a gene
        # value is set to true is gene is present in current bin (value of
        # i in this loop)
        idx = (gene_stats.m >= bin_edges[i]) & \
              (gene_stats.m < bin_edges[i + 1])
        if idx.sum() > 0: # making sure that the bin contains at least one gene
            bin_genes.append(list(idx[idx].index))
    bin_vals = []  # holds the variance and mean value of such a gene
                   # from each bin that has lowest variance in that bin
    for genes in bin_genes:
        # subset the full dataframe to extract genes of current bin
        temp_stat = gene_stats.reindex(genes)
        # identify gene with lowest varaince in current bin
        temp_gene = temp_stat.idxmin().v
        bin_vals.append(
            [temp_stat.v[temp_gene], temp_stat.m[temp_gene]])
    # transpose to have a shape : (2, nbins) such that first row contains
    # variances and second contain the means
    bin_vals = np.array(bin_vals).T
    # Fit lowess curve and get predicted y values
    bin_cor_fac = lowess(bin_vals[0], bin_vals[1], return_sorted=False, frac=lowess_frac, it=100).T
    fixed_var = {}
    for bcf, genes in zip(bin_cor_fac, bin_genes):
        for gene in genes:
            # correct predicted y values from all
            # genes of the corresponding bin
            fixed_var[gene] = gene_stats.v[gene] - bcf
    fixed_var = pd.Series(fixed_var)
    return fixed_var
	import numpy as np
	import pandas as pd
	from statsmodels.nonparametric.smoothers_lowess import lowess

	def fix_var(gene_stats, n_bins = 200, lowess_frac = 0.4):

	# gene_stats is a dataframe with gene names as index (rownames) and two
	# columns:
	# 'm': mean value of each gene
	# 'v': variance of each gene

	# find out bin positions
	bin_edges = np.histogram(gene_stats.m, bins=n_bins)[1]
	bin_edges[-1] += 0.1 # For inclusion of last gene
	bin_genes = [] # contains names of genes in each bin
	for i in range(n_bins):
	# idx is a boolean vector where each element corresponds to a gene
	# value is set to true is gene is present in current bin (value of
	# i in this loop)
	idx = (gene_stats.m >= bin_edges[i]) & \
	(gene_stats.m < bin_edges[i + 1])
	if idx.sum() > 0: # making sure that the bin contains at least one gene
	bin_genes.append(list(idx[idx].index))
	bin_vals = [] # holds the variance and mean value of such a gene
	# from each bin that has lowest variance in that bin
	for genes in bin_genes:
	# subset the full dataframe to extract genes of current bin
	temp_stat = gene_stats.reindex(genes)
	# identify gene with lowest varaince in current bin
	temp_gene = temp_stat.idxmin().v
	bin_vals.append(
	[temp_stat.v[temp_gene], temp_stat.m[temp_gene]])
	# transpose to have a shape : (2, nbins) such that first row contains
	# variances and second contain the means
	bin_vals = np.array(bin_vals).T
	# Fit lowess curve and get predicted y values
	bin_cor_fac = lowess(bin_vals[0], bin_vals[1], return_sorted=False, frac=lowess_frac, it=100).T
	fixed_var = {}
	for bcf, genes in zip(bin_cor_fac, bin_genes):
	for gene in genes:
	# correct predicted y values from all
	# genes of the corresponding bin
	fixed_var[gene] = gene_stats.v[gene] - bcf
	fixed_var = pd.Series(fixed_var)
	return fixed_var