Source code for mgkit.counts.scaling
"""
Scaling functions for counts
"""
from __future__ import division
from scipy import stats
import numpy
import pandas
[docs]def scale_factor_deseq(dataframe):
"""
.. versionadded:: 0.1.13
Returns the scale factor according to he deseq paper. The columns of the
dataframe are the samples.
size factor :math:`\\hat{s}_{j}` for sample *j* (from DESeq paper).
.. math::
\\hat{s}_{j} = median_{i} (
\\frac
{k_{ij}}
{
\\left (
\\prod_{v=1}^{m}
k_{iv}
\\right )^{1/m}
}
)
"""
# calc the genes geometric mean over all samples
gmean = dataframe.apply(stats.gmean, axis=1)
# keeps only the genes whose geometric mean is > 0
gmean = gmean[gmean > 0]
sample_factors = {}
# calc the scaling factor for each sample
for sample, genes in dataframe.items():
scale_factor = numpy.median(genes.loc[gmean.index] / gmean)
sample_factors[sample] = scale_factor
return pandas.Series(sample_factors)
[docs]def scale_deseq(dataframe):
"""
.. versionadded:: 0.1.13
Scale a dataframe using the deseq scaling. Uses :func:`scale_factor_deseq`
"""
scale_factors = scale_factor_deseq(dataframe)
return dataframe / scale_factors
[docs]def scale_rpkm(dataframe, gene_len):
"""
.. versionadded:: 0.1.14
Perform an RPKM scaling of the pandas dataframe/series supplied using the
*gene_len* series containing the gene sizes for all elements of *dataframe*
.. math::
RPKM =\\frac {10^{9} \\cdot C} {N \\cdot L}
"""
gene_len = gene_len[dataframe.index]
tot_reads = dataframe.sum().sum()
return (10 ** 9) * dataframe.div(gene_len * tot_reads, axis='index')