"""
.. versionadded:: 0.3.3
GLM models with metagenomes and metatranscriptomes. Experimental
"""
from __future__ import division
from builtins import range
import statsmodels.api as sm
from scipy import interpolate, optimize
import pandas as pd
[docs]def lowess_ci_bootstrap(endog, exog, num=100, frac=.2, it=3, alpha=.05,
delta=0., min_value=10**-3, kind='slinear'):
"""
.. versionchanged: 0.4.0
the interpolation of the confidence intervals uses the passed `kind`
Bootstraps a lowess for the dependent (`endog`) and indipendent (`exog`)
arguments.
Arguments:
endog (array): indipendent variable (Y)
exog (array): indipendent variable (X)
num (int): number of iterations for the bootstrap
frac (float): fraction of the array to use when fitting
it (int): number of iterations used to fit the lowess
alpha (float): confidence intervals for the bootstrap
delta (float): passed to :func:`statsmodels.api.nonparametric.lowess`
min_value (float): minimum value for the function to avoid out of
bounds
kind (str): type of interpolation passed to
:func:`scipy.interpolate.interp1d`
Returns:
tuple: the first element is the function describing the lowest
confidence interval, the second element is for the highest confidence
interval and the last one for the mean
.. note::
Performance increase with the value of *delta*.
"""
data = pd.DataFrame(
{
'endog': endog,
'exog': exog
}
).sort_values(by=['exog', 'endog'], ascending=True)
boots = []
for idx in range(num):
sample = data.sample(n=data.index.size, replace=True)
lw = sm.nonparametric.lowess(
sample.endog,
sample.exog,
is_sorted=False,
frac=frac,
it=it,
return_sorted=True,
delta=delta
)
boots.append(pd.DataFrame({
'endog': lw[:, 1],
'exog': lw[:, 0]
}))
alpha = alpha / 2
boots = pd.concat(boots)
q1 = boots.groupby('exog').quantile(
alpha, interpolation='nearest'
).sort_index()
q1 = q1.endog[q1.endog > 0].fillna(min_value)
q1 = interpolate.interp1d(q1.index, q1, kind=kind,
fill_value='extrapolate')
q2 = boots.groupby('exog').quantile(
1 - alpha, interpolation='nearest'
).sort_index()
q2 = q2.endog[q2.endog > 0].fillna(min_value)
q2 = interpolate.interp1d(q2.index, q2, kind=kind,
fill_value='extrapolate')
m = fit_lowess_interpolate(data.endog, data.exog, frac=frac, it=it,
kind=kind)
return q1, q2, m
[docs]def fit_lowess_interpolate(endog, exog, frac=.2, it=3, kind='slinear'):
"""
Fits a lowess for the passed `endog` (Y) and `exog` (X) and returns an
interpolated function that describes it. The first 4 arguments are passed
to :func:`statsmodels.api.sm.nonparametric.lowess`, while the last one is
passed to :func:`scipy.interpolate.interp1d`
Arguments:
endog (array): array of the dependent variable (Y)
exog (array): array of the indipendent variable (X)
frac (float): fraction of the number of elements to use when fitting
(0.0-1.0)
it (int): number of iterations to fit the lowess
kind (str): type of interpolation to use
Returns:
func: interpolated function representing the lowess fitted from the
data passed
"""
data = pd.DataFrame(
{
'endog': endog,
'exog': exog
}
).sort_values(by=['exog', 'endog'], ascending=True)
lw = sm.nonparametric.lowess(
data.endog,
data.exog,
frac=frac,
it=it,
)
return interpolate.interp1d(
lw[:, 0],
lw[:, 1],
kind=kind,
fill_value='extrapolate'
)
[docs]def variance_to_alpha(mu, func, min_alpha=10**-3):
"""
Based on the variance defined in the Negative Binomial in statsmodels
var = mu + alpha * (mu ** 2)
Arguments:
mu (float): mean to calculate the alphas for
func (func): function that returns the variace of the mean
min_alpha (float): value of alpha if the `func` goes out of bounds
Returns:
float: value of alpha for the passed mean
"""
alpha = (func(mu) - mu) / (mu ** 2)
return min_alpha if alpha <= 0 else alpha
[docs]def optimise_alpha_scipy_function(args, formula, data, criterion='aic'):
"""
.. versionadded:: 0.4.0
"""
alpha, = args
model = sm.formula.glm(
formula,
data=data,
family=sm.families.NegativeBinomial(alpha=alpha)
).fit()
return getattr(model, criterion)
[docs]def optimise_alpha_scipy(formula, data, mean_func, q1_func, q2_func):
"""
.. versionadded:: 0.4.0
Used to find an optimal *alpha* parameter for the Negative Binomial
distribution used in `statsmodels`, using the lowess functions from
:func:`lowess_ci_bootstrap`.
Arguments:
formula (str): the formula used for the regression
data (DataFrame): DataFrame for regression
mean_func (func): function for the mean :func:`lowess_ci_bootstrap`
q1_func (func): function for the q1 :func:`lowess_ci_bootstrap`
q2_func (func): function for the q2 :func:`lowess_ci_bootstrap`
Returns:
float: *alpha* value for the Negative Binomial
"""
data_mean = data.mean()[0]
return optimize.minimize(
optimise_alpha_scipy_function,
[variance_to_alpha(data_mean, mean_func)],
args=(formula, data),
bounds=[(variance_to_alpha(data_mean, q1_func), variance_to_alpha(data_mean, q2_func))]
).x[0]