"""
.. versionadded:: 0.1.13
Misc functions for count data
"""
import sys
import logging
import itertools
import functools
from collections import Counter
from future.utils import viewitems
from mgkit.filter import taxon as tx_filters
from mgkit.io import open_file
import mgkit.simple_cache
import pandas
LOG = logging.getLogger(__name__)
SKIP = set(
[
'no_feature',
'ambiguous',
'too_low_aQual',
'not_aligned',
'alignment_not_unique'
]
)
[docs]def load_htseq_counts(file_handle, conv_func=int):
"""
.. versionchanged:: 0.1.15
added *conv_func* parameter
Loads an HTSeq-count result file
Arguments:
file_handle (file or str): file handle or string with file name
conv_func (func): function to convert the number from string, defaults
to *int*, but *float* can be used as well
Yields:
tuple: first element is the gene_id and the second is the count
"""
if isinstance(file_handle, str):
file_handle = open_file(file_handle, 'rb')
if getattr(file_handle, 'name', None) is not None:
LOG.info("Loading HTSeq-count file %s", file_handle.name)
for line in file_handle:
line = line.decode('ascii')
gene_id, count = line.rstrip().split('\t')
if line.startswith('__') or (gene_id in SKIP):
continue
yield gene_id, conv_func(count)
[docs]def batch_load_htseq_counts(count_files, samples=None, cut_name=None):
"""
Loads a list of htseq count result files and returns a DataFrame
(IDxSAMPLE)
The sample names are names are the file names if *samples* and *cut_name*
are *None*, supplying a list of sample names with *samples* is the
preferred way, and *cut_name* is used for backward compatibility and as an
option in cases a string replace is enough.
Arguments:
count_files (file or str): file handle or string with file name
samples (iterable): list of sample names, in the same order as
*count_files*
cut_name (str): string to delete from the the file names to get the
sample names
Returns:
pandas.DataFrame: with sample names as columns and gene_ids as index
"""
counts = {}
iterator = itertools.zip_longest(
count_files,
[] if samples is None else samples
)
for fname, sample in iterator:
if sample is None:
if cut_name is not None:
sample = fname.replace(cut_name, '')
else:
sample = fname
counts[sample] = pandas.Series(load_htseq_counts(fname))
return pandas.DataFrame.from_dict(counts)
[docs]def get_uid_info(info_dict, uid):
"""
Simple function to get a value from a dictionary of tuples
(gene_id, taxon_id)
"""
return info_dict[uid]
[docs]def get_uid_info_ann(annotations, uid):
"""
Simple function to get a value from a dictionary of annotations
"""
annotation = annotations[uid]
return annotation.gene_id, annotation.taxon_id
[docs]def filter_counts(counts_iter, info_func, gfilters=None, tfilters=None):
"""
Returns counts that pass filters for each *uid* associated *gene_id* and
*taxon_id*.
Arguments:
counts_iter (iterable): iterator that yields a tuple (uid, count)
info_func (func): function accepting a *uid* that returns a tuple
*(gene_id, taxon_id)*
gfilters (iterable): list of filters to apply to each *uid* associated
*gene_id*
tfilters (iterable): list of filters to apply to each *uid* associated
*taxon_id*
Yields:
tuple: *(uid, count)* that pass filters
"""
for uid, count in counts_iter:
try:
gene_id, taxon_id = info_func(uid)
except KeyError:
continue
if tfilters is not None:
if taxon_id is None:
continue
if not all(tfilter(taxon_id) for tfilter in tfilters):
continue
if gfilters is not None:
if not all(gfilter(taxon_id) for gfilter in gfilters):
continue
yield uid, count
[docs]def map_counts(counts_iter, info_func, gmapper=None, tmapper=None, index=None,
uid_used=None):
"""
.. versionchanged:: 0.1.14
added *index* parameter
.. versionchanged:: 0.1.15
added *uid_used* parameter
Maps counts according to the gmapper and tmapper functions. Each mapped
gene ID count is the sum of all uid that have the same ID(s). The same is
true for the taxa.
Arguments:
counts_iter (iterable): iterator that yields a tuple (uid, count)
info_func (func): function accepting a *uid* that returns a tuple
*(gene_id, taxon_id)*
gmapper (func): fucntion that accepts a *gene_id* and returns a list
of mapped IDs
tmapper (func): fucntion that accepts a *taxon_id* and returns a new
*taxon_id*
index (None, str): if None, the index of the Series if
*(gene_id, taxon_id)*, if a str, it can be either *gene* or
*taxon*, to specify a single value
uid_used (None, dict): an empty dictionary in which to store the *uid*
that were assigned to each key of the returned pandas.Series. If
*None*, no information is saved
Returns:
pandas.Series: array with MultiIndex *(gene_id, taxon_id)* with the
mapped counts
"""
mapped_counts = {}
for uid, count in counts_iter:
try:
gene_id, taxon_id = info_func(uid)
except KeyError:
continue
if gmapper is not None:
gene_ids = gmapper(gene_id)
else:
gene_ids = [gene_id]
if tmapper is not None:
taxon_id = tmapper(taxon_id)
if taxon_id is None:
continue
for map_id in gene_ids:
if index is None:
key = (map_id, taxon_id)
elif index == 'gene':
key = map_id
elif index == 'taxon':
key = taxon_id
try:
mapped_counts[key] += count
if uid_used is not None:
uid_used[key].add(uid)
except KeyError:
mapped_counts[key] = count
if uid_used is not None:
uid_used[key] = set([uid])
return pandas.Series(mapped_counts)
[docs]def map_taxon_id_to_rank(taxonomy, rank, taxon_id, include_higher=True):
"""
Maps a *taxon_id* to the request taxon rank. Returns *None* if
*include_higher* is False and the found rank is not the one requested.
Internally uses :meth:`mgkit.taxon.Taxonomy.get_ranked_taxon`
Arguments:
taxonomy: taxonomy instance
rank (str): taxonomic rank requested
taxon_id (int): taxon_id to map
include_higher (bool): if False, any rank different than the requested
one is discarded
Returns:
(int, None): if the mapping is successful, the ranked taxon_id is
returned, otherwise *None* is returned
"""
if taxon_id is None:
return None
ranked = taxonomy.get_ranked_taxon(taxon_id, rank)
if (include_higher is False) and (ranked.rank != rank):
return None
return ranked.taxon_id
[docs]def map_gene_id_to_map(gene_map, gene_id):
"""
Function that extract a list of gene mappings from a dictionary and returns
an empty list if the *gene_id* is not found.
"""
return gene_map.get(gene_id, [])
[docs]def load_sample_counts(info_dict, counts_iter, taxonomy, inc_anc=None,
rank=None, gene_map=None, ex_anc=None,
include_higher=True, cached=True, uid_used=None):
"""
.. versionchanged:: 0.1.14
added *cached* argument
.. versionchanged:: 0.1.15
added *uid_used* parameter
.. versionchanged:: 0.2.0
info_dict can be a function
Reads sample counts, filtering and mapping them if requested. It's an
example of the usage of the above functions.
Arguments:
info_dict (dict): dictionary that has *uid* as key and
*(gene_id, taxon_id)* as value. In alternative a function that
accepts a *uid* as sole argument and returns *(gene_id, taxon_id)*
counts_iter (iterable): iterable that yields a *(uid, count)*
taxonomy: taxonomy instance
inc_anc (int, list): ancestor taxa to include
rank (str): rank to which map the counts
gene_map (dict): dictionary with the gene mappings
ex_anc (int, list): ancestor taxa to exclude
include_higher (bool): if False, any rank different than the requested
one is discarded
cached (bool): if *True*, the function will use
:class:`mgkit.simple_cache.memoize` to cache some of the functions
used
uid_used (None, dict): an empty dictionary in which to store the *uid*
that were assigned to each key of the returned pandas.Series. If
*None*, no information is saved
Returns:
pandas.Series: array with MultiIndex *(gene_id, taxon_id)* with the
filtered and mapped counts
"""
if isinstance(info_dict, dict):
if isinstance(info_dict[list(info_dict.keys())[0]], tuple):
info_func = functools.partial(get_uid_info, info_dict)
else:
info_func = functools.partial(get_uid_info_ann, info_dict)
else:
info_func = info_dict
tfilters = []
if inc_anc is not None:
if not isinstance(inc_anc, (list, set, tuple)):
inc_anc = [inc_anc]
tfilters.append(
functools.partial(
tx_filters.filter_by_ancestor,
filter_list=inc_anc,
exclude=False,
taxonomy=taxonomy
)
)
if ex_anc is not None:
if not isinstance(ex_anc, (list, set, tuple)):
ex_anc = [ex_anc]
tfilters.append(
functools.partial(
tx_filters.filter_by_ancestor,
filter_list=ex_anc,
exclude=True,
taxonomy=taxonomy
)
)
if cached:
tfilters = [
mgkit.simple_cache.memoize(tfilter)
for tfilter in tfilters
]
gmapper = None
tmapper = None
if rank is not None:
tmapper = functools.partial(
map_taxon_id_to_rank,
taxonomy,
rank,
include_higher=include_higher
)
if cached:
tmapper = mgkit.simple_cache.memoize(tmapper)
if gene_map is not None:
gmapper = functools.partial(
map_gene_id_to_map,
gene_map
)
if cached:
gmapper = mgkit.simple_cache.memoize(gmapper)
series = map_counts(
filter_counts(
counts_iter,
info_func,
gfilters=None,
tfilters=tfilters
),
info_func,
gmapper=gmapper,
tmapper=tmapper,
uid_used=uid_used
)
return series
[docs]def load_sample_counts_to_taxon(info_func, counts_iter, taxonomy, inc_anc=None,
rank=None, ex_anc=None, include_higher=True,
cached=True, uid_used=None):
"""
.. versionadded:: 0.1.14
.. versionchanged:: 0.1.15
added *uid_used* parameter
Reads sample counts, filtering and mapping them if requested. It's a
variation of :func:`load_sample_counts`, with the counts being mapped only
to each specific taxon. Another difference is the absence of any assumption
on the first parameter. It is expected to return a (gene_id, taxon_id)
tuple.
Arguments:
info_func (callable): any callable that accept an *uid* as the only
parameter and and returns *(gene_id, taxon_id)* as value
counts_iter (iterable): iterable that yields a *(uid, count)*
taxonomy: taxonomy instance
inc_anc (int, list): ancestor taxa to include
rank (str): rank to which map the counts
ex_anc (int, list): ancestor taxa to exclude
include_higher (bool): if False, any rank different than the requested
one is discarded
cached (bool): if *True*, the function will use
:class:`mgkit.simple_cache.memoize` to cache some of the functions
used
uid_used (None, dict): an empty dictionary in which to store the *uid*
that were assigned to each key of the returned pandas.Series. If
*None*, no information is saved
Returns:
pandas.Series: array with Index *taxon_id* with the filtered and mapped
counts
"""
tfilters = []
if inc_anc is not None:
if not isinstance(inc_anc, (list, set, tuple)):
inc_anc = [inc_anc]
tfilters.append(
functools.partial(
tx_filters.filter_by_ancestor,
filter_list=inc_anc,
exclude=False,
taxonomy=taxonomy
)
)
if ex_anc is not None:
if not isinstance(ex_anc, (list, set, tuple)):
ex_anc = [ex_anc]
tfilters.append(
functools.partial(
tx_filters.filter_by_ancestor,
filter_list=ex_anc,
exclude=True,
taxonomy=taxonomy
)
)
if cached:
tfilters = [
mgkit.simple_cache.memoize(tfilter)
for tfilter in tfilters
]
tmapper = None
if rank is not None:
tmapper = functools.partial(
map_taxon_id_to_rank,
taxonomy,
rank,
include_higher=include_higher
)
if cached:
tmapper = mgkit.simple_cache.memoize(tmapper)
series = map_counts(
filter_counts(
counts_iter,
info_func,
gfilters=None,
tfilters=tfilters
),
info_func,
tmapper=tmapper,
index='taxon',
uid_used=uid_used
)
return series
[docs]def load_sample_counts_to_genes(info_func, counts_iter, taxonomy, inc_anc=None,
gene_map=None, ex_anc=None, cached=True,
uid_used=None):
"""
.. versionadded:: 0.1.14
.. versionchanged:: 0.1.15
added *uid_used* parameter
Reads sample counts, filtering and mapping them if requested. It's a
variation of :func:`load_sample_counts`, with the counts being mapped only
to each specific gene_id. Another difference is the absence of any
assumption on the first parameter. It is expected to return a
(gene_id, taxon_id) tuple.
Arguments:
info_func (callable): any callable that accept an *uid* as the only
parameter and and returns *(gene_id, taxon_id)* as value
counts_iter (iterable): iterable that yields a *(uid, count)*
taxonomy: taxonomy instance
inc_anc (int, list): ancestor taxa to include
rank (str): rank to which map the counts
gene_map (dict): dictionary with the gene mappings
ex_anc (int, list): ancestor taxa to exclude
cached (bool): if *True*, the function will use
:class:`mgkit.simple_cache.memoize` to cache some of the functions
used
uid_used (None, dict): an empty dictionary in which to store the *uid*
that were assigned to each key of the returned pandas.Series. If
*None*, no information is saved
Returns:
pandas.Series: array with Index *gene_id* with the filtered and mapped
counts
"""
tfilters = []
if inc_anc is not None:
if not isinstance(inc_anc, (list, set, tuple)):
inc_anc = [inc_anc]
tfilters.append(
functools.partial(
tx_filters.filter_by_ancestor,
filter_list=inc_anc,
exclude=False,
taxonomy=taxonomy
)
)
if ex_anc is not None:
if not isinstance(ex_anc, (list, set, tuple)):
ex_anc = [ex_anc]
tfilters.append(
functools.partial(
tx_filters.filter_by_ancestor,
filter_list=ex_anc,
exclude=True,
taxonomy=taxonomy
)
)
if cached:
tfilters = [
mgkit.simple_cache.memoize(tfilter)
for tfilter in tfilters
]
if gene_map is not None:
gmapper = functools.partial(
map_gene_id_to_map,
gene_map
)
if cached:
gmapper = mgkit.simple_cache.memoize(gmapper)
series = map_counts(
filter_counts(
counts_iter,
info_func,
gfilters=None,
tfilters=tfilters
),
info_func,
gmapper=gmapper,
index='gene',
uid_used=uid_used
)
return series
[docs]def load_deseq2_results(file_name, taxon_id=None):
"""
.. versionadded:: 0.1.14
Reads a CSV file output with DESeq2 results, adding a taxon_id to the index
for concatenating multiple results from different taxonomic groups.
Arguments:
file_name (str): file name of the CSV
Returns:
pandas.DataFrame: a MultiIndex DataFrame with the results
"""
dataframe = pandas.DataFrame.from_csv(file_name)
dataframe = dataframe.rename(
index=dict(
(gene_id, (gene_id, taxon_id))
for gene_id in dataframe.index
)
)
dataframe.index.names = ['gene_id', 'taxon_id']
return dataframe
[docs]def map_counts_to_category(counts, gene_map, nomap=False, nomap_id='NOMAP'):
"""
Used to map the counts from a certain gene identifier to another. Genes
with no mappings are not counted, unless *nomap=True*, in which case they
are counted as *nomap_id*.
Arguments:
counts (iterator): an iterator that yield a tuple, with the first value
being the gene_id and the second value the count for it
gene_map (dictionary): a dictionary whose keys are the gene_id yield by
*counts* and the values are iterable of mapping identifiers
nomap (bool): if False, counts for genes with no mappings in *gene_map*
are discarded, if True, they a counted as *nomap_id*
nomap_id (str): name of the mapping for genes with no mappings
Returns:
pandas.Series: mapped counts
"""
newcounts = {}
for gene_id, count in counts:
try:
map_ids = gene_map[gene_id]
except KeyError:
continue
if nomap and (not map_ids):
map_ids = [nomap_id]
for map_id in map_ids:
try:
newcounts[map_id] += count
except KeyError:
newcounts[map_id] = count
return pandas.Series(newcounts)
[docs]def load_counts_from_gff(annotations, elem_func=lambda x: x.uid,
sample_func=None, nozero=True):
"""
.. versionadded:: 0.2.5
Loads counts for each annotations that are stored into the annotation
*counts_* attributes. Annotations with a total of 0 counts are skipped by
default (nozero=True), the row index is set to the *uid* of the annotation
and the column to the sample name. The functions used to transform the
indices expect the annotation (for the row, *elem_func*) and the sample
name (for the column, *sample_func*).
Arguments:
annotations (iter): iterable of annotations
elem_func (func): function that accepts an annotation and return a
str/int for a Index or a tuple for a MultiIndex, defaults to
returning the *uid* of the annotation
sample_func (func, None): function that accepts the sample name and
returns tuple for a MultiIndex. Defaults to *None* so no
transformation is performed
nozero (bool): if *True*, annotations with no counts are skipped
"""
count_dict = {}
for annotation in annotations:
counts = pandas.Series(annotation.counts)
if nozero and (counts.sum() == 0):
continue
count_dict[elem_func(annotation)] = counts
# LOG.debug('Returning DataFrame')
count_dict = pandas.DataFrame.from_dict(count_dict, orient='index')
if sample_func is not None:
count_dict.columns = pandas.MultiIndex.from_tuples(
[
sample_func(column)
for column in count_dict.columns
]
)
return count_dict
[docs]def from_gff(annotations, samples, ann_func=None, sample_func=None):
"""
.. versionadded:: 0.3.1
Loads count data from a GFF file, only for the requested samples. By
default the function returns a DataFrame where the index is the *uid* of
each annotation and the columns the requested samples.
This can be customised by supplying *ann_func* and *sample_func*.
*sample_func* is a function that accept a sample name and is expected to
return a string or a tuple. This will be used to change the columns in the
DataFrame. *ann_func* must accept an :class:`mgkit.io.gff.Annotation`
instance and return an iterable, with each iteration yielding either a
single element or a tuple (for a MultiIndex DataFrame), each element
yielded will have the count of that annotation added to.
Arguments:
annotation (iterable): iterable yielding annotations
samples (iterable): list of samples to keep
ann_func (func): function used to customise the output
sample_func (func): function to customise the column elements
Returns:
DataFrame: dataframe with the count data, columns are the samples and
rows the annotation counts (unless mapped with *ann_func*)
Exmples:
Assuming we have a list of *annotations* and sample SAMPLE1 and SAMPLE2
we can obtain the count table for all annotations with this
>>> from_gff(annotations, ['SAMPLE1', 'SAMPLE2'])
Assuming we want to group the samples, for example treatment1,
treatment2 and control1, control2 into a MultiIndex DataFrame column
>>> sample_func = lambda x: ('T' if x.startswith('t') else 'C', x)
>>> from_gff(annotations, ['treatment1', 'treatment2', 'control1',
'control2'], sample_func=sample_func)
Annotations can be mapped to other levels for example instead of using
the *uid* that is the default, it can be mapped to the gene_id,
taxon_id information that is included in the annotation, resulting in a
MultiIndex index for the rows, with (gene_id, taxon_id) as key.
>>> ann_func = lambda x: [(x.gene_id, x.taxon_id)]
>>> from_gff(annotations, ['SAMPLE1', 'SAMPLE2'], ann_func=ann_func)
"""
if ann_func is None:
def ann_func(x): return (x.uid, )
if sample_func is None:
def sample_func(x): return x
counters = {
sample_func(sample): Counter()
for sample in samples
}
for annotation in annotations:
for sample, count in viewitems(annotation.counts):
sample = sample_func(sample)
if sample not in counters:
continue
for ann_id in ann_func(annotation):
counters[sample][ann_id] += count
return pandas.DataFrame(counters)