Source code for mgkit.filter.reads
"""
Some test functions to filter sequences
"""
from __future__ import division
from builtins import range
import numpy
# from numpy import sum
# import pandas
# from scipy.stats import itemfreq
# import numexpr as ne
# ne.set_num_threads(8)
# #direct use: fourth (1.6ms loop)
# #in trim_by_ee: fourth
# def expected_error_rate_1(qualities):
# qualities = pandas.DataFrame(itemfreq(qualities))
# return qualities.prod(axis=1).sum()
# direct use: first (0.033ms loop)
# in trim_by_ee: first (15.9s)
[docs]def expected_error_rate(qualities):
"""
Calculate the expected error rate for an array of qualities (converted to
probabilities).
"""
unique, inv = numpy.unique(qualities, return_inverse=True)
return sum(unique * numpy.bincount(inv))
# #direct use: third (0.463ms loop)
# #in trim_by_ee: third (65.7s)
# def expected_error_rate_3(qualities):
# return sum(
# sum(1 for value in qualities if value == unique) * unique
# for unique in numpy.unique(qualities)
# )
# #direct use: second (0.109ms loop)
# #in trim_by_ee: second (27.7s)
# def expected_error_rate_4(qualities):
# return sum(
# len(qualities[qualities == unique]) * unique
# for unique in numpy.unique(qualities)
# )
[docs]def trim_by_ee(qualities, min_length=50, threshold=0.5, chars=True, base=33):
"""
Trim a sequence based on the expected error rate.
"""
if chars:
qualities = numpy.fromiter(
(ord(score) for score in qualities),
dtype=int
) - base
# qualities = ne.evaluate("10 ** ((-qualities) / 10)")
qualities = 10 ** ((-qualities) / 10)
# print qualities
for idx in range(min_length, qualities.size):
# print expected_error_rate(qualities[:idx])
if expected_error_rate(qualities[:idx]) > threshold:
return idx
return qualities.size