"""
Fastq utility functions
"""
import re
import numpy
import logging
from .utils import open_file
LOG = logging.getLogger(__name__)
CASAVA_HEADER_OLD = r"""(?P<machine>\w+-\w+):
(?P<lane>\d):
(?P<tile>\d+):
(?P<xcoord>\d+):
(?P<ycoord>\d+)\#
(?P<index>(\d|[ACTGN]{6}))/
(?P<mate>(1|2))"""
"Old casava header regex"
CASAVA_HEADER_NEW = r"""(?P<machine>[\w-]+):
(?P<runid>\d+):
(?P<cellid>\w+):
(?P<lane>\d):
(?P<tile>\d+):
(?P<xcoord>\d+):
(?P<ycoord>\d+)
[_ ](?P<mate>\d): # underscore for data from from www.ebi.ac.uk/ena/
(?P<filter>[YN]):
(?P<bits>\d+):
(?P<index>[ACTGN+]+)"""
"New casava header regex, including indices for both forward and reverse"
CASAVA_KHMER = r"""
(?P<machine>[\w-]+):
(?P<runid>\d+):
(?P<cellid>\w+):
(?P<lane>\d):
(?P<tile>\d+):
(?P<xcoord>\d+):
(?P<ycoord>\d+)
/(?P<mate>\d)
"""
[docs]def check_fastq_type(qualities):
"""
Trys to guess the type of quality string used in a Fastq file
:param str qualities: string with the quality scores as in the Fastq file
:return str: a string with the guessed quality score
.. note::
Possible values are the following, classified but the values usually
used in other softwares:
* ASCII33: sanger, illumina-1.8
* ASCII64: illumina-1.3, illumina-1.5, solexa-old
"""
qualities = [ord(char) for char in qualities]
max_qual = max(qualities)
min_qual = min(qualities)
if (min_qual >= 33) and (max_qual <= 73):
return 'sanger'
elif (min_qual >= 33) and (max_qual <= 74):
return 'illumina-1.8'
elif (min_qual >= 64) and (max_qual <= 104):
return 'illumina-1.5'
elif (min_qual >= 67) and (max_qual <= 104):
return 'illumina-1.3'
elif (min_qual >= 59) and (max_qual <= 104):
return 'solexa-old'
[docs]def convert_seqid_to_new(seq_id):
"""
Convert old seq_id format for Illumina reads to the new found in Casava
1.8+
:param str seq_id: seq_id of the sequence (stripped of '@')
:return str: the new format seq_id
.. note::
Example from Wikipedia::
old casava seq_id:
@HWUSI-EAS100R:6:73:941:1973#0/1
new casava seq_id:
@EAS139:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCAC
"""
machine_id, cell_lane, tile_num, x_coord, other = seq_id.split(':')
y_coord, other = other.split('#')
idx_multiplex, mate = other.split('/')
return "{0}:0:FC706VJ:{1}:{2}:{3}:{4} {5}:N:0:{6}".format(
machine_id,
cell_lane,
tile_num,
x_coord,
y_coord,
mate,
idx_multiplex
)
[docs]def convert_seqid_to_old(seq_id, index_as_seq=True):
"""
.. deprecated:: 0.3.3
Convert old seq_id format for Illumina reads to the new found in Casava
until 1.8, which marks the new format.
:param str seq_id: seq_id of the sequence (stripped of '@')
:param bool index_as_seq: if True, the index for the multiplex we'll be
the sequence found at the end of the new format seq_id. Otherwise, 0
we'll be used
:return str: the new format seq_id
"""
left_part, right_part = seq_id.split(' ')
machine_id, run_id, cell_id, cell_lane, tile_num, x_coord, \
y_coord = left_part.split(':')
mate, fail_filter, control_bits, index_seq = right_part.split(':')
return "HWUSI-{0}:{1}:{2}:{3}:{4}#{5}/{6}".format(
machine_id,
cell_lane,
tile_num,
x_coord,
y_coord,
index_seq if index_as_seq else '0',
mate
)
[docs]def write_fastq_sequence(file_handle, name, seq, qual, write_mode='a'):
"""
.. versionchanged:: 0.3.3
if *qual* is not a string it's converted to chars (phred33)
Write a fastq sequence to file. If the *file_handle* is a string, the file
will be opened using *write_mode*.
:param file_handle: file handle or string.
:param str name: header to write for the sequence
:param str seq: sequence to write
:param str qual: quality string
"""
file_handle = open_file(file_handle, write_mode)
if isinstance(qual[0], (int, numpy.integer)):
qual = ''.join(chr(q + 33) for q in qual)
file_handle.write(
"@{name}\n{seq}\n+\n{qual}\n".format(
name=name,
seq=seq,
qual=qual
).encode('ascii')
)
[docs]def load_fastq(file_handle, num_qual=False):
"""
.. versionadded:: 0.3.1
Loads a fastq file and returns a generator of tuples in which the first
element is the name of the sequence, the second the sequence and the third
the quality scores (converted in a numpy array if *num_qual* is True).
.. note::
this is a simple parser that assumes each sequence is on 4 lines,
1st and 3rd for the headers, 2nd for the sequence and 4th the quality
scores
Arguments:
file_handle (str, file): fastq file to open, can be a file name or a
file handle
num_qual (bool): if *False* (default), the quality score will be
returned as ASCII character, if *True* a numpy array.
Yields:
tuple: first element is the sequence name/header, the second element is
the sequence, the third is the quality score. The quality scores are
kept as a string if *num_qual* is False (default) and converted to a
numpy array with correct values (0-41) if *num_qual* is True
Raises:
ValueError: if the headers in both sequence and quality scores are not
valid. This implies that the sequence/qualities have carriage returns
or the file is truncated.
TypeError: if the qualities are in a format different than sanger
(min 0, max 40) or illumina-1.8 (0, 41)
"""
file_handle = open_file(file_handle, 'r')
if getattr(file_handle, 'name', None) is not None:
LOG.info("Reading fastq file (%s)", file_handle.name)
check_qual = True
sequence_count = 0
while True:
header1 = file_handle.readline().decode('ascii').strip()
# Reached the end of the file
if not header1:
break
seq = file_handle.readline().decode('ascii').strip()
header2 = file_handle.readline().decode('ascii').strip()
qualities = file_handle.readline().decode('ascii').strip()
if (header1[0] != '@') or (header2[0] != '+'):
raise ValueError(
"The sequence and quality headers are not valid '{}' != '{}'".format(header1, header2)
)
header1 = header1[1:]
header2 = header2[1:]
if check_qual:
qual_type = check_fastq_type(qualities)
if qual_type not in ('sanger', 'illumina-1.8'):
raise TypeError(
'Quality scores are in "{}" format. You must convert them to Sanger (phred33)'.format(qual_type)
)
check_qual = False
if num_qual:
qualities = numpy.fromiter((ord(x) for x in qualities), numpy.int) - 33
sequence_count += 1
yield header1, seq, qualities
LOG.info("Read %d fastq sequences", sequence_count)
[docs]def load_fastq_rename(file_handle, num_qual=False, name_func=None):
"""
.. versionadded:: 0.3.3
Mirrors the same functionality in :func:`mgkit.io.fasta.load_fasta_rename`.
Renames the header of the sequences using *name_func*, which is called on
each header. By default, the behaviour is to keep the header to the left of
the first space (BLAST behaviour).
"""
if name_func is None:
def name_func(header): return header.split(' ')[0]
for header, seq, qual in load_fastq(file_handle, num_qual=num_qual):
yield name_func(header), seq, qual