Source code for mgkit.workflow.fastq_utils

"""
Commands
--------

* Interleave/deinterleave paired-end fastq files.
* Converts to FASTA
* sort 2 files to sync the headers

Changes
-------

.. versionchanged:: 0.3.4
    moved to use click, internal fastq parsing, removed *rand* command

.. versionchanged:: 0.3.1
    added stdin/stdout defaults for some commands

.. versionchanged:: 0.3.0
    added *convert* command to FASTA

"""
from __future__ import division
from builtins import zip
import logging
import click
import mgkit
from mgkit.io.fastq import choose_header_type, write_fastq_sequence, load_fastq
from mgkit.io import fasta
from . import utils

LOG = logging.getLogger(__name__)


@click.group()
@click.version_option()
@utils.cite_option
def main():
    "Main function"
    pass


@main.command('convert', help="""Convert FastQ file [fastq-file] to FASTA file
              [fasta-file]""")
@click.option('-v', '--verbose', is_flag=True)
@click.argument('fastq-file', type=click.File('rb'), default='-')
@click.argument('fasta-file', type=click.File('wb'), default='-')
def convert_command(verbose, fastq_file, fasta_file):
    mgkit.logger.config_log(level=logging.DEBUG if verbose else logging.INFO)
    LOG.info(
        "Writing FASTA file (%s)",
        getattr(fasta_file, 'name', repr(fasta_file))
    )

    for seq_id, seq, qual in load_fastq(fastq_file):
        fasta.write_fasta_sequence(fasta_file, seq_id, seq)


[docs]def report_counts(count, wcount, counter=None): "Logs the status" if (counter is None) or (counter % 100000 == 0): LOG.info( "Read %-9d sequences, Wrote %-9d (%.2f%%)", count, wcount, wcount / count * 100 )
@main.command('sort', help="""Sort paired-end sequences from [mate1-input] and [mate2-input] into files [mate1-output] and [mate2-output]""") @click.option('-v', '--verbose', is_flag=True) @click.argument('mate1-input', type=click.File('rb')) @click.argument('mate2-input', type=click.File('rb')) @click.argument('mate1-output', type=click.File('wb')) @click.argument('mate2-output', type=click.File('wb')) def sort(verbose, mate1_input, mate2_input, mate1_output, mate2_output): "Sort two fastq files" mgkit.logger.config_log(level=logging.DEBUG if verbose else logging.INFO) LOG.info( 'Writing [mate1-output] to file (%s)', getattr(mate1_output, 'name', repr(mate1_output)) ) LOG.info( 'Writing [mate2-output] to file (%s)', getattr(mate2_output, 'name', repr(mate2_output)) ) regex = None simple_header = False mate1 = {} mate2 = {} count = 0 wcount = 0 for (seq_id1, seq1, qual1), (seq_id2, seq2, qual2) in zip(load_fastq(mate1_input), load_fastq(mate2_input)): count += 1 if (regex is None) and (not simple_header): regex = choose_header_type(seq_id1) if regex is None: simple_header = True LOG.info("Using a simple header structure") if simple_header: key1 = seq_id1[:-1] key2 = seq_id2[:-1] else: match1 = regex.search(seq_id1) match2 = regex.search(seq_id2) key1 = ( match1.group('lane'), match1.group('tile'), match1.group('xcoord'), match1.group('ycoord') ) key2 = ( match2.group('lane'), match2.group('tile'), match2.group('xcoord'), match2.group('ycoord') ) seq1 = (seq_id1, seq1, qual1) seq2 = (seq_id2, seq2, qual2) if key1 == key2: # if the 2 write_fastq_sequence(mate1_output, *seq1) write_fastq_sequence(mate2_output, *seq2) wcount += 1 report_counts(count, wcount, count) continue mate1[key1] = seq1 mate2[key2] = seq2 if key1 in mate2: write_fastq_sequence(mate1_output, *mate1[key1]) write_fastq_sequence(mate2_output, *mate2[key1]) del mate1[key1] del mate2[key1] wcount += 1 if key2 in mate1: write_fastq_sequence(mate1_output, *mate1[key2]) write_fastq_sequence(mate2_output, *mate2[key2]) del mate1[key2] del mate2[key2] wcount += 1 report_counts(count, wcount, count) report_counts(count, wcount, None) @main.command('di', help="""Deinterleave sequences from [fastq-file], into [mate1-file] and [mate2-file]""") @click.option('-v', '--verbose', is_flag=True) @click.option('-s', '--strip', is_flag=True, default=False, help="Strip additional info") @click.argument('fastq-file', type=click.File('rb'), default='-') @click.argument('mate1-file', type=click.File('wb')) @click.argument('mate2-file', type=click.File('wb')) def deinterleave(verbose, strip, fastq_file, mate1_file, mate2_file): "Deinterleave a fastq file" mgkit.logger.config_log(level=logging.DEBUG if verbose else logging.INFO) LOG.info( 'Writing [mate1-file] to file (%s)', getattr(mate1_file, 'name', repr(mate1_file)) ) LOG.info( 'Writing [mate2-file] to file (%s)', getattr(mate2_file, 'name', repr(mate2_file)) ) regex = None simple_header = False mate1 = {} mate2 = {} count = 0 wcount = 0 for seq_id, seq, qual in load_fastq(fastq_file): count += 1 if (regex is None) and (not simple_header): regex = choose_header_type(seq_id) if regex is None: LOG.info("Using a simple header structure") simple_header = True if simple_header: key = seq_id[:-1] mate = int(seq_id[-1]) else: match = regex.search(seq_id) key = ( match.group('lane'), match.group('tile'), match.group('xcoord'), match.group('ycoord') ) mate = int(match.group('mate')) if strip: sequence_name = seq_id.split('\t')[0] else: sequence_name = seq_id if mate == 1: mate1[key] = (sequence_name, seq, qual) else: mate2[key] = (sequence_name, seq, qual) try: # if sequence header in both seq1 = mate1[key] seq2 = mate2[key] write_fastq_sequence(mate1_file, *seq1) write_fastq_sequence(mate2_file, *seq2) wcount += 2 del mate1[key] del mate2[key] except KeyError: pass report_counts(count, wcount, count) report_counts(count, wcount, None) @main.command('il', help="""Interleave sequences from [mate1-file] and [mate2-file] into [fastq-file]""") @click.option('-v', '--verbose', is_flag=True) @click.argument('mate1-file', type=click.File('rb')) @click.argument('mate2-file', type=click.File('rb')) @click.argument('fastq-file', type=click.File('wb'), default='-') def interleave(verbose, mate1_file, mate2_file, fastq_file): "Interleave two fastq files" mgkit.logger.config_log(level=logging.DEBUG if verbose else logging.INFO) LOG.info( 'Writing interleaved [fastq-file] to file (%s)', getattr(fastq_file, 'name', repr(fastq_file)) ) regex = None simple_header = False mate1 = {} mate2 = {} count = 0 wcount = 0 for (seq_id1, seq1, qual1), (seq_id2, seq2, qual2) in zip(load_fastq(mate1_file), load_fastq(mate2_file)): count += 1 if (regex is None) and (not simple_header): regex = choose_header_type(seq_id1) if regex is None: simple_header = True LOG.info("Using a simple header structure") if simple_header: key1 = seq_id1[:-1] key2 = seq_id2[:-1] else: match1 = regex.search(seq_id1) match2 = regex.search(seq_id2) key1 = ( match1.group('lane'), match1.group('tile'), match1.group('xcoord'), match1.group('ycoord') ) key2 = ( match2.group('lane'), match2.group('tile'), match2.group('xcoord'), match2.group('ycoord') ) seq1 = (seq_id1, seq1, qual1) seq2 = (seq_id2, seq2, qual2) if key1 == key2: # if the 2 write_fastq_sequence(fastq_file, *seq1) write_fastq_sequence(fastq_file, *seq2) wcount += 1 report_counts(count, wcount, wcount) continue mate1[key1] = seq1 mate2[key2] = seq2 if key1 in mate2: write_fastq_sequence(fastq_file, *mate1[key1]) write_fastq_sequence(fastq_file, *mate2[key1]) del mate1[key1] del mate2[key1] wcount += 1 if key2 in mate1: write_fastq_sequence(fastq_file, *mate1[key2]) write_fastq_sequence(fastq_file, *mate2[key2]) del mate1[key2] del mate2[key2] wcount += 1 report_counts(count, wcount, count) report_counts(count, wcount, None)