"""
.. deprecated:: 0.5.7
This script is deprecated now, use `pnps-gen vcf` instead
.. note::
if you need to use the script, install HTSeq
This script parses results of SNPs analysis from any tool for SNP calling [#]_
and integrates them into a format that can be later used for other scripts in
the pipeline.
It integrates coverage and expected number of syn/nonsyn change and taxonomy
from a GFF file, SNP data from a VCF file.
.. note::
The script accept gzipped VCF files
.. [#] GATK pipeline was tested, but it is possible to use samtools and
bcftools
Changes
*******
.. versionchanged:: 0.2.1
added *-s* option for VCF files generated using bcftools
.. versionchanged:: 0.1.16
reworkked internals and removed SNPDat, syn/nonsyn evaluation is internal
.. versionchanged:: 0.1.13
reworked the internals and the classes used, including options -m and -s
"""
from __future__ import division
from builtins import zip
import logging
import argparse
import pickle
import HTSeq
from . import utils
from ..io import gff, compressed_handle, fasta
from .. import logger
from ..snps.classes import GeneSNP, SNPType
LOG = logging.getLogger(__name__)
[docs]def set_parser():
"""
Sets command line arguments parser
"""
parser = argparse.ArgumentParser(
description='DEPRECATED, use `pnps-gen vcf` SNPs analysis, requires a vcf file',
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument(
'-o',
'--output-file',
default='snp_data.pickle',
type=argparse.FileType('wb'),
help='Ouput file'
)
parser.add_argument(
'-q',
'--min-qual',
default=30,
type=int,
action='store',
help='Minimum SNP quality (Phred score)'
)
parser.add_argument(
'-f',
'--min-freq',
default=0.01,
type=float,
action='store',
help='Minimum allele frequency'
)
parser.add_argument(
'-r',
'--min-reads',
default=4,
type=int,
action='store',
help='Minimum number of reads to accept the SNP'
)
parser.add_argument(
'-g',
'--gff-file',
required=True,
type=argparse.FileType('rb'),
action='store',
help='GFF file with annotations'
)
parser.add_argument(
'-p',
'--vcf-file',
required=True,
type=argparse.FileType('r'),
action='store',
help='Merged VCF file'
)
parser.add_argument(
'-a',
'--reference',
required=True,
type=argparse.FileType('rb'),
help='Fasta file with the GFF Reference'
)
parser.add_argument(
'-m',
'--samples-id',
action='append',
required=True,
type=str,
help='the ids of the samples used in the analysis'
)
parser.add_argument(
'-c',
'--cov-suff',
action='store',
default='_cov',
help="Per sample coverage suffix in the GFF"
)
parser.add_argument(
'-s',
'--bcftools-vcf',
action='store_true',
default=False,
help="bcftools call was used to produce the VCF file"
)
utils.add_basic_options(parser, manual=__doc__)
return parser
[docs]def init_count_set(annotations):
LOG.info("Init data structures")
samples = list(annotations[0].sample_coverage.keys())
snp_data = dict(
(sample, {}) for sample in samples
)
for annotation in annotations:
taxon_id = annotation.taxon_id
uid = annotation.uid
sample_coverage = annotation.sample_coverage
for sample in sample_coverage:
snp_data[sample][uid] = GeneSNP(
uid=uid,
gene_id=annotation.gene_id,
taxon_id=taxon_id,
exp_syn=annotation.exp_syn,
exp_nonsyn=annotation.exp_nonsyn,
coverage=sample_coverage[sample],
)
return snp_data
[docs]def check_snp_in_set(samples, snp_data, pos, change, annotations, seq):
"""
Used by :func:`parse_vcf` to check if a SNP
:param iterable samples: list of samples that contain the SNP
:param dict snp_data: dictionary from :func:`init_count_set` with per
sample SNPs information
"""
for annotation in annotations:
if pos not in annotation:
continue
if annotation.is_syn(seq, pos, change, strict=False):
snp_type = SNPType.syn
else:
snp_type = SNPType.nonsyn
uid = annotation.uid
rel_pos = annotation.get_relative_pos(pos)
for sample in samples:
snp_data[sample][uid].add_snp(rel_pos, change, snp_type=snp_type)
[docs]def parse_vcf(vcf_file, snp_data, min_reads, min_af, min_qual, annotations,
seqs, options, line_num=100000):
"""
Parse VCF file counts synonymous and non-synonymous SNPs
:param file vcf_file: file handle to a VCF file
:param dict snp_data: dictionary from :func:`init_count_set` with per
sample SNPs information
:param int min_reads: minimum number of reads to accept a SNP
:param float min_af: minimum allele frequency to accept a SNP
:param int min_qual: minimum quality (Phred score) to accept a SNP
:param dict annotations: annotations grouped by their reference sequence
:param dict seqs: reference sequences
:param int line_num: the interval in number of lines at which progress
will be printed
"""
vcf_handle = HTSeq.VCF_Reader(compressed_handle(vcf_file))
vcf_handle.parse_meta()
vcf_handle.make_info_dict()
# total number of SNPs accepted
count_tot = 0
# number of SNPs skipped for low depth
skip_dp = 0
# number of SNPs skipped for low allele frequency
skip_af = 0
# number of SNPs skipped for low quality
skip_qual = 0
# indels
skip_indels = 0
for vcf_record in vcf_handle:
# the SNP is a sequence with no annotations
if vcf_record.chrom not in annotations:
continue
if float(vcf_record.qual) < min_qual:
# low quality SNP
skip_qual += 1
continue
# unpack info records (needed for vcf_record.info to be a dictionary)
vcf_record.unpack_info(vcf_handle.infodict)
if vcf_record.info['INDEL']:
skip_indels += 1
continue
if not isinstance(vcf_record.info['DP'], int):
LOG.warning(vcf_record.info['DP'])
if vcf_record.info['DP'] < min_reads:
# not enough reads (depth) for the SNP
skip_dp += 1
continue
# Samtools mpileup -> bcftools call doesn't output the allele freq.
# it can be calculated with AC/AN for each ALT nucleotide
# checked on bfctools (roh command) manual
# https://samtools.github.io/bcftools/bcftools.html
try:
allele_freqs = vcf_record.info['AF']
except KeyError:
if isinstance(vcf_record.info['AC'], list):
allele_freqs = [
AC / vcf_record.info['AN'] for AC in vcf_record.info['AC']
]
else:
allele_freqs = vcf_record.info['AC'] / vcf_record.info['AN']
# if the allele frequency is a single value, make it a list, so
# the iteration below works anyway
if isinstance(allele_freqs, float):
allele_freqs = [allele_freqs]
# alt is the nucleotidic change
iter_data = zip(allele_freqs, vcf_record.alt)
for alt_index, (allele_freq, change) in enumerate(iter_data):
if allele_freq < min_af:
# the allele frequency for the SNP is too low, it'll be
# skipped
skip_af += 1
continue
# the samples that contain the SNP is a string separated by '-'
if options.bcftools_vcf:
samples = set()
for sample_id, sample_info in vcf_record.samples.items():
# prepare the genotype list, to make the comparison easier
# the genotype separator to '/' only, to use only one
# type of split
sample_info_gt = sample_info['GT'].replace('|', '/')
sample_info_gt = sample_info_gt.split('/')
for genotype in sample_info_gt:
if genotype == '.':
continue
if int(genotype) == (alt_index + 1):
samples.add(sample_id)
else:
samples = [
sample
for sample in vcf_record.info['set'].split('-')
]
check_snp_in_set(
samples,
snp_data,
vcf_record.pos.start,
change,
annotations[vcf_record.chrom],
seqs[vcf_record.chrom]
)
# increase the total number of snps available
count_tot += 1
if vcf_handle.line_no % line_num == 0:
LOG.info(
"Line %d, SNPs passed %d; skipped for: qual %d, " +
"depth %d, freq %d, indels %d",
vcf_handle.line_no, count_tot, skip_qual, skip_dp, skip_af,
skip_indels
)
[docs]def save_data(output_file, snp_data):
"""
Pickle data structures to the disk.
:param str output_file: base name for pickle files
:param dict snp_data: dictionary from :func:`init_count_set` with per
sample SNPs information
"""
LOG.info("Saving sample SNPs to %s", output_file)
pickle.dump(snp_data, output_file, -1)
[docs]def main():
"Main function"
options = set_parser().parse_args()
# configs log and set log level
logger.config_log(options.verbose)
seqs = dict(fasta.load_fasta(options.reference))
# Loads them as list because it's easier to init the data structure
annotations = list(gff.parse_gff(options.gff_file))
if len(annotations[0].sample_coverage) != len(options.samples_id):
utils.exit_script(
"Coverage information was not found for all samples", 2
)
snp_data = init_count_set(annotations)
# Group annotations by their reference sequence
annotations = gff.group_annotations(
annotations,
key_func=lambda x: x.seq_id
)
parse_vcf(
options.vcf_file,
snp_data,
options.min_reads,
options.min_freq,
options.min_qual,
annotations,
seqs,
options
)
save_data(options.output_file, snp_data)