Source code for mgkit.net.embl

"Access EMBL Services"

from builtins import range
import logging
import re
import gzip
from . import url_read
import requests
import mgkit
try:
    from cStringIO import StringIO
except ImportError:
    from io import BytesIO as StringIO


[docs]class NoEntryFound(Exception):
    """
    Raised if no sequences where found by :func:`get_sequences_by_ids`, the
    check is based on the :data:`NONE_FOUND` variable.
    """
    pass


[docs]class EntryNotFound(Exception):
    """
    Raised if at least one entry was not found by :func:`get_sequences_by_ids`.
    :data:`NOT_FOUND` is used to check if any entry wasn't downloaded.
    """
    pass


URL_REST = "http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/"
"Base URL for EMBL DBFetch REST API"

URL_DATAWAREHOUSE = "http://www.ebi.ac.uk/ena/data/warehouse/search?"

EMBL_DBID = 'embl_cds'
"Default database id"

NOT_FOUND = r"Entry: .+? not found.\n"
"""
Appears in the resulting fasta (not tried on other formats) in the case that at
least one entry wasn't found.
"""

SUPPRESSED = "suppressed at the submitter's request on"

NONE_FOUND = r"ERROR 12.+?.\n?"
"""
Regular expression to check if no entry was found, used by :exc:`NoEntryFound`
"""

LOG = logging.getLogger(__name__)
"Log instance for this module"


[docs]def get_sequences_by_ids(embl_ids, contact=None, out_format='fasta',
                         num_req=10, embl_db=EMBL_DBID, strict=False):
    """
    .. versionchanged:: 0.3.4
        removed *compress* as it's bases on the `requests` package

    Downloads entries using EBI REST API. It can download one entry at a
    time or accept an iterable and all sequences will be downloaded in batches
    of at most num_req.

    It's fairly general, so can be customised, from the DB used to the output
    format: all batches are simply concatenate.

    .. note::

        There are some checks on the some errors reported by the EMBL api, but
        not documented, in particular two errors, which are just reported as
        text lines in the fasta file (the only one tested at this time).

        The are two possible cases:

        * if no entry was found :exc:`NoEntryFound` will be raised.
        * if at least one entry wasn't found:

          * if strict is False (the default) the error will be just logged as a
            debug message
          * if strict is True :exc:`EntryNotFound` is raised

    Args:
        embl_ids (iterable, str): list of ids to download
        contact (str): email address to be passed in the query
        format (str): format of the entry
        num_req (int): number of entries to download with each request
        embl_db (str): db to which the ids refer to
        strict (bool): if True, a check on the number of entries retrieved is
            performed

    Returns:
        str: the entries requested

    Raises:
        EntryNotFound: if at least an entry was not found
        NoEntryFound: if NO entry were found

    .. warning::

        The number of sequences that can be downloaded at a time is 11, it
        seems, since the returned sequences for each request was at most 11. I
        didn't find any mention of this in the API docs, but it may be a
        restriction that's temporary.

    """
    if isinstance(embl_ids, str):
        embl_ids = [embl_ids]
    elif isinstance(embl_ids, (set, dict)):
        embl_ids = list(embl_ids)

    splitter = re.compile(NOT_FOUND)
    error_re = re.compile(NONE_FOUND)

    entries = []

    for idx in range(0, len(embl_ids), num_req):

        if len(embl_ids) > num_req:
            LOG.debug("Downloading ids - range %d-%d", idx + 1, idx + num_req)

        request_params = "{db_id}/{entry_id}/{format}".format(
            db_id=embl_db,
            entry_id=','.join(embl_ids[idx:idx+num_req]),
            format=out_format
        )

        request_url = URL_REST + request_params

        seqs = url_read(request_url, agent=contact)

        # it seems that they add at the end of each request two newlines
        # (should be just one)
        seqs = seqs.replace('\n\n', '\n')

        if splitter.search(seqs):
            if strict:
                raise EntryNotFound("At least one Entry was not found")
            LOG.debug("At least one Entry was not found")
            entries.extend(splitter.split(seqs))
        else:
            entries.append(seqs)

    entries = ''.join(entries)

    # check for suppressed entries
    entries = '\n'.join(
        line for line in entries.split('\n')
        if line.find(SUPPRESSED) == -1
    )

    if error_re.search(entries):
        # if there's no sequences or we want all sequences to be downloaded
        if entries.find('>') == -1:
            raise NoEntryFound("No entry found")
        elif strict:
            raise EntryNotFound("At least one Entry was not found")
        # in case there's at least one sequence that was retrieved
        else:
            entries = ''.join(error_re.split(entries))
            LOG.debug("At least one Entry was not found")

    return entries


[docs]def dbfetch(embl_ids, db='embl', contact=None, out_format='seqxml',
            num_req=10):
    """
    .. versionadded:: 0.1.12

    Function that allows to use dbfetch service (REST). More information on the
    output formats and the database available at the
    `service page <http://www.ebi.ac.uk/Tools/dbfetch/syntax.jsp>`_

    Arguments:
        embl_ids (str, iterable): list or single sequence id to retrieve
        db (str): database from which retrieve the sequence data
        contact (str): email contact to use as per EMBL guidlines
        out_format (str): output format, depends on database
        num_req (int): number of ids per request

    Returns:
        list: a list with the results from each request sent. Each request sent
        has a maximum number *num_req* of ids, so the number of items in the
        list depends by the number of ids in *embl_ids* and the value of
        *num_req*.
    """
    if isinstance(embl_ids, str):
        embl_ids = [embl_ids]
    elif isinstance(embl_ids, (set, dict)):
        embl_ids = list(embl_ids)

    entries = []

    for idx in range(0, len(embl_ids), num_req):

        if len(embl_ids) > num_req:
            LOG.info("Downloading ids - range %d-%d", idx + 1, idx + num_req)

        request_params = "{db_id}/{entry_id}/{format}?style=raw".format(
            db_id=db,
            entry_id=','.join(embl_ids[idx:idx+num_req]),
            format=out_format
        )

        request_url = URL_REST + request_params

        if mgkit.DEBUG:
            LOG.debug(request_url)

        entries.append(url_read(request_url, agent=contact))

    return entries


[docs]def datawarehouse_search(query, domain='sequence', result='sequence_release',
                         display='fasta', offset=0, length=100000,
                         contact=None, download='gzip', url=URL_DATAWAREHOUSE,
                         fields=None):
    """
    .. versionchanged:: 0.2.3
        added *fields* parameter to retrieve tab separated information

    .. versionadded:: 0.1.13

    Perform a datawarehouse search on EMBL dbs. Instructions on the query
    language used to query the datawarehouse are available at `this page
    <http://www.ebi.ac.uk/ena/about/browser#data_warehouse>`_ with more details
    about the databases domains `at this page
    <http://www.ebi.ac.uk/ena/data/warehouse/usage>`_

    Arguments:
        query (str): query for the search enging
        domain (str): database domain to search
        result (str): domain result requested
        display (str): display option (format to retrieve the entries)
        offset (int): the offset of the search results, defaults to the first
        length (int): number of results to retrieve at the specified offset
            and the limit is automatically set a 100,000 records for query
        contact (str): email of the user
        download (str): type of response. Gzip responses are automatically
            decompressed
        url (str): base URL for the resource
        fields (None, iterable): must be an iterable of fields to be returned
            if display is set to *report*

    Returns:
        str: the raw request

    Examples:
        Querying EMBL for all sequences of type rRNA of the  *Clostridium*
        genus. Only from the EMBL release database in fasta format:

        >>> query = 'tax_tree(1485) AND mol_type="rRNA"'
        >>> result = 'sequence_release'
        >>> display = 'fasta'
        >>> data = embl.datawarehouse_search(query, result=result,
        ... display=display)
        >>> len(data)
        35919

        Each entry taxon_id from the same data can be retrieved using *report*
        as the *display* option and *fields* an iterable of fields to just
        ('accession', tax_id'):

        >>> query = 'tax_tree(1485) AND mol_type="rRNA"'
        >>> result = 'sequence_release'
        >>> display = 'report'
        >>> fields = ('accession', 'tax_id')
        >>> data = embl.datawarehouse_search(query, result=result,
            display=display, fields=fields)

    """

    params = {
        'query': query,
        'domain': domain,
        'result': result,
        'display': display,
        'offset': offset,
        'length': length,
        'download': download
    }

    # if display='report', fields can be used to display, but download needs to
    # be 'txt'
    if display == 'report':
        params['download'] = download = 'txt'
        try:
            params['fields'] = ','.join(fields)
        except TypeError:
            raise ValueError(
                "the 'fields' parameter must be an iteratble if display is "
                "equal to 'report'"
                )

    stream = True if download == 'gzip' else False
    request = requests.get(url, params, headers={'user-agent': contact}, stream=stream)
    if download == 'gzip':
        data = StringIO(request.raw.read())
        data = gzip.GzipFile(mode='rb', fileobj=data).read()
    else:
        data = request.text

    return data
Source code for mgkit.net.embl

MGKit: Metagenomic framework

Navigation

Related Topics