"""
Contains function and constants for Uniprot access
"""
from __future__ import division
from builtins import range, zip
import mgkit
import logging
from . import url_read
UNIPROT_MAP = 'http://www.uniprot.org/mapping/'
"URL to Uniprot mapping REST API"
UNIPROT_GET = 'http://www.uniprot.org/uniprot/'
"URL to Uniprot REST API"
UNIPROT_TAXONOMY = 'http://www.uniprot.org/taxonomy/'
"URL to Uniprot REST API - Taxonomy"
COLS_TAXON = 'organism-id'
COLS_KO = 'database(KO)'
COLS_EGGNOG = 'database(EGGNOG)'
COLS_EC = 'ec'
LOG = logging.getLogger(__name__)
[docs]def get_sequences_by_ko(ko_id, taxonomy, contact=None, reviewed=True):
"""
Gets sequences from Uniprot, restricting to the taxon id passed.
:param str ko_id: KO id of the sequences to download
:param int taxonomy: id of the taxon
:param str contact: email address to be passed in the query (requested by
Uniprot API)
:param bool reviewed: if the sequences requested must be reviewed
:return: string with the fasta file downloaded
"""
params = {
'query': 'database:(type:ko {0}) AND taxonomy:{1}{2}'.format(
ko_id, taxonomy, ' reviewed:yes' if reviewed else ''),
'format': 'fasta',
'limit': 200,
'sort': 'score'
}
if mgkit.DEBUG:
LOG.debug("query: %s?%s", UNIPROT_GET, params)
LOG.debug("request length %d", len(params))
fasta = url_read(UNIPROT_GET, data=params, agent=contact)
return fasta
[docs]def get_mappings(entry_ids, db_from='ID', db_to='EMBL', out_format='tab',
contact=None):
"""
Gets mapping of genes using Uniprot REST API. The db_from and db_to values
are the ones accepted by Uniprot API. The same applies to out_format, the
only processed formats are 'list', which returns a list of the mappings
(should be used with one gene only) and 'tab', which returns a dictionary
with the mapping. All other values returns a string with the newline
stripped.
:param iterable entry_ids: iterable of ids to be mapped (there's a limit)
to the maximum length of a HTTP request, so it should be less than 50
:param str db_from: string that identify the DB for elements in entry_ids
:param str db_to: string that identify the DB to which map entry_ids
:param str out_format: format of the mapping; 'list' and 'tab' are
processed
:param str contact: email address to be passed in the query (requested
Uniprot API)
:return: tuple, dict or str depending on out_format value
"""
if isinstance(entry_ids, str):
entry_ids = [entry_ids]
data = {
'from': db_from,
'to': db_to,
'query': ' '.join(entry_ids),
'format': out_format
}
mappings = url_read(UNIPROT_MAP, data=data, agent=contact)
mappings = mappings.strip()
if out_format == 'list':
mappings = mappings.split('\n')
elif out_format == 'tab':
mapping_dict = {}
mappings = mappings.split('\n')
# delete first row 'From to'
del mappings[0]
for mapping in mappings:
id_from, id_to = mapping.split('\t')
if id_to == 'null':
continue
try:
mapping_dict[id_from].append(id_to)
except KeyError:
mapping_dict[id_from] = [id_to]
mappings = mapping_dict
return mappings
[docs]def ko_to_mapping(ko_id, query, columns, contact=None):
"""
Returns the mappings to the supplied KO. Can be used for any id, the
query format is free as well as the columns returned. The only
restriction is using a tab format, that is parsed.
:param str ko_id: id used in the query
:param str query: query passed to the Uniprot API, ko_id is replaced
using :func:`str.format`
:param str column: column used in the results table used to map the ids
:param str contact: email address to be passed in the query (requested
Uniprot API)
.. note::
each mapping in the column is separated by a ;
"""
data = {
'query': query.format(ko_id),
'format': 'tab',
'columns': columns
}
mappings = url_read(UNIPROT_GET, data=data, agent=contact)
if mgkit.DEBUG:
LOG.debug("query: %s?%s", UNIPROT_GET, data)
LOG.debug("request length %d", len(data))
mappings = mappings.split('\n')
del mappings[0]
categories = set()
for map_line in mappings:
mappings = [mapping.strip() for mapping in map_line.split(';')]
if not mappings:
continue
categories.update(mappings)
# in case an empty line is present
try:
categories.remove('')
except KeyError:
pass
return categories
[docs]def get_gene_info(gene_ids, columns, max_req=50, contact=None):
"""
.. versionadded:: 0.1.12
Get informations about a list of genes. it uses :func:`query_uniprot` to
send the request and format the response in a dictionary.
Arguments:
gene_ids (iterable, str): gene id(s) to get informations for
columns (list): list of columns
max_req (int): number of maximum *gene_ids* per request
contact (str): email address to be passed in the query (requested
Uniprot API)
Returns:
dict: dictionary where the keys are the *gene_ids* requested and the
values are dictionaries with the names of the *columns* requested as
keys and the corresponding values, which can be lists if the values are
are semicolon separated strings.
Example:
To get the taxonomy ids for some genes:
>>> uniprot.get_gene_info(['Q09575', 'Q8DQI6'], ['organism-id'])
{'Q09575': {'organism-id': '6239'}, 'Q8DQI6': {'organism-id': '171101'}}
"""
if isinstance(gene_ids, str):
gene_ids = [gene_ids]
elif isinstance(gene_ids, set):
gene_ids = list(gene_ids)
if isinstance(columns, str):
columns = [columns]
infos = {}
for index in range(0, len(gene_ids), max_req):
LOG.info(
"Querying uniprot ids (%d/%d)",
index + max_req,
len(gene_ids)
)
info_lines = query_uniprot(
' OR '.join('id:{}'.format(gene_id) for gene_id in gene_ids[index:index+max_req]),
columns=['id'] + columns,
contact=contact
)
info_lines = info_lines.split('\n')
del info_lines[0]
for info_line in info_lines:
info_line = info_line.strip()
if not info_line:
continue
values = info_line.split('\t')
gene_id = values[0]
infos[gene_id] = dict(
(
column,
value if (not value.endswith(';')) and (not value.endswith('; ')) and ('; ' not in value)
else [x.strip() for x in value.split(';') if x.strip()]
)
for column, value in zip(columns, values[1:])
)
return infos
[docs]def query_uniprot(query, columns=None, format='tab', limit=None, contact=None,
baseurl=UNIPROT_GET):
"""
.. versionadded:: 0.1.12
.. versionchanged:: 0.1.13
added *baseurl* and made *columns* a default argument
Queries Uniprot, returning the raw response in tbe format specified. More
informations at the `page <http://www.uniprot.org/faq/28>`_
Arguments:
query (str): query to submit, as put in the input box
columns (None, iterable): list of columns to return
format (str): response format
limit (int, None): number of entries to return or *None* to request all
entries
contact (str): email address to be passed in the query (requested
Uniprot API)
baseurl (str): base url for the REST API, can be either
:data:`UNIPROT_GET` or :data:`UNIPROT_TAXONOMY`
Returns:
str: raw response from the query
Example:
To get the taxonomy ids for some genes:
>>> uniprot.query_uniprot('Q09575 OR Q8DQI6', ['id', 'organism-id'])
'Entry\\tOrganism ID\\nQ8DQI6\\t171101\\nQ09575\\t6239\\n'
.. warning::
because of limits in the length of URLs, it's advised to limit the
length of the query string.
"""
data = {
'query': query,
'format': format
}
if limit is not None:
data['limit'] = limit
if columns is not None:
data['columns'] = ','.join(columns)
if mgkit.DEBUG:
LOG.debug("query: %s?%s", baseurl, data)
LOG.debug("request length %d", len(data))
return url_read(baseurl, data, agent=contact)
[docs]def parse_uniprot_response(data, simple=True):
"""
.. versionadded:: 0.1.12
Parses raw response from a Uniprot query (tab format only) from functions
like :func:`query_uniprot` into a dictionary. It requires that the first
column is the entry id (or any other unique id).
Arguments:
data (str): string response from Uniprot
simple (bool): if True and the number of columns is 1, the dictionary
returned has a simplified structure
Returns:
dict: The format of the resulting dictionary is
entry_id -> {column1 -> value, column2 -> value, ..} unless there's
only one column and *simple* is True, in which case the value is
equal to the value of the only column.
"""
data = data.splitlines()
columns = [x.lower() for x in data[0].split('\t')[1:]]
del data[0]
parsed_data = {}
for line in data:
line = line.split('\t')
entry_id = line[0]
if (len(columns) == 1) and simple:
parsed_data[entry_id] = line[1]
else:
parsed_data[entry_id] = dict(
zip(columns, line[1:])
)
return parsed_data
[docs]def get_ko_to_eggnog_mappings(ko_ids, contact=None):
"""
.. versionadded:: 0.1.14
It's not possible to map in one go KO IDs to eggNOG IDs via the API in
Uniprot. This function uses :func:`query_uniprot` to get all Uniprot IDs
requested and the return a dictionary with all their eggNOG IDs they map
to.
Arguments:
ko_ids (iterable): an iterable of KO IDs
contact (str): email address to be passed in the query (requested
Uniprot API)
Returns:
dict: The format of the resulting dictionary is
ko_id -> {eggnog_id1, ..}
"""
data = query_uniprot(
"database:(type:ko AND ({}))".format(' OR '.join(list(ko_ids))),
columns=['database(KO)', 'database(EGGNOG)'],
contact=contact
)
data = data.splitlines()
del data[0]
parsed_data = {}
for line in data:
ko_ids, eggnog_ids = line.split('\t')
ko_ids = ko_ids.split(';')
for ko_id in ko_ids:
if not ko_id:
continue
if not eggnog_ids:
continue
for eggnog_id in eggnog_ids.split(';'):
if not eggnog_id:
continue
try:
parsed_data[ko_id].add(eggnog_id)
except KeyError:
parsed_data[ko_id] = set([eggnog_id])
return parsed_data
[docs]def get_uniprot_ec_mappings(gene_ids, contact=None):
"""
.. versionadded:: 0.1.14
Shortcut to download EC mapping of Uniprot IDs. Uses :func:`get_gene_info`
passing the correct column (*ec*).
"""
return get_gene_info(
gene_ids,
columns=['ec'],
contact=contact,
max_req=100
)
[docs]def get_gene_info_iter(gene_ids, columns, contact=None, max_req=50):
"""
.. versionadded:: 0.3.3
Alternative function to :func:`get_gene_info`, returning an iterator to
avoid connections timeouts when updating a dictionary
This funciton's parameters are the same as :func:`get_gene_info`
"""
gene_ids = list(gene_ids)
for index in range(0, len(gene_ids), max_req):
yield get_gene_info(
gene_ids[index:index+max_req],
columns,
contact=contact,
max_req=max_req
)