Source code for gepyto.structures.genes

# Structures to handle genes and their associated transcripts and their
# proteins.

# This file is part of gepyto.
#
# This work is licensed under the Creative Commons Attribution-NonCommercial
# 4.0 International License. To view a copy of this license, visit
# http://creativecommons.org/licenses/by-nc/4.0/ or send a letter to Creative
# Commons, PO Box 1866, Mountain View, CA 94042, USA.

__author__ = "Marc-Andre Legault"
__copyright__ = ("Copyright 2014 Marc-Andre Legault and Louis-Philippe "
                 "Lemieux Perreault. All rights reserved.")
__license__ = "Attribution-NonCommercial 4.0 International (CC BY-NC 4.0)"


import sys
import contextlib
import json
import logging
import re

try:
    # Python 2 support
    from urllib2 import urlopen, Request
except ImportError:
    # Python 3 support
    from urllib.request import urlopen, Request

from .. import settings
from ..db.ensembl import query_ensembl, get_url_prefix
from ..db.appris import get_category_for_transcript
from .region import Region
from . import sequences


__all__ = ["Gene", "Transcript"]


class Gene(object):
[docs] """Python object representing a gene. Store the following information: Required - build: The genome build. - chrom: The chromosome. - start and end: The genomic positions for the gene. - strand: The strand (either 1 or -1). - xrefs: A dict of id mappings to multiple databases. - transcripts: A list of Transcript objects. Optional - symbol: An HGNC symbol. - desc: A short description. - exons: A list of pairs of positions for exons. You can only pass kwargs to build the genes. This makes for more eloquent code and avoids mistakes. """ def __init__(self, **kwargs): _PARAMETERS = { "build": str, "chrom": str, "start": int, "end": int, "strand": int, "xrefs": dict, } _OPTIONAL_PARAMETERS = { "symbol": str, "desc": str, "transcripts": list, "exons": list, "biotype": str, } _ALL_PARAMS = dict( list(_PARAMETERS.items()) + list(_OPTIONAL_PARAMETERS.items()) ) # Store the passed parameters. for arg, val in kwargs.items(): if arg not in _ALL_PARAMS: raise Exception("Unknown parameter {}.".format(arg)) try: setattr(self, arg, _ALL_PARAMS[arg](val)) except TypeError: raise TypeError("Invalid type for argument {}. Expected " "a {}.".format(arg, _ALL_PARAMS[arg])) # Check that all required arguments were passed. for p in _PARAMETERS: if getattr(self, p, "-1") == "-1": raise Exception("Missing parameter {}.".format(p)) # Some basic assertions. assert self.build in ("GRCh37", "GRCh38") assert re.match(r"([0-9]{1,2}|MT|X|Y)", self.chrom) assert self.start < self.end def __repr__(self): # Getting the best name name_repr = None if hasattr(self, "symbol"): name_repr = self.symbol else: for ref in ("ensembl_gene_id", "entrez_id", "ucsc_id"): if ref in self.xrefs: name_repr = "{} ({})".format(self.xrefs[ref], ref) break if name_repr is None: name_repr = "Unknown" return "<Gene:{} (chr{}:{}-{})>".format( name_repr, self.chrom, self.start, self.end ) @property def region(self): """Lazily loads the Region object for this Gene.""" if hasattr(self, "_region"): return self._region else: self._region = Region(self.chrom, self.start, self.end) return self.region def __contains__(self, o): """Test if an object is in the gene. This compares the chrom, start and end (or pos) attributes of the given object with the gene. You can use the syntax: `rs123 in gene`. Where the `rs123` is a SNP object (or other) and `gene` is a `Gene` object. """ return o in self.region def get_ortholog_sequences(self):
[docs] """Queries Ensembl to get Sequence objects representing orthologs. :returns: A list of :py:class:`gepyto.structures.sequences.Sequence` :rtype: list """ return self._homology("orthologs") def get_paralog_sequences(self):
[docs] """Queries Ensembl to get Sequence objects representing paralogs. :returns: A list of :py:class:`gepyto.structures.sequences.Sequence` :rtype: list """ return self._homology("paralogs") def _homology(self, homo_type="orthologs"):
"""Retrieve homology information from Ensembl. :param homo_type: The type of homologuous sequence to query. Can be either 'orthologs' or 'paralogs'. :type homo_type: str """ homo_types = { "orthologs": "orthologues", "paralogs": "paralogues" } if homo_type not in homo_types.keys(): raise Exception("Invalid homology type. Valid parameters are: " "{}".foramt(homo_types.keys())) if "ensembl_id" not in self.xrefs: raise Exception("Can't retrieve homology information from Ensembl " "without an 'ensembl_id' in the cross references " "(xrefs).") url = get_url_prefix(settings.BUILD) url += "homology/id/{}?content-type=application/json&type={}" url = url.format(self.xrefs["ensembl_id"], homo_types[homo_type]) res = query_ensembl(url) homolog_sequences = [] for hit in res["data"][0]["homologies"]: hit = hit["target"] seq_id = hit["protein_id"] seq = hit["align_seq"] seq_obj = sequences.Sequence( seq_id, "".join([c for c in seq if c != '-']), "AA", { "species": hit["species"], "species_ncbi_tax_id": hit["taxon_id"], "perc_id": hit["perc_id"] } ) homolog_sequences.append(seq_obj) return homolog_sequences @classmethod def factory_symbol(cls, symbol, build=settings.BUILD):
[docs] """Builds a gene object from it's HGNC symbol. :param symbol: The HGNC symbol. :type symbol: str :returns: The Gene object. :rtype: :py:class:`Gene` """ xrefs = Gene.get_xrefs_from_symbol(symbol, build=build) ensembl_id = xrefs.get("ensembl_id") if ensembl_id is None: raise Exception("Could not initialize gene from symbol {} because " "the Ensembl ID (ENSG) could not be " "found.".format(symbol)) return Gene.factory_ensembl_id(ensembl_id, xrefs=xrefs, build=build) @classmethod
def factory_ensembl_id(cls, ensembl_id, xrefs=None, build=settings.BUILD):
[docs] """Builds a gene object from it's Ensembl ID. :param ensembl_id: The Ensembl ID. :type ensembl_id: str :returns: The Gene object. :rtype: :py:class:`Gene` """ url = get_url_prefix(build) url += ("overlap/id/{}" "?content-type=application/json" "&feature=gene" "&feature=transcript" "&feature=exon") response = query_ensembl(url.format(ensembl_id)) # The response contains both gene information, the underlying # transcripts and the exons. We need to parse all that. transcripts = [] exons = [] gene_info = None for elem in response: # We skip irrelevent entries (not on this build). if elem.get("assembly_name") and elem["assembly_name"] != build: continue if elem["feature_type"] == "gene" and elem["id"] == ensembl_id: gene_info = _parse_gene(elem) elif elem["feature_type"] == "transcript": tr = _parse_transcript(elem) # Sometimes, ensembl returns odd things that are unrelated, # we filter here for those. if tr.parent == ensembl_id: transcripts.append(tr) elif elem["feature_type"] == "exon": exons.append(_parse_exon(elem)) if gene_info is None: raise Exception("Could not find gene {} in Ensembl.".format( ensembl_id, )) if xrefs is None: xrefs = Gene.get_xrefs_from_ensembl_id(ensembl_id, build=build) if "symbol" in xrefs: gene_info["symbol"] = xrefs.pop("symbol") gene_info["xrefs"] = xrefs gene_info["transcripts"] = transcripts gene_info["exons"] = exons g = Gene(**gene_info) for tr in transcripts: tr.parent = g return g @classmethod
def get_xrefs_from_ensembl_id(cls, ensembl_id, build=settings.BUILD):
[docs] """Fetches the HGNC (HUGO Gene Nomenclature Commitee) service to get a gene ID for other databases. :param ensembl_id: The gene Ensembl ID to query. :type ensembl_id: str :returns: A dict representing information on the gene. :rtype: dict If no gene with this Ensembl ID can be found, `None` is returned. """ return Gene.get_xrefs("ensembl_gene_id", ensembl_id) @classmethod
def get_xrefs_from_symbol(cls, symbol, build=settings.BUILD):
[docs] """Fetches the HGNC (HUGO Gene Nomenclature Commitee) service to get a gene ID for other databases. :param symbol: The gene symbol to query. :type symbol: str :returns: A dict representing information on the gene. :rtype: dict If no gene with this symbol can be found, `None` is returned. """ return Gene.get_xrefs("symbol", symbol) @classmethod
def get_xrefs(cls, field, query, build=settings.BUILD):
[docs] """Fetches the HGNC (HUGO Gene Nomenclature Commitee) service to get a gene ID for other databases. :param field: A searchable fields. :type field: str :param query: The query. :type query: str :returns: A dict representing information on the gene. :rtype: dict If no gene with this symbol can be found, `None` is returned. """ # Checks if the field is valid assert field in {"ccds_id", "ensembl_gene_id", "entrez_id", "hgnc_id", "name", "symbol", "ucsc_id", "uniprot_ids", "vega_id"} # The url and header url = "http://rest.genenames.org/search/{field}:{query}" headers = {"Accept": "application/json"} req = Request(url.format(field=field, query=query), headers=headers) with contextlib.closing(urlopen(req)) as stream: res = json.loads(stream.read().decode()) # We take the top search hit and run a fetch. if res["response"]["numFound"] > 0: doc = res["response"]["docs"][0] assert doc["score"] == res["response"]["maxScore"] # Use the HGNC Fetch. url = "http://rest.genenames.org/fetch/{field}/{query}" req = Request(url.format(field=field, query=query), headers=headers) with contextlib.closing(urlopen(req)) as stream: res = json.loads(stream.read().decode()) # Parse the cross references. if res["response"]["numFound"] > 0: doc = res["response"]["docs"][0] # Check the right symbol was found assert doc.get(field) == query id_dict = { "name": doc.get("name"), "symbol": doc.get("symbol"), "ncbi_id": doc.get("entrez_id"), "cosmic_id": doc.get("cosmic"), "refseq_ids": doc.get("refseq_accession"), "ensembl_id": doc.get("ensembl_gene_id"), "omim_ids": doc.get("omim_id"), "uniprot_ids": doc.get("uniprot_ids"), "ucsc_id": doc.get("ucsc_id"), } return id_dict else: raise Exception("No gene returned by HGNC fetch on " "{field} {query}.".format(field=field, query=query)) elif field == "ensembl_gene_id": # Get from Ensembl url = get_url_prefix(build) url += "xrefs/id/{}?content-type=application/json" url = url.format(query) response = query_ensembl(url) id_dict = {"ensembl_gene_id": query} for db_info in response: # Entrez if db_info["dbname"] == "EntrezGene": id_dict["entrez_id"] = db_info["primary_id"] continue # UniPROT if db_info["dbname"] == "Uniprot_gn": id_dict["uniprot_ids"] = db_info["primary_id"] return id_dict else: return {} class Transcript(object):
[docs] """Python object representing a transcript. Store the following information: Required - build: The genome build. - chrom: The chromosome. - start and end: The genomic positions for the gene. - enst: The corresponding Ensembl transcript id. Optional - appris_cat: The APPRIS category. - parent: The corresponding Gene object. - biotype: The biotype as given by Ensembl. """ def __init__(self, **kwargs): dummy = lambda x: x _PARAMETERS = { "build": str, "chrom": str, "start": int, "end": int, "enst": str, } _OPTIONAL_PARAMETERS = { "appris_cat": str, "parent": dummy, "biotype": str, } _ALL_PARAMS = dict( list(_PARAMETERS.items()) + list(_OPTIONAL_PARAMETERS.items()) ) # Store the passed parameters. for arg, val in kwargs.items(): if arg not in _ALL_PARAMS: raise Exception("Unknown parameter {}.".format(arg)) setattr(self, arg, _ALL_PARAMS[arg](val)) # Check that all required arguments were passed. for p in _PARAMETERS: if getattr(self, p, "-1") == "-1": raise Exception("Missing parameter {}.".format(p)) # Some basic assertions. assert self.build in ("GRCh37", "GRCh38") assert re.match(r"([0-9]{1,2}|MT|X|Y)", self.chrom) assert self.start < self.end @property def region(self): """Lazily loads the Region object for this Transcript.""" if hasattr(self, "_region"): return self._region else: self._region = Region(self.chrom, self.start, self.end) return self.region def get_sequence(self, seq_type="genomic"):
[docs] """Build a Sequence object representing the transcript. :param seq_type: This can be either genomic, cds, cdna or protein. :type seq_type: str :returns: A Sequence object representing the feature. :rtype: :py:class:`gepyto.structures.sequences.Sequence` """ # Get the sequence from Ensembl. seq_types = ("genomic", "cds", "cdna", "protein") if seq_type not in seq_types: raise Exception("Invalid sequence type ({}). Known types are: " "{}".format(seq_type, ", ".join(seq_types))) url = get_url_prefix(self.build) url += "sequence/id/{}?content-type=application/json&type={}" url = url.format(self.enst, seq_type) res = query_ensembl(url) # Build the Sequence. seq = sequences.Sequence( res["id"], res["seq"], "AA" if seq_type == "protein" else "DNA", ) return seq @classmethod
def factory_position(cls, region, build=settings.BUILD):
[docs] """Gets a list of transcripts overlapping with the given position. :param pos: A genomic position of the form `chr2:12345-12347`. :type pos: str :returns: A list of :py:class:`Transcript` :rtype: list This method uses the Ensembl API. """ assert re.match(r"^chr([0-9]{1,2}|MT|X|Y):[0-9]+-[0-9]+$", region) region = region.lstrip("chr") region = region.replace("-", "..") url = get_url_prefix(build) url += ("overlap/region/homo_sapiens/{}" "?feature=transcript" "&content-type=application/json") url = url.format(region) response = query_ensembl(url) transcripts = [] for tr in response: transcripts.append(_parse_transcript(tr)) return transcripts def __repr__(self):
return "<Transcript:{} (chr{}:{}-{})>".format( self.enst, self.chrom, self.start, self.end, ) def _parse_gene(o):
"""Parse gene information from an Ensembl `overlap` query. :param o: A Python dict representing an entry from the response. :type o: dict :returns: A standardised dict of gene information. :rtype: dict """ assert o["feature_type"] == "gene" d = { "desc": o.get("description"), "build": o.get("assembly_name"), "chrom": o.get("seq_region_name"), "start": o.get("start"), "end": o.get("end"), "strand": o.get("strand"), "biotype": o.get("biotype"), } return d def _parse_transcript(o): """Parse transcript information from an Ensembl `overlap` query. :param o: A Python dict representing an entry from the response. :type o: dict :returns: A Transcript object representing the transcript. :rtype: :py:class:`Transcript` """ assert o["feature_type"] == "transcript" d = { "build": o.get("assembly_name"), "chrom": o.get("seq_region_name"), "start": o.get("start"), "end": o.get("end"), "enst": o.get("id"), "biotype": o.get("biotype"), "parent": o.get("Parent"), } # Get the APPRIS annotation. try: d["appris_cat"] = get_category_for_transcript(d["enst"]) except Exception: pass return Transcript(**d) def _parse_exon(o): """Parse exon information from an Ensembl `overlap` query. :param o: A Python dict representing an entry from the response. :type o: dict :returns: A tuple of (start, end) positions representing the exon. :rtype: tuple """ assert o["feature_type"] == "exon" return (o["start"], o["end"])