Source code for gepyto.structures.genes

# Structures to handle genes and their associated transcripts and their
# proteins.

# This file is part of gepyto.
#
# This work is licensed under the Creative Commons Attribution-NonCommercial
# 4.0 International License. To view a copy of this license, visit
# http://creativecommons.org/licenses/by-nc/4.0/ or send a letter to Creative
# Commons, PO Box 1866, Mountain View, CA 94042, USA.

__author__ = "Marc-Andre Legault"
__copyright__ = ("Copyright 2014 Marc-Andre Legault and Louis-Philippe "
                 "Lemieux Perreault. All rights reserved.")
__license__ = "Attribution-NonCommercial 4.0 International (CC BY-NC 4.0)"


import sys
import contextlib
import json
import logging
import re

try:
    # Python 2 support
    from urllib2 import urlopen, Request
except ImportError:
    # Python 3 support
    from urllib.request import urlopen, Request

from .. import settings
from ..db.ensembl import query_ensembl, get_url_prefix
from ..db.appris import get_category_for_transcript
from .region import Region
from . import sequences


__all__ = ["Gene", "Transcript"]


class Gene(object):
[docs]    """Python object representing a gene.

    Store the following information:

    Required

        - build: The genome build.
        - chrom: The chromosome.
        - start and end: The genomic positions for the gene.
        - strand: The strand (either 1 or -1).
        - xrefs: A dict of id mappings to multiple databases.
        - transcripts: A list of Transcript objects.

    Optional

        - symbol: An HGNC symbol.
        - desc: A short description.
        - exons: A list of pairs of positions for exons.

    You can only pass kwargs to build the genes. This makes for more
    eloquent code and avoids mistakes.

    """

    def __init__(self, **kwargs):

        _PARAMETERS = {
            "build": str,
            "chrom": str,
            "start": int,
            "end": int,
            "strand": int,
            "xrefs": dict,
        }

        _OPTIONAL_PARAMETERS = {
            "symbol": str,
            "desc": str,
            "transcripts": list,
            "exons": list,
            "biotype": str,
        }

        _ALL_PARAMS = dict(
            list(_PARAMETERS.items()) + list(_OPTIONAL_PARAMETERS.items())
        )

        # Store the passed parameters.
        for arg, val in kwargs.items():
            if arg not in _ALL_PARAMS:
                raise Exception("Unknown parameter {}.".format(arg))
            try:
                setattr(self, arg, _ALL_PARAMS[arg](val))
            except TypeError:
                raise TypeError("Invalid type for argument {}. Expected "
                                "a {}.".format(arg, _ALL_PARAMS[arg]))

        # Check that all required arguments were passed.
        for p in _PARAMETERS:
            if getattr(self, p, "-1") == "-1":
                raise Exception("Missing parameter {}.".format(p))

        # Some basic assertions.
        assert self.build in ("GRCh37", "GRCh38")
        assert re.match(r"([0-9]{1,2}|MT|X|Y)", self.chrom)
        assert self.start < self.end

    def __repr__(self):
        # Getting the best name
        name_repr = None
        if hasattr(self, "symbol"):
            name_repr = self.symbol
        else:
            for ref in ("ensembl_gene_id", "entrez_id", "ucsc_id"):
                if ref in self.xrefs:
                    name_repr = "{} ({})".format(self.xrefs[ref], ref)
                    break

        if name_repr is None:
            name_repr = "Unknown"

        return "<Gene:{} (chr{}:{}-{})>".format(
            name_repr,
            self.chrom,
            self.start,
            self.end
        )

    @property
    def region(self):
        """Lazily loads the Region object for this Gene."""
        if hasattr(self, "_region"):
            return self._region
        else:
            self._region = Region(self.chrom, self.start, self.end)
            return self.region

    def __contains__(self, o):
        """Test if an object is in the gene.

        This compares the chrom, start and end (or pos) attributes of the
        given object with the gene.

        You can use the syntax: `rs123 in gene`.
        Where the `rs123` is a SNP object (or other) and `gene` is a `Gene`
        object.

        """
        return o in self.region 

    def get_ortholog_sequences(self):
[docs]        """Queries Ensembl to get Sequence objects representing orthologs.

        :returns: A list of :py:class:`gepyto.structures.sequences.Sequence`
        :rtype: list

        """
        return self._homology("orthologs")

    def get_paralog_sequences(self):
[docs]        """Queries Ensembl to get Sequence objects representing paralogs.

        :returns: A list of :py:class:`gepyto.structures.sequences.Sequence`
        :rtype: list

        """
        return self._homology("paralogs")

    def _homology(self, homo_type="orthologs"):
        """Retrieve homology information from Ensembl.

        :param homo_type: The type of homologuous sequence to query. Can be
                          either 'orthologs' or 'paralogs'.
        :type homo_type: str

        """

        homo_types = {
            "orthologs": "orthologues",
            "paralogs": "paralogues"
        }
        if homo_type not in homo_types.keys():
            raise Exception("Invalid homology type. Valid parameters are: "
                            "{}".foramt(homo_types.keys()))

        if "ensembl_id" not in self.xrefs:
            raise Exception("Can't retrieve homology information from Ensembl "
                            "without an 'ensembl_id' in the cross references "
                            "(xrefs).")

        url = get_url_prefix(settings.BUILD)
        url += "homology/id/{}?content-type=application/json&type={}"
        url = url.format(self.xrefs["ensembl_id"], homo_types[homo_type])

        res = query_ensembl(url)

        homolog_sequences = []
        for hit in res["data"][0]["homologies"]:
            hit = hit["target"]

            seq_id = hit["protein_id"]
            seq = hit["align_seq"]

            seq_obj = sequences.Sequence(
                seq_id,
                "".join([c for c in seq if c != '-']),
                "AA",
                {
                    "species": hit["species"],
                    "species_ncbi_tax_id": hit["taxon_id"],
                    "perc_id": hit["perc_id"]
                }
            )
            homolog_sequences.append(seq_obj)

        return homolog_sequences

    @classmethod
    def factory_symbol(cls, symbol, build=settings.BUILD):
[docs]        """Builds a gene object from it's HGNC symbol.

        :param symbol: The HGNC symbol.
        :type symbol: str

        :returns: The Gene object.
        :rtype: :py:class:`Gene`

        """
        xrefs = Gene.get_xrefs_from_symbol(symbol, build=build)
        ensembl_id = xrefs.get("ensembl_id")

        if ensembl_id is None:
            raise Exception("Could not initialize gene from symbol {} because "
                            "the Ensembl ID (ENSG) could not be "
                            "found.".format(symbol))

        return Gene.factory_ensembl_id(ensembl_id, xrefs=xrefs, build=build)

    @classmethod
    def factory_ensembl_id(cls, ensembl_id, xrefs=None, build=settings.BUILD):
[docs]        """Builds a gene object from it's Ensembl ID.

        :param ensembl_id: The Ensembl ID.
        :type ensembl_id: str

        :returns: The Gene object.
        :rtype: :py:class:`Gene`

        """
        url = get_url_prefix(build)
        url += ("overlap/id/{}"
                "?content-type=application/json"
                "&feature=gene"
                "&feature=transcript"
                "&feature=exon")

        response = query_ensembl(url.format(ensembl_id))

        # The response contains both gene information, the underlying
        # transcripts and the exons. We need to parse all that.
        transcripts = []
        exons = []
        gene_info = None
        for elem in response:
            # We skip irrelevent entries (not on this build).
            if elem.get("assembly_name") and elem["assembly_name"] != build:
                continue

            if elem["feature_type"] == "gene" and elem["id"] == ensembl_id:
                gene_info = _parse_gene(elem)
            elif elem["feature_type"] == "transcript":
                tr = _parse_transcript(elem)
                # Sometimes, ensembl returns odd things that are unrelated,
                # we filter here for those.
                if tr.parent == ensembl_id:
                    transcripts.append(tr)
            elif elem["feature_type"] == "exon":
                exons.append(_parse_exon(elem))

        if gene_info is None:
            raise Exception("Could not find gene {} in Ensembl.".format(
                ensembl_id,
            ))

        if xrefs is None:
            xrefs = Gene.get_xrefs_from_ensembl_id(ensembl_id, build=build)

        if "symbol" in xrefs:
            gene_info["symbol"] = xrefs.pop("symbol")
        gene_info["xrefs"] = xrefs
        gene_info["transcripts"] = transcripts
        gene_info["exons"] = exons

        g = Gene(**gene_info)
        for tr in transcripts:
            tr.parent = g

        return g

    @classmethod
    def get_xrefs_from_ensembl_id(cls, ensembl_id, build=settings.BUILD):
[docs]        """Fetches the HGNC (HUGO Gene Nomenclature Commitee) service to get a
           gene ID for other databases.

        :param ensembl_id: The gene Ensembl ID to query.
        :type ensembl_id: str

        :returns: A dict representing information on the gene.
        :rtype: dict

        If no gene with this Ensembl ID can be found, `None` is returned.

        """
        return Gene.get_xrefs("ensembl_gene_id", ensembl_id)

    @classmethod
    def get_xrefs_from_symbol(cls, symbol, build=settings.BUILD):
[docs]        """Fetches the HGNC (HUGO Gene Nomenclature Commitee) service to get a
           gene ID for other databases.

        :param symbol: The gene symbol to query.
        :type symbol: str

        :returns: A dict representing information on the gene.
        :rtype: dict

        If no gene with this symbol can be found, `None` is returned.

        """
        return Gene.get_xrefs("symbol", symbol)

    @classmethod
    def get_xrefs(cls, field, query, build=settings.BUILD):
[docs]        """Fetches the HGNC (HUGO Gene Nomenclature Commitee) service to get a
           gene ID for other databases.

        :param field: A searchable fields.
        :type field: str

        :param query: The query.
        :type query: str

        :returns: A dict representing information on the gene.
        :rtype: dict

        If no gene with this symbol can be found, `None` is returned.

        """
        # Checks if the field is valid
        assert field in {"ccds_id", "ensembl_gene_id", "entrez_id", "hgnc_id",
                         "name", "symbol", "ucsc_id", "uniprot_ids", "vega_id"}

        # The url and header
        url = "http://rest.genenames.org/search/{field}:{query}"
        headers = {"Accept": "application/json"}

        req = Request(url.format(field=field, query=query), headers=headers)
        with contextlib.closing(urlopen(req)) as stream:
            res = json.loads(stream.read().decode())

        # We take the top search hit and run a fetch.
        if res["response"]["numFound"] > 0:
            doc = res["response"]["docs"][0]
            assert doc["score"] == res["response"]["maxScore"]

            # Use the HGNC Fetch.
            url = "http://rest.genenames.org/fetch/{field}/{query}"
            req = Request(url.format(field=field, query=query),
                          headers=headers)
            with contextlib.closing(urlopen(req)) as stream:
                res = json.loads(stream.read().decode())

            # Parse the cross references.
            if res["response"]["numFound"] > 0:
                doc = res["response"]["docs"][0]

                # Check the right symbol was found
                assert doc.get(field) == query

                id_dict = {
                    "name": doc.get("name"),
                    "symbol": doc.get("symbol"),
                    "ncbi_id": doc.get("entrez_id"),
                    "cosmic_id": doc.get("cosmic"),
                    "refseq_ids": doc.get("refseq_accession"),
                    "ensembl_id": doc.get("ensembl_gene_id"),
                    "omim_ids": doc.get("omim_id"),
                    "uniprot_ids": doc.get("uniprot_ids"),
                    "ucsc_id": doc.get("ucsc_id"),
                }

                return id_dict
            else:
                raise Exception("No gene returned by HGNC fetch on "
                                "{field} {query}.".format(field=field,
                                                          query=query))

        elif field == "ensembl_gene_id":
            # Get from Ensembl
            url = get_url_prefix(build)
            url += "xrefs/id/{}?content-type=application/json"
            url = url.format(query)

            response = query_ensembl(url)

            id_dict = {"ensembl_gene_id": query}
            for db_info in response:
                # Entrez
                if db_info["dbname"] == "EntrezGene":
                    id_dict["entrez_id"] = db_info["primary_id"]
                    continue

                # UniPROT
                if db_info["dbname"] == "Uniprot_gn":
                    id_dict["uniprot_ids"] = db_info["primary_id"]

            return id_dict

        else:
            return {}


class Transcript(object):
[docs]    """Python object representing a transcript.

    Store the following information:

    Required

        - build: The genome build.
        - chrom: The chromosome.
        - start and end: The genomic positions for the gene.
        - enst: The corresponding Ensembl transcript id.

    Optional

        - appris_cat: The APPRIS category.
        - parent: The corresponding Gene object.
        - biotype: The biotype as given by Ensembl.

    """

    def __init__(self, **kwargs):
        dummy = lambda x: x

        _PARAMETERS = {
            "build": str,
            "chrom": str,
            "start": int,
            "end": int,
            "enst": str,
        }

        _OPTIONAL_PARAMETERS = {
            "appris_cat": str,
            "parent": dummy,
            "biotype": str,
        }

        _ALL_PARAMS = dict(
            list(_PARAMETERS.items()) + list(_OPTIONAL_PARAMETERS.items())
        )

        # Store the passed parameters.
        for arg, val in kwargs.items():
            if arg not in _ALL_PARAMS:
                raise Exception("Unknown parameter {}.".format(arg))
            setattr(self, arg, _ALL_PARAMS[arg](val))

        # Check that all required arguments were passed.
        for p in _PARAMETERS:
            if getattr(self, p, "-1") == "-1":
                raise Exception("Missing parameter {}.".format(p))

        # Some basic assertions.
        assert self.build in ("GRCh37", "GRCh38")
        assert re.match(r"([0-9]{1,2}|MT|X|Y)", self.chrom)
        assert self.start < self.end

    @property
    def region(self):
        """Lazily loads the Region object for this Transcript."""
        if hasattr(self, "_region"):
            return self._region
        else:
            self._region = Region(self.chrom, self.start, self.end)
            return self.region

    def get_sequence(self, seq_type="genomic"):
[docs]        """Build a Sequence object representing the transcript.

        :param seq_type: This can be either genomic, cds, cdna or protein.
        :type seq_type: str

        :returns: A Sequence object representing the feature.
        :rtype: :py:class:`gepyto.structures.sequences.Sequence`

        """

        # Get the sequence from Ensembl.
        seq_types = ("genomic", "cds", "cdna", "protein")
        if seq_type not in seq_types:
            raise Exception("Invalid sequence type ({}). Known types are: "
                            "{}".format(seq_type, ", ".join(seq_types)))

        url = get_url_prefix(self.build)
        url += "sequence/id/{}?content-type=application/json&type={}"
        url = url.format(self.enst, seq_type)

        res = query_ensembl(url)

        # Build the Sequence.
        seq = sequences.Sequence(
            res["id"],
            res["seq"],
            "AA" if seq_type == "protein" else "DNA",
        )

        return seq

    @classmethod
    def factory_position(cls, region, build=settings.BUILD):
[docs]        """Gets a list of transcripts overlapping with the given position.

        :param pos: A genomic position of the form `chr2:12345-12347`.
        :type pos: str

        :returns: A list of :py:class:`Transcript`
        :rtype: list

        This method uses the Ensembl API.

        """

        assert re.match(r"^chr([0-9]{1,2}|MT|X|Y):[0-9]+-[0-9]+$", region)
        region = region.lstrip("chr")
        region = region.replace("-", "..")

        url = get_url_prefix(build)
        url += ("overlap/region/homo_sapiens/{}"
                "?feature=transcript"
                "&content-type=application/json")
        url = url.format(region)

        response = query_ensembl(url)
        transcripts = []
        for tr in response:
            transcripts.append(_parse_transcript(tr))

        return transcripts

    def __repr__(self):
        return "<Transcript:{} (chr{}:{}-{})>".format(
            self.enst,
            self.chrom,
            self.start,
            self.end,
        )


def _parse_gene(o):
    """Parse gene information from an Ensembl `overlap` query.

    :param o: A Python dict representing an entry from the response.
    :type o: dict

    :returns: A standardised dict of gene information.
    :rtype: dict

    """
    assert o["feature_type"] == "gene"

    d = {
        "desc": o.get("description"),
        "build": o.get("assembly_name"),
        "chrom": o.get("seq_region_name"),
        "start": o.get("start"),
        "end": o.get("end"),
        "strand": o.get("strand"),
        "biotype": o.get("biotype"),
    }

    return d


def _parse_transcript(o):
    """Parse transcript information from an Ensembl `overlap` query.

    :param o: A Python dict representing an entry from the response.
    :type o: dict

    :returns: A Transcript object representing the transcript.
    :rtype: :py:class:`Transcript`

    """
    assert o["feature_type"] == "transcript"

    d = {
        "build": o.get("assembly_name"),
        "chrom": o.get("seq_region_name"),
        "start": o.get("start"),
        "end": o.get("end"),
        "enst": o.get("id"),
        "biotype": o.get("biotype"),
        "parent": o.get("Parent"),
    }

    # Get the APPRIS annotation.
    try:
        d["appris_cat"] = get_category_for_transcript(d["enst"])
    except Exception:
        pass

    return Transcript(**d)


def _parse_exon(o):
    """Parse exon information from an Ensembl `overlap` query.

    :param o: A Python dict representing an entry from the response.
    :type o: dict

    :returns: A tuple of (start, end) positions representing the exon.
    :rtype: tuple

    """
    assert o["feature_type"] == "exon"
    return (o["start"], o["end"])