Source code for gepyto.utils.genes

# Utilities to handle gene data.

# This file is part of gepyto.
#
# This work is licensed under the Creative Commons Attribution-NonCommercial
# 4.0 International License. To view a copy of this license, visit
# http://creativecommons.org/licenses/by-nc/4.0/ or send a letter to Creative
# Commons, PO Box 1866, Mountain View, CA 94042, USA.


__author__ = "Marc-Andre Legault"
__copyright__ = ("Copyright 2014 Marc-Andre Legault and Louis-Philippe "
                 "Lemieux Perreault. All rights reserved.")
__license__ = "Attribution-NonCommercial 4.0 International (CC BY-NC 4.0)"


import logging

from ..settings import BUILD
from ..structures.genes import Gene
from ..db.ensembl import query_ensembl
from ..structures.genes import _parse_gene


def ensembl_genes_in_region(region, bare=False, build=BUILD):
[docs] """Queries a genome region of the form chr3:123-456 for genes using Ensembl API. :param region: The region to query. :type region: str :param bare: If `True`, no information about transcript will be fetched :type bare: boolean :param build: The genome build to use (GRCh37 or GRCh38). :type build: str :returns: A list of :py:class:`gepyto.structures.genes.Gene`. :rtype: list """ url = ("rest.ensembl.org/overlap/region/homo_sapiens/{region}" "?feature=gene" "&content-type=application/json") if build == "GRCh37": url = "http://grch37." + url elif build == "GRCh38": url = "http://" + url else: raise Exception("Unknown build '{}'.".format(build)) if region.startswith("chr"): region = region.lstrip("chr") url = url.format(region=region) res = query_ensembl(url) genes = [] for gene in res: # Check some stuff assert gene["feature_type"] == "gene" assert gene["assembly_name"] == build # Building the gene g_obj = None if not bare: g_obj = Gene.factory_ensembl_id(gene["id"], build=build) else: # Only the gene information is required gene_info = _parse_gene(gene) gene_info["symbol"] = gene["external_name"] gene_info["xrefs"] = {"ensembl_gene_id": gene["id"]} g_obj = Gene(**gene_info) genes.append(g_obj) if len(genes) == 0: logging.warning("No gene detected in region {}.".format(region)) return genes