Source code for gepyto.db.appris

# Module to get principal transcripts from the appris database.
# appris annotates transcripts and provides a main isoform for proteins.
# http://appris.bioinfo.cnio.es/
# The data included in this module was fetched from the APPRIS website on
# 2014-10-30.
# URL: http://appris.bioinfo.cnio.es/download/data/homo_sapiens/appris_data.principal.txt

# This file is part of gepyto.
#
# This work is licensed under the Creative Commons Attribution-NonCommercial
# 4.0 International License. To view a copy of this license, visit
# http://creativecommons.org/licenses/by-nc/4.0/ or send a letter to Creative
# Commons, PO Box 1866, Mountain View, CA 94042, USA.


__author__ = "Marc-Andre Legault"
__copyright__ = ("Copyright 2014 Marc-Andre Legault and Louis-Philippe "
                 "Lemieux Perreault. All rights reserved.")
__license__ = "Attribution-NonCommercial 4.0 International (CC BY-NC 4.0)"


import gzip
import collections
import sqlite3

from pkg_resources import resource_filename


APPRIS_CUR = None


def _load_appris():
    """Load the APPRIS database in memory as a sqlite3 database.

    :returns: The database cursor.

    """
    con = sqlite3.connect(":memory:")
    cur = con.cursor()

    cur.execute(
        "CREATE TABLE appris ("
        "  symbol TEXT,"
        "  ensembl_gene TEXT,"
        "  ensembl_transcript TEXT,"
        "  ccds TEXT,"
        "  category TEXT"
        ")"
    )

    fn = resource_filename(__name__, "data/appris_data.principal.txt.gz")
    db = collections.defaultdict(list)
    with gzip.open(fn) as f:
        for line in f:
            line = line.rstrip().decode("UTF-8")
            tu = tuple(line.split("\t"))
            cur.execute("INSERT INTO appris VALUES (?, ?, ?, ?, ?)", tu)

    con.commit()
    return cur


[docs]def init_db():
    """This is an initialization method for the database.

    We use this to load the database only if a function is called.

    """
    global APPRIS_CUR

    if APPRIS_CUR is None:
        APPRIS_CUR = _load_appris()


[docs]def get_transcripts_for_gene(ensg):
    """Fetches the transcripts and their annotation for a given gene (ENSG).

    :param ensg: The Ensembl gene id.
    :type ensg: str

    :returns: A list of transcript IDs and their categories (tuples).
    :rtype: tuple

    """

    init_db()
    APPRIS_CUR.execute(
        "SELECT ensembl_transcript, category FROM appris WHERE ensembl_gene=?",
        (ensg, )
    )
    return APPRIS_CUR.fetchall()


[docs]def get_category_for_transcript(enst):
    """Fetches the annotation for a transcript (ENST).

    :param enst: The Ensembl transcript id.
    :type enst: str

    :returns: The APPRIS category for this transcript.
    :rtype: str

    """

    init_db()
    APPRIS_CUR.execute(
        "SELECT category FROM appris WHERE ensembl_transcript=?",
        (enst, )
    )
    return APPRIS_CUR.fetchone()[0]


[docs]def get_main_transcripts(ensg):
    """Gets the main Ensembl transcript id for the provided gene based on the
       APPRIS annotation.

    :param ensg: The Ensembl gene number (ENSG000000).
    :param ensg: str

    :returns: The "main" transcrit (ENST). If there is an `appris_principal`
              annotation, this will be returned. If it is not the case, the
              order of priority is the following:
              `appris_candidate_longest_ccds`, `appris_candidate_ccds`,
              `appris_candidate_longest_seq`, `appris_candidate`.
    :rtype: str

    """

    init_db()
    APPRIS_CUR.execute(
        "SELECT ensembl_transcript, category FROM appris WHERE ensembl_gene=?",
        (ensg, )
    )
    li = APPRIS_CUR.fetchall()

    top_category = None
    categories = set([tu[1] for tu in li])

    ordered_cats = (
        "appris_principal", "appris_candidate_longest_ccds",
        "appris_candidate_ccds", "appris_candidate_longest_seq",
        "appris_candidate"
    )

    for cat in ordered_cats:
        if cat in categories:
            top_category = cat
            break

    top_transcripts = []
    for enst, cat in li:
        if cat == top_category:
            top_transcripts.append(enst)

    return enst