Source code for gepyto.db.appris

# Module to get principal transcripts from the appris database.
# appris annotates transcripts and provides a main isoform for proteins.
# http://appris.bioinfo.cnio.es/
# The data included in this module was fetched from the APPRIS website on
# 2014-10-30.
# URL: http://appris.bioinfo.cnio.es/download/data/homo_sapiens/appris_data.principal.txt

# This file is part of gepyto.
#
# This work is licensed under the Creative Commons Attribution-NonCommercial
# 4.0 International License. To view a copy of this license, visit
# http://creativecommons.org/licenses/by-nc/4.0/ or send a letter to Creative
# Commons, PO Box 1866, Mountain View, CA 94042, USA.


__author__ = "Marc-Andre Legault"
__copyright__ = ("Copyright 2014 Marc-Andre Legault and Louis-Philippe "
                 "Lemieux Perreault. All rights reserved.")
__license__ = "Attribution-NonCommercial 4.0 International (CC BY-NC 4.0)"


import gzip
import collections
import sqlite3

from pkg_resources import resource_filename


APPRIS_CUR = None


def _load_appris():
    """Load the APPRIS database in memory as a sqlite3 database.

    :returns: The database cursor.

    """
    con = sqlite3.connect(":memory:")
    cur = con.cursor()

    cur.execute(
        "CREATE TABLE appris ("
        "  symbol TEXT,"
        "  ensembl_gene TEXT,"
        "  ensembl_transcript TEXT,"
        "  ccds TEXT,"
        "  category TEXT"
        ")"
    )

    fn = resource_filename(__name__, "data/appris_data.principal.txt.gz")
    db = collections.defaultdict(list)
    with gzip.open(fn) as f:
        for line in f:
            line = line.rstrip().decode("UTF-8")
            tu = tuple(line.split("\t"))
            cur.execute("INSERT INTO appris VALUES (?, ?, ?, ?, ?)", tu)

    con.commit()
    return cur


[docs]def init_db(): """This is an initialization method for the database. We use this to load the database only if a function is called. """ global APPRIS_CUR if APPRIS_CUR is None: APPRIS_CUR = _load_appris()
[docs]def get_transcripts_for_gene(ensg): """Fetches the transcripts and their annotation for a given gene (ENSG). :param ensg: The Ensembl gene id. :type ensg: str :returns: A list of transcript IDs and their categories (tuples). :rtype: tuple """ init_db() APPRIS_CUR.execute( "SELECT ensembl_transcript, category FROM appris WHERE ensembl_gene=?", (ensg, ) ) return APPRIS_CUR.fetchall()
[docs]def get_category_for_transcript(enst): """Fetches the annotation for a transcript (ENST). :param enst: The Ensembl transcript id. :type enst: str :returns: The APPRIS category for this transcript. :rtype: str """ init_db() APPRIS_CUR.execute( "SELECT category FROM appris WHERE ensembl_transcript=?", (enst, ) ) return APPRIS_CUR.fetchone()[0]
[docs]def get_main_transcripts(ensg): """Gets the main Ensembl transcript id for the provided gene based on the APPRIS annotation. :param ensg: The Ensembl gene number (ENSG000000). :param ensg: str :returns: The "main" transcrit (ENST). If there is an `appris_principal` annotation, this will be returned. If it is not the case, the order of priority is the following: `appris_candidate_longest_ccds`, `appris_candidate_ccds`, `appris_candidate_longest_seq`, `appris_candidate`. :rtype: str """ init_db() APPRIS_CUR.execute( "SELECT ensembl_transcript, category FROM appris WHERE ensembl_gene=?", (ensg, ) ) li = APPRIS_CUR.fetchall() top_category = None categories = set([tu[1] for tu in li]) ordered_cats = ( "appris_principal", "appris_candidate_longest_ccds", "appris_candidate_ccds", "appris_candidate_longest_seq", "appris_candidate" ) for cat in ordered_cats: if cat in categories: top_category = cat break top_transcripts = [] for enst, cat in li: if cat == top_category: top_transcripts.append(enst) return enst