Source code for gepyto.db.ucsc

#
# Utilities to interact with the UCSC database.
#
# This file is part of gepyto.
#
# This work is licensed under the Creative Commons Attribution-NonCommercial
# 4.0 International License. To view a copy of this license, visit
# http://creativecommons.org/licenses/by-nc/4.0/ or send a letter to Creative
# Commons, PO Box 1866, Mountain View, CA 94042, USA.


from __future__ import division


__author__ = "Marc-Andre Legault"
__copyright__ = ("Copyright 2014 Marc-Andre Legault and Louis-Philippe "
                 "Lemieux Perreault. All rights reserved.")
__license__ = "Attribution-NonCommercial 4.0 International (CC BY-NC 4.0)"


import collections

import numpy as np

from .. import settings
from ..structures.region import Region


[docs]class UCSC(object): """Provides raw access to the UCSC MySQL database. The database will be set to the `db` parameter or to the `BUILD` as defined in `gepyto`'s settings. Later versions could implement features like throttling, but for now this is a very simple interface. """ def __init__(self, db=None): import pymysql if db is None: db = settings.BUILD if db.lower() == "grch37": db = "hg19" elif db.lower() == "grch38": db = "hg38" else: raise Exception("Invalid genome reference '{}'".format(db)) self.con = pymysql.connect(user="genome", host="genome-mysql.cse.ucsc.edu", database=db) self.cur = self.con.cursor()
[docs] def raw_sql(self, sql, params): """Execute a raw SQL query.""" self.cur.execute(sql, params) return self.cur.fetchall()
[docs] def query_gap_table(self, chromosome, ucsc_type): """Get either the "telomere" or "centromere" of a given chromosome. """ if not chromosome.startswith("chr"): chromosome = "chr" + chromosome valid_types = ("telomere", "centromere") if ucsc_type not in valid_types: msg = "'{}' is not a valid type: use {}.".format( ucsc_type, valid_types ) raise TypeError(msg) return self.raw_sql( ("SELECT chromStart + 1, chromEnd + 1 " "FROM gap " "WHERE chrom=%s AND type=%s"), (chromosome, ucsc_type), )
def close(self): self.con.close() def __enter__(self): return self def __exit__(self, *args): self.close()
[docs]def get_telomere(chromosome): """Returns a Noncontiguous region representing the telomeres of a chromosome. :param chromosome: The chromosome, _e.g._ "3" :type chromosome: str :returns: A region corresponding to the telomeres. :rtype: :py:class:`Region` This is done by connecting to the UCSC MySQL server. """ chromosome = str(chromosome) with UCSC() as ucsc: telomeres = ucsc.query_gap_table(chromosome, "telomere") assert len(telomeres) == 2, ("UCSC did not return two telomeres (chrom={}" ").".format(chromosome)) # Create a region for both telomeres and use a union to return the full # region. telo1 = Region(chromosome.lstrip("chr"), telomeres[0][0], telomeres[0][1]) telo2 = Region(chromosome.lstrip("chr"), telomeres[1][0], telomeres[1][1]) return telo1.union(telo2)
[docs]def get_centromere(chromosome): """Returns a contiguous region representing the centromere of a chromosome. :param chromosome: The chromosome, _e.g._ "3" :type chromosome: str :returns: A region corresponding to the centromere. :rtype: :py:class:`Region` This is done by connecting to the UCSC MySQL server. """ chromosome = str(chromosome) with UCSC() as ucsc: centromere = ucsc.query_gap_table(chromosome, "centromere") assert len(centromere) == 1, "UCSC returned {} centromere(s).".format( len(centromere) ) centromere = centromere[0] return Region(chromosome.lstrip("chr"), centromere[0], centromere[1])
[docs]def get_phylop_100_way(region): """Get a vector of phyloP conservation scores for a given region. Scores represent the -log(p-value) under a H0 of neutral evolution. Positive values represent conservation and negative values represent fast-evolving bases. The UCSC MySQL database only contains aggregate scores for chunks of 1024 bases. We return the results for the subset of the required region that is fully contained in the UCSC bins. Because UCSC uses 0-based indexing, we adjust the gepyto region before querying. This means that the user should use 1-based indexing, as usual when creating the Region object. .. warning:: This function has a fairly low resolution. You should download the raw data (e.g. from `goldenpath <http://hgdownload.cse.ucsc.edu/goldenpath/hg19/phyloP100way/>`_ ) if you need scores for each base. Also note that gepyto can't parse bigWig, but it can parse Wig files. .. warning:: This function is **untested**. """ with UCSC() as ucsc: sql = ( "SELECT * FROM phyloP100wayAll WHERE " " chrom=%s AND " " chromStart>=%s AND " " chromEnd<=%s " ) chrom = region.chrom if not chrom.startswith("chr"): chrom = "chr{}".format(chrom) ucsc.cur.execute(sql, (chrom, region.start - 1, region.end - 1)) n = ucsc.cur.rowcount if not n: return # No results. results = iter(ucsc.cur) phylop = np.empty(n) Row = collections.namedtuple( "Row", ("bin", "chrom", "chromStart", "chromEnd", "name", "span", "count", "offset", "file", "lowerLimit", "dataRange", "validCount", "sumData", "sumSquares") ) start = end = None for i, row in enumerate(results): row = Row(*row) # Bind column names. # Adjust types so we can do integer operations. row_start = int(row.chromStart) row_end = int(row.chromEnd) sum_data = int(row.sumData) valid_count = int(row.validCount) end = row_end if start is None: start = row_start + 1 phylop[i] = sum_data / valid_count chrom = chrom[3:] return { "phylop_scores": phylop, "n_bins": n, "actual_region": Region(chrom, start, end) }