Source code for gepyto.structures.region

# Structure to handle genomic regions or ranges.
#
# This file is part of gepyto.
#
# This work is licensed under the Creative Commons Attribution-NonCommercial
# 4.0 International License. To view a copy of this license, visit
# http://creativecommons.org/licenses/by-nc/4.0/ or send a letter to Creative
# Commons, PO Box 1866, Mountain View, CA 94042, USA.

__author__ = "Marc-Andre Legault"
__copyright__ = ("Copyright 2014 Marc-Andre Legault and Louis-Philippe "
                 "Lemieux Perreault. All rights reserved.")
__license__ = "Attribution-NonCommercial 4.0 International (CC BY-NC 4.0)"


import re

from .. import settings
from . import sequences

import numpy as np
from six.moves import range


class _Segment(object):
    def __init__(self, chrom, start, end):
        chrom = str(chrom)
        assert re.match(settings.CHROM_REGEX, chrom)
        self.chrom = chrom
        self.start = int(start)
        self.end = int(end)
        assert self.start < self.end

    def overlaps_with(self, segment):
        return (self.chrom == segment.chrom and
                self.start <= segment.end and
                self.end >= segment.start)

    def distance_to(self, segment):
        if self.chrom != segment.chrom:
            raise Exception("Comparing segments from different chromosomes.")
        if self.overlaps_with(segment):
            return 0
        if self.end < segment.start:
            return segment.start - self.end
        else:
            return self.start - segment.end

    def as_range(self, iterator=False):
        if iterator:
            return range(self.start, self.end + 1)
        return np.arange(self.start, self.end + 1)

    def __eq__(self, seg):
        return (self.chrom == seg.chrom and self.start == seg.start and
                self.end == seg.end)

    def __ne__(self, seg):
        return not self.__eq__(seg)

    def __contains__(self, o):
        if hasattr(o, "chrom") and hasattr(o, "pos"):
            return o.chrom == self.chrom and (self.start <= o.pos <= self.end)
        elif hasattr(o, "chrom") and hasattr(o, "start") and hasattr(o, "end"):
            return (o.chrom == self.chrom and
                    (self.start <= o.start <= o.end <= self.end))
        else:
            raise TypeError("Object needs to have either (chrom, start, end) "
                            "or (chrom, pos) attributes to test if it is in "
                            "a _Segment or Region.")

    def to_sequence(self):
        return sequences.Sequence.from_reference(self.chrom, self.start,
                                                 self.end)

    @staticmethod
    def merge_segments(li):
        """Merge overlapping segments in a sorted list."""

        for i in range(len(li) - 1):
            cur = li[i]
            nxt = li[i + 1]
            if nxt.start < cur.start:
                raise Exception("Only sorted lists of segments can be "
                                "merged. Sort using the position first.")

            if cur.chrom != nxt.chrom:
                raise Exception("Can't merge segments from different "
                                "chromosomes.")

        merged_segments = []
        i = 0
        while i < len(li) - 1:
            j = i

            # Walk as long as the segments are overlapping.
            cur = li[i]
            nxt = li[i + 1]
            block = [cur.start, cur.end]
            if nxt.start <= cur.end:
                block[1] = max(block[1], nxt.end)

            while nxt.start <= block[1] and j + 1 < len(li) - 1:
                block[1] = max(block[1], nxt.end)
                j += 1
                cur = li[j]
                nxt = li[j + 1]

            merged_segments.append(
                _Segment(cur.chrom, block[0], block[1])
            )
            i = j + 1

        if li[-1].start > li[-2].end:
            merged_segments.append(li[-1])

        return merged_segments

    def __repr__(self):
        return "<_Segment object chr{}:{}-{}>".format(self.chrom, self.start,
                                                      self.end)


class Region(object):
[docs] """Region object to represent a part of the genome. :param chrom: The chromosome. :type chrom: str :param start: The start of the region. :type start: int :param end: The end of the region. :type end: int This can either represent a contiguous region or a fragmented region with multiple non-overlaping segments. This object can easily be converted to a Sequence using the :py:func:`Region.sequence` property. It is also easy to test overlap with another region or to test if an object is contained within this region using the `in` operator. """ def __init__(self, chrom, start, end): self.segments = [] self.segments.append(_Segment(chrom, start, end)) def union(self, region):
[docs] """Primary method to create non contiguous regions. This will create a region represented by the union of the current Region and the provided Region. This means that overlapping segments will be merged to avoid redundancy. """ segments = self.segments + region.segments segments = sorted(segments, key=lambda x: x.start) segments = _Segment.merge_segments(segments) return Region._from_segments(segments) def overlaps_with(self, region):
[docs] """Tests overlap with another region.""" for seg1 in self.segments: for seg2 in region.segments: if seg1.overlaps_with(seg2): return True return False def distance_to(self, region):
[docs] """Computes the distance to the given Region.""" min_dist = float("infinity") for seg1 in self.segments: for seg2 in region.segments: d = seg1.distance_to(seg2) if d < min_dist: min_dist = d return min_dist def as_range(self, iterator=False):
[docs] """Get a range corresponding to all the nucleotide positions.""" if self.is_contiguous: return self.segments[0].as_range(iterator) else: return set([seg.as_range(iterator) for seg in self.segments]) @property
def chrom(self): if self.is_contiguous: return self.segments[0].chrom else: chrom = {seg.chrom for seg in self.segments} if len(chrom) > 1: raise Exception("Ambiguous chromosome for non contiguous " "region.") return chrom[0] @property def start(self): if self.is_contiguous: return self.segments[0].start return min([seg.start for seg in self.segments]) @property def end(self): if self.is_contiguous: return self.segments[0].end return max([seg.end for seg in self.segments]) @property def is_contiguous(self): return len(self.segments) == 1 @property def sequence(self): """Builds a Sequence object representing the region. If the region is non contiguous, a tuple of sequences is returned. """ sequences = [] for seg in self.segments: sequences.append(seg.to_sequence()) return sequences[0] if len(sequences) == 1 else tuple(sequences) @staticmethod def _from_segments(segments): region = Region(1, 1, 2) region.segments = segments return region @staticmethod def from_str(s):
[docs] """Parses the region object (contiguous only) from a string. The expected format is `chrX:START-END`. """ regex = ( r"^chr" + settings.CHROM_REGEX.pattern + ":([0-9]+)-([0-9]+)$" ) match = re.match(regex, s) if match is None: raise TypeError("Invalid format for Region string. Expected the " "form 'chrXX:START-END' and got '{}'." "".format(s)) chrom, start, end = match.groups() start = int(start) end = int(end) assert end >= start seg = _Segment(chrom, start, end) region = Region._from_segments([seg, ]) return region def __contains__(self, o):
"""Tests if an object is in the region. This is valid for any object with (`chrom` and `pos`) or (`chrom`, `start` and `end`) attributes. """ for seg in self.segments: if o in seg: return True return False def __repr__(self): return "<{}Region: {}>".format( "Contiguous" if self.is_contiguous else "NonContiguous", self.segments )