Source code for gepyto.formats.impute2

#
# Implementation of the IMPUTE2 format into Python objects.
#
# This file is part of gepyto.
#
# This work is licensed under the Creative Commons Attribution-NonCommercial
# 4.0 International License. To view a copy of this license, visit
# http://creativecommons.org/licenses/by-nc/4.0/ or send a letter to Creative
# Commons, PO Box 1866, Mountain View, CA 94042, USA.

from __future__ import division, print_function

__author__ = "Louis-Philippe Lemieux Perreault"
__copyright__ = ("Copyright 2014 Marc-Andre Legault and Louis-Philippe "
                 "Lemieux Perreault. All rights reserved.")
__license__ = "Attribution-NonCommercial 4.0 International (CC BY-NC 4.0)"

from collections import Counter, namedtuple
from multiprocessing import Process, Queue
import functools

import numpy as np
import pandas as pd

import gzip

_Line = namedtuple(
    "Line",
    ["name", "chrom", "pos", "a1", "a2", "probabilities"]
)

# Two possible ways of iterating over Impute2File
DOSAGE = "dosage"
LINE = "line"
HARD_CALL = "hard_call"


[docs]class Impute2File(object):
    """Class representing an Impute2File.

    This is used to either generate a dosage matrix where columns represent
    variants and rows represent samples or to read the file line by line
    using the generator syntax.

    This also implements the context manager interface.

    Usage: ::

        # Read as probabilities (Line tuples).
        with open(Impute2File(fn)) as f:
            for line in f:
                # line has name, chrom, pos, a1, a2, probabilities
                print(line)

        # Read as dosage.
        with open(Impute2File(fn), "dosage") as f:
            for dosage_vector, info in f:
                pass

        # Read as a matrix.
        with open(Impute2File(fn)) as f:
            # 1 row per sample and 1 column per variant. Values between 0 and 2
            m = f.as_matrix()

    If you use the ``dosage`` mode, you can also add additional arguments:

        - prob_threshold: Genotype probability cutoff for no call values (NaN).
        - is_chr23: Not implemented yet, but dosage is computed differently
                    for sexual chromosomes for men (hemizygote).
        - sex_vector: Not implemented yet, but this is a vector representing
                      the gender of every sample (for dosage computation on
                      sexual chromosomes).

    .. warning::

        Be careful with the :py:func:`Impute2File.as_matrix()` function as it
        will  try to load the WHOLE Impute2 file in memory.

    """

    def __init__(self, fn, mode=LINE, **kwargs):
        self._filename = fn

        if fn.endswith(".gz"):
            opener = gzip.open
        else:
            opener = functools.partial(open, mode="r")

        self._file = opener(fn)

        assert mode in (DOSAGE, LINE, HARD_CALL)
        self._mode = mode

        # The special function arguments
        self.dosage_arguments = {}
        self.hard_calls_arguments = {}

        if self._mode is DOSAGE:
            # Parse kwargs that can be passed to the _compute_dosage function.
            kw = ("prob_threshold", "is_chr23", "sex_vector")
            for keyword in kwargs:
                if keyword in kw:
                    self.dosage_arguments[keyword] = kwargs[keyword]
                else:
                    self._file.close()
                    raise TypeError("__init__() got an unexpected keyword "
                                    "argument '{}'".format(keyword))

        elif self._mode is HARD_CALL:
            # Parse kwargs that can be passed to the _compute_hard_calls
            # function.
            kw = ("prob_threshold",)
            for keyword in kwargs:
                if keyword in kw:
                    self.hard_calls_arguments[keyword] = kwargs[keyword]
                else:
                    self._file.close()
                    raise TypeError("__init__() got an unexpected keyword "
                                    "argument '{}'".format(keyword))
        else:
            if len(kwargs) > 0:
                self._file.close()
                raise TypeError("__init__() got an unexpected keyword "
                                "argument '{}'".format(kwargs.keys()[0]))

[docs]    def as_matrix(self):
        """Creates a numpy dosage matrix from this file.

        :returns: A numpy matrix where columns represent variant dosage
                  between 0 and 2 and a dataframe describing the variants
                  (major, minor, maf).
        :type: tuple

        .. warning::

            This will attempt to load the whole file in memory.

        """
        prev_pos = self._file.tell()
        self._file.seek(0)

        prev_mode = self._mode
        self._mode = DOSAGE

        snp_vector_list = []
        snp_info_list = []
        for v, info in self:
            information_fields = info.keys()
            snp_vector_list.append(v)
            snp_info_list.append([info[k] for k in information_fields])

        m = np.array(snp_vector_list)  # snp x sample
        m = m.T  # We transpose to get sample x snp matrix (standard for stats)

        # Make the information df.
        df = pd.DataFrame(snp_info_list, columns=information_fields)

        # Put the file like it was.
        self._file.seek(prev_pos)
        self._mode = prev_mode

        return m, df

    def __next__(self):
        line = next(self._file)
        if line is None:
            # Done with the file.
            raise StopIteration()

        if self._mode is DOSAGE:
            return _compute_dosage(_read_impute2_line(line),
                                   **self.dosage_arguments)

        elif self._mode is HARD_CALL:
            return _compute_hard_calls(_read_impute2_line(line),
                                       **self.hard_calls_arguments)

        elif self._mode is LINE:
            return _read_impute2_line(line)

    next = __next__

[docs]    def readline(self):
        """Read a single line from the Impute2File.

        This will return either a ``Line`` including the genotype probabilities
        or a dosage vector. This depends on the `mode` (the second argument
        given to the file when it was opened).

        Available modes are ``dosage`` and ``line``.

        """
        return self.next()

    def __iter__(self):
        return self

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self.close()

    def close(self):
        self._file.close()


def _compute_dosage(line, prob_threshold=0, is_chr23=False,
                    sex_vector=None):
    """Computes dosage from probabilities (IMPUTE2)."""
    # Type check
    assert type(line) is _Line

    # We set the dosage as being the expected number of "a2" alleles for
    # all samples.
    # Given that the rows represent p_aa, p_ab, p_bb, we have that
    # E(#b) = 2 * p_bb + p_ab, the dosage.
    dosage = 2 * line.probabilities[:, 2] + line.probabilities[:, 1]

    # Probability filtering. We mask samples with low probabilities.
    if prob_threshold > 0:
        dosage[~np.any(line.probabilities > prob_threshold, axis=1)] = np.nan

    # Compute the maf.
    mac = np.nansum(dosage)
    maf = mac / (2 * np.sum(~np.isnan(dosage)))

    # If maf > 0.5, we need to flip.
    major, minor = (line.a1, line.a2)
    if maf > 0.5:
        dosage = 2 - dosage  # 0 -> 2, 1 -> 1, 2 -> 0.
        mac = np.nansum(dosage)
        maf = 1 - maf
        major, minor = minor, major

    if is_chr23:
        # TODO: Implement this, please
        raise NotImplementedError("dosage for chromosome 23 is not yet "
                                  "supported (because dosage is computed "
                                  "differently for males on chromosome 23)")

    return (dosage, {
        "major": major,
        "minor": minor,
        "maf": maf,
        "minor_allele_count": mac,
        "name": line.name,
        "chrom": line.chrom,
        "pos": line.pos,
    })


def _compute_hard_calls(line, prob_threshold=0):
    """Computes hard calls from probabilities (IMPUTE2)."""
    # Getting the possible genotypes
    possible_geno = np.array([" ".join([line.a1] * 2),
                              " ".join([line.a1, line.a2]),
                              " ".join([line.a2] * 2)])

    # The final genotype
    final_geno = possible_geno[np.argmax(line.probabilities, axis=1)]

    # The threshold
    low_quality = np.max(line.probabilities, axis=1) < prob_threshold
    final_geno[low_quality] = "0 0"

    return (
        final_geno,
        {"name": line.name,
         "chrom": line.chrom,
         "pos": line.pos},
    )


def _read_impute2_line(line):
    """Parse an IMPUTE2 line (a single marker).

    :param line: a line from an impute 2 file.
    :type line: str

    :returns: A namedtuple with the name of the variant, chromosome, position,
              allele1, allele2 and probability matrix.

    The matrix of shape ``samples x 3`` represents the probability of the
    the following genotypes ``(aa, ab, bb)``.

    """
    # Splitting the line
    row = line.rstrip("\n").split(" ")

    # Getting marker information
    name = row[1]
    chrom = row[0]
    pos = int(row[2])
    a1 = row[3]
    a2 = row[4]

    # Constructing the genotype
    prob = np.array(row[5:], dtype=float)
    prob.shape = (prob.shape[0] // 3, 3)

    return _Line(name, chrom, pos, a1, a2, prob)