Source code for gepyto.formats.impute2

#
# Implementation of the IMPUTE2 format into Python objects.
#
# This file is part of gepyto.
#
# This work is licensed under the Creative Commons Attribution-NonCommercial
# 4.0 International License. To view a copy of this license, visit
# http://creativecommons.org/licenses/by-nc/4.0/ or send a letter to Creative
# Commons, PO Box 1866, Mountain View, CA 94042, USA.

from __future__ import division, print_function

__author__ = "Louis-Philippe Lemieux Perreault"
__copyright__ = ("Copyright 2014 Marc-Andre Legault and Louis-Philippe "
                 "Lemieux Perreault. All rights reserved.")
__license__ = "Attribution-NonCommercial 4.0 International (CC BY-NC 4.0)"

from collections import Counter, namedtuple
from multiprocessing import Process, Queue
import functools

import numpy as np
import pandas as pd

import gzip

_Line = namedtuple(
    "Line",
    ["name", "chrom", "pos", "a1", "a2", "probabilities"]
)

# Two possible ways of iterating over Impute2File
DOSAGE = "dosage"
LINE = "line"
HARD_CALL = "hard_call"


[docs]class Impute2File(object): """Class representing an Impute2File. This is used to either generate a dosage matrix where columns represent variants and rows represent samples or to read the file line by line using the generator syntax. This also implements the context manager interface. Usage: :: # Read as probabilities (Line tuples). with open(Impute2File(fn)) as f: for line in f: # line has name, chrom, pos, a1, a2, probabilities print(line) # Read as dosage. with open(Impute2File(fn), "dosage") as f: for dosage_vector, info in f: pass # Read as a matrix. with open(Impute2File(fn)) as f: # 1 row per sample and 1 column per variant. Values between 0 and 2 m = f.as_matrix() If you use the ``dosage`` mode, you can also add additional arguments: - prob_threshold: Genotype probability cutoff for no call values (NaN). - is_chr23: Not implemented yet, but dosage is computed differently for sexual chromosomes for men (hemizygote). - sex_vector: Not implemented yet, but this is a vector representing the gender of every sample (for dosage computation on sexual chromosomes). .. warning:: Be careful with the :py:func:`Impute2File.as_matrix()` function as it will try to load the WHOLE Impute2 file in memory. """ def __init__(self, fn, mode=LINE, **kwargs): self._filename = fn if fn.endswith(".gz"): opener = gzip.open else: opener = functools.partial(open, mode="r") self._file = opener(fn) assert mode in (DOSAGE, LINE, HARD_CALL) self._mode = mode # The special function arguments self.dosage_arguments = {} self.hard_calls_arguments = {} if self._mode is DOSAGE: # Parse kwargs that can be passed to the _compute_dosage function. kw = ("prob_threshold", "is_chr23", "sex_vector") for keyword in kwargs: if keyword in kw: self.dosage_arguments[keyword] = kwargs[keyword] else: self._file.close() raise TypeError("__init__() got an unexpected keyword " "argument '{}'".format(keyword)) elif self._mode is HARD_CALL: # Parse kwargs that can be passed to the _compute_hard_calls # function. kw = ("prob_threshold",) for keyword in kwargs: if keyword in kw: self.hard_calls_arguments[keyword] = kwargs[keyword] else: self._file.close() raise TypeError("__init__() got an unexpected keyword " "argument '{}'".format(keyword)) else: if len(kwargs) > 0: self._file.close() raise TypeError("__init__() got an unexpected keyword " "argument '{}'".format(kwargs.keys()[0]))
[docs] def as_matrix(self): """Creates a numpy dosage matrix from this file. :returns: A numpy matrix where columns represent variant dosage between 0 and 2 and a dataframe describing the variants (major, minor, maf). :type: tuple .. warning:: This will attempt to load the whole file in memory. """ prev_pos = self._file.tell() self._file.seek(0) prev_mode = self._mode self._mode = DOSAGE snp_vector_list = [] snp_info_list = [] for v, info in self: information_fields = info.keys() snp_vector_list.append(v) snp_info_list.append([info[k] for k in information_fields]) m = np.array(snp_vector_list) # snp x sample m = m.T # We transpose to get sample x snp matrix (standard for stats) # Make the information df. df = pd.DataFrame(snp_info_list, columns=information_fields) # Put the file like it was. self._file.seek(prev_pos) self._mode = prev_mode return m, df
def __next__(self): line = next(self._file) if line is None: # Done with the file. raise StopIteration() if self._mode is DOSAGE: return _compute_dosage(_read_impute2_line(line), **self.dosage_arguments) elif self._mode is HARD_CALL: return _compute_hard_calls(_read_impute2_line(line), **self.hard_calls_arguments) elif self._mode is LINE: return _read_impute2_line(line) next = __next__
[docs] def readline(self): """Read a single line from the Impute2File. This will return either a ``Line`` including the genotype probabilities or a dosage vector. This depends on the `mode` (the second argument given to the file when it was opened). Available modes are ``dosage`` and ``line``. """ return self.next()
def __iter__(self): return self def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): self.close() def close(self): self._file.close()
def _compute_dosage(line, prob_threshold=0, is_chr23=False, sex_vector=None): """Computes dosage from probabilities (IMPUTE2).""" # Type check assert type(line) is _Line # We set the dosage as being the expected number of "a2" alleles for # all samples. # Given that the rows represent p_aa, p_ab, p_bb, we have that # E(#b) = 2 * p_bb + p_ab, the dosage. dosage = 2 * line.probabilities[:, 2] + line.probabilities[:, 1] # Probability filtering. We mask samples with low probabilities. if prob_threshold > 0: dosage[~np.any(line.probabilities > prob_threshold, axis=1)] = np.nan # Compute the maf. mac = np.nansum(dosage) maf = mac / (2 * np.sum(~np.isnan(dosage))) # If maf > 0.5, we need to flip. major, minor = (line.a1, line.a2) if maf > 0.5: dosage = 2 - dosage # 0 -> 2, 1 -> 1, 2 -> 0. mac = np.nansum(dosage) maf = 1 - maf major, minor = minor, major if is_chr23: # TODO: Implement this, please raise NotImplementedError("dosage for chromosome 23 is not yet " "supported (because dosage is computed " "differently for males on chromosome 23)") return (dosage, { "major": major, "minor": minor, "maf": maf, "minor_allele_count": mac, "name": line.name, "chrom": line.chrom, "pos": line.pos, }) def _compute_hard_calls(line, prob_threshold=0): """Computes hard calls from probabilities (IMPUTE2).""" # Getting the possible genotypes possible_geno = np.array([" ".join([line.a1] * 2), " ".join([line.a1, line.a2]), " ".join([line.a2] * 2)]) # The final genotype final_geno = possible_geno[np.argmax(line.probabilities, axis=1)] # The threshold low_quality = np.max(line.probabilities, axis=1) < prob_threshold final_geno[low_quality] = "0 0" return ( final_geno, {"name": line.name, "chrom": line.chrom, "pos": line.pos}, ) def _read_impute2_line(line): """Parse an IMPUTE2 line (a single marker). :param line: a line from an impute 2 file. :type line: str :returns: A namedtuple with the name of the variant, chromosome, position, allele1, allele2 and probability matrix. The matrix of shape ``samples x 3`` represents the probability of the the following genotypes ``(aa, ab, bb)``. """ # Splitting the line row = line.rstrip("\n").split(" ") # Getting marker information name = row[1] chrom = row[0] pos = int(row[2]) a1 = row[3] a2 = row[4] # Constructing the genotype prob = np.array(row[5:], dtype=float) prob.shape = (prob.shape[0] // 3, 3) return _Line(name, chrom, pos, a1, a2, prob)