Source code for gepyto.formats.seqxml

#
# Implementation of the SeqXML format into Python objects with
# some elementary operations.
# See http://orthoxml.org/xml/Main.html for more information on the format.
#
# This file is part of gepyto.
#
# This work is licensed under the Creative Commons Attribution-NonCommercial
# 4.0 International License. To view a copy of this license, visit
# http://creativecommons.org/licenses/by-nc/4.0/ or send a letter to Creative
# Commons, PO Box 1866, Mountain View, CA 94042, USA.


__author__ = "Marc-Andre Legault"
__copyright__ = ("Copyright 2014 Marc-Andre Legault and Louis-Philippe "
                 "Lemieux Perreault. All rights reserved.")
__license__ = "Attribution-NonCommercial 4.0 International (CC BY-NC 4.0)"


import collections
import gzip
import xml.etree.ElementTree as etree

from ..structures.sequences import Sequence


[docs]class SeqXML(object): """Parses the SeqXML format representing sequence data. :param fn: The filename of the SeqXML file. The format description is available at `orthoxml.org <http://seqxml.org/0.4/seqxml_doc_v0.4.html>`_ (visited Nov. 2014). :type fn: str The returned object will have a list of entries which are :py:class:`Sequence` objects. """ seq_xml_seqtypes = { "DNAseq": "DNA", "RNAseq": "RNA", "AAseq": "AA", } def __init__(self, fn): opener = gzip.open if fn.endswith(".gz") else open with opener(fn) as f: tree = etree.parse(f) self.root = tree.getroot() self.entries = [] self.id_index = {} for entry in self.root: # Mandatory fields uid = entry.attrib.get("id") seq = None seq_type = None # Additional information info = {} # Parse the sequence entry. seq_types = set(SeqXML.seq_xml_seqtypes.keys()) for elem in entry: # This is the biological sequence. if elem.tag in seq_types: seq_type = SeqXML.seq_xml_seqtypes[elem.tag] seq = elem.text # Those are all "info" fields. elif elem.tag == "property": info[elem.attrib["name"]] = elem.attrib.get("value", 1) elif elem.tag == "species": info["species"] = elem.attrib["name"] info["species_ncbi_tax_id"] = elem.attrib["ncbiTaxID"] elif elem.tag == "description": info["description"] = elem.text elif elem.tag == "DBRef": info["db_name"] = elem.attrib["source"] info["db_acc"] = elem.attrib["id"] # Create the Sequence object. seq = Sequence(uid, seq, seq_type, info) self.entries.append(seq) # Build the id_index that allows fast Sequence lookup by id. for entry in self.entries: self.id_index[entry.uid] = entry
[docs] def get_seq(self, uid): """Get a sequence from it's unique identifier. :param uid: The sequence id. :type uid: str """ if uid not in self.id_index: return None else: return self.id_index[uid]