Source code for gepyto.formats.seqxml
#
# Implementation of the SeqXML format into Python objects with
# some elementary operations.
# See http://orthoxml.org/xml/Main.html for more information on the format.
#
# This file is part of gepyto.
#
# This work is licensed under the Creative Commons Attribution-NonCommercial
# 4.0 International License. To view a copy of this license, visit
# http://creativecommons.org/licenses/by-nc/4.0/ or send a letter to Creative
# Commons, PO Box 1866, Mountain View, CA 94042, USA.
__author__ = "Marc-Andre Legault"
__copyright__ = ("Copyright 2014 Marc-Andre Legault and Louis-Philippe "
"Lemieux Perreault. All rights reserved.")
__license__ = "Attribution-NonCommercial 4.0 International (CC BY-NC 4.0)"
import collections
import gzip
import xml.etree.ElementTree as etree
from ..structures.sequences import Sequence
[docs]class SeqXML(object):
"""Parses the SeqXML format representing sequence data.
:param fn: The filename of the SeqXML file. The format description is
available at
`orthoxml.org <http://seqxml.org/0.4/seqxml_doc_v0.4.html>`_
(visited Nov. 2014).
:type fn: str
The returned object will have a list of entries which are
:py:class:`Sequence` objects.
"""
seq_xml_seqtypes = {
"DNAseq": "DNA",
"RNAseq": "RNA",
"AAseq": "AA",
}
def __init__(self, fn):
opener = gzip.open if fn.endswith(".gz") else open
with opener(fn) as f:
tree = etree.parse(f)
self.root = tree.getroot()
self.entries = []
self.id_index = {}
for entry in self.root:
# Mandatory fields
uid = entry.attrib.get("id")
seq = None
seq_type = None
# Additional information
info = {}
# Parse the sequence entry.
seq_types = set(SeqXML.seq_xml_seqtypes.keys())
for elem in entry:
# This is the biological sequence.
if elem.tag in seq_types:
seq_type = SeqXML.seq_xml_seqtypes[elem.tag]
seq = elem.text
# Those are all "info" fields.
elif elem.tag == "property":
info[elem.attrib["name"]] = elem.attrib.get("value", 1)
elif elem.tag == "species":
info["species"] = elem.attrib["name"]
info["species_ncbi_tax_id"] = elem.attrib["ncbiTaxID"]
elif elem.tag == "description":
info["description"] = elem.text
elif elem.tag == "DBRef":
info["db_name"] = elem.attrib["source"]
info["db_acc"] = elem.attrib["id"]
# Create the Sequence object.
seq = Sequence(uid, seq, seq_type, info)
self.entries.append(seq)
# Build the id_index that allows fast Sequence lookup by id.
for entry in self.entries:
self.id_index[entry.uid] = entry
[docs] def get_seq(self, uid):
"""Get a sequence from it's unique identifier.
:param uid: The sequence id.
:type uid: str
"""
if uid not in self.id_index:
return None
else:
return self.id_index[uid]