Source code for galaxy.datatypes.xml

"""
XML format classes
"""
import re
import data
import logging
from galaxy.datatypes.sniff import *
import dataproviders

log = logging.getLogger(__name__)

@dataproviders.decorators.has_dataproviders
[docs]class GenericXml( data.Text ): """Base format class for any XML file.""" file_ext = "xml"
[docs] def set_peek( self, dataset, is_multi_byte=False ): """Set the peek and blurb text""" if not dataset.dataset.purged: dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) dataset.blurb = 'XML data' else: dataset.peek = 'file does not exist' dataset.blurb = 'file purged from disk'
[docs] def sniff( self, filename ): """ Determines whether the file is XML or not >>> fname = get_test_fname( 'megablast_xml_parser_test1.blastxml' ) >>> GenericXml().sniff( fname ) True >>> fname = get_test_fname( 'interval.interval' ) >>> GenericXml().sniff( fname ) False """ #TODO - Use a context manager on Python 2.5+ to close handle handle = open(filename) line = handle.readline() handle.close() #TODO - Is there a more robust way to do this? return line.startswith('<?xml ')
[docs] def merge(split_files, output_file): """Merging multiple XML files is non-trivial and must be done in subclasses.""" if len(split_files) > 1: raise NotImplementedError("Merging multiple XML files is non-trivial and must be implemented for each XML type") #For one file only, use base class method (move/copy) data.Text.merge(split_files, output_file)
merge = staticmethod(merge) @dataproviders.decorators.dataprovider_factory( 'xml', dataproviders.hierarchy.XMLDataProvider.settings )
[docs] def xml_dataprovider( self, dataset, **settings ): dataset_source = dataproviders.dataset.DatasetDataProvider( dataset ) return dataproviders.hierarchy.XMLDataProvider( dataset_source, **settings )
[docs]class MEMEXml( GenericXml ): """MEME XML Output data""" file_ext = "memexml"
[docs] def set_peek( self, dataset, is_multi_byte=False ): """Set the peek and blurb text""" if not dataset.dataset.purged: dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) dataset.blurb = 'MEME XML data' else: dataset.peek = 'file does not exist' dataset.blurb = 'file purged from disk'
[docs] def sniff( self, filename ): return False
[docs]class CisML( GenericXml ): """CisML XML data""" #see: http://www.ncbi.nlm.nih.gov/pubmed/15001475 file_ext = "cisml"
[docs] def set_peek( self, dataset, is_multi_byte=False ): """Set the peek and blurb text""" if not dataset.dataset.purged: dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) dataset.blurb = 'CisML data' else: dataset.peek = 'file does not exist' dataset.blurb = 'file purged from disk'
[docs] def sniff( self, filename ): return False
[docs]class Phyloxml( GenericXml ): """Format for defining phyloxml data http://www.phyloxml.org/""" file_ext = "phyloxml"
[docs] def set_peek( self, dataset, is_multi_byte=False ): """Set the peek and blurb text""" if not dataset.dataset.purged: dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) dataset.blurb = 'Phyloxml data' else: dataset.peek = 'file does not exist' dataset.blurb = 'file purged from disk'
[docs] def sniff( self, filename ): """"Checking for keyword - 'phyloxml' always in lowercase in the first few lines""" f = open( filename, "r" ) firstlines = "".join( f.readlines(5) ) f.close() if "phyloxml" in firstlines: return True return False
[docs] def get_visualizations( self, dataset ): """ Returns a list of visualizations for datatype. """ return [ 'phyloviz' ]
[docs]class Owl( GenericXml ): """ Web Ontology Language OWL format description http://www.w3.org/TR/owl-ref/ """ file_ext = "owl"
[docs] def set_peek( self, dataset, is_multi_byte=False ): if not dataset.dataset.purged: dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) dataset.blurb = "Web Ontology Language OWL" else: dataset.peek = 'file does not exist' dataset.blurb = 'file purged from disc'
[docs] def sniff( self, filename ): """ Checking for keyword - '<owl' in the first 200 lines. """ owl_marker = re.compile(r'\<owl:') with open( filename ) as handle: # Check first 200 lines for the string "<owl:" first_lines = handle.readlines(200) for line in first_lines: if owl_marker.search( line ): return True return False