Source code for galaxy.visualization.genomes

import os, re, sys, glob, logging
from bx.seq.twobit import TwoBitFile
from galaxy.util.json import loads
from galaxy import model, util
from galaxy.util.bunch import Bunch

log = logging.getLogger( __name__ )

# FIXME: copied from tracks.py
# Message strings returned to browser
messages = Bunch(
    PENDING = "pending",
    NO_DATA = "no data",
    NO_CHROMOSOME = "no chromosome",
    NO_CONVERTER = "no converter",
    NO_TOOL = "no tool",
    DATA = "data",
    ERROR = "error",
    OK = "ok"
)

[docs]def decode_dbkey( dbkey ): """ Decodes dbkey and returns tuple ( username, dbkey )""" if ':' in dbkey: return dbkey.split( ':' ) else: return None, dbkey
[docs]class GenomeRegion( object ): """ A genomic region on an individual chromosome. """ def __init__( self, chrom = None, start = 0, end = 0, sequence=None ): self.chrom = chrom self.start = int( start ) self.end = int( end ) self.sequence = sequence def __str__( self ): return self.chrom + ":" + str( self.start ) + "-" + str( self.end ) @staticmethod
[docs] def from_dict( obj_dict ): return GenomeRegion( chrom = obj_dict[ 'chrom' ], start = obj_dict[ 'start' ], end = obj_dict[ 'end' ] )
@staticmethod
[docs] def from_str( obj_str ): # check for gene region gene_region = obj_str.split(':') # split gene region into components if (len(gene_region) == 2): gene_interval = gene_region[1].split('-') # check length if (len(gene_interval) == 2): return GenomeRegion(chrom = gene_region[0], start = gene_interval[0], end = gene_interval[1]) # return genome region instance return GenomeRegion()
[docs]class Genome( object ): """ Encapsulates information about a known genome/dbkey. """ def __init__( self, key, description, len_file=None, twobit_file=None ): self.key = key self.description = description self.len_file = len_file self.twobit_file = twobit_file
[docs] def to_dict( self, num=None, chrom=None, low=None ): """ Returns representation of self as a dictionary. """ def check_int(s): if s.isdigit(): return int(s) else: return s def split_by_number(s): return [ check_int(c) for c in re.split('([0-9]+)', s) ] # # Parameter check, setting. # if num: num = int( num ) else: num = sys.maxint if low: low = int( low ) if low < 0: low = 0 else: low = 0 # # Get chroms data: # (a) chrom name, len; # (b) whether there are previous, next chroms; # (c) index of start chrom. # len_file_enumerate = enumerate( open( self.len_file ) ) chroms = {} prev_chroms = False start_index = 0 if chrom: # Use starting chrom to start list. found = False count = 0 for line_num, line in len_file_enumerate: if line.startswith("#"): continue name, len = line.split("\t") if found: chroms[ name ] = int( len ) count += 1 elif name == chrom: # Found starting chrom. chroms[ name ] = int ( len ) count += 1 found = True start_index = line_num if line_num != 0: prev_chroms = True if count >= num: break else: # Use low to start list. high = low + int( num ) prev_chroms = ( low != 0 ) start_index = low # Read chrom data from len file. for line_num, line in len_file_enumerate: if line_num < low: continue if line_num >= high: break if line.startswith("#"): continue # LEN files have format: # <chrom_name><tab><chrom_length> fields = line.split("\t") chroms[ fields[0] ] = int( fields[1] ) # Set flag to indicate whether there are more chroms after list. next_chroms = False try: len_file_enumerate.next() next_chroms = True except: # No more chroms to read. pass to_sort = [{ 'chrom': chrm, 'len': length } for chrm, length in chroms.iteritems()] to_sort.sort(lambda a,b: cmp( split_by_number(a['chrom']), split_by_number(b['chrom']) )) return { 'id': self.key, 'reference': self.twobit_file is not None, 'chrom_info': to_sort, 'prev_chroms' : prev_chroms, 'next_chroms' : next_chroms, 'start_index' : start_index }
[docs]class Genomes( object ): """ Provides information about available genome data and methods for manipulating that data. """ def __init__( self, app ): self.app = app # Create list of genomes from app.genome_builds self.genomes = {} # Store internal versions of data tables for twobit and __dbkey__ self._table_versions = { 'twobit': None, '__dbkeys__': None } self.reload_genomes()
[docs] def reload_genomes( self ): self.genomes = {} # Store table versions for later for table_name in self._table_versions.keys(): table = self.app.tool_data_tables.get( table_name, None ) if table is not None: self._table_versions[ table_name ] = table._loaded_content_version twobit_table = self.app.tool_data_tables.get( 'twobit', None ) twobit_fields = {} if twobit_table is None: # Add genome data (twobit files) to genomes, directly from twobit.loc try: for line in open( os.path.join( self.app.config.tool_data_path, "twobit.loc" ) ): if line.startswith("#"): continue val = line.split() if len( val ) == 2: key, path = val twobit_fields[ key ] = path except IOError, e: # Thrown if twobit.loc does not exist. log.exception( "Error reading twobit.loc: %s", e ) for key, description in self.app.genome_builds.get_genome_build_names(): self.genomes[ key ] = Genome( key, description ) # Add len files to genomes. self.genomes[ key ].len_file = self.app.genome_builds.get_chrom_info( key )[0] if self.genomes[ key ].len_file: if not os.path.exists( self.genomes[ key ].len_file ): self.genomes[ key ].len_file = None # Add genome data (twobit files) to genomes. if twobit_table is not None: self.genomes[ key ].twobit_file = twobit_table.get_entry( 'value', key, 'path', default=None ) elif key in twobit_fields: self.genomes[ key ].twobit_file = twobit_fields[ key ]
[docs] def check_and_reload( self ): # Check if tables have been modified, if so reload for table_name, table_version in self._table_versions.iteritems(): table = self.app.tool_data_tables.get( table_name, None ) if table is not None and not table.is_current_version( table_version ): return self.reload_genomes()
[docs] def get_build( self, dbkey ): """ Returns build for the given key. """ self.check_and_reload() rval = None if dbkey in self.genomes: rval = self.genomes[ dbkey ] return rval
[docs] def get_dbkeys( self, trans, chrom_info=False, **kwd ): """ Returns all known dbkeys. If chrom_info is True, only dbkeys with chromosome lengths are returned. """ self.check_and_reload() dbkeys = [] # Add user's custom keys to dbkeys. user_keys_dict = {} user = trans.get_user() if user: if 'dbkeys' in user.preferences: user_keys_dict = loads( user.preferences[ 'dbkeys' ] ) dbkeys.extend( [ (attributes[ 'name' ], key ) for key, attributes in user_keys_dict.items() ] ) # Add app keys to dbkeys. # If chrom_info is True, only include keys with len files (which contain chromosome info). filter_fn = lambda b: True if chrom_info: filter_fn = lambda b: b.len_file is not None dbkeys.extend( [ ( genome.description, genome.key ) for key, genome in self.genomes.items() if filter_fn( genome ) ] ) return dbkeys
[docs] def chroms( self, trans, dbkey=None, num=None, chrom=None, low=None ): """ Returns a naturally sorted list of chroms/contigs for a given dbkey. Use either chrom or low to specify the starting chrom in the return list. """ self.check_and_reload() # If there is no dbkey owner, default to current user. dbkey_owner, dbkey = decode_dbkey( dbkey ) if dbkey_owner: dbkey_user = trans.sa_session.query( trans.app.model.User ).filter_by( username=dbkey_owner ).first() else: dbkey_user = trans.user # # Get/create genome object. # genome = None twobit_file = None # Look first in user's custom builds. if dbkey_user and 'dbkeys' in dbkey_user.preferences: user_keys = loads( dbkey_user.preferences['dbkeys'] ) if dbkey in user_keys: dbkey_attributes = user_keys[ dbkey ] dbkey_name = dbkey_attributes[ 'name' ] # If there's a fasta for genome, convert to 2bit for later use. if 'fasta' in dbkey_attributes: build_fasta = trans.sa_session.query( trans.app.model.HistoryDatasetAssociation ).get( dbkey_attributes[ 'fasta' ] ) len_file = build_fasta.get_converted_dataset( trans, 'len' ).file_name build_fasta.get_converted_dataset( trans, 'twobit' ) # HACK: set twobit_file to True rather than a file name because # get_converted_dataset returns null during conversion even though # there will eventually be a twobit file available for genome. twobit_file = True # Backwards compatibility: look for len file directly. elif 'len' in dbkey_attributes: len_file = trans.sa_session.query( trans.app.model.HistoryDatasetAssociation ).get( user_keys[ dbkey ][ 'len' ] ).file_name if len_file: genome = Genome( dbkey, dbkey_name, len_file=len_file, twobit_file=twobit_file ) # Look in history and system builds. if not genome: # Look in history for chromosome len file. len_ds = trans.db_dataset_for( dbkey ) if len_ds: genome = Genome( dbkey, dbkey_name, len_file=len_ds.file_name ) # Look in system builds. elif dbkey in self.genomes: genome = self.genomes[ dbkey ] # Set up return value or log exception if genome not found for key. rval = None if genome: rval = genome.to_dict( num=num, chrom=chrom, low=low ) else: log.exception( 'genome not found for key %s' % dbkey ) return rval
[docs] def has_reference_data( self, dbkey, dbkey_owner=None ): """ Returns true if there is reference data for the specified dbkey. If dbkey is custom, dbkey_owner is needed to determine if there is reference data. """ self.check_and_reload() # Look for key in built-in builds. if dbkey in self.genomes and self.genomes[ dbkey ].twobit_file: # There is built-in reference data. return True # Look for key in owner's custom builds. if dbkey_owner and 'dbkeys' in dbkey_owner.preferences: user_keys = loads( dbkey_owner.preferences[ 'dbkeys' ] ) if dbkey in user_keys: dbkey_attributes = user_keys[ dbkey ] if 'fasta' in dbkey_attributes: # Fasta + converted datasets can provide reference data. return True return False
[docs] def reference( self, trans, dbkey, chrom, low, high ): """ Return reference data for a build. """ self.check_and_reload() # If there is no dbkey owner, default to current user. dbkey_owner, dbkey = decode_dbkey( dbkey ) if dbkey_owner: dbkey_user = trans.sa_session.query( trans.app.model.User ).filter_by( username=dbkey_owner ).first() else: dbkey_user = trans.user if not self.has_reference_data( dbkey, dbkey_user ): return None # # Get twobit file with reference data. # twobit_file_name = None if dbkey in self.genomes: # Built-in twobit. twobit_file_name = self.genomes[dbkey].twobit_file else: user_keys = loads( dbkey_user.preferences['dbkeys'] ) dbkey_attributes = user_keys[ dbkey ] fasta_dataset = trans.sa_session.query( trans.app.model.HistoryDatasetAssociation ).get( dbkey_attributes[ 'fasta' ] ) msg = fasta_dataset.convert_dataset( trans, 'twobit' ) if msg: return msg else: twobit_dataset = fasta_dataset.get_converted_dataset( trans, 'twobit' ) twobit_file_name = twobit_dataset.file_name # Read and return reference data. try: twobit = TwoBitFile( open( twobit_file_name ) ) if chrom in twobit: seq_data = twobit[chrom].get( int(low), int(high) ) return GenomeRegion( chrom=chrom, start=low, end=high, sequence=seq_data ) except IOError: return None