import os, gzip, re, gzip, zipfile, binascii, bz2, imghdr
from galaxy import util
from StringIO import StringIO
HTML_CHECK_LINES = 100
try:
import Image as PIL
except ImportError:
try:
from PIL import Image as PIL
except:
PIL = None
[docs]def check_image( file_path ):
if PIL != None:
try:
im = PIL.open( file_path )
except:
return False
if im:
return im
return False
else:
if imghdr.what( file_path ) != None:
return True
return False
[docs]def check_html( file_path, chunk=None ):
if chunk is None:
temp = open( file_path, "U" )
else:
temp = chunk
regexp1 = re.compile( "<A\s+[^>]*HREF[^>]+>", re.I )
regexp2 = re.compile( "<IFRAME[^>]*>", re.I )
regexp3 = re.compile( "<FRAMESET[^>]*>", re.I )
regexp4 = re.compile( "<META[\W][^>]*>", re.I )
regexp5 = re.compile( "<SCRIPT[^>]*>", re.I )
lineno = 0
# TODO: Potentially reading huge lines into string here, this should be
# reworked.
for line in temp:
lineno += 1
matches = regexp1.search( line ) or regexp2.search( line ) or regexp3.search( line ) or regexp4.search( line ) or regexp5.search( line )
if matches:
if chunk is None:
temp.close()
return True
if HTML_CHECK_LINES and (lineno > HTML_CHECK_LINES):
break
if chunk is None:
temp.close()
return False
[docs]def check_binary( name, file_path=True ):
# Handles files if file_path is True or text if file_path is False
is_binary = False
if file_path:
temp = open( name, "U" )
else:
temp = StringIO( name )
chars_read = 0
try:
for char in temp.read( 100 ):
if util.is_binary( char ):
is_binary = True
break
finally:
temp.close( )
return is_binary
[docs]def check_gzip( file_path ):
# This method returns a tuple of booleans representing ( is_gzipped, is_valid )
# Make sure we have a gzipped file
try:
temp = open( file_path, "U" )
magic_check = temp.read( 2 )
temp.close()
if magic_check != util.gzip_magic:
return ( False, False )
except:
return ( False, False )
# We support some binary data types, so check if the compressed binary file is valid
# If the file is Bam, it should already have been detected as such, so we'll just check
# for sff format.
try:
header = gzip.open( file_path ).read(4)
if binascii.b2a_hex( header ) == binascii.hexlify( '.sff' ):
return ( True, True )
except:
return( False, False )
CHUNK_SIZE = 2**15 # 32Kb
gzipped_file = gzip.GzipFile( file_path, mode='rb' )
chunk = gzipped_file.read( CHUNK_SIZE )
gzipped_file.close()
# See if we have a compressed HTML file
if check_html( file_path, chunk=chunk ):
return ( True, False )
return ( True, True )
[docs]def check_bz2( file_path ):
try:
temp = open( file_path, "U" )
magic_check = temp.read( 3 )
temp.close()
if magic_check != util.bz2_magic:
return ( False, False )
except:
return( False, False )
CHUNK_SIZE = 2**15 # reKb
bzipped_file = bz2.BZ2File( file_path, mode='rb' )
chunk = bzipped_file.read( CHUNK_SIZE )
bzipped_file.close()
# See if we have a compressed HTML file
if check_html( file_path, chunk=chunk ):
return ( True, False )
return ( True, True )
[docs]def check_zip( file_path ):
if zipfile.is_zipfile( file_path ):
return True
return False
[docs]def is_bz2( file_path ):
is_bz2, is_valid = check_bz2( file_path )
return is_bz2
[docs]def is_gzip( file_path ):
is_gzipped, is_valid = check_gzip( file_path )
return is_gzipped