Package biana :: Package BianaParser :: Module bianaParser
[hide private]
[frames] | no frames]

Source Code for Module biana.BianaParser.bianaParser

  1  """ 
  2      BIANA: Biologic Interactions and Network Analysis 
  3      Copyright (C) 2009  Javier Garcia-Garcia, Emre Guney, Baldo Oliva 
  4   
  5      This program is free software: you can redistribute it and/or modify 
  6      it under the terms of the GNU General Public License as published by 
  7      the Free Software Foundation, either version 3 of the License, or 
  8      (at your option) any later version. 
  9   
 10      This program is distributed in the hope that it will be useful, 
 11      but WITHOUT ANY WARRANTY; without even the implied warranty of 
 12      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 13      GNU General Public License for more details. 
 14   
 15      You should have received a copy of the GNU General Public License 
 16      along with this program.  If not, see <http://www.gnu.org/licenses/>. 
 17   
 18  """ 
 19   
 20   
 21  """ 
 22  File        : bianaParser.py 
 23  Author      : Javier Garcia 
 24  Creation    : October 2007 
 25  Contents    : General parser to introduce information into biana 
 26  Called from : command line 
 27   
 28  ======================================================================================================= 
 29   
 30  This file implements a program that fills up tables in database biana with information from distinct databases 
 31   
 32  """ 
 33   
 34  ## STEP 1: IMPORT NECESSARY MODULES 
 35   
 36  import sys 
 37  import getopt 
 38  import re 
 39  import time 
 40  import gzip 
 41  import traceback 
 42  import os 
 43  #import tarfile 
 44   
 45   
 46  from biana.BianaDB import BianaDBaccess 
 47  from biana.BianaObjects import * 
 48   
 49   
50 -class BianaParser(object):
51 """ 52 General Parser Class to biana 53 """ 54
55 - def __init__(self, default_db_description = None, 56 default_script_name = "bianaParser.py", 57 default_script_description = "This file implements a program that fills up tables in database biana with information from distinct databases", 58 #content_type_list = [], 59 additional_compulsory_arguments = [], 60 additional_optional_arguments = []):
61 62 """ 63 Starts the bianaParser Object 64 """ 65 66 print "Parser object started" 67 68 self.compulsory_arguments = [ ("input-identifier=",None,"path or file name of input file(s) containing database data. Path names must end with \"/\"."), 69 ("biana-dbname=",None,"name of database biana to be used"), 70 ("biana-dbhost=",None,"name of host where database biana to be used is placed"), 71 ("database-name=",None,"internal identifier name to this database (it must be unique in the database)"), 72 ("database-version=",None,"version of the database to be inserted") ] 73 74 self.compulsory_arguments.extend(additional_compulsory_arguments) 75 76 77 self.optional_arguments = [ ("biana-dbuser=",None,"username accessing the database (not required in most systems)"), 78 ("biana-dbpass=",None,"password of username accessing the database (not required in most systems"), 79 ("help",None,"prints this message and exits"), 80 ("verbose",0,"prints process info to stdout"), 81 ("log-file=",None,"Prints a log file of the parsing result (number of inserted proteins, references...)"), 82 ("time-control",None,"prints to stderr a control of the timing of the parser"), 83 ("database-description=",default_db_description,"Description of the database to be inserted."), 84 ("optimize-for-parsing",None,"Optimizes database for parsing"), 85 ("promiscuous",False,"sets the database to be parsed as promiscuous (whose entities can be included in multi user entities)") ] 86 #("mode=","scratch","sets mode to be used by parser. Valid modes are: \"scratch\" (biana database is empty, create it from scratch) or \"tables\" (fill only tables indicated in tables_to_fill (see code)")] 87 88 self.optional_arguments.extend(additional_optional_arguments) 89 90 self.script_name = default_script_name 91 self.script_description = default_script_description 92 93 #Parse general methods 94 self.arguments_dic = self.parseArguments() 95 self.input_file = self.arguments_dic["input-identifier"] 96 self.biana_dbname = self.arguments_dic["biana-dbname"] 97 self.biana_dbhost = self.arguments_dic["biana-dbhost"] 98 self.sourcedb_name = self.arguments_dic["database-name"] 99 self.sourcedb_version = self.arguments_dic["database-version"] 100 self.biana_dbuser = self.arguments_dic["biana-dbuser"] 101 self.biana_dbpass = self.arguments_dic["biana-dbpass"] 102 self.help = self.arguments_dic["help"] 103 self.verbose = self.arguments_dic["verbose"] 104 self.time_control = self.arguments_dic["time-control"] 105 self.log_file = self.arguments_dic["log-file"] 106 self.optimize_for_parsing = self.arguments_dic["optimize-for-parsing"] 107 #self.mode = self.arguments_dic["mode"] 108 self.is_promiscuous = self.arguments_dic["promiscuous"] # Flag deciding whether database gives information that is going to be added to more than one user entiries 109 110 self.database = None 111 if self.arguments_dic.has_key("default-attribute"): 112 self.default_eE_attribute = self.arguments_dic["default-attribute"] # default externalEntityAttribute specified by the particular database parser (it will be overwritten in the parser if not given as argument) 113 else: 114 self.default_eE_attribute = ""
115 116 #self.content_type_list = content_types 117 118
119 - def start(self):
120 121 122 print "Parser started" 123 if isinstance(self.sourcedb_name,int) or isinstance(self.sourcedb_version,int): 124 sys.stderr.write("You must insert correctly the database name and database version\n") 125 sys.exit(1) 126 127 #if( self.mode=="scratch" ): 128 self.database_description = self.arguments_dic["database-description"] 129 130 # Log dictionary where all log information will be stored 131 self.log = {} 132 if self.log_file: 133 self.log_file_fd = file(self.log_file, 'w') 134 135 self.biana_access = BianaDBaccess(dbname=self.biana_dbname, dbhost=self.biana_dbhost, dbuser=self.biana_dbuser, use_buffer=True, dbpassword=self.biana_dbpass, lock_tables=True, check_integrity=True ) 136 137 138 # check data consistency 139 140 # Time related 141 self.initial_time = time.time() 142 143 # Insert the information associated to the parsed database 144 # Introduce database info into biana database 145 #if( self.mode=="scratch" ): 146 147 self.database = ExternalDatabase( databaseName = self.sourcedb_name, 148 databaseVersion = self.sourcedb_version, 149 databaseFile = self.input_file.split(os.sep)[-1], 150 databaseDescription = self.database_description, 151 defaultExternalEntityAttribute = self.default_eE_attribute, 152 isPromiscuous = self.is_promiscuous ) 153 #content_type_list = self.content_type_list) 154 155 self.biana_access.insert_new_external_database( externalDatabase = self.database ) 156 157 # Open the input file descriptor 158 # This is a responsability of subclasses method 159 160 try: 161 if self.optimize_for_parsing: 162 self.biana_access.optimize_database_for(mode="parsing") 163 164 self.parse_database() 165 166 # set the parsing time 167 self.database.set_parsing_time( int(time.time() - self.initial_time) ) 168 169 # Updates the information that this external database has inserted 170 self.biana_access.update_external_database_external_entity_attributes( self.database ) 171 172 self.close() 173 174 175 except: 176 traceback.print_exc() 177 sys.stderr.write("ERROR WHILE PARSING. ALL MODIFICATIONS ARE GOING TO BE DELETED\n") 178 self.biana_access._rollback() 179 sys.exit(1)
180 181 182 # METHODS 183
184 - def close(self):
185 ## LAST STEP: CLOSE DATABASE CONNECTION IMPORTANT !!!! 186 ## As bianaDBaccess uses an internal buffer, it is necessary to close the connection to sure that all inserts are correctly done, as well as unlock tables 187 188 189 self.biana_access.close() 190 191 if self.time_control: 192 sys.stderr.write("Total time: %s seconds\n" %(time.time()-self.initial_time)) 193 194 if self.log_file: 195 self.log_file_fd.write(self.get_log_string()) 196 self.log_file_fd.close() 197 198 if self.verbose: 199 sys.stderr.write("\n Total time: %s \n" %(time.time()-self.initial_time) ) 200 sys.stderr.write(self.get_log_string())
201 202 203 204 ## GENERAL PARSER METHODS ##
205 - def parseArguments(self):
206 """ 207 Method that returns a dictionary with the values of the arguments 208 209 """ 210 211 arguments = self.compulsory_arguments+self.optional_arguments 212 213 # Set all default values 214 #return_values = [i[1] for i in arguments] 215 return_dict = {} 216 for i in arguments: 217 return_dict[i[0].replace("=","")] = i[1] 218 219 # Obtain a list with the names of all arguments 220 list_arguments = [argument[0] for argument in arguments] 221 # It can be of the following way because it contains "=" digit 222 #list_arguments = return_dict.keys() 223 224 225 # Parse arguments 226 try: 227 opts, args = getopt.getopt(sys.argv[2:], "", list_arguments) 228 229 except getopt.GetoptError, bad_opt: 230 # return error in parsing parameters, and return void list 231 raise ValueError("%s\n" %(bad_opt.__str__()) ) 232 233 # If there is no error, continue with the parsing 234 for option,value in opts: 235 if option=="--help": 236 self.print_help() 237 sys.exit(2) 238 for actual_argument in list_arguments: 239 # Delete the "=" value if it has 240 temp_arg = actual_argument.replace("=","") 241 if option=="--"+temp_arg: 242 if value=="": 243 return_dict[temp_arg]=1 244 else: 245 return_dict[temp_arg]=value 246 247 248 # Check for all compulsory arguments: 249 for comp_arg in self.compulsory_arguments: 250 if return_dict[comp_arg[0].replace("=","")] is None: 251 sys.stderr.write("%s argument is not defined!\n" %(comp_arg[0].replace("=",""))) 252 self.print_help() 253 sys.exit(2) 254 255 return return_dict
256 257 258 ## LOG RELATED METHODS ##
259 - def add_to_log(self,key):
260 """ 261 Increment the counter of a log dictionary for a given key 262 263 Used in parsers 264 265 """ 266 267 try: 268 self.log[key] += 1 269 except KeyError: 270 self.log[key] = 1
271 272
273 - def get_log_string(self):
274 """ 275 Returns a string with the content of the log dictionary 276 277 Format: key: value 278 """ 279 280 string_list = [] 281 282 for log_element in self.log.keys(): 283 string_list.append("%s: %s" %(log_element,self.log[log_element])) 284 285 return "\n".join(string_list)
286 287
288 - def print_help(self):
289 290 print "--------------------------------------------------------------------------------------------------------------" 291 print "DESCRIPTION:" 292 print "\t"+self.script_description 293 294 usage = "\tpython %s " %(self.script_name) 295 296 for argument in self.compulsory_arguments: 297 if re.search("=",argument[0]): 298 usage = usage + "--%s%s " %(argument[0],argument[0].rstrip("=")) 299 else: 300 usage = usage + "--%s " %(argument[0]) 301 302 for argument in self.optional_arguments: 303 if re.search("=",argument[0]): 304 usage = usage + "[--%s=%s] " %(argument[0],argument[0].rstrip("=")) 305 else: 306 usage = usage + "[--%s] " %(argument[0]) 307 308 print "\n" 309 print "USAGE:" 310 print usage 311 312 print "\nWHERE:\n" 313 314 if len(self.compulsory_arguments)>0: 315 print "COMPULSORY ARGUMENTS:" 316 317 for argument in self.compulsory_arguments: 318 sys.stdout.write("\t%s:" %(argument[0].rstrip("="))) 319 sys.stdout.write("%s" %(self._indent(3,len(argument[0])-1))) 320 argument_description = self._splitsize(string=argument[2],size=80) 321 if len(argument_description)==1: 322 sys.stdout.write("%s\n" %(argument[2])) 323 else: 324 sys.stdout.write("%s\n" %(argument_description[0])) 325 for i in range(1,len(argument_description)): 326 sys.stdout.write("\t\t\t\t%s\n" %(argument_description[i])) 327 328 if len(self.optional_arguments)>0: 329 print 330 print "OPTIONAL ARGUMENTS:" 331 332 for argument in self.optional_arguments: 333 sys.stdout.write("\t%s:" %(argument[0].rstrip("="))) 334 sys.stdout.write("%s" %(self._indent(3,len(argument[0])-1))) 335 argument_description = self._splitsize(string=argument[2]+" [default: %s]" %(argument[1]),size=80) 336 if len(argument_description)==1: 337 sys.stdout.write("%s [default: %s]\n" %(argument[2],argument[1])) 338 else: 339 sys.stdout.write("%s\n" %(argument_description[0])) 340 for i in range(1,len(argument_description)): 341 sys.stdout.write("\t\t\t\t%s\n" %(argument_description[i])) 342 343 print "--------------------------------------------------------------------------------------------------------------"
344 345 346
347 - def _indent(self,max_num_tabulators, initial_length):
348 349 num_tabulators = max_num_tabulators - (initial_length+1)/8 350 351 values_to_return = [] 352 353 #print "num tabulators: %s" %(num_tabulators) 354 355 for i in xrange(num_tabulators): 356 values_to_return.append("\t") 357 358 return "".join(values_to_return)
359
360 - def _splitsize(self, string, size):
361 """ 362 Split a string in substrings with a determined size 363 """ 364 365 list_return = [] 366 367 final_position=0 368 369 if len(string)<=size: 370 list_return = [string] 371 else: 372 for i in xrange(len(string)/size): 373 initial_position = i*size + final_position - i*size 374 final_position = (i+1)*size 375 while final_position<len(string) and string[final_position] != " " and string[final_position] != "\t": 376 final_position += 1 377 list_return.append(string[initial_position:final_position]) 378 379 return list_return
380 381
383 """ 384 Create the input file descriptor given input database file name. Handles gzipped data as well. 385 """ 386 387 self.input_file_fd = None 388 389 if self.input_file != "None": 390 391 if( os.path.isfile(self.input_file) ): 392 #if( self.input_file.endswith("tar.gz") ): 393 # self.input_file_fd = tarfile.open(self.input_file,'r') 394 if( self.input_file.endswith(".gz") ): 395 self.input_file_fd = gzip.open(self.input_file,'r') 396 else: 397 self.input_file_fd = file(self.input_file, 'r') 398 elif( os.path.isdir(self.input_file) ): 399 self.input_file_fd = None
400 401 402 403
404 - def parse_database(self):
405 """ 406 Method to be overwritten by specific parsers 407 408 The method must include the calls to control lock and unlock database procedures 409 """ 410 return
411