1 """
2 BIANA: Biologic Interactions and Network Analysis
3 Copyright (C) 2009 Javier Garcia-Garcia, Emre Guney, Baldo Oliva
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>.
17
18 """
19
20
21 """
22 File : bianaParser.py
23 Author : Javier Garcia
24 Creation : October 2007
25 Contents : General parser to introduce information into biana
26 Called from : command line
27
28 =======================================================================================================
29
30 This file implements a program that fills up tables in database biana with information from distinct databases
31
32 """
33
34
35
36 import sys
37 import getopt
38 import re
39 import time
40 import gzip
41 import traceback
42 import os
43
44
45
46 from biana.BianaDB import BianaDBaccess
47 from biana.BianaObjects import *
48
49
51 """
52 General Parser Class to biana
53 """
54
55 - def __init__(self, default_db_description = None,
56 default_script_name = "bianaParser.py",
57 default_script_description = "This file implements a program that fills up tables in database biana with information from distinct databases",
58
59 additional_compulsory_arguments = [],
60 additional_optional_arguments = []):
61
62 """
63 Starts the bianaParser Object
64 """
65
66 print "Parser object started"
67
68 self.compulsory_arguments = [ ("input-identifier=",None,"path or file name of input file(s) containing database data. Path names must end with \"/\"."),
69 ("biana-dbname=",None,"name of database biana to be used"),
70 ("biana-dbhost=",None,"name of host where database biana to be used is placed"),
71 ("database-name=",None,"internal identifier name to this database (it must be unique in the database)"),
72 ("database-version=",None,"version of the database to be inserted") ]
73
74 self.compulsory_arguments.extend(additional_compulsory_arguments)
75
76
77 self.optional_arguments = [ ("biana-dbuser=",None,"username accessing the database (not required in most systems)"),
78 ("biana-dbpass=",None,"password of username accessing the database (not required in most systems"),
79 ("help",None,"prints this message and exits"),
80 ("verbose",0,"prints process info to stdout"),
81 ("log-file=",None,"Prints a log file of the parsing result (number of inserted proteins, references...)"),
82 ("time-control",None,"prints to stderr a control of the timing of the parser"),
83 ("database-description=",default_db_description,"Description of the database to be inserted."),
84 ("optimize-for-parsing",None,"Optimizes database for parsing"),
85 ("promiscuous",False,"sets the database to be parsed as promiscuous (whose entities can be included in multi user entities)") ]
86
87
88 self.optional_arguments.extend(additional_optional_arguments)
89
90 self.script_name = default_script_name
91 self.script_description = default_script_description
92
93
94 self.arguments_dic = self.parseArguments()
95 self.input_file = self.arguments_dic["input-identifier"]
96 self.biana_dbname = self.arguments_dic["biana-dbname"]
97 self.biana_dbhost = self.arguments_dic["biana-dbhost"]
98 self.sourcedb_name = self.arguments_dic["database-name"]
99 self.sourcedb_version = self.arguments_dic["database-version"]
100 self.biana_dbuser = self.arguments_dic["biana-dbuser"]
101 self.biana_dbpass = self.arguments_dic["biana-dbpass"]
102 self.help = self.arguments_dic["help"]
103 self.verbose = self.arguments_dic["verbose"]
104 self.time_control = self.arguments_dic["time-control"]
105 self.log_file = self.arguments_dic["log-file"]
106 self.optimize_for_parsing = self.arguments_dic["optimize-for-parsing"]
107
108 self.is_promiscuous = self.arguments_dic["promiscuous"]
109
110 self.database = None
111 if self.arguments_dic.has_key("default-attribute"):
112 self.default_eE_attribute = self.arguments_dic["default-attribute"]
113 else:
114 self.default_eE_attribute = ""
115
116
117
118
120
121
122 print "Parser started"
123 if isinstance(self.sourcedb_name,int) or isinstance(self.sourcedb_version,int):
124 sys.stderr.write("You must insert correctly the database name and database version\n")
125 sys.exit(1)
126
127
128 self.database_description = self.arguments_dic["database-description"]
129
130
131 self.log = {}
132 if self.log_file:
133 self.log_file_fd = file(self.log_file, 'w')
134
135 self.biana_access = BianaDBaccess(dbname=self.biana_dbname, dbhost=self.biana_dbhost, dbuser=self.biana_dbuser, use_buffer=True, dbpassword=self.biana_dbpass, lock_tables=True, check_integrity=True )
136
137
138
139
140
141 self.initial_time = time.time()
142
143
144
145
146
147 self.database = ExternalDatabase( databaseName = self.sourcedb_name,
148 databaseVersion = self.sourcedb_version,
149 databaseFile = self.input_file.split(os.sep)[-1],
150 databaseDescription = self.database_description,
151 defaultExternalEntityAttribute = self.default_eE_attribute,
152 isPromiscuous = self.is_promiscuous )
153
154
155 self.biana_access.insert_new_external_database( externalDatabase = self.database )
156
157
158
159
160 try:
161 if self.optimize_for_parsing:
162 self.biana_access.optimize_database_for(mode="parsing")
163
164 self.parse_database()
165
166
167 self.database.set_parsing_time( int(time.time() - self.initial_time) )
168
169
170 self.biana_access.update_external_database_external_entity_attributes( self.database )
171
172 self.close()
173
174
175 except:
176 traceback.print_exc()
177 sys.stderr.write("ERROR WHILE PARSING. ALL MODIFICATIONS ARE GOING TO BE DELETED\n")
178 self.biana_access._rollback()
179 sys.exit(1)
180
181
182
183
185
186
187
188
189 self.biana_access.close()
190
191 if self.time_control:
192 sys.stderr.write("Total time: %s seconds\n" %(time.time()-self.initial_time))
193
194 if self.log_file:
195 self.log_file_fd.write(self.get_log_string())
196 self.log_file_fd.close()
197
198 if self.verbose:
199 sys.stderr.write("\n Total time: %s \n" %(time.time()-self.initial_time) )
200 sys.stderr.write(self.get_log_string())
201
202
203
204
206 """
207 Method that returns a dictionary with the values of the arguments
208
209 """
210
211 arguments = self.compulsory_arguments+self.optional_arguments
212
213
214
215 return_dict = {}
216 for i in arguments:
217 return_dict[i[0].replace("=","")] = i[1]
218
219
220 list_arguments = [argument[0] for argument in arguments]
221
222
223
224
225
226 try:
227 opts, args = getopt.getopt(sys.argv[2:], "", list_arguments)
228
229 except getopt.GetoptError, bad_opt:
230
231 raise ValueError("%s\n" %(bad_opt.__str__()) )
232
233
234 for option,value in opts:
235 if option=="--help":
236 self.print_help()
237 sys.exit(2)
238 for actual_argument in list_arguments:
239
240 temp_arg = actual_argument.replace("=","")
241 if option=="--"+temp_arg:
242 if value=="":
243 return_dict[temp_arg]=1
244 else:
245 return_dict[temp_arg]=value
246
247
248
249 for comp_arg in self.compulsory_arguments:
250 if return_dict[comp_arg[0].replace("=","")] is None:
251 sys.stderr.write("%s argument is not defined!\n" %(comp_arg[0].replace("=","")))
252 self.print_help()
253 sys.exit(2)
254
255 return return_dict
256
257
258
260 """
261 Increment the counter of a log dictionary for a given key
262
263 Used in parsers
264
265 """
266
267 try:
268 self.log[key] += 1
269 except KeyError:
270 self.log[key] = 1
271
272
274 """
275 Returns a string with the content of the log dictionary
276
277 Format: key: value
278 """
279
280 string_list = []
281
282 for log_element in self.log.keys():
283 string_list.append("%s: %s" %(log_element,self.log[log_element]))
284
285 return "\n".join(string_list)
286
287
289
290 print "--------------------------------------------------------------------------------------------------------------"
291 print "DESCRIPTION:"
292 print "\t"+self.script_description
293
294 usage = "\tpython %s " %(self.script_name)
295
296 for argument in self.compulsory_arguments:
297 if re.search("=",argument[0]):
298 usage = usage + "--%s%s " %(argument[0],argument[0].rstrip("="))
299 else:
300 usage = usage + "--%s " %(argument[0])
301
302 for argument in self.optional_arguments:
303 if re.search("=",argument[0]):
304 usage = usage + "[--%s=%s] " %(argument[0],argument[0].rstrip("="))
305 else:
306 usage = usage + "[--%s] " %(argument[0])
307
308 print "\n"
309 print "USAGE:"
310 print usage
311
312 print "\nWHERE:\n"
313
314 if len(self.compulsory_arguments)>0:
315 print "COMPULSORY ARGUMENTS:"
316
317 for argument in self.compulsory_arguments:
318 sys.stdout.write("\t%s:" %(argument[0].rstrip("=")))
319 sys.stdout.write("%s" %(self._indent(3,len(argument[0])-1)))
320 argument_description = self._splitsize(string=argument[2],size=80)
321 if len(argument_description)==1:
322 sys.stdout.write("%s\n" %(argument[2]))
323 else:
324 sys.stdout.write("%s\n" %(argument_description[0]))
325 for i in range(1,len(argument_description)):
326 sys.stdout.write("\t\t\t\t%s\n" %(argument_description[i]))
327
328 if len(self.optional_arguments)>0:
329 print
330 print "OPTIONAL ARGUMENTS:"
331
332 for argument in self.optional_arguments:
333 sys.stdout.write("\t%s:" %(argument[0].rstrip("=")))
334 sys.stdout.write("%s" %(self._indent(3,len(argument[0])-1)))
335 argument_description = self._splitsize(string=argument[2]+" [default: %s]" %(argument[1]),size=80)
336 if len(argument_description)==1:
337 sys.stdout.write("%s [default: %s]\n" %(argument[2],argument[1]))
338 else:
339 sys.stdout.write("%s\n" %(argument_description[0]))
340 for i in range(1,len(argument_description)):
341 sys.stdout.write("\t\t\t\t%s\n" %(argument_description[i]))
342
343 print "--------------------------------------------------------------------------------------------------------------"
344
345
346
347 - def _indent(self,max_num_tabulators, initial_length):
348
349 num_tabulators = max_num_tabulators - (initial_length+1)/8
350
351 values_to_return = []
352
353
354
355 for i in xrange(num_tabulators):
356 values_to_return.append("\t")
357
358 return "".join(values_to_return)
359
361 """
362 Split a string in substrings with a determined size
363 """
364
365 list_return = []
366
367 final_position=0
368
369 if len(string)<=size:
370 list_return = [string]
371 else:
372 for i in xrange(len(string)/size):
373 initial_position = i*size + final_position - i*size
374 final_position = (i+1)*size
375 while final_position<len(string) and string[final_position] != " " and string[final_position] != "\t":
376 final_position += 1
377 list_return.append(string[initial_position:final_position])
378
379 return list_return
380
381
400
401
402
403
405 """
406 Method to be overwritten by specific parsers
407
408 The method must include the calls to control lock and unlock database procedures
409 """
410 return
411