1 """
2 File : keggligandParser.py
3 Author : Javier Garcia Garcia
4 Creation : January 2008
5 Contents : fills up tables in database biana with information from kegg ligand database
6 Called from :
7
8 =======================================================================================================
9
10 This file implements a program that fills up tables in database biana with information of kegg ligand databases
11
12 """
13
14 from bianaParser import *
15 from biana.BianaObjects.Sequence import ProteinSequence
16
18 """
19 Uniprot Parser Class
20 """
21
22 name = "kegg_ligand"
23 description = "This file implements a program that fills up tables in database biana with information of kegg Ligand database"
24 external_entity_definition = ""
25 external_entity_relations = ""
26
27
29
30
31
32 BianaParser.__init__(self, default_db_description = "KEGG Ligand database",
33 default_script_name = "keggligandParser.py",
34 default_script_description = KeggLigandParser.description,
35 additional_compulsory_arguments = [],
36 additional_optional_arguments = [])
37 self.default_eE_attribute = "keggCode"
38
39
41 """
42 """
43
44 kegg_ligand_path = self.input_file
45
46 if kegg_ligand_path[-1] != os.sep:
47 kegg_ligand_path += os.sep
48
49
50
51
52
53 continue_field_regex = re.compile("^\s{3,}(.+);*$")
54
55 field_regex = re.compile("^(\w+)\s+(.+);*$")
56 pathway_regex = re.compile("PATH\:\s+(map|rn)(\d+)\s+(.+)$")
57
58 space_regex = re.compile("\s+")
59 parenthesis_regex = re.compile("\(.+\)")
60
61
62 pathway_dict_desc = {}
63 pathway_dict_components = {}
64
65
66 kegg_elements_dict = {}
67
68 temp_code = None
69
70
71
72
73 compound_f = file(kegg_ligand_path+"compound","r")
74
75 entry_regex = re.compile("^ENTRY\s+(\w+)\s+.*\s+Compound")
76 remark_regex = re.compile("^REMARK\s+Same\sas\:\s+(.+)$")
77 formula_regex = re.compile("^FORMULA\s+(.+)$")
78 comment_regex = re.compile("^COMMENT\s+(.+)$")
79
80 peptide_regex = re.compile("^ENTRY.+Peptide.+Compound")
81 sequence_regex = re.compile("^SEQUENCE\s+(.+)$")
82
83 kegg_object = None
84
85 temp_value = []
86 current_field = None
87
88 temp_pathway_codes = []
89
90 is_peptide = None
91
92 for line in compound_f:
93
94 m = entry_regex.match(line)
95
96 if m:
97 if peptide_regex.match(line):
98 is_peptide = 1
99 else:
100 is_peptide = None
101
102 if kegg_object is not None:
103 self.biana_access.insert_new_external_entity( externalEntity = kegg_object )
104 kegg_elements_dict[temp_code] = kegg_object.get_id()
105 [ pathway_dict_components[actual_pathway_code].append(kegg_object.get_id()) for actual_pathway_code in temp_pathway_codes ]
106
107 id_type = None
108 if is_peptide:
109 kegg_object = ExternalEntity( source_database = self.database, type="protein" )
110
111 id_type = "keggCode"
112 else:
113 kegg_object = ExternalEntity( source_database = self.database, type="compound" )
114
115 id_type = "keggCode"
116
117 kegg_object.add_attribute( ExternalEntityAttribute( attribute_identifier = id_type, value = m.group(1), type="unique" ) )
118 temp_code = m.group(1)
119 temp_pathway_codes = []
120
121 continue
122
123 m = pathway_regex.search(line)
124 if m:
125 temp_pathway_codes.append(m.group(2))
126 if not pathway_dict_desc.has_key(m.group(2)):
127 pathway_dict_desc[m.group(2)] = m.group(3)
128 pathway_dict_components[m.group(2)] = []
129 continue
130
131 new_field = field_regex.match(line)
132
133 if new_field:
134
135 if current_field == "NAME":
136 [ kegg_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "name", value = x, type= "unique") ) for x in temp_value ]
137 elif current_field == "FORMULA":
138 kegg_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "formula", value = " ".join(temp_value)) )
139
140 elif current_field == "COMMENT":
141 kegg_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "description", value = " ".join(temp_value) ) )
142
143 elif current_field == "SEQUENCE":
144
145 if is_peptide:
146 sequence_list = space_regex.split( parenthesis_regex.sub('', " ".join(temp_value)).strip() )
147 if "(Disulfide" in sequence_list:
148 print
149 print parenthesis_regex.sub('', " ".join(temp_value))
150 sequence = [ ProteinSequence.get_aminoacid_code_3to1( code = actual_residue.replace("-NH2","").replace("Acetyl-","").replace("6-Bromo-","").replace("N-Formyl-Met","") ) for actual_residue in sequence_list ]
151 kegg_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "ProteinSequence", value = ProteinSequence("".join(sequence)) ))
152
153
154
155
156 elif current_field == "REMARK":
157 for current_remark_line in temp_value:
158 m = remark_regex.match(current_field+" "+current_field)
159 if m:
160
161 [ kegg_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "keggCode", value=x,type="cross-reference")) for x in m.group(1).split(" ") ]
162
163 current_field = new_field.group(1)
164 temp_value = [new_field.group(2).strip()]
165 else:
166 cont_value = continue_field_regex.match(line)
167 if cont_value:
168 temp_value.append(cont_value.group(1).strip())
169
170
171
172
173 if kegg_object is not None:
174 self.biana_access.insert_new_external_entity( externalEntity = kegg_object )
175 kegg_elements_dict[temp_code] = kegg_object.get_id()
176
177 compound_f.close()
178
179
180
181
182
183 drug_f = file(kegg_ligand_path+"drug","r")
184
185 entry_regex = re.compile("^ENTRY\s+(\w+)\s+.*\s+Drug")
186 remark_regex = re.compile("^REMARK\s+Same\sas\:\s+(.+)$")
187 formula_regex = re.compile("^FORMULA\s+(.+)$")
188
189 kegg_object = None
190
191 temp_value = []
192 current_field = None
193
194 temp_pathway_codes = []
195
196
197 for line in drug_f:
198
199
200 m = entry_regex.match(line)
201
202 if m:
203 if kegg_object is not None:
204 self.biana_access.insert_new_external_entity( externalEntity = kegg_object )
205 kegg_elements_dict[temp_code] = kegg_object.get_id()
206 [ pathway_dict_components[actual_pathway_code].append(kegg_object.get_id()) for actual_pathway_code in temp_pathway_codes ]
207
208 kegg_object = ExternalEntity( source_database = self.database, type="drug" )
209 kegg_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "keggCode", value= m.group(1), type="unique") )
210 temp_code = m.group(1)
211 temp_pathway_codes = []
212
213 continue
214
215
216 m = pathway_regex.search(line)
217 if m:
218 temp_pathway_codes.append(m.group(2))
219 if not pathway_dict_desc.has_key(m.group(2)):
220 pathway_dict_desc[m.group(2)] = m.group(3)
221 pathway_dict_components[m.group(2)] = []
222 continue
223
224
225 new_field = field_regex.match(line)
226 if new_field:
227 if current_field == "NAME":
228 [ kegg_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "name", value=x, type="unique") ) for x in temp_value ]
229 elif current_field == "FORMULA":
230 kegg_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "formula", value=" ".join(temp_value)) )
231
232 elif current_field == "COMMENT":
233 kegg_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "description", value=" ".join(temp_value)) )
234
235 elif current_field == "REMARK":
236 for current_remark_line in temp_value:
237 m = remark_regex.match(current_field+" "+current_field)
238 if m:
239
240 [ kegg_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "keggCode", value=x,type="cross-reference")) for x in m.group(1).split(" ") ]
241
242 current_field = new_field.group(1)
243 temp_value = [new_field.group(2).strip()]
244 else:
245 cont_value = continue_field_regex.match(line)
246 if cont_value:
247 temp_value.append(cont_value.group(1).strip())
248
249
250
251 if kegg_object is not None:
252 self.biana_access.insert_new_external_entity( externalEntity = kegg_object )
253 kegg_elements_dict[temp_code] = kegg_object.get_id()
254
255 drug_f.close()
256
257
258
259
260
261 glycan_f = file(kegg_ligand_path+"glycan","r")
262
263 entry_regex = re.compile("^ENTRY\s+(\w+)\s+.*\s+Glycan")
264 formula_regex = re.compile("^COMPOSITION\s+(.+)$")
265 remark_regex = re.compile("^REMARK\s+Same\sas\:\s+(.+)$")
266
267
268 kegg_object = None
269
270 temp_value = []
271 current_field = None
272
273 temp_pathway_codes = []
274
275 for line in glycan_f:
276
277
278 m = entry_regex.match(line)
279
280 if m:
281 if kegg_object is not None:
282 self.biana_access.insert_new_external_entity( externalEntity = kegg_object )
283 kegg_elements_dict[temp_code] = kegg_object.get_id()
284 [ pathway_dict_components[actual_pathway_code].append(kegg_object.get_id()) for actual_pathway_code in temp_pathway_codes ]
285
286 kegg_object = ExternalEntity( source_database = self.database, type="glycan" )
287 kegg_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "keggCode", value=m.group(1),type="unique") )
288 temp_code = m.group(1)
289 temp_pathway_codes = []
290
291 continue
292
293 m = pathway_regex.search(line)
294 if m:
295 temp_pathway_codes.append(m.group(2))
296 if not pathway_dict_desc.has_key(m.group(2)):
297 pathway_dict_desc[m.group(2)] = m.group(3)
298 pathway_dict_components[m.group(2)] = []
299 continue
300
301 new_field = field_regex.match(line)
302 if new_field:
303
304 if current_field == "NAME":
305 [ kegg_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "name", value = x,type="unique") ) for x in temp_value ]
306 elif current_field == "COMPOSITION":
307 kegg_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "formula", value = " ".join(temp_value)) )
308
309 elif current_field == "COMMENT":
310 kegg_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "description", value = " ".join(temp_value)) )
311
312 elif current_field == "REMARK":
313 for current_remark_line in temp_value:
314 m = remark_regex.match(current_field+" "+current_field)
315 if m:
316
317 [ kegg_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "keggCode", value=x,type="cross-reference")) for x in m.group(1).split(" ") ]
318
319 current_field = new_field.group(1)
320 temp_value = [new_field.group(2).strip()]
321 else:
322 cont_value = continue_field_regex.match(line)
323 if cont_value:
324 temp_value.append(cont_value.group(1).strip())
325
326
327
328 if kegg_object is not None:
329 self.biana_access.insert_new_external_entity( externalEntity = kegg_object )
330 kegg_elements_dict[temp_code] = kegg_object.get_id()
331
332 glycan_f.close()
333
334
335
336
337
338
339
340 enzyme_f = file(kegg_ligand_path+"enzyme","r")
341
342 entry_regex = re.compile("^ENTRY\s+EC\s*([\d\.]+)\s+.*\s+Enzyme")
343
344 kegg_object = None
345
346 temp_value = []
347 current_field = None
348
349 temp_pathway_codes = []
350
351 sysname_regex = re.compile("^SYSNAME\s+(.+)$")
352 structure_regex = re.compile("PDB\:\s+(.+)$")
353
354 for line in enzyme_f:
355
356 m = entry_regex.match(line)
357
358 if m:
359 if kegg_object is not None:
360 self.biana_access.insert_new_external_entity( externalEntity = kegg_object )
361 kegg_elements_dict[temp_code] = kegg_object.get_id()
362 [ pathway_dict_components[actual_pathway_code].append(kegg_object.get_id()) for actual_pathway_code in temp_pathway_codes ]
363
364 kegg_object = ExternalEntity( source_database = self.database, type="enzyme" )
365 kegg_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "EC", value=m.group(1),type="unique") )
366 temp_code = m.group(1)
367 temp_pathway_codes = []
368
369 continue
370
371
372 m = pathway_regex.search(line)
373 if m:
374 temp_pathway_codes.append(m.group(2))
375 if not pathway_dict_desc.has_key(m.group(2)):
376 pathway_dict_desc[m.group(2)] = m.group(3)
377 pathway_dict_components[m.group(2)] = []
378 continue
379
380 m = sysname_regex.match(line)
381 if m:
382 kegg_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "name", value=m.group(1), type="unique") )
383
384
385 new_field = field_regex.match(line)
386 if new_field:
387
388 if current_field == "NAME":
389 [ kegg_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "name", value=x, type="unique") ) for x in temp_value ]
390
391 elif current_field == "COMMENT":
392 kegg_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "description", value = " ".join(temp_value)) )
393
394 elif current_field == "STRUCTURES":
395 all_str = " ".join(temp_value).strip()
396 m = structure_regex.search(all_str)
397 if m:
398 for actual_pdb in space_regex.split(m.group(1)):
399 kegg_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "pdb", value = actual_pdb) )
400
401 current_field = new_field.group(1)
402 temp_value = [new_field.group(2).strip()]
403 else:
404 cont_value = continue_field_regex.match(line)
405 if cont_value:
406 temp_value.append(cont_value.group(1).strip())
407
408
409
410 if kegg_object is not None:
411 self.biana_access.insert_new_external_entity( externalEntity = kegg_object )
412 kegg_elements_dict[temp_code] = kegg_object.get_id()
413
414 enzyme_f.close()
415
416
417
418
419
420
421 reaction_f = file(kegg_ligand_path+"reaction","r")
422
423 entry_regex = re.compile("^ENTRY\s+(\w+)\s+.*\s+Reaction")
424 enzyme_regex = re.compile("^ENZYME\s+([\d\.\s]+)$")
425 equation_regex = re.compile("^EQUATION\s+(.+)\s*\<\=\>\s+(.+)\s*$")
426
427
428 parenthesis_regex = re.compile("\(.+\)")
429
430 kegg_object = None
431
432 temp_value = []
433 current_field = None
434
435 temp_pathway_codes = []
436
437 for line in reaction_f:
438
439 m = entry_regex.match(line)
440
441 if m:
442 if kegg_object is not None:
443 self.biana_access.insert_new_external_entity( externalEntity = kegg_object )
444 kegg_elements_dict[temp_code] = kegg_object.get_id()
445 [ pathway_dict_components[actual_pathway_code].append(kegg_object.get_id()) for actual_pathway_code in temp_pathway_codes ]
446
447 kegg_object = ExternalEntityRelation( source_database = self.database, relation_type="reaction" )
448 kegg_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "keggCode", value=m.group(1),type="unique"))
449 temp_code = m.group(1)
450 temp_pathway_codes = []
451
452 continue
453
454
455 m = pathway_regex.search(line)
456 if m:
457 temp_pathway_codes.append(m.group(2))
458 if not pathway_dict_desc.has_key(m.group(2)):
459 pathway_dict_desc[m.group(2)] = m.group(3)
460 pathway_dict_components[m.group(2)] = []
461 continue
462
463 new_field = field_regex.match(line)
464 if new_field:
465 if current_field == "NAME":
466 [ kegg_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "name", value=x,type="unique") ) for x in temp_value ]
467
468 elif current_field == "COMMENT":
469 kegg_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "description", value=" ".join(temp_value)) )
470
471 elif current_field == "ENZYME":
472 m = enzyme_regex.match(current_field+" "+" ".join(temp_value).strip())
473 if m:
474
475 for actual_enzyme in space_regex.split(m.group(1)):
476 kegg_object.add_participant( externalEntityID = kegg_elements_dict[actual_enzyme] )
477 kegg_object.add_participant_attribute( externalEntityID = kegg_elements_dict[actual_enzyme],
478 participantAttribute = ExternalEntityRelationParticipantAttribute( attribute_identifier = "role",
479 value = "catalyst" ) )
480
481 elif current_field == "EQUATION":
482 m = equation_regex.match(current_field+" "+" ".join(temp_value))
483 if m:
484 substrates = m.group(1)
485 products = m.group(2)
486
487
488 for actual_substrat in [ x.strip() for x in substrates.split(" + ") ]:
489 splitted = actual_substrat.split(" ")
490 if len(splitted)==1:
491 num = 1
492
493 code = parenthesis_regex.sub('', splitted[0])
494
495 elif len(splitted)==2:
496 num = splitted[0].replace('n','').replace('(','').replace(')','')
497
498 code = parenthesis_regex.sub('', splitted[1])
499 else:
500 raise ValueError("How is possible to have more than 2 elements?")
501
502 try:
503 kegg_object.add_participant( externalEntityID = kegg_elements_dict[code] )
504 kegg_object.add_participant_attribute( externalEntityID = kegg_elements_dict[code],
505 participantAttribute = ExternalEntityRelationParticipantAttribute( attribute_identifier = "role",
506 value = "substrate" ) )
507
508 if num != '':
509 if int(num)>1:
510 kegg_object.add_participant_attribute( externalEntityID = kegg_elements_dict[code],
511 participantAttribute = ExternalEntityRelationParticipantAttribute( attribute_identifier = "cardinality",
512 value = num ) )
513 except:
514 sys.stderr.write("Kegg element %s is not defined in kegg database\n" %code)
515
516
517 for actual_product in [ x.strip() for x in products.split(" + ") ]:
518 splitted = actual_product.split(" ")
519 if len(splitted)==1:
520 num = 1
521
522 code = parenthesis_regex.sub('', splitted[0])
523 elif len(splitted)==2:
524 num = splitted[0].replace('n','').replace('(','').replace(')','')
525
526 code = parenthesis_regex.sub('', splitted[1])
527 else:
528 raise ValueError("How is possible to have more than 2 elements? [ %s ]\nPRODUCTS: %s" %(actual_product,products))
529
530 try:
531 kegg_object.add_participant( externalEntityID = kegg_elements_dict[code] )
532 kegg_object.add_participant_attribute( externalEntityID = kegg_elements_dict[code],
533 participantAttribute = ExternalEntityRelationParticipantAttribute( attribute_identifier = "role",
534 value = "product" ) )
535 if num != '':
536 if int(num)>1:
537 kegg_object.add_participant_attribute( externalEntityID = kegg_elements_dict[code],
538 participantAttribute = ExternalEntityRelationParticipantAttribute( attribute_identifier = "cardinality",
539 value = num ) )
540 except:
541 sys.stderr.write("Kegg element %s is not defined in kegg database\n" %code)
542
543 current_field = new_field.group(1)
544 temp_value = [new_field.group(2).strip()]
545 else:
546 cont_value = continue_field_regex.match(line)
547 if cont_value:
548 temp_value.append(cont_value.group(1).strip())
549
550
551
552 if kegg_object is not None:
553 self.biana_access.insert_new_external_entity( externalEntity = kegg_object )
554
555 reaction_f.close()
556
557
558
559
560
561
562
563
564
565
566
567
568 for actual_pathway_code in pathway_dict_desc.keys():
569 kegg_pathway_object = ExternalEntityRelation( source_database = self.database, relation_type="pathway" )
570 kegg_pathway_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "keggCode", value=actual_pathway_code, type="unique") )
571 kegg_pathway_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "description", value=pathway_dict_desc[actual_pathway_code] ) )
572
573 [ kegg_pathway_object.add_participant( externalEntityID = actual_participant_external_entity_id )
574 for actual_participant_external_entity_id in pathway_dict_components[actual_pathway_code] ]
575
576 self.biana_access.insert_new_external_entity( externalEntity = kegg_pathway_object )
577