Package biana :: Package BianaParser :: Module keggligandParser
[hide private]
[frames] | no frames]

Source Code for Module biana.BianaParser.keggligandParser

  1  """ 
  2  File        : keggligandParser.py 
  3  Author      : Javier Garcia Garcia 
  4  Creation    : January 2008 
  5  Contents    : fills up tables in database biana with information from kegg ligand database 
  6  Called from :  
  7   
  8  ======================================================================================================= 
  9   
 10  This file implements a program that fills up tables in database biana with information of kegg ligand databases 
 11   
 12  """ 
 13   
 14  from bianaParser import * 
 15  from biana.BianaObjects.Sequence import ProteinSequence 
 16   
17 -class KeggLigandParser(BianaParser):
18 """ 19 Uniprot Parser Class 20 """ 21 22 name = "kegg_ligand" 23 description = "This file implements a program that fills up tables in database biana with information of kegg Ligand database" 24 external_entity_definition = "" 25 external_entity_relations = "" 26 27
28 - def __init__(self):
29 30 # Start with the default values 31 32 BianaParser.__init__(self, default_db_description = "KEGG Ligand database", 33 default_script_name = "keggligandParser.py", 34 default_script_description = KeggLigandParser.description, 35 additional_compulsory_arguments = [], #[("kegg_ligand_path=",None,"Path where compound, drug, glycan, and enzyme files are")], 36 additional_optional_arguments = []) 37 self.default_eE_attribute = "keggCode"
38 39
40 - def parse_database(self):
41 """ 42 """ 43 44 kegg_ligand_path = self.input_file 45 46 if kegg_ligand_path[-1] != os.sep: 47 kegg_ligand_path += os.sep 48 49 50 # General regex 51 #continue_field_regex = re.compile("^\s{3,}([^;]+);*$") 52 #continue_field_regex = re.compile("^\s{3,}(.+);$") 53 continue_field_regex = re.compile("^\s{3,}(.+);*$") 54 #field_regex = re.compile("^(\w+)\s+([^;]+);*$") 55 field_regex = re.compile("^(\w+)\s+(.+);*$") 56 pathway_regex = re.compile("PATH\:\s+(map|rn)(\d+)\s+(.+)$") 57 58 space_regex = re.compile("\s+") 59 parenthesis_regex = re.compile("\(.+\)") # used to eliminate extra information in sequence 60 61 62 pathway_dict_desc = {} # This will store the pathway kegg code with its description as value 63 pathway_dict_components = {} # This will store in memory the pathway kegoo code as key with a list of its participants as values 64 65 66 kegg_elements_dict = {} # stores the uniqueID code from kegg and its correspondence with the external entity identifier 67 # This is used later when inserting relations 68 temp_code = None 69 70 71 72 # PARSE COMPOUND FILE 73 compound_f = file(kegg_ligand_path+"compound","r") 74 75 entry_regex = re.compile("^ENTRY\s+(\w+)\s+.*\s+Compound") 76 remark_regex = re.compile("^REMARK\s+Same\sas\:\s+(.+)$") 77 formula_regex = re.compile("^FORMULA\s+(.+)$") 78 comment_regex = re.compile("^COMMENT\s+(.+)$") 79 80 peptide_regex = re.compile("^ENTRY.+Peptide.+Compound") 81 sequence_regex = re.compile("^SEQUENCE\s+(.+)$") 82 83 kegg_object = None 84 85 temp_value = [] # List used to store the information of those fields that can have more than a single line 86 current_field = None 87 88 temp_pathway_codes = [] 89 90 is_peptide = None 91 92 for line in compound_f: 93 94 m = entry_regex.match(line) 95 96 if m: 97 if peptide_regex.match(line): 98 is_peptide = 1 99 else: 100 is_peptide = None 101 102 if kegg_object is not None: 103 self.biana_access.insert_new_external_entity( externalEntity = kegg_object ) 104 kegg_elements_dict[temp_code] = kegg_object.get_id() 105 [ pathway_dict_components[actual_pathway_code].append(kegg_object.get_id()) for actual_pathway_code in temp_pathway_codes ] 106 107 id_type = None 108 if is_peptide: 109 kegg_object = ExternalEntity( source_database = self.database, type="protein" ) 110 #id_type = "keggCompound" 111 id_type = "keggCode" 112 else: 113 kegg_object = ExternalEntity( source_database = self.database, type="compound" ) 114 #id_type = "keggProtein" 115 id_type = "keggCode" 116 117 kegg_object.add_attribute( ExternalEntityAttribute( attribute_identifier = id_type, value = m.group(1), type="unique" ) ) 118 temp_code = m.group(1) 119 temp_pathway_codes = [] 120 121 continue 122 123 m = pathway_regex.search(line) 124 if m: 125 temp_pathway_codes.append(m.group(2)) 126 if not pathway_dict_desc.has_key(m.group(2)): 127 pathway_dict_desc[m.group(2)] = m.group(3) 128 pathway_dict_components[m.group(2)] = [] 129 continue 130 131 new_field = field_regex.match(line) 132 133 if new_field: 134 135 if current_field == "NAME": 136 [ kegg_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "name", value = x, type= "unique") ) for x in temp_value ] 137 elif current_field == "FORMULA": 138 kegg_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "formula", value = " ".join(temp_value)) ) 139 140 elif current_field == "COMMENT": 141 kegg_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "description", value = " ".join(temp_value) ) ) 142 143 elif current_field == "SEQUENCE": 144 #eliminate all between parenthesis and split by spaces 145 if is_peptide: 146 sequence_list = space_regex.split( parenthesis_regex.sub('', " ".join(temp_value)).strip() ) 147 if "(Disulfide" in sequence_list: 148 print 149 print parenthesis_regex.sub('', " ".join(temp_value)) 150 sequence = [ ProteinSequence.get_aminoacid_code_3to1( code = actual_residue.replace("-NH2","").replace("Acetyl-","").replace("6-Bromo-","").replace("N-Formyl-Met","") ) for actual_residue in sequence_list ] 151 kegg_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "ProteinSequence", value = ProteinSequence("".join(sequence)) )) 152 153 154 #kegg_object 155 156 elif current_field == "REMARK": 157 for current_remark_line in temp_value: 158 m = remark_regex.match(current_field+" "+current_field) 159 if m: 160 #print m.group(1) 161 [ kegg_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "keggCode", value=x,type="cross-reference")) for x in m.group(1).split(" ") ] 162 163 current_field = new_field.group(1) 164 temp_value = [new_field.group(2).strip()] 165 else: 166 cont_value = continue_field_regex.match(line) 167 if cont_value: 168 temp_value.append(cont_value.group(1).strip()) 169 170 171 172 # Insert the last one 173 if kegg_object is not None: 174 self.biana_access.insert_new_external_entity( externalEntity = kegg_object ) 175 kegg_elements_dict[temp_code] = kegg_object.get_id() 176 177 compound_f.close() 178 179 180 ######################################################################################### 181 182 # PARSE DRUG FILE 183 drug_f = file(kegg_ligand_path+"drug","r") 184 185 entry_regex = re.compile("^ENTRY\s+(\w+)\s+.*\s+Drug") 186 remark_regex = re.compile("^REMARK\s+Same\sas\:\s+(.+)$") 187 formula_regex = re.compile("^FORMULA\s+(.+)$") 188 189 kegg_object = None 190 191 temp_value = [] # List used to store the information of those fields that can have more than a single line 192 current_field = None 193 194 temp_pathway_codes = [] 195 196 197 for line in drug_f: 198 199 200 m = entry_regex.match(line) 201 202 if m: 203 if kegg_object is not None: 204 self.biana_access.insert_new_external_entity( externalEntity = kegg_object ) 205 kegg_elements_dict[temp_code] = kegg_object.get_id() 206 [ pathway_dict_components[actual_pathway_code].append(kegg_object.get_id()) for actual_pathway_code in temp_pathway_codes ] 207 208 kegg_object = ExternalEntity( source_database = self.database, type="drug" ) 209 kegg_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "keggCode", value= m.group(1), type="unique") ) 210 temp_code = m.group(1) 211 temp_pathway_codes = [] 212 213 continue 214 215 216 m = pathway_regex.search(line) 217 if m: 218 temp_pathway_codes.append(m.group(2)) 219 if not pathway_dict_desc.has_key(m.group(2)): 220 pathway_dict_desc[m.group(2)] = m.group(3) 221 pathway_dict_components[m.group(2)] = [] 222 continue 223 224 225 new_field = field_regex.match(line) 226 if new_field: 227 if current_field == "NAME": 228 [ kegg_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "name", value=x, type="unique") ) for x in temp_value ] 229 elif current_field == "FORMULA": 230 kegg_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "formula", value=" ".join(temp_value)) ) 231 232 elif current_field == "COMMENT": 233 kegg_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "description", value=" ".join(temp_value)) ) 234 235 elif current_field == "REMARK": 236 for current_remark_line in temp_value: 237 m = remark_regex.match(current_field+" "+current_field) 238 if m: 239 #print m.group(1) 240 [ kegg_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "keggCode", value=x,type="cross-reference")) for x in m.group(1).split(" ") ] 241 242 current_field = new_field.group(1) 243 temp_value = [new_field.group(2).strip()] 244 else: 245 cont_value = continue_field_regex.match(line) 246 if cont_value: 247 temp_value.append(cont_value.group(1).strip()) 248 249 250 # Insert the last one 251 if kegg_object is not None: 252 self.biana_access.insert_new_external_entity( externalEntity = kegg_object ) 253 kegg_elements_dict[temp_code] = kegg_object.get_id() 254 255 drug_f.close() 256 257 258 ######################################################################################### 259 260 # PARSE GLYCAN FILE 261 glycan_f = file(kegg_ligand_path+"glycan","r") 262 263 entry_regex = re.compile("^ENTRY\s+(\w+)\s+.*\s+Glycan") 264 formula_regex = re.compile("^COMPOSITION\s+(.+)$") 265 remark_regex = re.compile("^REMARK\s+Same\sas\:\s+(.+)$") 266 267 268 kegg_object = None 269 270 temp_value = [] # List used to store the information of those fields that can have more than a single line 271 current_field = None 272 273 temp_pathway_codes = [] 274 275 for line in glycan_f: 276 277 278 m = entry_regex.match(line) 279 280 if m: 281 if kegg_object is not None: 282 self.biana_access.insert_new_external_entity( externalEntity = kegg_object ) 283 kegg_elements_dict[temp_code] = kegg_object.get_id() 284 [ pathway_dict_components[actual_pathway_code].append(kegg_object.get_id()) for actual_pathway_code in temp_pathway_codes ] 285 286 kegg_object = ExternalEntity( source_database = self.database, type="glycan" ) 287 kegg_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "keggCode", value=m.group(1),type="unique") ) 288 temp_code = m.group(1) 289 temp_pathway_codes = [] 290 291 continue 292 293 m = pathway_regex.search(line) 294 if m: 295 temp_pathway_codes.append(m.group(2)) 296 if not pathway_dict_desc.has_key(m.group(2)): 297 pathway_dict_desc[m.group(2)] = m.group(3) 298 pathway_dict_components[m.group(2)] = [] 299 continue 300 301 new_field = field_regex.match(line) 302 if new_field: 303 304 if current_field == "NAME": 305 [ kegg_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "name", value = x,type="unique") ) for x in temp_value ] 306 elif current_field == "COMPOSITION": 307 kegg_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "formula", value = " ".join(temp_value)) ) 308 309 elif current_field == "COMMENT": 310 kegg_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "description", value = " ".join(temp_value)) ) 311 312 elif current_field == "REMARK": 313 for current_remark_line in temp_value: 314 m = remark_regex.match(current_field+" "+current_field) 315 if m: 316 #print m.group(1) 317 [ kegg_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "keggCode", value=x,type="cross-reference")) for x in m.group(1).split(" ") ] 318 319 current_field = new_field.group(1) 320 temp_value = [new_field.group(2).strip()] 321 else: 322 cont_value = continue_field_regex.match(line) 323 if cont_value: 324 temp_value.append(cont_value.group(1).strip()) 325 326 327 # Insert the last one 328 if kegg_object is not None: 329 self.biana_access.insert_new_external_entity( externalEntity = kegg_object ) 330 kegg_elements_dict[temp_code] = kegg_object.get_id() 331 332 glycan_f.close() 333 334 335 336 337 ######################################################################################### 338 339 # PARSE ENZIME FILE 340 enzyme_f = file(kegg_ligand_path+"enzyme","r") 341 342 entry_regex = re.compile("^ENTRY\s+EC\s*([\d\.]+)\s+.*\s+Enzyme") 343 344 kegg_object = None 345 346 temp_value = [] # List used to store the information of those fields that can have more than a single line 347 current_field = None 348 349 temp_pathway_codes = [] 350 351 sysname_regex = re.compile("^SYSNAME\s+(.+)$") 352 structure_regex = re.compile("PDB\:\s+(.+)$") 353 354 for line in enzyme_f: 355 356 m = entry_regex.match(line) 357 358 if m: 359 if kegg_object is not None: 360 self.biana_access.insert_new_external_entity( externalEntity = kegg_object ) 361 kegg_elements_dict[temp_code] = kegg_object.get_id() 362 [ pathway_dict_components[actual_pathway_code].append(kegg_object.get_id()) for actual_pathway_code in temp_pathway_codes ] 363 364 kegg_object = ExternalEntity( source_database = self.database, type="enzyme" ) 365 kegg_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "EC", value=m.group(1),type="unique") ) 366 temp_code = m.group(1) 367 temp_pathway_codes = [] 368 369 continue 370 371 372 m = pathway_regex.search(line) 373 if m: 374 temp_pathway_codes.append(m.group(2)) 375 if not pathway_dict_desc.has_key(m.group(2)): 376 pathway_dict_desc[m.group(2)] = m.group(3) 377 pathway_dict_components[m.group(2)] = [] 378 continue 379 380 m = sysname_regex.match(line) 381 if m: 382 kegg_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "name", value=m.group(1), type="unique") ) 383 384 385 new_field = field_regex.match(line) 386 if new_field: 387 388 if current_field == "NAME": 389 [ kegg_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "name", value=x, type="unique") ) for x in temp_value ] 390 391 elif current_field == "COMMENT": 392 kegg_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "description", value = " ".join(temp_value)) ) 393 394 elif current_field == "STRUCTURES": 395 all_str = " ".join(temp_value).strip() 396 m = structure_regex.search(all_str) 397 if m: 398 for actual_pdb in space_regex.split(m.group(1)): 399 kegg_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "pdb", value = actual_pdb) ) 400 401 current_field = new_field.group(1) 402 temp_value = [new_field.group(2).strip()] 403 else: 404 cont_value = continue_field_regex.match(line) 405 if cont_value: 406 temp_value.append(cont_value.group(1).strip()) 407 408 409 # Insert the last one 410 if kegg_object is not None: 411 self.biana_access.insert_new_external_entity( externalEntity = kegg_object ) 412 kegg_elements_dict[temp_code] = kegg_object.get_id() 413 414 enzyme_f.close() 415 416 417 418 ######################################################################################### 419 420 # PARSE REACTION FILE 421 reaction_f = file(kegg_ligand_path+"reaction","r") 422 423 entry_regex = re.compile("^ENTRY\s+(\w+)\s+.*\s+Reaction") 424 enzyme_regex = re.compile("^ENZYME\s+([\d\.\s]+)$") 425 equation_regex = re.compile("^EQUATION\s+(.+)\s*\<\=\>\s+(.+)\s*$") 426 427 # special case for dna 428 parenthesis_regex = re.compile("\(.+\)") 429 430 kegg_object = None 431 432 temp_value = [] # List used to store the information of those fields that can have more than a single line 433 current_field = None 434 435 temp_pathway_codes = [] 436 437 for line in reaction_f: 438 439 m = entry_regex.match(line) 440 441 if m: 442 if kegg_object is not None: 443 self.biana_access.insert_new_external_entity( externalEntity = kegg_object ) 444 kegg_elements_dict[temp_code] = kegg_object.get_id() 445 [ pathway_dict_components[actual_pathway_code].append(kegg_object.get_id()) for actual_pathway_code in temp_pathway_codes ] 446 447 kegg_object = ExternalEntityRelation( source_database = self.database, relation_type="reaction" ) 448 kegg_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "keggCode", value=m.group(1),type="unique")) 449 temp_code = m.group(1) 450 temp_pathway_codes = [] 451 452 continue 453 454 455 m = pathway_regex.search(line) 456 if m: 457 temp_pathway_codes.append(m.group(2)) 458 if not pathway_dict_desc.has_key(m.group(2)): 459 pathway_dict_desc[m.group(2)] = m.group(3) 460 pathway_dict_components[m.group(2)] = [] 461 continue 462 463 new_field = field_regex.match(line) 464 if new_field: 465 if current_field == "NAME": 466 [ kegg_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "name", value=x,type="unique") ) for x in temp_value ] 467 468 elif current_field == "COMMENT": 469 kegg_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "description", value=" ".join(temp_value)) ) 470 471 elif current_field == "ENZYME": 472 m = enzyme_regex.match(current_field+" "+" ".join(temp_value).strip()) 473 if m: 474 #get the externalEntityID for this enzymeID (stored in memory) and add as a participant 475 for actual_enzyme in space_regex.split(m.group(1)): 476 kegg_object.add_participant( externalEntityID = kegg_elements_dict[actual_enzyme] ) 477 kegg_object.add_participant_attribute( externalEntityID = kegg_elements_dict[actual_enzyme], 478 participantAttribute = ExternalEntityRelationParticipantAttribute( attribute_identifier = "role", 479 value = "catalyst" ) ) 480 481 elif current_field == "EQUATION": 482 m = equation_regex.match(current_field+" "+" ".join(temp_value)) 483 if m: 484 substrates = m.group(1) 485 products = m.group(2) 486 487 # Add substrates 488 for actual_substrat in [ x.strip() for x in substrates.split(" + ") ]: 489 splitted = actual_substrat.split(" ") 490 if len(splitted)==1: 491 num = 1 492 #code = splitted[0] 493 code = parenthesis_regex.sub('', splitted[0]) 494 495 elif len(splitted)==2: 496 num = splitted[0].replace('n','').replace('(','').replace(')','') 497 #code = splitted[1] 498 code = parenthesis_regex.sub('', splitted[1]) 499 else: 500 raise ValueError("How is possible to have more than 2 elements?") 501 502 try: 503 kegg_object.add_participant( externalEntityID = kegg_elements_dict[code] ) 504 kegg_object.add_participant_attribute( externalEntityID = kegg_elements_dict[code], 505 participantAttribute = ExternalEntityRelationParticipantAttribute( attribute_identifier = "role", 506 value = "substrate" ) ) 507 508 if num != '': 509 if int(num)>1: 510 kegg_object.add_participant_attribute( externalEntityID = kegg_elements_dict[code], 511 participantAttribute = ExternalEntityRelationParticipantAttribute( attribute_identifier = "cardinality", 512 value = num ) ) 513 except: 514 sys.stderr.write("Kegg element %s is not defined in kegg database\n" %code) 515 516 # Add products 517 for actual_product in [ x.strip() for x in products.split(" + ") ]: 518 splitted = actual_product.split(" ") 519 if len(splitted)==1: 520 num = 1 521 #code = splitted[0] 522 code = parenthesis_regex.sub('', splitted[0]) 523 elif len(splitted)==2: 524 num = splitted[0].replace('n','').replace('(','').replace(')','') 525 #code = splitted[1] 526 code = parenthesis_regex.sub('', splitted[1]) 527 else: 528 raise ValueError("How is possible to have more than 2 elements? [ %s ]\nPRODUCTS: %s" %(actual_product,products)) 529 530 try: 531 kegg_object.add_participant( externalEntityID = kegg_elements_dict[code] ) 532 kegg_object.add_participant_attribute( externalEntityID = kegg_elements_dict[code], 533 participantAttribute = ExternalEntityRelationParticipantAttribute( attribute_identifier = "role", 534 value = "product" ) ) 535 if num != '': 536 if int(num)>1: 537 kegg_object.add_participant_attribute( externalEntityID = kegg_elements_dict[code], 538 participantAttribute = ExternalEntityRelationParticipantAttribute( attribute_identifier = "cardinality", 539 value = num ) ) 540 except: 541 sys.stderr.write("Kegg element %s is not defined in kegg database\n" %code) 542 543 current_field = new_field.group(1) 544 temp_value = [new_field.group(2).strip()] 545 else: 546 cont_value = continue_field_regex.match(line) 547 if cont_value: 548 temp_value.append(cont_value.group(1).strip()) 549 550 551 # Insert the last one 552 if kegg_object is not None: 553 self.biana_access.insert_new_external_entity( externalEntity = kegg_object ) 554 555 reaction_f.close() 556 557 558 559 560 ############################### 561 562 # INSERT PATHWAYS INFORMATION 563 564 # Each distinct pathway is inserted as an external entity 565 # All the elements mapped to the pathway are assigned to them 566 # Maybe it would be sufficient to map the reactions, as the components are mapped to the reactions... 567 568 for actual_pathway_code in pathway_dict_desc.keys(): 569 kegg_pathway_object = ExternalEntityRelation( source_database = self.database, relation_type="pathway" ) 570 kegg_pathway_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "keggCode", value=actual_pathway_code, type="unique") ) 571 kegg_pathway_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "description", value=pathway_dict_desc[actual_pathway_code] ) ) 572 573 [ kegg_pathway_object.add_participant( externalEntityID = actual_participant_external_entity_id ) 574 for actual_participant_external_entity_id in pathway_dict_components[actual_pathway_code] ] 575 576 self.biana_access.insert_new_external_entity( externalEntity = kegg_pathway_object )
577