1 """
2 BIANA: Biologic Interactions and Network Analysis
3 Copyright (C) 2009 Javier Garcia-Garcia, Emre Guney, Baldo Oliva
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>.
17
18 """
19
20 """
21 File : biopaxLevel2Parser.py
22 Author : Javier Garcia Garcia
23 Creation : June 2008
24 Contents : fills up tables in database biana with information from a BIOPAX formatted database
25 Called from :
26
27 =======================================================================================================
28 """
29
30 from bianaParser import *
31 from xml.sax import saxutils, handler, make_parser
32 from XMLNode import XMLNode
33 import copy
34 import sets
35
36
38
39 resources = None
40 controlled_relations = None
41
42 database = None
43 dbaccess = None
44 _not_recognized = sets.Set()
45
46 datatype_to_biana_type = { "uniprot": "uniprotaccession",
47 "ncbi_taxonomy": "taxID",
48 "tigr": "TIGR",
49 "tigr cna": "TIGR",
50 "tigr eha": "TIGR",
51 "tigr osa": "TIGR",
52 "reactome": "Reactome",
53 "chebi": "CHEBI",
54 "go": "GO",
55 "pubchem compound" : "PubchemCompound",
56 "glycan" : "keggCode",
57 "compound" : "keggCode",
58
59 "entrez": "refseq",
60 "pubmed": "pubmed",
61 "embl": "AccessionNumber",
62 "ensembl": "ensembl",
63 "wormbase": "wormbasesequencename",
64 "sgd": "sgd",
65 "flybase": "flybase" }
66
70
71
72 return str(a).split(".")
74 if a.startswith("REACT_"):
75 return a[6:]
76 else:
77 return a
78
79 datatype_operations = {}
80 for x in datatype_to_biana_type:
81 datatype_operations[x] = _identity
82
83 datatype_operations["entrez"] = _entrez_funct
84 datatype_operations["reactome"] = _reactome_funct
85
87
88 self.rdf_id = XMLNode.attrs["rdf:ID"]
89
90 self.synonyms = []
91 self.comments = []
92 self.data_source = None
93 self.short_name = None
94 self.availability = None
95 self.name = None
96 self.xrefs = []
97 self.synonyms = []
98
99 self.organism = None
100 self.set_attributes(XMLNode)
101
102 self.biana_object = None
103
105 """
106 """
107 for current_child in XMLNode.getChilds():
108 if current_child.name == "bp:NAME":
109 self.name = current_child.getValue()
110 elif current_child.name == "bp:ORGANISM":
111 self.organism = current_child.attrs["rdf:resource"]
112 elif current_child.name == "bp:XREF":
113 self.xrefs.append(current_child.attrs["rdf:resource"])
114 elif current_child.name == "bp:SYNONYMS":
115 self.synonyms.append(current_child.getValue())
116 elif current_child.name == "bp:SHORT_NAME":
117 self.short_name = current_child.getValue()
118 elif current_child.name == "bp:COMMENT":
119 self.comments.append(current_child.getValue())
120
121
123 raise ValueError("%s has not implemented _get_biana_object" %self)
124
125
127 """
128 Add general attributes to external entity and adds it to the database
129
130 returns the external entity id assigned to it
131 """
132
133
134 externalEntity = self._get_biana_object()
135
136 if externalEntity is None:
137 return
138
139 if externalEntity.get_id() is not None:
140 return externalEntity.get_id()
141
142 externalEntity = self._get_biana_object()
143
144 if externalEntity is None:
145 return
146
147
148
149
150 if self.name is not None:
151 externalEntity.add_attribute( ExternalEntityAttribute( attribute_identifier = "description", value = self.name ) )
152
153
154 if self.short_name is not None:
155 externalEntity.add_attribute( ExternalEntityAttribute( attribute_identifier = "name", value = self.short_name, type = "synonym" ) )
156
157
158
159
160 [ externalEntity.add_attribute( ExternalEntityAttribute( attribute_identifier = "name", value = x, type = "synonym" ) ) for x in self.synonyms ]
161
162
163 if self.organism:
164 organism_obj = BiopaxEntity.resources[self.organism]
165 if organism_obj.tax_ref is not None:
166 externalEntity.add_attribute( ExternalEntityAttribute( attribute_identifier = "taxID", value = BiopaxEntity.resources[organism_obj.tax_ref].id, type = "unique" ) )
167
168
169 [ externalEntity.add_attribute( ExternalEntityAttribute( attribute_identifier = "description", value = x ) ) for x in self.comments ]
170
171
172 split_regex = re.compile("(FUNCTION|CATALYTIC ACTIVITY|DISEASE|SUBCELLULAR LOCATION|SIMILARITY|DATABASE)")
173
174 for current_comment in self.comments:
175 t = split_regex.split(current_comment)
176 for x in xrange(len(t)):
177 if current_comment[x] == "FUNCTION":
178 externalEntity.add_attribute( ExternalEntityAttribute( attribute_identifier = "function", value = current_comment[x+1] ) )
179 x+=1
180 elif current_comment[x] == "DISEASE":
181 externalEntity.add_attribute( ExternalEntityAttribute( attribute_identifier = "disease", value = current_comment[x+1] ) )
182 x+=1
183 elif current_comment[x] == "SUBCELLULAR LOCATION":
184 externalEntity.add_attribute( ExternalEntityAttribute( attribute_identifier = "SubcellularLocation", value = current_comment[x+1] ) )
185 x+=1
186 elif current_comment[x] == "CATALYTIC ACTIVITY":
187
188 x+=1
189
190
191
192 ec_regex = re.compile("EC\s+(\d*\.\d*\.\d*.\d*)")
193 mim_regex = re.compile("\[MIM\:(\d+)\]")
194
195 if self.name is not None:
196 for current_ec in ec_regex.findall(self.name):
197 externalEntity.add_attribute( ExternalEntityAttribute( attribute_identifier = "ec", value = current_ec, type = "cross-reference" ) )
198
199 if self.short_name is not None:
200 for current_ec in ec_regex.findall(self.short_name):
201 externalEntity.add_attribute( ExternalEntityAttribute( attribute_identifier = "ec", value = current_ec, type = "cross-reference" ) )
202
203 for current_comment in self.comments:
204 for current_ec in ec_regex.findall(current_comment):
205 externalEntity.add_attribute( ExternalEntityAttribute( attribute_identifier = "ec", value = current_ec, type = "cross-reference" ) )
206 for current_mim in mim_regex.findall(current_comment):
207 externalEntity.add_attribute( ExternalEntityAttribute( attribute_identifier = "mim", value = current_mim, type = "cross-reference" ) )
208
209
210 for current_xref in self.xrefs:
211 xref_object = BiopaxEntity.resources[current_xref]
212 if xref_object.id is not None and xref_object.db is not None:
213 if xref_object.db.lower() not in BiopaxEntity.datatype_to_biana_type:
214 if xref_object.db not in BiopaxEntity._not_recognized:
215 print xref_object.db, " not recognized"
216 BiopaxEntity._not_recognized.add(xref_object.db)
217 else:
218 value = BiopaxEntity.datatype_operations[xref_object.db.lower()](xref_object.id)
219 if not isinstance(value,list):
220 externalEntity.add_attribute( ExternalEntityAttribute( attribute_identifier = BiopaxEntity.datatype_to_biana_type[xref_object.db.lower()],
221 value = value,
222 type = "cross-reference" ) )
223 else:
224 externalEntity.add_attribute( ExternalEntityAttribute( attribute_identifier = BiopaxEntity.datatype_to_biana_type[xref_object.db.lower()],
225 value = value[0],
226 version = value[1],
227 type = "cross-reference" ) )
228
229
230
231 if BiopaxEntity.controlled_relations.has_key('#'+self.rdf_id):
232 for current_controller in BiopaxEntity.controlled_relations['#'+self.rdf_id]:
233 control_obj = BiopaxEntity.resources['#'+current_controller.rdf_id]
234
235
236
237 if control_obj.controller_xref is None:
238 print "control object %s has no contoller_xref" %control_obj.rdf_id
239 continue
240
241 controller = BiopaxEntity.resources[control_obj.controller_xref]
242
243 participant_eEid = controller.toBiana()
244
245 if participant_eEid is None:
246 raise ValueError("In BiopaxEntity. %s" %controller)
247 externalEntity.add_participant( externalEntityID = participant_eEid )
248 if control_obj.control_type is None:
249 raise ValueError("How is it possible to not have a controller role?")
250 externalEntity.add_participant_attribute( externalEntityID = participant_eEid,
251 participantAttribute = ExternalEntityRelationParticipantAttribute( attribute_identifier = "role", value = control_obj.control_type ) )
252
253 BiopaxEntity.dbaccess.insert_new_external_entity( externalEntity = externalEntity )
254
255
256
257
258
259
260
261
262
263 return externalEntity.get_id()
264
265
266
268 """
269 Biopax definition: any additional special characteristics of a physical entity in the context of an interaction or complex. These currently include stoichiometric coefficient and cellular location, but this list may be expanded in later levels.
270 """
271
273
274 self.cellular_location_xref = None
275 self.stoichiometric_coefficient = None
276 self.physical_entity_xref = None
277 self.sequence_features_list = []
278 BiopaxEntity.__init__(self, XMLNode)
279
281 for current_child in XMLNode.getChilds():
282 if current_child.name == "bp:CELLULAR-LOCATION":
283 self.cellular_location_xref = current_child.attrs["rdf:resource"]
284 elif current_child.name == "bp:PHYSICAL-ENTITY":
285 self.physical_entity_xref = current_child.attrs["rdf:resource"]
286 elif current_child.name == "bp:STOICHIOMETRIC-COEFFICIENT":
287 self.stoichiometric_coefficient = current_child.getValue()
288 elif current_child.name == "bp:SEQUENCE-FEATURE-LIST":
289
290 pass
291 BiopaxEntity.set_attributes(self,XMLNode)
292
298
316
317
355
356
358 """
359 A step in a patwhay
360 Multiple interactions may occur in a pathway step, each should be listed in the STEP-INTERACTIONS property.
361 """
362
369
371
372 for current_child in XMLNode.getChilds():
373 if current_child.name == "bp:NEXT-STEP":
374 self.next_xref = current_child.attrs["rdf:resource"]
375 elif current_child.name == "bp:STEP-INTERACTIONS":
376 self.step_interactions_xref.append(current_child.attrs["rdf:resource"])
377
380
381
389
390
393
394
395
396
397
398
416
417
425
426
428
430 print "CONVERSION NOT WELL IMPLEMENTED YET!!!"
431 if self.biana_object is None:
432 self.biana_object = ExternalEntityRelation( source_database = BiopaxEntity.database, relation_type = "reaction" )
433 return self.biana_object
434
435
446
447
457
458
460
468
470 for current_child in XMLNode.getChilds():
471 if current_child.name == "bp:LEFT":
472 self.left_xrefs.append(current_child.attrs["rdf:resource"])
473 elif current_child.name == "bp:RIGHT":
474 self.right_xrefs.append(current_child.attrs["rdf:resource"])
475 elif current_child.name == "bp:EC-NUMBER":
476 self.ec_number.append(current_child.getValue())
477 BiopaxEntity.set_attributes(self,XMLNode)
478
512
513
515 """
516 An interaction in which one entity regulates, modifies, or otherwise influences another. Two types of control interactions are defined: activation and inhibition
517 """
518
519
520
531
533
534 for current_child in XMLNode.getChilds():
535 if current_child.name == "bp:CONTROLLER":
536 self.controller_xref = current_child.attrs["rdf:resource"]
537 elif current_child.name == "bp:CONTROLLED":
538 self.controlled_xref = current_child.attrs["rdf:resource"]
539 elif current_child.name == "bp:CONTROL-TYPE":
540 control_type = current_child.getValue()
541 if control_type == "ACTIVATION":
542 self.control_type = "activates"
543 elif control_type == "INHIBITION":
544 self.control_type = "inhibits"
545 elif control_type == "INHIBITION-ALLOSTERIC":
546 self.control_type = "allosteric_inhibition"
547 elif control_type == "INHIBITION-COMPETITIVE":
548 self.control_type = "competitive_inhibition"
549 elif control_type == "INHIBITION-IRREVERSIBLE":
550 self.control_type = "irreversible_inhibition"
551 elif control_type == "INHIBITION-NONCOMPETITIVE":
552 self.control_type = "non_competitive_inhibition"
553 elif control_type == "INHIBITION-OTHER":
554 self.control_type = "inhibits"
555 elif control_type == "INHIBITION-UNCOMPETITIVE":
556 self.control_type = "uncompetitive_inhibition"
557 elif control_type == "ACTIVATION-NONALLOSTERIC":
558 self.control_type = "nonallosteric_activation"
559 elif control_type == "ACTIVATION-ALLOSTERIC":
560 self.control_type = "allosteric_activation"
561 else:
562 raise ValueError("Control type %s not recognized" %control_type)
563
564 BiopaxPhysicalInteraction.set_attributes(self,XMLNode)
565
568
569
571 """
572 A control interaction in which a physical entity (a catalyst) increases the rate of a conversion interaction by lowering its activation energy. Instances of this class describe a pairing between a catalyzing entity and a catalyzed conversion
573 """
574
580
590
591
595
596
603
604
634
635
645
652
659
666
667
669
674
676 for current_child in XMLNode.getChilds():
677 if current_child.name == "bp:DB":
678 self.db = current_child.getValue()
679 elif current_child.name == "bp:ID":
680 self.id = current_child.getValue()
681
684
685
687
694
696 for current_child in XMLNode.getChilds():
697 if current_child.name == "bp:NAME":
698 self.name = current_child.getValue()
699 elif current_child.name == "bp:TAXON-XREF":
700 self.tax_ref = current_child.attrs["rdf:resource"]
701 else:
702 print current_child.name," not recognized"
703
706
711
712
714
719
721 for current_child in XMLNode.getChilds():
722 if current_child.name == "bp:TERM":
723 self.term = current_child.getValue()
724 elif current_child.name == "bp:XREF":
725 self.xref = current_child.attrs["rdf:resource"]
726
729
730
732 """
733
734 """
735
736 name = "biopax_level_2"
737 description = "This file implements a program that fills up tables in database biana with information of a BIOPAX Level 2 formatted database"
738 external_entity_definition = ""
739 external_entity_relations = ""
740
742
743
744
745 BianaParser.__init__(self, default_db_description = "Biopax formatted database",
746 default_script_name = "biopaxLevel2Parser.py",
747 default_script_description = BiopaxLevel2Parser.description,
748 additional_compulsory_arguments = [("default-attribute=",None,"Name of the default identifier that this database gives (such as reactome)")])
749
750
752 """
753 Class to handle content in Biopax Level2 XML files
754 """
755
759 return str(a).replace("gi|","")
760
761 datatype_operations = { "uniprot": _identity,
762 "ncbi_taxonomy": _identity,
763 "tigr": _identity,
764 "reactome": _identity,
765 "chebi": _identity,
766 "go": _identity,
767 "pubchem compound" : _identity,
768 "glycan" : _identity,
769 "compound" : _identity,
770 "entrez": _entrez_funct }
771
772 biopax_objects_dict = { "bp:unificationxref": BiopaxXREF,
773 "bp:relationshipxref": BiopaxXREF,
774 "bp:publicationxref": BiopaxXREF,
775 "bp:opencontrolledvocabulary": BiopaxOpenControlledVocabulary,
776 "bp:biosource": BiopaxBioSource,
777 "bp:protein": BiopaxProtein,
778 "bp:complex": BiopaxComplex,
779 "bp:dna": BiopaxDNA,
780 "bp:rna": BiopaxRNA,
781 "bp:smallmolecule": BiopaxSmallMolecule,
782 "bp:physicalentity": BiopaxSmallMolecule,
783 "bp:pathway": BiopaxPathway,
784 "bp:interaction": BiopaxInteraction,
785 "bp:physcialinteraction": BiopaxPhysicalInteraction,
786 "bp:conversion": BiopaxConversion,
787 "bp:control": BiopaxControl,
788 "bp:biochemicalreaction": BiopaxBiochemicalReaction,
789 "bp:complexassembly": BiopaxComplexAssembly,
790 "bp:transport": BiopaxTransport,
791 "bp:catalysis": BiopaxCatalysis,
792 "bp:modulation": BiopaxModulation,
793 "bp:sequenceparticipant": BiopaxPhysicalEntityParticipant,
794 "bp:physicalentityparticipant": BiopaxPhysicalEntityParticipant,
795 "bp:pathwaystep": BiopaxPathwayStep }
796
797
798
799
800
801
802
803
806
808
809 print "initalizing BiopaxLevel2Handler"
810
811 self.current_XMLNode = None
812 self.step = 0
813 self.xmlnode_hierarchylist = []
814 self.biopaxElements = {}
815
816 BiopaxEntity.resources = self.biopaxElements
817 BiopaxEntity.controlled_relations = {}
818
819 handler.ContentHandler.__init__(self)
820
822
823 if xref_id[0] == '#':
824 xref_id = xref_id[1:]
825
826 try:
827 return self.unification_xrefs[xref_id]
828 except:
829 return self._get_cross_ref( xref_id = self.recursive_xref[xref_id] )
830
832
833 if xref_id[0] == '#':
834 xref_id = xref_id[1:]
835
836 return self.links[ xref_id ]
837
838
839
842
845
847 if self.current_XMLNode is None:
848 self.current_XMLNode = XMLNode(name = name, attrs = attrs)
849 else:
850 t = XMLNode(name = name, attrs = attrs)
851 self.current_XMLNode.addChild(t)
852 self.xmlnode_hierarchylist.append(self.current_XMLNode)
853 self.current_XMLNode = t
854
872
873
875
876 self.current_XMLNode.addValue(text.replace('.<','<').replace('br>','>').encode("ascii","ignore"))
877
878
879
881 for current_element in self.biopaxElements.values():
882 current_element.toBiana()
883
884
885
887 """
888 Class for parsing individual XML files obeying BIOPAX Level 2 standards
889 """
890
892 self.fileName = None
893 self.file = None
894 self.listEntry = []
895 self.handler = BiopaxLevel2Parser.BiopaxLevel2Handler()
896 self.saxParser = make_parser()
897 self.saxParser.setContentHandler(self.handler)
898 return
899
901 if self.file is not None and not self.file.closed:
902 self.file.close()
903 return
904
907
909 self.__init__()
910 if fileName is not None:
911 self.fileName = fileName
912 self.file = open(fileName)
913 self.saxParser.parse(self.fileName)
914 self.handler.toBiana()
915 if not self.file.closed:
916 self.file.close()
917 return
918
919
921 """
922 """
923 BiopaxEntity.database = self.database
924 BiopaxEntity.dbaccess = self.biana_access
925
926
927
928 self.biana_access.store_relations_hierarchy = True
929
930 parser = self.BiopaxLevel2XMLParser(self.verbose)
931
932 if os.path.isdir(self.input_file):
933 files_list = os.listdir(self.input_file)
934 if not self.input_file.endswith(os.sep):
935 self.input_file += os.sep
936 files_list = [ self.input_file+x for x in files_list ]
937 else:
938 files_list = [self.input_file]
939
940 for current_file in files_list:
941 if current_file.endswith(".owl"):
942 print "Parsing file %s" %current_file
943 it = time.time()
944 parser.parseFile(current_file)
945 if self.time_control:
946 print "Done in %s seconds" %(time.time()-it)
947