1
2 """
3 File : uniprot2piana.py
4 Author : Ramon Aragues, Javier Garcia Garcia
5 Creation : 16.3.2004
6 Modified : Javier Garcia Garcia December 2007
7 Contents : fills up tables in database piana with information from uniprot
8 Called from :
9
10 =======================================================================================================
11
12 This file implements a program that fills up tables in database piana with information of uniprot databases
13
14 This parser uses biopython libraries and methods
15
16 Command line option '--help' describes usage of this program
17
18 For more details on how to use it, read piana/README.populate_piana_db
19 """
20
21
22 from bianaParser import *
23 import re, sys, sets
24
26 """
27 Generic Parser Class
28 """
29
30 name = "generic"
31 description = "This file implements a program that fills up tables in BIANA database from data in a tabulated file"
32 external_entity_definition = ""
33 external_entity_relations = ""
34
35 mandatory_columns = sets.Set(["id", "type"])
36 mandatory_relation_columns = sets.Set(["id", "interactor_id_list", "type"])
37
39
40
41
42 BianaParser.__init__(self, default_db_description = "Generic Tabulated parser",
43 default_script_name = "GenericParser.py",
44 default_script_description = GenericParser.description,
45 additional_compulsory_arguments = [("default-attribute=",None,"Name of the default identifier that this database gives (such as uniprotentry)")])
46
48 """
49 Method that implements the specific operations of a general tabulated file
50 """
51
52 value_separator = "|"
53 participant_re = re.compile("(.+):\s*(.+)")
54
55
56 self.biana_access.store_relations_hierarchy = True
57
58 self.initialize_input_file_descriptor()
59
60 self.in_external_entities = False
61 self.external_entity_fields = None
62 self.external_entity_ids_dict = {}
63
64 self.in_external_entity_relations = False
65 self.external_entity_relation_fields = None
66
67 for line in self.input_file_fd:
68
69 line = line.strip()
70
71 if line=="":
72 continue
73
74 if line.startswith("@EXTERNAL_ENTITY_DATA"):
75 self.in_external_entities = True
76 self.in_external_entity_relations = False
77 continue
78 elif line.startswith("@EXTERNAL_ENTITY_RELATION_DATA"):
79 self.in_external_entities = False
80 self.in_external_entity_relations = True
81 continue
82
83
84 if self.in_external_entities:
85 if self.external_entity_fields is None:
86 values = re.split("\t+",line.strip())
87 column_to_index = dict([ (i.lower(),j) for i,j in zip(values, range(len(values))) ])
88 for x in self.mandatory_columns:
89 if not column_to_index.has_key(x):
90 raise Exception("External Entity %s column not found" % x)
91 self.external_entity_fields = column_to_index
92 else:
93 values = re.split("\t+",line.strip())
94
95
96 new_external_entity = ExternalEntity( source_database = self.database,
97 type = values[self.external_entity_fields["type"]].strip() )
98
99 for x,i in self.external_entity_fields.iteritems():
100 if x in self.mandatory_columns:
101 continue
102 if values[i].strip()!="-":
103 for current_value in values[i].split(value_separator):
104 current_value = current_value.strip()
105 attribute_identifier = x
106 if attribute_identifier.lower()=="proteinsequence":
107 current_value = ProteinSequence(current_value)
108 new_external_entity.add_attribute( ExternalEntityAttribute( attribute_identifier= attribute_identifier,
109 value=current_value,
110 type="cross-reference") )
111
112
113 self.external_entity_ids_dict[values[self.external_entity_fields["id"]]] = self.biana_access.insert_new_external_entity( externalEntity = new_external_entity )
114
115
116
117 elif self.in_external_entity_relations:
118 if self.external_entity_relation_fields is None:
119 values = re.split("\t+",line.strip())
120 column_to_index = dict([ (i.lower(),j) for i,j in zip(values, range(len(values))) ])
121 for x in self.mandatory_relation_columns:
122 if not column_to_index.has_key(x):
123 raise Exception("External Entity Relation %s column not found" % x)
124 self.external_entity_relation_fields = column_to_index
125 else:
126 values = re.split("\t+",line.strip())
127
128 new_external_entity_relation = ExternalEntityRelation( source_database = self.database,
129 relation_type = values[self.external_entity_relation_fields["type"]].strip() )
130
131 for id in values[self.external_entity_relation_fields["interactor_id_list"]].split(value_separator):
132 id = id.strip()
133 new_external_entity_relation.add_participant( externalEntityID = self.external_entity_ids_dict[id] )
134
135 for current_attribute,index in self.external_entity_relation_fields.iteritems():
136 if current_attribute in self.mandatory_relation_columns:
137 continue
138 v = values[index]
139 if v.strip()!="-":
140 if current_attribute.startswith("participants:"):
141 current_attribute = current_attribute.replace("participants:", '')
142 for current_value in v.split(value_separator):
143 s = participant_re.search(current_value.strip())
144 if s:
145 participant_id = s.group(1)
146 attribute_value = s.group(2)
147 else:
148 sys.stderr.write("Format error, check file format!\n")
149
150
151 new_external_entity_relation.add_participant_attribute( externalEntityID = self.external_entity_ids_dict[participant_id],
152 participantAttribute = ExternalEntityRelationParticipantAttribute( attribute_identifier = current_attribute,
153 value = attribute_value ) )
154 else:
155 for current_value in v.split(value_separator):
156 current_value = current_value.strip()
157 new_external_entity_relation.add_attribute( ExternalEntityRelationAttribute( attribute_identifier = current_attribute,
158 value = current_value ) )
159
160
161 self.external_entity_ids_dict[values[self.external_entity_relation_fields["id"]]] = self.biana_access.insert_new_external_entity( externalEntity = new_external_entity_relation )
162 else:
163 sys.stderr.write("Format error, check file format!\n")
164