1 """
2 BIANA: Biologic Interactions and Network Analysis
3 Copyright (C) 2009 Javier Garcia-Garcia, Emre Guney, Baldo Oliva
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>.
17
18 """
19
20 from bianaParser import *
21
22
24 """
25 Uniprot Parser Class
26 """
27
28 name = "uniprot"
29 description = "This file implements a program that fills up tables in database biana with information of uniprot databases"
30 external_entity_definition = "A external entity represents a protein"
31 external_entity_relations = ""
32
43
45 """
46 Method that implements the specific operations of uniprot parser
47
48 If executing in self.mode "tables", it is necessary to insert the tables here
49 """
50
51 protein_number=0
52
53
54
55 new_regex = re.compile("^\/\/\s*$")
56
57
58 id_regex = re.compile("^ID\s+(\S+)\s*")
59
60 ac_regex = re.compile("^AC\s+(.+);\s*$")
61 ac_version_regex = re.compile("sequence version (\d+)")
62 de_regex = re.compile("^DE\s+(.+)\s*$")
63 taxID_regex = re.compile("^OX\s+NCBI_TaxID=(\d+);")
64 keyword_regex = re.compile("^KW\s+(.+);$")
65
66
67
68 geneName_regex = re.compile("^GN")
69 gene_name_regex = re.compile("Name=([^;]+);")
70 gene_orf_name_regex = re.compile("ORFNames=([^;]+);")
71 gene_synonyms_regex = re.compile("Synonyms=([^;]+);")
72 gene_orderedLocusNames = re.compile("OrderedLocusNames=([^;]+);")
73
74
75 cross_regex = re.compile("^DR")
76
77 pfam_regex = re.compile("^DR\s+Pfam;\s*(\S+);")
78 kegg_regex = re.compile("^DR\s+KEGG;\s*(\S+);")
79 interpro_regex = re.compile("^DR\s+InterPro;\s*(\S+);")
80 prosite_regex = re.compile("^DR\s+PROSITE;\s*(\S+);")
81 prodom_regex = re.compile("^DR\s+ProDom;\s*(\S+);")
82 mim_regex = re.compile("^DR\s+MIM;\s*(\S+);")
83 pir_regex = re.compile("^DR\s+PIR;\s*(\S+);")
84 prints_regex = re.compile("^DR\s+PRINTS;\s*(\S+);")
85 ensembl_regex = re.compile("^DR\s+Ensembl;\s*(\S+);")
86 embl_regex = re.compile("^DR\s+EMBL;\s*(\S+);")
87 geneID_regex = re.compile("^DR\s+GeneID;\s*(\S+);")
88 go_regex = re.compile("^DR\s+GO;\s*GO\:(\d+);")
89 refseq_regex = re.compile("^DR\s+RefSeq;\s*(\S+);")
90 unigene_regex = re.compile("^DR\s+UniGene;\s*(\S+);")
91 hgnc_regex = re.compile("^DR\s+HGNC;\s*HGNC\:(\d+);")
92 pdb_regex = re.compile("^DR\s+PDB;\s*(\S+);.+;.+;(.+).")
93 flybase_regex = re.compile("^DR\s+FlyBase;\s*(\S+);")
94 mgi_regex = re.compile("^DR\s+MGI;\s*MGI:(\d+);")
95 reactome_regex = re.compile("^DR\s*Reactome;\s*REACT_(\d+);")
96 sgd_regex = re.compile("^DR\s+SGD;\s*(\w+);")
97
98
99 tigr_regex = re.compile("^DR\s+TIGR\;\s+(.+)\;")
100
101 dip_regex = re.compile("^DR\s+DIP\;\s+DIP\:(.+)\;")
102 cygd_regex = re.compile("^DR\s+CYGD\;\s+(.+)\;")
103
104 WormPep_regex = re.compile("^DR\s+WormPep\;\s+(.+)\;\s*CE(\d+)\.\s*$")
105 WormBase_regex = re.compile("^DR\s+WormBase\;\s*WBGene(\d+)\;\s*(.+)\.\s*$")
106 rgd_regex = re.compile("^DR\s+RGD\;\s+(\d+)\;")
107
108
109 sequence_regex = re.compile("^\s+(.+)$")
110
111
112 new_comment_regex = re.compile("^CC\s+\-\!\-")
113 general_comment_regex = re.compile("^CC\s+(.+)$")
114 subcellular_location_regex = re.compile("SUBCELLULAR LOCATION:\s*(.*)$")
115 function_regex = re.compile("FUNCTION:\s*(.*)$")
116 disease_regex = re.compile("DISEASE:\s*(.*)$")
117
118
119 uniprotObject = ExternalEntity( source_database = self.database, type="protein" )
120
121
122 description = []
123 sequence = []
124 comments = { "SubcellularLocation": [],
125 "Disease": [],
126 "Function": [] }
127
128 actual_comment = None
129
130 self.initialize_input_file_descriptor()
131
132 uniprot_accession_list = []
133
134
135 for line in self.input_file_fd:
136
137
138 if new_regex.match(line):
139
140 if uniprotObject is not None:
141
142 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="proteinSequence", value=ProteinSequence("".join(sequence))))
143
144
145 if len(description)>0:
146 desc_str = " ".join(description)
147 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="description", value=desc_str))
148
149
150
151
152
153
154
155
156
157 if desc_str != "":
158 enzymes = re.findall("EC=(.+\..+\..+\..+)\;", desc_str)
159 for enzyme in enzymes:
160 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="EC", value=enzyme, type="cross-reference"))
161
162
163 if len(comments["Function"])>0:
164 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="function", value= " ".join(comments["Function"])))
165
166 if len(comments["Disease"])>0:
167 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="disease", value = " ".join(comments["Disease"])))
168
169 if len(comments["SubcellularLocation"])>0:
170 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="subcellularLocation", value = " ".join(comments["SubcellularLocation"])))
171
172
173 description = []
174 sequence = []
175 comments = { "SubcellularLocation": [],
176 "Disease": [],
177 "Function": [] }
178 actual_comment = None
179
180
181 self.biana_access.insert_new_external_entity( externalEntity = uniprotObject )
182
183
184
185 uniprotObject = ExternalEntity( source_database = self.database, type="protein" )
186 protein_number += 1
187
188 sequence = []
189
190 if self.time_control:
191 if protein_number%20000==0:
192 sys.stderr.write("%s proteins done in %s seconds\n" %(protein_number,time.time()-self.initial_time))
193
194
195 m = id_regex.match(line)
196 if m:
197 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="uniprotentry", value=m.group(1), type="unique"))
198 continue
199
200 m = ac_regex.match(line)
201 if m:
202 uniprot_accession_list.extend([ x.strip() for x in m.group(1).split(";") ])
203 continue
204
205 m = ac_version_regex.search(line)
206 if m:
207
208
209
210 for i in range(uniprot_accession_list):
211 x = uniprot_accession_list[i]
212 if i == 0:
213 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="uniprotaccession", value=x, version=m.group(1), type="unique"))
214 else:
215 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="uniprotaccession", value=x, version=m.group(1), type="previous"))
216 uniprot_accession_list = []
217 continue
218
219 m = de_regex.match(line)
220 if m:
221 description.append( m.group(1) )
222 continue
223
224 m = taxID_regex.match(line)
225 if m:
226 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="taxID", value=m.group(1)))
227 continue
228
229 m = keyword_regex.match(line)
230 if m:
231 [ uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="keyword", value=x)) for x in m.group(1).split(";") ]
232 continue
233
234
235 m = sequence_regex.match(line)
236 if m:
237 sequence.append( m.group(1).replace(" ","") )
238
239
240 m = geneName_regex.match(line)
241
242 if m:
243 m = gene_name_regex.search(line)
244 if m:
245 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="geneSymbol", value=m.group(1),type="unique"))
246
247 m = gene_orf_name_regex.search(line)
248 if m:
249 [ uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="ORFName", value=x,type="alias")) for x in m.group(1).split(",") ]
250
251 m = gene_synonyms_regex.search(line)
252 if m:
253 [ uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="geneSymbol", value=x, type="synonym")) for x in m.group(1).split(",") ]
254
255 m = gene_orderedLocusNames.search(line)
256 if m:
257 [ uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="OrderedLocusName", value=x, type="alias")) for x in m.group(1).split(",") ]
258
259
260 continue
261
262
263
264 m = general_comment_regex.match(line)
265 if m:
266
267 if( new_comment_regex.match(line)):
268 actual_comment = None
269 m = subcellular_location_regex.search(line)
270 if m:
271 actual_comment = "SubcellularLocation"
272 else:
273 m = function_regex.search(line)
274 if m:
275 actual_comment = "Function"
276 else:
277 m = disease_regex.search(line)
278 if m:
279 actual_comment = "Disease"
280
281 if actual_comment is not None:
282 comments[actual_comment].append(m.group(1))
283
284 else:
285 if actual_comment is not None:
286 comments[actual_comment].append(m.group(1))
287
288
289
290 if cross_regex.match(line):
291
292 m = WormBase_regex.match(line)
293 if m:
294 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="WormBaseGeneID",
295 value=m.group(1),type="cross-reference"))
296 continue
297
298 m = WormPep_regex.match(line)
299 if m:
300 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="WormBaseSequenceName",
301 value=m.group(1),type="cross-reference"))
302 continue
303
304 m = dip_regex.match(line)
305 if m:
306 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="DIP",
307 value=m.group(1),type="cross-reference"))
308 continue
309
310 m = tigr_regex.match(line)
311 if m:
312 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="tigr",
313 value=m.group(1),type="cross-reference"))
314
315 continue
316
317 m = cygd_regex.match(line)
318 if m:
319 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="cygd",
320 value=m.group(1),type="cross-reference"))
321
322 continue
323
324
325 m = rgd_regex.match(line)
326 if m:
327 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="rgd",
328 value=m.group(1),type="cross-reference"))
329 continue
330
331 m = pfam_regex.match(line)
332 if m:
333 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="pfam", value=m.group(1),type="cross-reference"))
334 continue
335
336 m = kegg_regex.match(line)
337 if m:
338 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="kegggene", value=m.group(1),type="cross-reference"))
339 continue
340
341 m = interpro_regex.match(line)
342 if m:
343 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="interpro", value=m.group(1),type="cross-reference"))
344 continue
345
346 m = prosite_regex.match(line)
347 if m:
348 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="prosite", value=m.group(1),type="cross-reference"))
349 continue
350
351 m = prodom_regex.match(line)
352 if m:
353 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="prodom", value=m.group(1), type="cross-reference"))
354 continue
355
356 m = mim_regex.match(line)
357 if m:
358 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="mim", value=m.group(1)))
359 continue
360
361 m = pir_regex.match(line)
362 if m:
363 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="pir", value=m.group(1), type="cross-reference"))
364 continue
365
366 m = prints_regex.match(line)
367 if m:
368 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="prints", value=m.group(1), type="cross-reference"))
369 continue
370
371 m = ensembl_regex.match(line)
372 if m:
373 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="ensembl", value=m.group(1), type="cross-reference"))
374 continue
375
376 m = embl_regex.match(line)
377 if m:
378 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="accessionNumber", value=m.group(1), type="cross-reference"))
379 continue
380
381 m = geneID_regex.match(line)
382 if m:
383 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="geneID", value=m.group(1), type="cross-reference"))
384 continue
385
386 m = go_regex.match(line)
387 if m:
388 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="go", value=m.group(1), type="cross-reference"))
389 continue
390
391 m = refseq_regex.match(line)
392 if m:
393 rs = m.group(1).split('.')
394 if len(rs)==2:
395 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="refseq", value=rs[0], version=rs[1], type="cross-reference"))
396 else:
397 print "Refseq %s has no version?" %m.group(1)
398 continue
399
400 m = unigene_regex.match(line)
401 if m:
402 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="unigene", value = m.group(1), type="cross-reference"))
403 continue
404
405 m = hgnc_regex.match(line)
406 if m:
407 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="hgnc", value=m.group(1),type="cross-reference"))
408 continue
409
410 m = pdb_regex.match(line)
411 if m:
412 pdb_code = m.group(1)
413
414 fragments = m.group(2).split(",")
415
416 for actual_frag in fragments:
417 m = re.search("\s*(.+)=(.+)\s*",actual_frag)
418 if m:
419 chains = m.group(1).split("/")
420 m = re.search("(\d+)-(\d+)",m.group(2))
421 if m:
422 range = "%s-%s" %(m.group(1),m.group(2))
423
424 [ uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="pdb", value=pdb_code, type = "cross-reference",
425 additional_fields = {"chain": x,
426 "pdb_range": range })) for x in chains ]
427 else:
428 [ uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="pdb", value=pdb_code, type="cross-reference",
429 additional_fields = {"chain": x})) for x in chains ]
430
431 continue
432
433 m = flybase_regex.match(line)
434 if m:
435 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="flybase", value=m.group(1), type = "cross-reference"))
436 continue
437
438 m = mgi_regex.match(line)
439 if m:
440 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="MGI", value=m.group(1),type="cross-reference"))
441 continue
442
443
444 m = reactome_regex.match(line)
445 if m:
446 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="reactome", value = m.group(1), type="cross-reference"))
447 continue
448
449 m = sgd_regex.match(line)
450 if m:
451 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="SGD", value=m.group(1), type="cross-reference"))
452
453 continue
454