1 """
2 BIANA: Biologic Interactions and Network Analysis
3 Copyright (C) 2009 Javier Garcia-Garcia, Emre Guney, Baldo Oliva
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>.
17
18 """
19
20
21 """
22 File : nr2piana.py
23 Author : Ramon Aragues & Javier Garcia
24 Modified : Javier Garcia October 2007
25 Creation : 10.2004
26 Contents : fills up tables in database piana with information from nr
27 Called from :
28 =======================================================================================================
29
30 This file implements a program that fills up tables in database piana with information from nr
31
32 nr can be downloaded from ftp://ftp.ncbi.nih.gov/blast/db/FASTA/nr.gz
33
34 Before running, taxonomy table of piana must be populated (use taxonomy2piana for that)
35 """
36
37
38
39
40
41 from bianaParser import *
42
43
45 """
46 NCBI nr Parser Class
47 """
48
49 name = "nr"
50 description = "This file implements a program that fills up tables in BIANA database with information from nr"
51 external_entity_definition = "A external entity represents a protein"
52 external_entity_relations = ""
53
64
65
67 """
68 Method that implements the specific operations of nr parser
69 """
70
71
72
73 verbose_all = 0
74
75 self.initialize_input_file_descriptor()
76
77
78
79 self.protein_number=0
80 self.total_entries=0
81 self.total_inconsistencies=0
82
83
84
85
86
87
88
89
90 self.initialize_input_file_descriptor()
91
92
93 if self.verbose:
94 print "Processing file"
95
96
97 if self.verbose:
98 print "Reading gi/taxonomy information..."
99
100 dict_name_tax = self.biana_access.get_taxonomy_names_taxID_dict()
101
102
103
104 inconsistencies = {}
105
106
107
108 sequence = []
109 protein_title_line = None
110
111 for line in self.input_file_fd:
112
113 if line[0]==">":
114 if len(sequence)>0:
115 self.parse_nr_record(header_line = protein_title_line, sequence = "".join(sequence))
116 protein_title_line = line[1:]
117 sequence = []
118 else:
119 sequence.append(line.strip())
120
121 if len(sequence)>0:
122 self.parse_nr_record(header_line = protein_title_line, sequence = "".join(sequence))
123
124
126
127 self.protein_number += 1
128 self.add_to_log("Number of proteins in input file")
129
130
131 if self.time_control:
132 if self.protein_number%20000==0:
133 sys.stderr.write("%s proteins\t%s seq-taxid entries\tin %s seconds\n" %(self.protein_number,self.total_entries,time.time()-self.initial_time))
134
135 protein_title_line = header_line
136 protein_sequence = sequence
137
138
139
140 """
141 Now, we have a proteinPiana that will be asigned to all other codes that we find in the title line
142 """
143
144
145
146
147
148
149
150
151
152
153
154
155
156 title_entries = protein_title_line.split("\001")
157
158
159 for title_entrie in title_entries:
160
161
162 nr_object = ExternalEntity( source_database = self.database, type="protein" )
163
164 nr_object.add_attribute(ExternalEntityAttribute( attribute_identifier = "proteinsequence",
165 value = ProteinSequence(protein_sequence)))
166
167 self.total_entries += 1
168
169 gi = None
170
171 if self.verbose:
172 sys.stderr.write("One entry of the title is: %s\n" %(title_entrie))
173
174 title_atoms = title_entrie.split('|')
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216 accessionNumber = None
217 uniprot_accession = None
218 uniprot_entry = None
219 ref_seq = None
220 pdb_code = None
221 pdb_chain = None
222 pir = None
223 description = None
224 species_name = None
225
226
227 for i in xrange(len(title_atoms)):
228 if title_atoms[i] == "gi":
229 if gi is None:
230 gi = int(title_atoms[i+1])
231 nr_object.add_attribute(ExternalEntityAttribute(attribute_identifier = "gi",
232 value = gi,
233 type = "unique" ))
234 else:
235 sys.stderr.write(" Error parsing... how an entry can have several gi values? \n")
236 elif title_atoms[i] == "gb" or title_atoms[i] == "emb" or title_atoms[i] == "dbj":
237 if accessionNumber is None:
238 accession = re.search("(\w+)\.(\d+)",title_atoms[i+1])
239 if accession:
240 accessionNumber = accession.group(1)
241 accessionNumber_version = accession.group(2)
242 nr_object.add_attribute(ExternalEntityAttribute(attribute_identifier="accessionNumber",
243 value = accessionNumber,
244 version = accessionNumber_version,
245 type = "unique" ))
246 else:
247 sys.stderr.write(" Error parsing... how an entry can have several accesionNumber.version values? \n")
248 elif title_atoms[i] == "pir":
249 if pir is None:
250 pirRe = re.search("(\w+)\.*",title_atoms[i+2])
251 if pirRe:
252 pir = pirRe.group(1)
253 nr_object.add_attribute(ExternalEntityAttribute(attribute_identifier = "pir",
254 value = pir,
255 type = "cross-reference" ))
256 else:
257 sys.stderr.write(" Error parsing... How an entry can have several pir values? \n")
258
259 elif title_atoms[i] == "sp":
260 if uniprot_accession is None:
261 uniprot_accession = title_atoms[i+1][0:6]
262 else:
263 sys.stderr.write(" There are more than one uniprot accession for the same gi... \n")
264 if uniprot_entry is None:
265
266
267 if len(title_atoms) > (i+2):
268 uniprot_entry = (title_atoms[i+2].split())[0].strip()
269 nr_object.add_attribute(ExternalEntityAttribute(attribute_identifier = "uniprotentry",
270 value = uniprot_entry,
271 type = "cross-reference"))
272 else:
273 sys.stderr.write(" There are more than one uniprot entry for the same gi... \n")
274 elif title_atoms[i] == "ref":
275 if ref_seq is None:
276 ref_seq = title_atoms[i+1].strip().split(".")
277 nr_object.add_attribute(ExternalEntityAttribute(attribute_identifier = "refseq",
278 value = ref_seq[0],
279 version = ref_seq[1],
280 type = "cross-reference"))
281 else:
282 sys.stderr.write(" There are more than one uniprot RefSeq for the same gi... \n")
283 elif title_atoms[i] == "pdb":
284 if pdb_code is None:
285 pdb_code = title_atoms[i+1].strip()
286 pdb_chain = title_atoms[i+2][0].strip()
287 nr_object.add_attribute(ExternalEntityAttribute(attribute_identifier = "pdb",
288 value = pdb_code,
289 additional_fields = { "chain": pdb_chain },
290 type = "cross-reference"))
291 else:
292 sys.stderr.write(" There are more than one PDB for the same gi... \n")
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315 starts = []
316 ends = []
317
318 for x in xrange(len(title_entrie)):
319 if title_entrie[x]=='[':
320 starts.append(x)
321 elif title_entrie[x]==']':
322 ends.append(x)
323
324 starts.reverse()
325 ends.reverse()
326
327 try:
328 limit = starts[0]
329
330 for x in xrange(1,len(ends)):
331 if ends[x]>starts[x-1]:
332 limit = starts[x]
333 continue
334 limit = starts[x-1]
335 break
336
337 species_name = title_entrie[limit+1:-1]
338
339 try:
340
341 tax_id_value = dict_name_tax[species_name.lower()]
342 nr_object.add_attribute( ExternalEntityAttribute( attribute_identifier="taxid", value=tax_id_value) )
343
344 except:
345
346 try:
347 tax_id_value = dict_name_tax[species_name.split(",")[0].lower()]
348 nr_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "taxID", value = tax_id_value ))
349 except:
350
351 try:
352 tax_id_value = dict_name_tax[species_name.split("=")[0].lower()]
353 nr_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "taxID", value = tax_id_value ))
354
355 except:
356 if self.verbose:
357 sys.stderr.write("%s not found\n" %(species_name))
358
359
360 nr_object.add_attribute( ExternalEntityAttribute(attribute_identifier = "description",
361 value = title_entrie[:limit].strip() ) )
362
363 except:
364 nr_object.add_attribute( ExternalEntityAttribute(attribute_identifier = "description",
365 value = title_entrie.strip()))
366
367
368 self.biana_access.insert_new_external_entity( externalEntity = nr_object )
369
370
371
372 if self.verbose:
373 sys.stderr.write( "\n---------------reading next record---------------\n")
374
375
376
377