1 """
2 BIANA: Biologic Interactions and Network Analysis
3 Copyright (C) 2009 Javier Garcia-Garcia, Emre Guney, Baldo Oliva
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>.
17
18 """
19
20
21 from bianaParser import *
22
23
25 """
26 Genpept Parser class
27 """
28
29 name = "ncbi_genpept"
30 description = "Inserts NCBI Genpet database into BIANA"
31 external_entity_definition = "A external entity represents a protein"
32 external_entity_relations = ""
33
44
45
46
47
49
50 def insert_fsa_aa_files(arg,dirname,names):
51 for name in names:
52 if name.endswith('.fsa_aa'):
53 file_fd = open(os.path.join(dirname,name),'r')
54 elif name.endswith('.fsa_aa.gz'):
55 file_fd = gzip.open(os.path.join(dirname,name),'r')
56 else:
57 continue
58
59 self.parse_fasta_file(file_fd)
60
61
62
63 file_fd.close()
64
65
66 self.protein_number=0
67
68 self.dict_name_tax = self.biana_access.get_taxonomy_names_taxID_dict()
69
70 if( len(self.dict_name_tax) == 0 ):
71 raise ValueError("Taxonomies have not been inserted. You must previously insert information about taxonomy")
72
73
74 self.accepted_coll_accessions = { "gb": None,
75 "dbj": None,
76 "emb": None}
77
78
79 os.path.walk(self.input_file,insert_fsa_aa_files,None)
80
81
82
84
85
86
87
88
89
90
91 if self.verbose:
92 sys.stderr.write("Loading taxonomies\n")
93
94
95 if self.verbose:
96 sys.stderr.write( "Processing file\n")
97
98
99 sequence = []
100 protein_title_line = None
101
102 for line in fasta_file_fd:
103
104 if line[0]==">":
105
106 if len(sequence)>0:
107 self.parse_genpept_entry(header_line = protein_title_line, sequence = "".join(sequence))
108
109 protein_title_line = line[1:]
110 sequence = []
111 else:
112 sequence.append(line.strip())
113
114 if len(sequence)>0:
115 self.parse_genpept_entry(header_line = protein_title_line, sequence = "".join(sequence))
116
117
118 - def parse_genpept_entry(self, header_line, sequence):
119
120 genpept_object = ExternalEntity( source_database = self.database, type="protein" )
121
122 self.protein_number += 1
123
124 self.add_to_log("Number of proteins processed")
125
126 if self.verbose:
127 sys.stderr.write( "===================================\n")
128 sys.stderr.write( " NEW ENTRY\n")
129 sys.stderr.write( "===================================\n")
130
131
132 if self.time_control:
133 if self.protein_number%20000==0:
134 sys.stderr.write("%s proteins done in %s seconds\n" %(self.protein_number,time.time()-self.initial_time))
135
136 protein_title_line = header_line
137 protein_sequence = sequence
138
139 if protein_sequence:
140 genpept_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "proteinsequence",
141 value = ProteinSequence(protein_sequence)) )
142
143 if self.verbose:
144 sys.stderr.write("title line is : %s\n" %(protein_title_line))
145
146
147
148
149
150
151
152
153
154 title_atoms = protein_title_line.split('|')
155
156 gi_id = int(title_atoms[1])
157
158 accessionNum = None
159 accessionVersion = None
160
161 if self.accepted_coll_accessions.has_key(title_atoms[2]):
162
163 coll_acc = title_atoms[3].strip()
164
165
166 try:
167 splited = re.split("\.",coll_acc)
168 accessionNum = splited[0]
169 accessionVersion = splited[1]
170 except:
171 accessionNum = coll_acc
172
173 temp = []
174 for x in xrange(4,len(title_atoms)):
175 temp.append(title_atoms[x])
176 temp_protein_name = "".join(temp)
177
178
179
180
181 starts = []
182 ends = []
183
184 for x in xrange(len(temp_protein_name)):
185 if temp_protein_name[x]=='[':
186 starts.append(x)
187 elif temp_protein_name[x]==']':
188 ends.append(x)
189
190 starts.reverse()
191 ends.reverse()
192
193 try:
194 limit = starts[0]
195
196 for x in xrange(1,len(ends)):
197 if ends[x]>starts[x-1]:
198 limit = starts[x]
199 continue
200 limit = starts[x-1]
201 break
202
203
204 species_name = temp_protein_name[limit+1:-1]
205
206 try:
207
208 tax_id_value = self.dict_name_tax[species_name.lower()]
209 genpept_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "taxID",
210 value = tax_id_value ))
211 except:
212
213 try:
214 tax_id_value = self.dict_name_tax[species_name.split(",")[0].lower()]
215 genpept_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "taxID",
216 value = tax_id_value ))
217 except:
218
219 try:
220 tax_id_value = self.dict_name_tax[species_name.split("=")[0].lower()]
221 genpept_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "taxID",
222 value = tax_id_value ))
223
224 except:
225 if self.verbose:
226 sys.stderr.write("%s not found\n" %(species_name))
227
228
229 genpept_object.add_attribute( ExternalEntityAttribute(attribute_identifier = "description",
230 value = temp_protein_name[:limit].strip() ) )
231
232 self.add_to_log("Number of protein descriptions inserted")
233
234 except:
235 genpept_object.add_attribute( ExternalEntityAttribute(attribute_identifier = "description",
236 value = temp_protein_name.strip() ) )
237 self.add_to_log("Number of protein descriptions inserted")
238
239
240 """
241 Now, we have a proteinPiana for the (sequence, tax_id), either the one already existing before or a new one.
242
243 Just add protein information to database tables.
244
245 TO DO!! this would be a good place to check for consistency between uniprot and genbank, using embl accessions
246 """
247
248 if gi_id:
249 genpept_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "gi",
250 value = gi_id,
251 type = "unique" ) )
252 self.add_to_log("Number of gi codes inserted")
253
254 if accessionNum != "":
255 if accessionVersion is None:
256 genpept_object.add_attribute( ExternalEntityAttribute(attribute_identifier = "accessionNumber",
257 value = accessionNum,
258 type = "unique" ) )
259 else:
260 genpept_object.add_attribute( ExternalEntityAttribute(attribute_identifier = "accessionNumber",
261 value = accessionNum,
262 version = accessionVersion,
263 type = "unique"))
264 self.add_to_log("Number of accessionNumber inserted")
265
266
267
268
269 if self.verbose:
270 sys.stderr.write( "reading next record\n")
271
272
273 self.biana_access.insert_new_external_entity( externalEntity = genpept_object )
274