1 """
2 BIANA: Biologic Interactions and Network Analysis
3 Copyright (C) 2009 Javier Garcia-Garcia, Emre Guney, Baldo Oliva
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>.
17
18 """
19
20 """
21 File : pfamParser.py
22 Author : Javier Garcia
23 Creation : 19 December 2007
24 Contents : fills up tables in database biana with information from PFAM
25 Called from :
26 =======================================================================================================
27
28 """
29
30
31
32
33 from bianaParser import *
34
35
37 """
38 PFAM Parser Class
39 """
40
41 name = "pfam"
42 description = "PFAM Parser. Collection of multiple sequence alignments and hidden markov models covering many common protein domains and families"
43 external_entity_definition = "A external entity represents a pfam domain"
44 external_entity_relations = ""
45
47
48
49
50 BianaParser.__init__(self, default_db_description = "Collection of multiple sequence alignments and hidden markov models covering many common protein domains and families",
51 default_script_name = "pfamParser.py",
52 default_script_description = PFAMParser.description,
53 additional_optional_arguments = [("include-sequence-mapping",True,"Includes sequence information and maps pfam to them"),
54 ("pfamA-file-name=","Pfam-A.full.gz","Name of the pfam A file. If None, it is not used"),
55 ("pfamB-file-name=","Pfam-B.gz","Name of the pfam B file. If None, it is not used"),
56 ("pfamSeq-file-name=","pfamseq.gz","Name of the file with pfam sequences (only used if \"include-sequence-mapping\" option is selected")])
57 self.default_eE_attribute = "pfam"
58
59
60
62 """
63 Method that implements the specific operations of PFAM parser
64 """
65
66
67 self.biana_access._add_transfer_attribute( externalDatabaseID = self.database.get_id(),
68 key_attribute = "uniprotaccession",
69 transfer_attribute="pfam" )
70
71 self.biana_access._add_transfer_attribute( externalDatabaseID = self.database.get_id(),
72 key_attribute = "uniprotentry",
73 transfer_attribute="pfam" )
74
75 if not self.input_file.endswith(os.sep):
76 self.input_file += os.sep
77
78 self.uniprot_acc_seq_md5_dict = {}
79
80 self.insert_mapping = None
81
82
83
84 if self.arguments_dic["include-sequence-mapping"]:
85
86 sequences_parsing_time = time.time()
87
88 self.seq_database = ExternalDatabase( databaseName = self.sourcedb_name+"_sequences",
89 databaseVersion = self.sourcedb_version,
90 databaseFile = self.input_file.split(os.sep)[-1],
91 databaseDescription = self.database_description,
92 defaultExternalEntityAttribute = "proteinSequence")
93
94 self.biana_access.insert_new_external_database( externalDatabase = self.seq_database )
95
96 self.insert_mapping = 1
97
98 pfamseq_file = self.input_file+self.arguments_dic["pfamSeq-file-name"]
99
100 if pfamseq_file.endswith(".gz"):
101 pfamseq_input_file_fd = gzip.open(pfamseq_file,'r')
102 else:
103 pfamseq_input_file_fd = file(pfamseq_file, 'r')
104
105 sequence = []
106 protein_title_line = None
107
108 protein_number = 0
109
110
111
112 uniprot_acc = None
113 uniprot_acc_version = None
114 uniprot_entry = None
115 sequence = None
116
117 self.title_regex = re.compile("^(\w+)\.(\d+)\s+(\S+)")
118
119 for line in pfamseq_input_file_fd:
120
121 if line[0]==">":
122 protein_number += 1
123
124 if self.time_control:
125 if protein_number%20000==0:
126 sys.stderr.write("%s proteins in %s seconds\n" %(protein_number,time.time()-self.initial_time))
127
128 if sequence is not None:
129 self.parse_pfam_seq_record(header_line = protein_title_line, sequence = "".join(sequence))
130
131 protein_title_line = line[1:]
132 sequence = []
133 else:
134 sequence.append(line.strip())
135
136 if len(sequence)>0:
137 self.parse_pfam_seq_record(header_line = protein_title_line, sequence = "".join(sequence))
138
139
140 self.seq_database.set_parsing_time( int(time.time() - sequences_parsing_time) )
141 self.biana_access.update_external_database_external_entity_attributes( self.seq_database )
142
143 pfamseq_input_file_fd.close()
144
145
146 if self.arguments_dic["pfamA-file-name"].lower() != "none":
147 self.parse_pfam_file(pfamType='A', pfamFile= self.input_file+self.arguments_dic["pfamA-file-name"])
148
149
150 if self.arguments_dic["pfamB-file-name"].lower() != "none":
151 self.parse_pfam_file(pfamType='B', pfamFile= self.input_file+self.arguments_dic["pfamB-file-name"])
152
153
155
156 pfamseq_object = ExternalEntity( source_database = self.seq_database, type="protein" )
157
158 m = self.title_regex.match(header_line)
159
160 if m:
161 uniprot_acc = m.group(1)
162 uniprot_acc_version = m.group(2)
163 uniprot_entry = m.group(3)
164 else:
165 raise "ERROR in parsing line %s" %header_line
166
167 pfamseq_object.add_attribute(ExternalEntityAttribute( attribute_identifier = "uniprotentry", value = uniprot_entry,type = "cross-reference" ))
168 pfamseq_object.add_attribute(ExternalEntityAttribute( attribute_identifier = "uniprotaccession", value = uniprot_acc, version = uniprot_acc_version, type = "cross-reference"))
169
170 sequenceObject = ProteinSequence(sequence.strip())
171
172 pfamseq_object.add_attribute(ExternalEntityAttribute( attribute_identifier = "proteinSequence", value = sequenceObject))
173
174 self.uniprot_acc_seq_md5_dict["%s.%s" %(uniprot_acc, uniprot_acc_version)] = sequenceObject.get_sequence_MD5()
175
176 self.biana_access.insert_new_external_entity( externalEntity = pfamseq_object )
177
178
179
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200 if pfamFile.endswith(".gz"):
201 pfam_input_file_fd = gzip.open(pfamFile,'r')
202 else:
203 pfam_input_file_fd = file(pfamFile, 'r')
204
205 line_number=0
206
207
208 pfam_object = ExternalEntity( source_database = self.database, type="pattern")
209
210 search_id = re.compile("^#=GF ID\s+(.+)$")
211 search_description = re.compile("^#=GF DE\s+(.+)$")
212 search_prosite = re.compile("^#=GF DR\s+PROSITE;\s+(\w+);")
213 search_interpro = re.compile("^#=GF DR\s+INTERPRO;\s+(\w+);")
214 search_scop = re.compile("^#=GF DR\s+SCOP;\s+(\w+);\s+(\w+);")
215 search_prints = re.compile("^#=GF DR\s+PRINTS;\s+(\w+);")
216 search_pfam_ac = re.compile("^#=GF AC\s+(.+)\.*(\d*)$")
217 search_pfama = re.compile("^#=GF DR\s+PFAMA;\s+(.+)\.*(\d*);$")
218 search_pfamb = re.compile("^#=GF DR\s+PFAMB;\s+(.+);$")
219 search_prodom = re.compile("^#=GF DR\s+PRODOM;\s+(.+);$")
220 search_end = re.compile("^\/\/$")
221
222
223
224 search_uniprot = re.compile("^#=GS\s+([\w\_]+)\/(\d+\-\d+)\s+AC\s+(\w+)\.(\d+)")
225
226
227 search_pdb_cross_ref = re.compile("^#=GS\s+[\w\_]+\/\d+\-\d+\s+DR\s+PDB;\s+(\w+)\s+(\w*);\s+(\d+)\-(\d+)")
228
229 entries = 0
230
231 for line in pfam_input_file_fd:
232
233 pfamAC = None
234 pfamDescription = None
235 pfamType = pfamType
236 pfamName = None
237
238 line_number += 1
239
240 try:
241 line.strip()
242
243 if( search_end.match(line) ):
244
245
246
247 self.biana_access.insert_new_external_entity( externalEntity = pfam_object )
248
249 if self.time_control:
250 if entries%200==0:
251 sys.stderr.write("%s pfam entries in %s seconds\n" %(entries,time.time()-self.initial_time))
252
253 pfam_object = ExternalEntity( source_database = self.database, type="pattern")
254 continue
255
256
257
258 m = search_id.match(line)
259 if m:
260 entries += 1
261 pfamName = m.group(1)
262 pfam_object.add_attribute(ExternalEntityAttribute( attribute_identifier = "name", value = pfamName, type = "unique"))
263 continue
264
265
266 m = search_description.match(line)
267 if m:
268 pfamDescription = m.group(1)
269 pfam_object.add_attribute(ExternalEntityAttribute( attribute_identifier = "description", value = pfamDescription))
270 continue
271
272
273 m = search_prints.match(line)
274 if m:
275 pfam_object.add_attribute(ExternalEntityAttribute( attribute_identifier = "prints", value = m.group(1), type = "cross-reference"))
276
277
278
279 m = search_prodom.match(line)
280 if m:
281 pfam_object.add_attribute(ExternalEntityAttribute( attribute_identifier = "prodom", value = m.group(1), type = "cross-reference"))
282
283
284 m = search_pfama.match(line)
285 if m:
286 pfam_object.add_attribute(ExternalEntityAttribute( attribute_identifier = "pfam", value = m.group(1), type = "cross-reference"))
287
288
289 m = search_pfamb.match(line)
290 if m:
291 pfam_object.add_attribute(ExternalEntityAttribute( attribute_identifier = "pfam", value = m.group(1), type = "cross-reference"))
292 continue
293
294
295 m = search_interpro.match(line)
296 if m:
297 pfam_object.add_attribute(ExternalEntityAttribute( attribute_identifier = "interpro", value = m.group(1), type = "cross-reference"))
298 continue
299
300
301
302 m = search_pfam_ac.match(line)
303 if m:
304 pfamAC = m.group(1)
305 pfam_object.add_attribute(ExternalEntityAttribute( attribute_identifier = "pfam", value = pfamAC, type = "unique") )
306 continue
307
308
309 m = search_prosite.match(line)
310 if m:
311 pfam_object.add_attribute(ExternalEntityAttribute( attribute_identifier = "prosite", value = m.group(1), type = "cross-reference"))
312 continue
313
314
315 m = search_uniprot.match(line)
316 if m:
317 pfam_object.add_attribute(ExternalEntityAttribute( attribute_identifier = "uniprotaccession", value = m.group(3), version = m.group(4), type = "cross-reference") )
318 pfam_object.add_attribute(ExternalEntityAttribute( attribute_identifier = "uniprotentry", value = m.group(1), type = "cross-reference"))
319
320 if self.insert_mapping:
321 try:
322 sequenceMD5 = self.uniprot_acc_seq_md5_dict["%s.%s" %(m.group(3),m.group(4))]
323 pfam_object.add_attribute(ExternalEntityAttribute( attribute_identifier = "sequenceMap", value = sequenceMD5,
324 additional_fields = {"seq_range": m.group(2)} ))
325 except:
326 sys.stderr.write("SequenceMD5 not found for %s.%s\n" %(m.group(3),m.group(4)))
327
328
329
330 continue
331
332
333 m = search_pdb_cross_ref.match(line)
334 if m:
335 pfam_object.add_attribute(ExternalEntityAttribute( attribute_identifier = "pdb", value = m.group(1), type = "cross-reference",
336 additional_fields = {"chain": m.group(2),
337 "pdb_range": "%s-%s" %(m.group(3),m.group(4))} ))
338 continue
339
340 except:
341 print line
342 print traceback.print_exc()
343 sys.stderr.write("Error in parsing line %s\n" %(line_number))
344 raise ValueError("error in parsing line")
345
346
347 if pfam_object is not None:
348 self.biana_access.insert_new_external_entity( externalEntity = pfam_object )
349
350 pfam_input_file_fd.close()
351