1 from bianaParser import *
2 from biana.BianaDB import database
3 from sets import *
4 import os, fnmatch, re, sys
5
7 """
8 STRING Parser Class
9 """
10 N_MAX_ENTRY_AT_ONCE = 5000
11 name = "string"
12 description = "This program fills up tables in database biana related with STRING"
13 external_entity_definition = "A protein in STRING with sequence"
14 external_entity_relations = "A link given between two proteins in STRING with score"
15
16 datatype_to_biana_type = { "SWISSPROT_DE": "name",
17 "SWISSPROT_GN": "geneSymbol",
18 "SWISSPROT_ID": "uniprotEntry",
19 "SWISSPROT_AC": "uniprotAccession",
20 "SWISSPROT_DR_EMBL": "AccessionNumber",
21 "SWISSPROT_DR_PIR": "PIR",
22 "SWISSPROT_DR_SGD": "SGD",
23 "GenomeReviews": "GenomeReviews",
24 "Uniprot/SWISSPROT": "uniprotEntry",
25 "Uniprot/SPTREMBL": "uniprotEntry",
26 "Uniprot/SPTREMBL": "uniprotEntry",
27 "Ensembl_Uniprot/SWISS": "uniprotEntry",
28 "EMBL": "AccessionNumber",
29 "RefSeq": "RefSeq",
30 "RefSeq_peptide": "RefSeq",
31 "RefSeq_dna": "RefSeq",
32 "EntrezGene": "GeneID",
33 "Ensembl": "Ensembl",
34 "UniGene": "Unigene",
35 "Ensembl_SGD": "SGD",
36 "SGD": "SGD",
37 "PDB": "PDB",
38 "IPI": "IPI",
39 "HGNC": "HGNC",
40 "HPRD": "HPRD",
41 "PUBMED": "Pubmed",
42 "HUGO": "GeneSymbol",
43 "gene": "Tair",
44 "RGD": "RGD",
45 "MIM": "MIM",
46 "wormbase_transcript": "WormBaseSequenceName",
47 "Ensembl_HUGO_Approved_Name": "Name",
48 "Ensembl_HUGO_Aliases": "Name",
49 "Ensembl_HUGO_RefSeq_IDs": "RefSeq",
50 "Ensembl_HUGO_Accession_Numbers": "AccessionNumber",
51 "Ensembl_HUGO_HGNC_ID": "HGNC",
52 "HUGE": "Huge",
53 "GO": "GO",
54 "PRINTS": "PRINTS",
55 "primary": None,
56 "Extra": None,
57 "protein_id": None,
58 "wormpep_id": None,
59 "CCDS": None,
60 "aniseed": None,
61 "ciona_int_jgi": None,
62 "cint_jgi_v2": None,
63 "cint_jgi_v1": None,
64 "cint_aniseed_v1": None,
65 "Ensembl_HUGO_Previous_Symbols": None,
66 "Uniprot/SPTREMBL_predicted": None,
67 "RefSeq_peptide_predicted": None,
68 "RefSeq_dna_predicted": None,
69 "protein_id_predicted": None,
70 "prediction_SPTREMBL": None,
71 "EMBL_predicted": None,
72 "Genoscope_annotated_gene": None,
73 "gadfly_translation_cgid": None,
74 "gadfly_transcript_cgid": None,
75 "gadfly_gene_cgid": None,
76 "wormbase_gene": None,
77 "wormbase_locus": None,
78 "flybase_polypeptide_id": None,
79 "flybase_gene_id": None,
80 "flybase_transcript_id": None,
81 "flybase_annotation_id": None,
82 "FlyBaseName_translations": None,
83 "FlyBaseName_transcript": None,
84 "FlyBaseName_gene": None,
85 "Celera_Trans": None,
86 "Celera_Pep": "",
87 "Celera_Gene": "",
88 "MEDLINE": None,
89 "OTTT": None,
90 "Genoscope_pred_gene": None,
91 "Genoscope_pred_transcript": None,
92 "Tiffin": None,
93 "miRNA_Registry": None,
94 "AFFY_Mouse430_2": None,
95 "AFFY_C_elegans": None,
96 "AFFY_Canine": None,
97 "AFFY_MG_U74Cv2": None,
98 "AFFY_Zebrafish": None,
99 "AFFY_HG_U95E": None,
100 "AFFY_HG_U95D": None,
101 "AFFY_HG_U95C": None,
102 "AFFY_HG_U95B": None,
103 "AFFY_HG_U133A": None,
104 "AFFY_HG_U133B": None,
105 "AFFY_HG_U133_PLUS_2": None,
106 "AFFY_HG_U95Av2": None,
107 "AFFY_MG_U74Av2": None,
108 "AFFY_HuGeneFL": None,
109 "AFFY_HC_G110 and CCDS": None,
110 "AFFY_HC_G110": None,
111 "AFFY_U133_X3P": None,
112 "AFFY_RG_U34B": None,
113 "AFFY_RG_U34C": None,
114 "AFFY_RG_U34A": None,
115 "AFFY_DrosGenome1": None,
116 "AFFY_Drosophila_2": None,
117 "AFFY_HG_U133A_2": None,
118 "AFFY_Chicken": None,
119 "AFFY_Mouse430A_2": None,
120 "AFFY_HG_Focus": None,
121 "AFFY_MG_U74Bv2": None,
122 "AFFY_Rat230_2": None,
123 "ZFIN_xpat": None,
124 "ZFIN_ID": None,
125 "Ens_Hs_translation": None,
126 "Uniprot/Varsplic": None,
127 "MarkerSymbol": None,
128 "Xenopus_Jamboree": None,
129 "AgilentCGH": None,
130 "Ens_Hs_transcript": None,
131 "AgilentProbe": None,
132 "Anopheles_symbol": None,
133 }
134
135 swissprot_re = re.compile("SWISSPROT_[A-Z]{2}[.]*")
136
138
139 BianaParser.__init__(self, default_db_description = "Search Tool for the Retrieval of Interacting Genes/Proteins",
140 default_script_name = "stringParser.py",
141 default_script_description = "This program fills up tables in database biana related to STRING",
142 additional_compulsory_arguments = [],
143 additional_optional_arguments = [("use-existing-temp-alias-table", 0, "Uses previously created temporary alias table in the database")])
144 self.default_eE_attribute = "string"
145 self.use_existing_temp = self.arguments_dic["use-existing-temp-alias-table"]
146 self.string_protein_object_number = 0
147 self.setUnknownDB = Set()
148 self.alias_temp_table = database.TableDB( table_name = "temp_string_aliases",
149 table_fields = [ database.FieldDB(field_name = "id", data_type = "varchar(53)"),
150 database.FieldDB(field_name = "alias", data_type = "varchar(130)"),
151 database.FieldDB(field_name = "source_list", data_type = "varchar(190)") ],
152 primary_key=("id, alias") )
153 return
154
156 """
157 Method that implements the specific operations of string parser
158 """
159
160 (sequences_file_fd, aliases_file, links_file_fd) = self._get_data_file_names()
161 string_protein_object = None
162
163 processed_string_ids_to_external_ids = {}
164
165 if self.use_existing_temp == 0:
166 self._insert_protein_information_from_alias_file_to_database(aliases_file)
167 nSequence = 0
168
169 lineLastRed = sequences_file_fd.readline()
170 while lineLastRed:
171
172 lineLastRed, dictIdToSequence = self._readGivenNumberOfSequencesToDictionary(sequences_file_fd, lineLastRed, self.N_MAX_ENTRY_AT_ONCE)
173
174 setSequenced = Set(dictIdToSequence.keys())
175 nSequence += len(setSequenced)
176 list_information_tuple = self._fetch_given_list_protein_information_from_database(setSequenced)
177 setAliased = Set()
178 if list_information_tuple != ():
179 id_word_prev = list_information_tuple[0][0]
180 list_information_tuple_inner = []
181 for id_word, alias, source_list in list_information_tuple:
182 setAliased.add(id_word)
183 if id_word_prev == id_word:
184 list_information_tuple_inner.append((alias, source_list))
185 else:
186
187 sequence = dictIdToSequence[id_word_prev]
188 processed_string_ids_to_external_ids[id_word_prev] = self._insert_new_string_protein_object_into_database(id_word_prev, sequence, list_information_tuple_inner)
189 self.check_time()
190 list_information_tuple_inner = [(alias, source_list)]
191 id_word_prev = id_word
192
193 if list_information_tuple != ():
194 id_word_last = list_information_tuple[-1][0]
195 sequence = dictIdToSequence[id_word_last]
196 processed_string_ids_to_external_ids[id_word_last] = self._insert_new_string_protein_object_into_database(id_word_last, sequence, list_information_tuple_inner)
197 self.check_time()
198 setToBeProcessed = setSequenced - setAliased
199 if self.verbose and len(setToBeProcessed):
200 print "Sequence entries without alias: ", len(setToBeProcessed)
201 for id_word in setToBeProcessed:
202 sequence = dictIdToSequence[id_word]
203 processed_string_ids_to_external_ids[id_word] = self._insert_new_string_protein_object_into_database(id_word, sequence, [])
204 self.check_time()
205
206 self._close_file_descriptor(sequences_file_fd)
207
208 if self.verbose:
209 print "Number of entries with sequence: ", nSequence
210 print "Number of entries with alias: ", self.biana_access.db.select_db_content( sql_query= "SELECT COUNT(DISTINCT(id)) FROM %s" % self.alias_temp_table.get_table_name(), answer_mode = "single" )
211
212 if self.verbose:
213 print "Unknown databases:", self.setUnknownDB
214
215
216 string_relation_object = None
217 string_relation_object_number = 0
218 for line in links_file_fd:
219
220 if line.startswith("#"):
221 continue
222 if line.startswith("protein"):
223 continue
224 line_fields = line.strip().split()
225 if len(line_fields) < 10:
226 if self.verbose:
227 print "Warning: Format inconsistency - missing scores", line
228 continue
229 id_word1 = line_fields[0]
230 id_word2 = line_fields[1]
231 neighborhood = line_fields[2]
232 fusion = line_fields[3]
233 cooccurence = line_fields[4]
234 coexpression = line_fields[5]
235 experimental = line_fields[6]
236 database = line_fields[7]
237 textmining = line_fields[8]
238 score = line_fields[9]
239
240
241 if not processed_string_ids_to_external_ids.has_key(id_word1):
242 list_information_tuple = self._fetch_given_protein_information_from_database(id_word1)
243 if list_information_tuple == ():
244 if self.verbose:
245 print "Warning: id not found in temp alias table:", id_word1
246 processed_string_ids_to_external_ids[id_word1] = self._insert_new_string_protein_object_into_database(id_word1, None, list_information_tuple)
247
248 if not processed_string_ids_to_external_ids.has_key(id_word2):
249 list_information_tuple = self._fetch_given_protein_information_from_database(id_word2)
250 if list_information_tuple == ():
251 if self.verbose:
252 print "Warning: id not found in temp alias table:", id_word2
253 processed_string_ids_to_external_ids[id_word2] = self._insert_new_string_protein_object_into_database(id_word2, None, list_information_tuple)
254
255
256 string_relation_object = ExternalEntityRelation( source_database = self.database, relation_type="functional_association")
257 string_relation_object.add_participant( externalEntityID = processed_string_ids_to_external_ids[id_word1] )
258 string_relation_object.add_participant( externalEntityID = processed_string_ids_to_external_ids[id_word2] )
259 string_relation_object.add_attribute(ExternalEntityRelationAttribute( attribute_identifier = "STRINGScore",
260 value = score, additional_fields = { "neighborhood": neighborhood, "fusion": fusion, "cooccurence": cooccurence, "coexpression": coexpression, "experimental": experimental, "db": database, "textmining": textmining }))
261 self.biana_access.insert_new_external_entity( externalEntity = string_relation_object )
262 string_relation_object_number += 1
263 if self.time_control:
264 if string_relation_object_number%20000==0:
265 sys.stderr.write("%s relation entries done in %s seconds\n" %(string_relation_object_number,time.time()-self.initial_time))
266
267 self._close_file_descriptor(links_file_fd)
268
269
270
271
272 return
273
275 self.string_protein_object_number += 1
276 if self.time_control:
277 if self.string_protein_object_number%20000==0:
278 sys.stderr.write("%s entries done in %s seconds\n" %(self.string_protein_object_number,time.time()-self.initial_time))
279 return
280
282 i = 0
283 dictIdToSequence = {}
284 while lineLastRed and i< nSequence:
285 lineLastRed, id_word, sequence = self._readNextSequenceInformationFromSequenceFile(sequences_file_fd, lineLastRed)
286 i += 1
287 dictIdToSequence[id_word] = sequence
288 return lineLastRed, dictIdToSequence
289
310
312
313 index = id_word.find(".")
314 id = id_word[index+1:]
315 tax = id_word[:index]
316
317
318 string_protein_object = ExternalEntity( source_database = self.database, type="protein" )
319
320 value = self.biana_access._transform_attribute_value_data_type_to_biana_database_attribute_data_type( attribute_identifier="STRING", value=id )
321 string_protein_object.add_attribute(ExternalEntityAttribute(attribute_identifier = "STRING", value = id, type = "unique"))
322 value = self.biana_access._transform_attribute_value_data_type_to_biana_database_attribute_data_type( attribute_identifier="taxID", value=tax )
323 string_protein_object.add_attribute(ExternalEntityAttribute(attribute_identifier = "taxID", value = tax))
324 if sequence is not None:
325 string_protein_object.add_attribute(ExternalEntityAttribute(attribute_identifier="proteinSequence", value = ProteinSequence("".join(sequence))))
326 for alias, source_list_str in list_information_tuple:
327 for source in source_list_str.split():
328 search = self.swissprot_re.search(source)
329 source_org = source
330 if search:
331 source = source[search.start():]
332 if not self.datatype_to_biana_type.has_key(source):
333
334 self.setUnknownDB.add(source_org)
335 source_db = None
336 else:
337 source_db = self.datatype_to_biana_type[source]
338 if source_db is not None:
339 type = "cross-reference"
340
341
342
343 if source_db.lower() == "hgnc":
344 if alias.startswith("HGNC:"):
345 alias = alias[5:]
346 if source == "Ensembl_HUGO_Aliases":
347 type = "alias"
348 if source == "Ensembl_HUGO_Approved_Name":
349 type = "unique"
350
351
352
353
354 if source_db.strip() == "":
355 continue
356 value = self.biana_access._transform_attribute_value_data_type_to_biana_database_attribute_data_type( attribute_identifier=source_db, value=alias )
357 string_protein_object.add_attribute(ExternalEntityAttribute(attribute_identifier = source_db, value = alias, type = type))
358
359 return self.biana_access.insert_new_external_entity( externalEntity = string_protein_object )
360
407
410
413
418
420 print self.input_file
421 (sequences_file_fd, aliases_file, links_file_fd) = (None, None, None)
422 if( not self.input_file.endswith(os.sep) ):
423 self.input_file += os.sep
424 directoryData = os.path.dirname(self.input_file)+os.sep
425
426 for file in os.listdir(directoryData):
427 file = directoryData + file
428
429 if fnmatch.fnmatch(file, '*protein.links.detailed*%s*' % self.sourcedb_version):
430 links_file_fd = self._get_file_descriptor(file)
431 if fnmatch.fnmatch(file, '*protein.sequences*%s*' % self.sourcedb_version):
432 sequences_file_fd = self._get_file_descriptor(file)
433 if fnmatch.fnmatch(file, '*protein.aliases*%s*' % self.sourcedb_version):
434
435 if file.endswith(".gz"):
436 os.system("gunzip %s" % file)
437 aliases_file = file[:-3]
438 else:
439 aliases_file = file
440 print links_file_fd, sequences_file_fd, aliases_file
441 return (sequences_file_fd, aliases_file, links_file_fd)
442
444 print file
445 if file.endswith(".gz"):
446 return gzip.open(file,'r')
447 else:
448 return open(file, 'r')
449
453
454
455