biana.BianaParser.stringParser

7 """ 8 STRING Parser Class 9 """ 10 N_MAX_ENTRY_AT_ONCE = 5000 #10000 #20000 #! 11 name = "string" 12 description = "This program fills up tables in database biana related with STRING" 13 external_entity_definition = "A protein in STRING with sequence" 14 external_entity_relations = "A link given between two proteins in STRING with score" 15 16 datatype_to_biana_type = { "SWISSPROT_DE": "name", 17 "SWISSPROT_GN": "geneSymbol", 18 "SWISSPROT_ID": "uniprotEntry", 19 "SWISSPROT_AC": "uniprotAccession", 20 "SWISSPROT_DR_EMBL": "AccessionNumber", 21 "SWISSPROT_DR_PIR": "PIR", 22 "SWISSPROT_DR_SGD": "SGD", 23 "GenomeReviews": "GenomeReviews", 24 "Uniprot/SWISSPROT": "uniprotEntry", 25 "Uniprot/SPTREMBL": "uniprotEntry", 26 "Uniprot/SPTREMBL": "uniprotEntry", 27 "Ensembl_Uniprot/SWISS": "uniprotEntry", 28 "EMBL": "AccessionNumber", 29 "RefSeq": "RefSeq", 30 "RefSeq_peptide": "RefSeq", 31 "RefSeq_dna": "RefSeq", 32 "EntrezGene": "GeneID", 33 "Ensembl": "Ensembl", 34 "UniGene": "Unigene", 35 "Ensembl_SGD": "SGD", 36 "SGD": "SGD", 37 "PDB": "PDB", 38 "IPI": "IPI", 39 "HGNC": "HGNC", 40 "HPRD": "HPRD", 41 "PUBMED": "Pubmed", 42 "HUGO": "GeneSymbol", 43 "gene": "Tair", 44 "RGD": "RGD", 45 "MIM": "MIM", 46 "wormbase_transcript": "WormBaseSequenceName", 47 "Ensembl_HUGO_Approved_Name": "Name", # unique 48 "Ensembl_HUGO_Aliases": "Name", # alias 49 "Ensembl_HUGO_RefSeq_IDs": "RefSeq", 50 "Ensembl_HUGO_Accession_Numbers": "AccessionNumber", 51 "Ensembl_HUGO_HGNC_ID": "HGNC", 52 "HUGE": "Huge", # no entry w/ this source 53 "GO": "GO", # no entry 54 "PRINTS": "PRINTS", # no entry 55 "primary": None, # telling that x is x (primary), no idea why it is there.. 56 "Extra": None, 57 "protein_id": None, 58 "wormpep_id": None, 59 "CCDS": None, 60 "aniseed": None, 61 "ciona_int_jgi": None, 62 "cint_jgi_v2": None, 63 "cint_jgi_v1": None, 64 "cint_aniseed_v1": None, 65 "Ensembl_HUGO_Previous_Symbols": None, 66 "Uniprot/SPTREMBL_predicted": None, 67 "RefSeq_peptide_predicted": None, 68 "RefSeq_dna_predicted": None, 69 "protein_id_predicted": None, 70 "prediction_SPTREMBL": None, 71 "EMBL_predicted": None, 72 "Genoscope_annotated_gene": None, 73 "gadfly_translation_cgid": None, 74 "gadfly_transcript_cgid": None, 75 "gadfly_gene_cgid": None, 76 "wormbase_gene": None, 77 "wormbase_locus": None, 78 "flybase_polypeptide_id": None, 79 "flybase_gene_id": None, 80 "flybase_transcript_id": None, 81 "flybase_annotation_id": None, 82 "FlyBaseName_translations": None, 83 "FlyBaseName_transcript": None, 84 "FlyBaseName_gene": None, 85 "Celera_Trans": None, 86 "Celera_Pep": "", 87 "Celera_Gene": "", 88 "MEDLINE": None, 89 "OTTT": None, 90 "Genoscope_pred_gene": None, 91 "Genoscope_pred_transcript": None, 92 "Tiffin": None, 93 "miRNA_Registry": None, 94 "AFFY_Mouse430_2": None, 95 "AFFY_C_elegans": None, 96 "AFFY_Canine": None, 97 "AFFY_MG_U74Cv2": None, 98 "AFFY_Zebrafish": None, 99 "AFFY_HG_U95E": None, 100 "AFFY_HG_U95D": None, 101 "AFFY_HG_U95C": None, 102 "AFFY_HG_U95B": None, 103 "AFFY_HG_U133A": None, 104 "AFFY_HG_U133B": None, 105 "AFFY_HG_U133_PLUS_2": None, 106 "AFFY_HG_U95Av2": None, 107 "AFFY_MG_U74Av2": None, 108 "AFFY_HuGeneFL": None, 109 "AFFY_HC_G110 and CCDS": None, 110 "AFFY_HC_G110": None, 111 "AFFY_U133_X3P": None, 112 "AFFY_RG_U34B": None, 113 "AFFY_RG_U34C": None, 114 "AFFY_RG_U34A": None, 115 "AFFY_DrosGenome1": None, 116 "AFFY_Drosophila_2": None, 117 "AFFY_HG_U133A_2": None, 118 "AFFY_Chicken": None, 119 "AFFY_Mouse430A_2": None, 120 "AFFY_HG_Focus": None, 121 "AFFY_MG_U74Bv2": None, 122 "AFFY_Rat230_2": None, 123 "ZFIN_xpat": None, 124 "ZFIN_ID": None, 125 "Ens_Hs_translation": None, 126 "Uniprot/Varsplic": None, 127 "MarkerSymbol": None, 128 "Xenopus_Jamboree": None, 129 "AgilentCGH": None, 130 "Ens_Hs_transcript": None, 131 "AgilentProbe": None, 132 "Anopheles_symbol": None, 133 } 134 135 swissprot_re = re.compile("SWISSPROT_[A-Z]{2}[.]*") 136

137 - def __init__(self):

138 # Start with the default values 139 BianaParser.__init__(self, default_db_description = "Search Tool for the Retrieval of Interacting Genes/Proteins", 140 default_script_name = "stringParser.py", 141 default_script_description = "This program fills up tables in database biana related to STRING", 142 additional_compulsory_arguments = [], 143 additional_optional_arguments = [("use-existing-temp-alias-table", 0, "Uses previously created temporary alias table in the database")]) 144 self.default_eE_attribute = "string" 145 self.use_existing_temp = self.arguments_dic["use-existing-temp-alias-table"] 146 self.string_protein_object_number = 0 147 self.setUnknownDB = Set() 148 self.alias_temp_table = database.TableDB( table_name = "temp_string_aliases", 149 table_fields = [ database.FieldDB(field_name = "id", data_type = "varchar(53)"), 150 database.FieldDB(field_name = "alias", data_type = "varchar(130)"), 151 database.FieldDB(field_name = "source_list", data_type = "varchar(190)") ], 152 primary_key=("id, alias") ) #, indices=[("alias")] ) 153 return

154

155 - def parse_database(self):

156 """ 157 Method that implements the specific operations of string parser 158 """ 159 ## Get STRING protein sequence, alias and links files 160 (sequences_file_fd, aliases_file, links_file_fd) = self._get_data_file_names() 161 string_protein_object = None 162 ## dictionary storing processed string ids and corresponding external ids assigned to them in Biana database 163 processed_string_ids_to_external_ids = {} 164 ## insert alias information to database as a temporary table to be used later 165 if self.use_existing_temp == 0: 166 self._insert_protein_information_from_alias_file_to_database(aliases_file) 167 nSequence = 0 168 ## insert the data of proteins one by one fetching sequence and protein from sequence file and then searching for the information of protein in alias table in database 169 lineLastRed = sequences_file_fd.readline() 170 while lineLastRed: 171 ## get identifier and sequence information 172 lineLastRed, dictIdToSequence = self._readGivenNumberOfSequencesToDictionary(sequences_file_fd, lineLastRed, self.N_MAX_ENTRY_AT_ONCE) 173 ## fetch alias information for the current protein 174 setSequenced = Set(dictIdToSequence.keys()) 175 nSequence += len(setSequenced) 176 list_information_tuple = self._fetch_given_list_protein_information_from_database(setSequenced) 177 setAliased = Set() 178 if list_information_tuple != (): 179 id_word_prev = list_information_tuple[0][0] 180 list_information_tuple_inner = [] 181 for id_word, alias, source_list in list_information_tuple: 182 setAliased.add(id_word) 183 if id_word_prev == id_word: 184 list_information_tuple_inner.append((alias, source_list)) 185 else: 186 ## Start & insert new entry 187 sequence = dictIdToSequence[id_word_prev] 188 processed_string_ids_to_external_ids[id_word_prev] = self._insert_new_string_protein_object_into_database(id_word_prev, sequence, list_information_tuple_inner) 189 self.check_time() 190 list_information_tuple_inner = [(alias, source_list)] 191 id_word_prev = id_word 192 193 if list_information_tuple != (): 194 id_word_last = list_information_tuple[-1][0] 195 sequence = dictIdToSequence[id_word_last] 196 processed_string_ids_to_external_ids[id_word_last] = self._insert_new_string_protein_object_into_database(id_word_last, sequence, list_information_tuple_inner) 197 self.check_time() 198 setToBeProcessed = setSequenced - setAliased 199 if self.verbose and len(setToBeProcessed): 200 print "Sequence entries without alias: ", len(setToBeProcessed) 201 for id_word in setToBeProcessed: 202 sequence = dictIdToSequence[id_word] 203 processed_string_ids_to_external_ids[id_word] = self._insert_new_string_protein_object_into_database(id_word, sequence, []) 204 self.check_time() 205 206 self._close_file_descriptor(sequences_file_fd) 207 208 if self.verbose: 209 print "Number of entries with sequence: ", nSequence 210 print "Number of entries with alias: ", self.biana_access.db.select_db_content( sql_query= "SELECT COUNT(DISTINCT(id)) FROM %s" % self.alias_temp_table.get_table_name(), answer_mode = "single" ) 211 212 if self.verbose: 213 print "Unknown databases:", self.setUnknownDB 214 215 ## Insert relations 216 string_relation_object = None 217 string_relation_object_number = 0 218 for line in links_file_fd: 219 ## get interaction & score information between two entries 220 if line.startswith("#"): 221 continue 222 if line.startswith("protein"): 223 continue 224 line_fields = line.strip().split() 225 if len(line_fields) < 10: 226 if self.verbose: 227 print "Warning: Format inconsistency - missing scores", line 228 continue 229 id_word1 = line_fields[0] 230 id_word2 = line_fields[1] 231 neighborhood = line_fields[2] 232 fusion = line_fields[3] 233 cooccurence = line_fields[4] 234 coexpression = line_fields[5] 235 experimental = line_fields[6] 236 database = line_fields[7] 237 textmining = line_fields[8] 238 score = line_fields[9] 239 ## if sequence information for these proteins was not available they were not inserted 240 ## in these cases insert their alias information to the database 241 if not processed_string_ids_to_external_ids.has_key(id_word1): 242 list_information_tuple = self._fetch_given_protein_information_from_database(id_word1) 243 if list_information_tuple == (): 244 if self.verbose: 245 print "Warning: id not found in temp alias table:", id_word1 246 processed_string_ids_to_external_ids[id_word1] = self._insert_new_string_protein_object_into_database(id_word1, None, list_information_tuple) 247 248 if not processed_string_ids_to_external_ids.has_key(id_word2): 249 list_information_tuple = self._fetch_given_protein_information_from_database(id_word2) 250 if list_information_tuple == (): 251 if self.verbose: 252 print "Warning: id not found in temp alias table:", id_word2 253 processed_string_ids_to_external_ids[id_word2] = self._insert_new_string_protein_object_into_database(id_word2, None, list_information_tuple) 254 255 ## Start & insert new entry relation 256 string_relation_object = ExternalEntityRelation( source_database = self.database, relation_type="functional_association") #"interaction" ) 257 string_relation_object.add_participant( externalEntityID = processed_string_ids_to_external_ids[id_word1] ) 258 string_relation_object.add_participant( externalEntityID = processed_string_ids_to_external_ids[id_word2] ) 259 string_relation_object.add_attribute(ExternalEntityRelationAttribute( attribute_identifier = "STRINGScore", 260 value = score, additional_fields = { "neighborhood": neighborhood, "fusion": fusion, "cooccurence": cooccurence, "coexpression": coexpression, "experimental": experimental, "db": database, "textmining": textmining })) 261 self.biana_access.insert_new_external_entity( externalEntity = string_relation_object ) 262 string_relation_object_number += 1 263 if self.time_control: 264 if string_relation_object_number%20000==0: 265 sys.stderr.write("%s relation entries done in %s seconds\n" %(string_relation_object_number,time.time()-self.initial_time)) 266 267 self._close_file_descriptor(links_file_fd) 268 #print "Unknown databases:", self.setUnknownDB 269 270 #self._remove_protein_information_from_database() #! 271 272 return

273

274 - def check_time(self):

275 self.string_protein_object_number += 1 276 if self.time_control: 277 if self.string_protein_object_number%20000==0: 278 sys.stderr.write("%s entries done in %s seconds\n" %(self.string_protein_object_number,time.time()-self.initial_time)) 279 return

280

281 - def _readGivenNumberOfSequencesToDictionary(self, sequences_file_fd, lineLastRed, nSequence):

282 i = 0 283 dictIdToSequence = {} 284 while lineLastRed and i< nSequence: 285 lineLastRed, id_word, sequence = self._readNextSequenceInformationFromSequenceFile(sequences_file_fd, lineLastRed) 286 i += 1 287 dictIdToSequence[id_word] = sequence 288 return lineLastRed, dictIdToSequence

289

290 - def _readNextSequenceInformationFromSequenceFile(self, sequences_file_fd, line):

291 while line.startswith("#"): 292 line = sequences_file_fd.readline() 293 continue 294 if not line.startswith(">"): 295 if self.verbose: 296 print "Warning: unexpected line in sequences file:", line 297 return 298 line_fields = line.strip().split() 299 id_word = line_fields[0][1:] 300 line = sequences_file_fd.readline() 301 if line.startswith(">"): 302 if self.verbose: 303 print "Warning: unexpected > in fasta file", line 304 return 305 sequence = "" 306 while line and not line.startswith('>'): 307 sequence += line.strip() 308 line = sequences_file_fd.readline() 309 return line, id_word, sequence

310

311 - def _insert_new_string_protein_object_into_database(self, id_word, sequence, list_information_tuple):

312 313 index = id_word.find(".") 314 id = id_word[index+1:] 315 tax = id_word[:index] 316 #print id_word, sequence, list_information_tuple 317 318 string_protein_object = ExternalEntity( source_database = self.database, type="protein" ) 319 320 value = self.biana_access._transform_attribute_value_data_type_to_biana_database_attribute_data_type( attribute_identifier="STRING", value=id ) 321 string_protein_object.add_attribute(ExternalEntityAttribute(attribute_identifier = "STRING", value = id, type = "unique")) 322 value = self.biana_access._transform_attribute_value_data_type_to_biana_database_attribute_data_type( attribute_identifier="taxID", value=tax ) 323 string_protein_object.add_attribute(ExternalEntityAttribute(attribute_identifier = "taxID", value = tax)) 324 if sequence is not None: 325 string_protein_object.add_attribute(ExternalEntityAttribute(attribute_identifier="proteinSequence", value = ProteinSequence("".join(sequence)))) 326 for alias, source_list_str in list_information_tuple: 327 for source in source_list_str.split(): 328 search = self.swissprot_re.search(source) 329 source_org = source 330 if search: 331 source = source[search.start():] 332 if not self.datatype_to_biana_type.has_key(source): 333 #sys.stderr.write("Warning: Unknown source db id - %s" % source+"\n") 334 self.setUnknownDB.add(source_org) 335 source_db = None 336 else: 337 source_db = self.datatype_to_biana_type[source] 338 if source_db is not None: 339 type = "cross-reference" 340 #if source_db.lower() == "ipi": 341 # if alias.startswith("IPI"): 342 # alias = alias[3:] 343 if source_db.lower() == "hgnc": 344 if alias.startswith("HGNC:"): 345 alias = alias[5:] 346 if source == "Ensembl_HUGO_Aliases": 347 type = "alias" 348 if source == "Ensembl_HUGO_Approved_Name": 349 type = "unique" 350 #if alias is None or len(alias) < 2: 351 # if self.verbose: 352 # print "Warning: None alias", id_word, alias, source_list_str 353 # continue 354 if source_db.strip() == "": 355 continue 356 value = self.biana_access._transform_attribute_value_data_type_to_biana_database_attribute_data_type( attribute_identifier=source_db, value=alias ) 357 string_protein_object.add_attribute(ExternalEntityAttribute(attribute_identifier = source_db, value = alias, type = type)) 358 359 return self.biana_access.insert_new_external_entity( externalEntity = string_protein_object )

360

361 - def _insert_protein_information_from_alias_file_to_database(self, aliases_file):

362 if self.verbose: 363 print "Creating temporary database for aliases.." 364 self.biana_access.db.insert_db_content( self.alias_temp_table.create_mysql_query(), answer_mode = None ) 365 aliases_file_fd = self._get_file_descriptor(aliases_file) 366 #values = [] 367 #i = 0 368 nMaxId = 0 369 nMaxAlias = 0 370 nMaxSourceList = 0 371 for line in aliases_file_fd: 372 #i+=1 373 if line.startswith('#'): 374 continue 375 words = line.split('\t') 376 tax = words[0] 377 if self.verbose and tax is None: 378 print "Warning: None taxId:", line 379 id = words[1] 380 id_word = "%s.%s" % (tax, id) 381 alias = words[2].replace("\"", "") #.strip("\"") 382 source_list = words[3].split() 383 if self.verbose and alias is None: 384 print "Warning: None alias:", line 385 source_list_str = " ".join(source_list) 386 #values.append(("\"%s\"" % id_word, "\"%s\"" % alias, "\"%s\"" % source_list_str)) 387 if len(id_word) > nMaxId: nMaxId = len(id_word) 388 if len(alias) > nMaxAlias: nMaxAlias = len(alias) 389 if len(source_list_str) > nMaxSourceList: nMaxSourceList = len(source_list_str) 390 #if i > self.N_MAX_ENTRY_AT_ONCE: 391 # self.biana_access.db.insert_db_content( self.biana_access.db._get_multiple_insert_query(self.alias_temp_table.get_table_name(),('id', 'alias', 'source_list'),values), answer_mode = None ) 392 # values = [] 393 # i = 0 394 self.biana_access.db.insert_db_content( self.biana_access.db._get_insert_sql_query(table = self.alias_temp_table.get_table_name(), column_values = [('id',id_word), 395 ('alias', alias), 396 ('source_list', source_list_str)] ), answer_mode = None ) 397 398 399 self.biana_access.db._empty_buffer() 400 401 self._close_file_descriptor(aliases_file_fd) 402 if self.verbose: 403 print "Max id-alias-sourcelist: ", nMaxId, nMaxAlias, nMaxSourceList 404 if self.verbose: 405 print "Temporary database for aliases is created!" 406 return

407

408 - def _fetch_given_list_protein_information_from_database(self, list_id):

409 return self.biana_access.db.select_db_content( self.biana_access.db._get_select_sql_query(tables=[ self.alias_temp_table.get_table_name() ] , columns=['id', 'alias', 'source_list'], fixed_conditions=[('id', 'IN', "(\"%s\")" % "\",\"".join(list_id), None)] ), answer_mode = "raw") #, remove_duplicates="yes" )

410

411 - def _fetch_given_protein_information_from_database(self, id_word):

412 return self.biana_access.db.select_db_content( self.biana_access.db._get_select_sql_query(tables=[ self.alias_temp_table.get_table_name() ] , columns=['alias', 'source_list'], fixed_conditions=[('id', '=', id_word)] ), answer_mode = "raw") #, remove_duplicates="yes" )

413

414 - def _remove_protein_information_from_database(self):

415 self.biana_access.db.insert_db_content( "DELETE FROM %s" %self.alias_temp_table.get_table_name() ) 416 self.biana_access.db.insert_db_content( self.alias_temp_table.get_drop_query() ) 417 return

418

419 - def _get_data_file_names(self):

420 print self.input_file 421 (sequences_file_fd, aliases_file, links_file_fd) = (None, None, None) 422 if( not self.input_file.endswith(os.sep) ): 423 self.input_file += os.sep 424 directoryData = os.path.dirname(self.input_file)+os.sep 425 # find string data files -- not checking if different versions exists 426 for file in os.listdir(directoryData): 427 file = directoryData + file 428 #print file 429 if fnmatch.fnmatch(file, '*protein.links.detailed*%s*' % self.sourcedb_version): 430 links_file_fd = self._get_file_descriptor(file) 431 if fnmatch.fnmatch(file, '*protein.sequences*%s*' % self.sourcedb_version): 432 sequences_file_fd = self._get_file_descriptor(file) 433 if fnmatch.fnmatch(file, '*protein.aliases*%s*' % self.sourcedb_version): 434 #aliases_file = file # self._get_file_descriptor(file) 435 if file.endswith(".gz"): 436 os.system("gunzip %s" % file) 437 aliases_file = file[:-3] 438 else: 439 aliases_file = file # = self._get_file_descriptor(file) 440 print links_file_fd, sequences_file_fd, aliases_file 441 return (sequences_file_fd, aliases_file, links_file_fd)

442

443 - def _get_file_descriptor(self, file):

444 print file 445 if file.endswith(".gz"): 446 return gzip.open(file,'r') 447 else: 448 return open(file, 'r')

449

450 - def _close_file_descriptor(self, fd):

451 #if isinstance(fd, file): 452 fd.close()

Source Code for Module biana.BianaParser.stringParser