Package biana :: Package BianaParser :: Module uniprotParser
[hide private]
[frames] | no frames]

Source Code for Module biana.BianaParser.uniprotParser

  1  """ 
  2      BIANA: Biologic Interactions and Network Analysis 
  3      Copyright (C) 2009  Javier Garcia-Garcia, Emre Guney, Baldo Oliva 
  4   
  5      This program is free software: you can redistribute it and/or modify 
  6      it under the terms of the GNU General Public License as published by 
  7      the Free Software Foundation, either version 3 of the License, or 
  8      (at your option) any later version. 
  9   
 10      This program is distributed in the hope that it will be useful, 
 11      but WITHOUT ANY WARRANTY; without even the implied warranty of 
 12      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 13      GNU General Public License for more details. 
 14   
 15      You should have received a copy of the GNU General Public License 
 16      along with this program.  If not, see <http://www.gnu.org/licenses/>. 
 17   
 18  """ 
 19   
 20  from bianaParser import * 
 21   
 22   
23 -class UniprotParser(BianaParser):
24 """ 25 Uniprot Parser Class 26 """ 27 28 name = "uniprot" 29 description = "This file implements a program that fills up tables in database biana with information of uniprot databases" 30 external_entity_definition = "A external entity represents a protein" 31 external_entity_relations = "" 32
33 - def __init__(self):
34 35 # Start with the default values 36 37 BianaParser.__init__(self, default_db_description = "Uniprot database", 38 default_script_name = "uniprotParser.py", 39 default_script_description = UniprotParser.description, 40 additional_optional_arguments = []) 41 self.default_eE_attribute = "uniprotaccession" 42 self.initialize_input_file_descriptor()
43
44 - def parse_database(self):
45 """ 46 Method that implements the specific operations of uniprot parser 47 48 If executing in self.mode "tables", it is necessary to insert the tables here 49 """ 50 51 protein_number=0 52 53 54 # New entry regex 55 new_regex = re.compile("^\/\/\s*$") 56 57 # General regex 58 id_regex = re.compile("^ID\s+(\S+)\s*") 59 #ac_regex = re.compile("^AC\s+(\S+)\;\s*$") 60 ac_regex = re.compile("^AC\s+(.+);\s*$") 61 ac_version_regex = re.compile("sequence version (\d+)") 62 de_regex = re.compile("^DE\s+(.+)\s*$") 63 taxID_regex = re.compile("^OX\s+NCBI_TaxID=(\d+);") 64 keyword_regex = re.compile("^KW\s+(.+);$") 65 66 67 # GeneName regex 68 geneName_regex = re.compile("^GN") 69 gene_name_regex = re.compile("Name=([^;]+);") 70 gene_orf_name_regex = re.compile("ORFNames=([^;]+);") 71 gene_synonyms_regex = re.compile("Synonyms=([^;]+);") 72 gene_orderedLocusNames = re.compile("OrderedLocusNames=([^;]+);") 73 74 #Cross-references regular expressions 75 cross_regex = re.compile("^DR") 76 77 pfam_regex = re.compile("^DR\s+Pfam;\s*(\S+);") 78 kegg_regex = re.compile("^DR\s+KEGG;\s*(\S+);") 79 interpro_regex = re.compile("^DR\s+InterPro;\s*(\S+);") 80 prosite_regex = re.compile("^DR\s+PROSITE;\s*(\S+);") 81 prodom_regex = re.compile("^DR\s+ProDom;\s*(\S+);") 82 mim_regex = re.compile("^DR\s+MIM;\s*(\S+);") 83 pir_regex = re.compile("^DR\s+PIR;\s*(\S+);") 84 prints_regex = re.compile("^DR\s+PRINTS;\s*(\S+);") 85 ensembl_regex = re.compile("^DR\s+Ensembl;\s*(\S+);") 86 embl_regex = re.compile("^DR\s+EMBL;\s*(\S+);") 87 geneID_regex = re.compile("^DR\s+GeneID;\s*(\S+);") 88 go_regex = re.compile("^DR\s+GO;\s*GO\:(\d+);") 89 refseq_regex = re.compile("^DR\s+RefSeq;\s*(\S+);") 90 unigene_regex = re.compile("^DR\s+UniGene;\s*(\S+);") 91 hgnc_regex = re.compile("^DR\s+HGNC;\s*HGNC\:(\d+);") 92 pdb_regex = re.compile("^DR\s+PDB;\s*(\S+);.+;.+;(.+).") 93 flybase_regex = re.compile("^DR\s+FlyBase;\s*(\S+);") 94 mgi_regex = re.compile("^DR\s+MGI;\s*MGI:(\d+);") 95 reactome_regex = re.compile("^DR\s*Reactome;\s*REACT_(\d+);") 96 sgd_regex = re.compile("^DR\s+SGD;\s*(\w+);") 97 98 99 tigr_regex = re.compile("^DR\s+TIGR\;\s+(.+)\;") 100 #intact_regex = re.compile("^DR\s+IntAct") # It's the same as UniprotAccession? 101 dip_regex = re.compile("^DR\s+DIP\;\s+DIP\:(.+)\;") 102 cygd_regex = re.compile("^DR\s+CYGD\;\s+(.+)\;") 103 #arrayexpress_regex = re.compile("") # It's the same as UniprotAccession? 104 WormPep_regex = re.compile("^DR\s+WormPep\;\s+(.+)\;\s*CE(\d+)\.\s*$") 105 WormBase_regex = re.compile("^DR\s+WormBase\;\s*WBGene(\d+)\;\s*(.+)\.\s*$") 106 rgd_regex = re.compile("^DR\s+RGD\;\s+(\d+)\;") 107 108 #Sequence 109 sequence_regex = re.compile("^\s+(.+)$") 110 111 #Comments 112 new_comment_regex = re.compile("^CC\s+\-\!\-") 113 general_comment_regex = re.compile("^CC\s+(.+)$") 114 subcellular_location_regex = re.compile("SUBCELLULAR LOCATION:\s*(.*)$") 115 function_regex = re.compile("FUNCTION:\s*(.*)$") 116 disease_regex = re.compile("DISEASE:\s*(.*)$") 117 118 #Start first uniprotObject 119 uniprotObject = ExternalEntity( source_database = self.database, type="protein" ) 120 121 122 description = [] 123 sequence = [] 124 comments = { "SubcellularLocation": [], 125 "Disease": [], 126 "Function": [] } 127 128 actual_comment = None 129 130 self.initialize_input_file_descriptor() 131 132 uniprot_accession_list = [] 133 134 # START PARSING 135 for line in self.input_file_fd: 136 137 # New entry 138 if new_regex.match(line): 139 140 if uniprotObject is not None: 141 #add sequence 142 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="proteinSequence", value=ProteinSequence("".join(sequence)))) 143 144 #add description 145 if len(description)>0: 146 desc_str = " ".join(description) 147 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="description", value=desc_str)) 148 149 # Detect EC code in description 150 #if desc_str != "": 151 # enzymes = re.findall("\(EC[\s\=]*\d+\.\d+\.\d+\.\d+\)", desc_str) 152 # for enzyme in enzymes: 153 # enzyme = re.sub("[(^\(EC[\s\=]*)(\)$)]", "", enzyme).strip() 154 # if enzyme != "": 155 # uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="EC", value=enzyme, type="cross-reference")) 156 157 if desc_str != "": 158 enzymes = re.findall("EC=(.+\..+\..+\..+)\;", desc_str) 159 for enzyme in enzymes: 160 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="EC", value=enzyme, type="cross-reference")) 161 162 #add comments 163 if len(comments["Function"])>0: 164 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="function", value= " ".join(comments["Function"]))) 165 166 if len(comments["Disease"])>0: 167 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="disease", value = " ".join(comments["Disease"]))) 168 169 if len(comments["SubcellularLocation"])>0: 170 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="subcellularLocation", value = " ".join(comments["SubcellularLocation"]))) 171 172 # restart variables 173 description = [] 174 sequence = [] 175 comments = { "SubcellularLocation": [], 176 "Disease": [], 177 "Function": [] } 178 actual_comment = None 179 180 # Insert 181 self.biana_access.insert_new_external_entity( externalEntity = uniprotObject ) 182 183 184 # Start new object 185 uniprotObject = ExternalEntity( source_database = self.database, type="protein" ) 186 protein_number += 1 187 188 sequence = [] 189 190 if self.time_control: 191 if protein_number%20000==0: 192 sys.stderr.write("%s proteins done in %s seconds\n" %(protein_number,time.time()-self.initial_time)) 193 194 195 m = id_regex.match(line) 196 if m: 197 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="uniprotentry", value=m.group(1), type="unique")) 198 continue 199 200 m = ac_regex.match(line) 201 if m: 202 uniprot_accession_list.extend([ x.strip() for x in m.group(1).split(";") ]) 203 continue 204 205 m = ac_version_regex.search(line) 206 if m: 207 #print uniprot_accession_list 208 #[ uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="uniprotaccession", value=x, version=m.group(1), type="unique")) for x in uniprot_accession_list ] 209 # First one is the primary accession the followings are previous accessions 210 for i in range(uniprot_accession_list): 211 x = uniprot_accession_list[i] 212 if i == 0: 213 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="uniprotaccession", value=x, version=m.group(1), type="unique")) 214 else: 215 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="uniprotaccession", value=x, version=m.group(1), type="previous")) 216 uniprot_accession_list = [] 217 continue 218 219 m = de_regex.match(line) 220 if m: 221 description.append( m.group(1) ) 222 continue 223 224 m = taxID_regex.match(line) 225 if m: 226 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="taxID", value=m.group(1))) 227 continue 228 229 m = keyword_regex.match(line) 230 if m: 231 [ uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="keyword", value=x)) for x in m.group(1).split(";") ] 232 continue 233 234 235 m = sequence_regex.match(line) 236 if m: 237 sequence.append( m.group(1).replace(" ","") ) 238 239 # Gene 240 m = geneName_regex.match(line) 241 242 if m: 243 m = gene_name_regex.search(line) 244 if m: 245 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="geneSymbol", value=m.group(1),type="unique")) 246 247 m = gene_orf_name_regex.search(line) 248 if m: 249 [ uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="ORFName", value=x,type="alias")) for x in m.group(1).split(",") ] 250 251 m = gene_synonyms_regex.search(line) 252 if m: 253 [ uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="geneSymbol", value=x, type="synonym")) for x in m.group(1).split(",") ] 254 255 m = gene_orderedLocusNames.search(line) 256 if m: 257 [ uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="OrderedLocusName", value=x, type="alias")) for x in m.group(1).split(",") ] 258 259 260 continue 261 262 263 # COMMENTS 264 m = general_comment_regex.match(line) 265 if m: 266 267 if( new_comment_regex.match(line)): 268 actual_comment = None 269 m = subcellular_location_regex.search(line) 270 if m: 271 actual_comment = "SubcellularLocation" 272 else: 273 m = function_regex.search(line) 274 if m: 275 actual_comment = "Function" 276 else: 277 m = disease_regex.search(line) 278 if m: 279 actual_comment = "Disease" 280 281 if actual_comment is not None: 282 comments[actual_comment].append(m.group(1)) 283 284 else: 285 if actual_comment is not None: 286 comments[actual_comment].append(m.group(1)) 287 288 289 # CROSS-REFERENCES 290 if cross_regex.match(line): 291 292 m = WormBase_regex.match(line) 293 if m: 294 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="WormBaseGeneID", 295 value=m.group(1),type="cross-reference")) 296 continue 297 298 m = WormPep_regex.match(line) 299 if m: 300 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="WormBaseSequenceName", 301 value=m.group(1),type="cross-reference")) 302 continue 303 304 m = dip_regex.match(line) 305 if m: 306 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="DIP", 307 value=m.group(1),type="cross-reference")) 308 continue 309 310 m = tigr_regex.match(line) 311 if m: 312 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="tigr", 313 value=m.group(1),type="cross-reference")) 314 315 continue 316 317 m = cygd_regex.match(line) 318 if m: 319 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="cygd", 320 value=m.group(1),type="cross-reference")) 321 322 continue 323 324 325 m = rgd_regex.match(line) 326 if m: 327 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="rgd", 328 value=m.group(1),type="cross-reference")) 329 continue 330 331 m = pfam_regex.match(line) 332 if m: 333 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="pfam", value=m.group(1),type="cross-reference")) 334 continue 335 336 m = kegg_regex.match(line) 337 if m: 338 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="kegggene", value=m.group(1),type="cross-reference")) 339 continue 340 341 m = interpro_regex.match(line) 342 if m: 343 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="interpro", value=m.group(1),type="cross-reference")) 344 continue 345 346 m = prosite_regex.match(line) 347 if m: 348 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="prosite", value=m.group(1),type="cross-reference")) 349 continue 350 351 m = prodom_regex.match(line) 352 if m: 353 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="prodom", value=m.group(1), type="cross-reference")) 354 continue 355 356 m = mim_regex.match(line) 357 if m: 358 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="mim", value=m.group(1))) 359 continue 360 361 m = pir_regex.match(line) 362 if m: 363 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="pir", value=m.group(1), type="cross-reference")) 364 continue 365 366 m = prints_regex.match(line) 367 if m: 368 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="prints", value=m.group(1), type="cross-reference")) 369 continue 370 371 m = ensembl_regex.match(line) 372 if m: 373 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="ensembl", value=m.group(1), type="cross-reference")) 374 continue 375 376 m = embl_regex.match(line) 377 if m: 378 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="accessionNumber", value=m.group(1), type="cross-reference")) 379 continue 380 381 m = geneID_regex.match(line) 382 if m: 383 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="geneID", value=m.group(1), type="cross-reference")) 384 continue 385 386 m = go_regex.match(line) 387 if m: 388 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="go", value=m.group(1), type="cross-reference")) 389 continue 390 391 m = refseq_regex.match(line) 392 if m: 393 rs = m.group(1).split('.') 394 if len(rs)==2: 395 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="refseq", value=rs[0], version=rs[1], type="cross-reference")) 396 else: 397 print "Refseq %s has no version?" %m.group(1) 398 continue 399 400 m = unigene_regex.match(line) 401 if m: 402 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="unigene", value = m.group(1), type="cross-reference")) 403 continue 404 405 m = hgnc_regex.match(line) 406 if m: 407 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="hgnc", value=m.group(1),type="cross-reference")) 408 continue 409 410 m = pdb_regex.match(line) 411 if m: 412 pdb_code = m.group(1) 413 414 fragments = m.group(2).split(",") 415 416 for actual_frag in fragments: 417 m = re.search("\s*(.+)=(.+)\s*",actual_frag) 418 if m: 419 chains = m.group(1).split("/") 420 m = re.search("(\d+)-(\d+)",m.group(2)) 421 if m: 422 range = "%s-%s" %(m.group(1),m.group(2)) 423 424 [ uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="pdb", value=pdb_code, type = "cross-reference", 425 additional_fields = {"chain": x, 426 "pdb_range": range })) for x in chains ] 427 else: 428 [ uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="pdb", value=pdb_code, type="cross-reference", 429 additional_fields = {"chain": x})) for x in chains ] 430 431 continue 432 433 m = flybase_regex.match(line) 434 if m: 435 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="flybase", value=m.group(1), type = "cross-reference")) 436 continue 437 438 m = mgi_regex.match(line) 439 if m: 440 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="MGI", value=m.group(1),type="cross-reference")) 441 continue 442 443 444 m = reactome_regex.match(line) 445 if m: 446 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="reactome", value = m.group(1), type="cross-reference")) 447 continue 448 449 m = sgd_regex.match(line) 450 if m: 451 uniprotObject.add_attribute(ExternalEntityAttribute(attribute_identifier="SGD", value=m.group(1), type="cross-reference")) 452 453 continue
454