Package biana :: Package BianaParser :: Module ipiParser
[hide private]
[frames] | no frames]

Source Code for Module biana.BianaParser.ipiParser

  1  """ 
  2      BIANA: Biologic Interactions and Network Analysis 
  3      Copyright (C) 2009  Javier Garcia-Garcia, Emre Guney, Baldo Oliva 
  4   
  5      This program is free software: you can redistribute it and/or modify 
  6      it under the terms of the GNU General Public License as published by 
  7      the Free Software Foundation, either version 3 of the License, or 
  8      (at your option) any later version. 
  9   
 10      This program is distributed in the hope that it will be useful, 
 11      but WITHOUT ANY WARRANTY; without even the implied warranty of 
 12      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 13      GNU General Public License for more details. 
 14   
 15      You should have received a copy of the GNU General Public License 
 16      along with this program.  If not, see <http://www.gnu.org/licenses/>. 
 17   
 18  """ 
 19   
 20   
 21  """ 
 22  File        : ipi2piana.py 
 23  Author      : Javier Garcia 
 24  Creation    : November 2007 
 25  Contents    : fills up tables in database piana with information from IPI 
 26  Called from :  
 27  ======================================================================================================= 
 28  """ 
 29   
 30  import re 
 31  from bianaParser import * 
 32   
 33   
34 -class IPIParser(BianaParser):
35 """ 36 IPI Parser Class 37 """ 38 39 name = "ipi" 40 description = "Inserts information of IPI database into BIANA" 41 external_entity_definition = "External entities are proteins" 42 external_entity_relations = "" 43
44 - def __init__(self):
45 46 # Start with the default values 47 48 BianaParser.__init__(self, default_db_description = "IPI. International Protein Index", 49 default_script_name = "ipi2piana.py", 50 default_script_description = IPIParser.description, 51 additional_compulsory_arguments = []) 52 self.default_eE_attribute = "ipi"
53 54
55 - def parse_database(self):
56 57 self.initialize_input_file_descriptor() 58 59 if self.input_file_fd is not None: 60 self.parse_file() 61 else: # is a directory 62 dirname = os.path.dirname(self.input_file+os.sep)+os.sep 63 files = os.listdir(self.input_file) 64 for current_file in files: 65 if re.search("ipi.\w+\.fasta",current_file): 66 if ( current_file.endswith(".gz") ): 67 self.input_file_fd = gzip.open(dirname+current_file,'r') 68 else: 69 self.input_file_fd = file(dirname+current_file, 'r') 70 self.parse_file()
71 72
73 - def parse_file(self):
74 """ 75 Method that implements the specific operations of HGNC parser 76 """ 77 78 # Example: 79 80 #>IPI:IPI00000001.2|SWISS-PROT:O95793-1|TREMBL:Q59F99|ENSEMBL:ENSP00000360922;ENSP00000379466|REFSEQ:NP_059347|H-INV:HIT000329496|VEGA:OTTHUMP00000031233 Tax 81 #_Id=9606 Gene_Symbol=STAU1 Isoform Long of Double-stranded RNA-binding protein Staufen homolog 1 82 #MSQVQVQVQNPSAALSGSQILNKNQSLLSQPLMSIPSTTSSLPSENAGRPIQNSALPSAS 83 #ITSTSAAAESITPTVELNALCMKLGKKPMYKPVDPYSRMQSTYNYNMRGGAYPPRYFYPF 84 85 86 line_number = 0 87 ipi_object = None 88 ipi_object_number = 0 89 actual_sequence = [] 90 91 for line in self.input_file_fd: 92 93 line_number += 1 94 95 line.strip() 96 97 field_search_re = re.compile("([\w\-]+)\:(\S+)") 98 tax_id_regex = re.compile("Tax_Id=(\d+)") 99 gene_symbol_regex = re.compile("Gene_Symbol=(\S+)\s+(.*)") 100 101 if line[0]=='>': 102 103 104 # Insert the last entry to the database 105 if ipi_object is not None: 106 ipi_object.add_attribute(ExternalEntityAttribute(attribute_identifier="proteinSequence", 107 value = ProteinSequence("".join(actual_sequence)))) 108 109 self.biana_access.insert_new_external_entity( externalEntity = ipi_object ) 110 111 # Start new entry 112 ipi_object = ExternalEntity( source_database = self.database, type="protein" ) 113 ipi_object_number += 1 114 115 if self.time_control: 116 if ipi_object_number%20000==0: 117 sys.stderr.write("%s entries done in %s seconds\n" %(ipi_object_number,time.time()-self.initial_time)) 118 119 actual_sequence = [] 120 line_fields = line.lstrip(">").split("|") 121 122 for actual_field in line_fields: 123 124 search = field_search_re.search(actual_field) 125 126 if search: 127 identifier_type = search.group(1) 128 values = search.group(2).split(";") 129 130 if( identifier_type == "IPI" ): 131 for actual_value in values: 132 #if actual_value.startswith("IPI"): 133 # actual_value = actual_value[3:] 134 ipi_object.add_attribute(ExternalEntityAttribute(attribute_identifier = "ipi", 135 value = actual_value, 136 type = "unique" )) 137 138 elif( identifier_type == "ENSEMBL" ): 139 for actual_value in values: 140 ipi_object.add_attribute(ExternalEntityAttribute(attribute_identifier="ensembl", 141 value = actual_value, 142 type = "cross-reference" )) 143 144 elif( identifier_type == "REFSEQ" ): 145 for actual_value in values: 146 ipi_object.add_attribute(ExternalEntityAttribute(attribute_identifier="refseq", 147 value = actual_value, 148 type = "cross-reference" )) 149 150 elif( identifier_type == "TREMBL" ): 151 for actual_value in values: 152 ipi_object.add_attribute(ExternalEntityAttribute(attribute_identifier="uniprotaccession", 153 value = actual_value, 154 type = "cross-reference" )) 155 156 elif( identifier_type == "SWISS-PROT" ): 157 for actual_value in values: 158 ipi_object.add_attribute(ExternalEntityAttribute(attribute_identifier="UniprotAccession", 159 value = actual_value[0:6], 160 type = "cross-reference" )) 161 162 elif( identifier_type == "TAIR" ): 163 for actual_value in values: 164 ipi_object.add_attribute(ExternalEntityAttribute(attribute_identifier="tair", 165 value = actual_value, 166 type = "cross-reference" )) 167 168 search = tax_id_regex.search(line) 169 if search: 170 ipi_object.add_attribute(ExternalEntityAttribute(attribute_identifier="taxID", 171 value = search.group(1) )) 172 173 search = gene_symbol_regex.search(line) 174 if search: 175 ipi_object.add_attribute(ExternalEntityAttribute(attribute_identifier="geneSymbol", 176 value = actual_value, 177 type = "cross-reference" )) 178 179 search2 = re.search("[Emb|Gb]\|(\S+)",search.group(2)) 180 if search2: 181 ipi_object.add_attribute(ExternalEntityAttribute(attribute_identifier = "accessionNumber", 182 value = search2.group(1), 183 type = "cross-reference" )) 184 else: 185 ipi_object.add_attribute(ExternalEntityAttribute(attribute_identifier="description", 186 value = search.group(2) )) 187 188 else: 189 # Sequence line 190 actual_sequence.append(line.strip()) 191 192 # Insert the last entry 193 if ipi_object is not None: 194 ipi_object.add_attribute(ExternalEntityAttribute(attribute_identifier="proteinSequence", 195 value = ProteinSequence("".join(actual_sequence)))) 196 self.biana_access.insert_new_external_entity( externalEntity = ipi_object )
197