biana.BianaParser.ncbiGenPeptParser

1 """ 2 BIANA: Biologic Interactions and Network Analysis 3 Copyright (C) 2009 Javier Garcia-Garcia, Emre Guney, Baldo Oliva 4 5 This program is free software: you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation, either version 3 of the License, or 8 (at your option) any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with this program. If not, see <http://www.gnu.org/licenses/>. 17 18 """ 19 20 21 from bianaParser import * 22 23

24 -class NcbiGenPeptParserGenpeptParser(BianaParser):

25 """ 26 Genpept Parser class 27 """ 28 29 name = "ncbi_genpept" 30 description = "Inserts NCBI Genpet database into BIANA" 31 external_entity_definition = "A external entity represents a protein" 32 external_entity_relations = "" 33

34 - def __init__(self):

35 36 # Start with the default values 37 38 BianaParser.__init__(self, default_db_description = "NCBI genbank", 39 default_script_name = "ncbiGenPeptParser.py", 40 default_script_description = NcbiGenPeptParserGenpeptParser.description, 41 additional_optional_arguments = []) 42 self.default_eE_attribute = "proteinSequence" 43 self.initialize_input_file_descriptor()

44 45 46 47

48 - def parse_database(self):

49 50 def insert_fsa_aa_files(arg,dirname,names): 51 for name in names: 52 if name.endswith('.fsa_aa'): 53 file_fd = open(os.path.join(dirname,name),'r') 54 elif name.endswith('.fsa_aa.gz'): 55 file_fd = gzip.open(os.path.join(dirname,name),'r') 56 else: 57 continue 58 #try: 59 self.parse_fasta_file(file_fd) 60 #except: 61 # sys.stderr.write("Error parsing file %s\n" %name) 62 # traceback.print_exc() 63 file_fd.close()

64 65 66 self.protein_number=0 67 68 self.dict_name_tax = self.biana_access.get_taxonomy_names_taxID_dict() 69 70 if( len(self.dict_name_tax) == 0 ): 71 raise ValueError("Taxonomies have not been inserted. You must previously insert information about taxonomy") 72 73 # All these tags will be considered to be pointing to id type Accession 74 self.accepted_coll_accessions = { "gb": None, 75 "dbj": None, 76 "emb": None} 77 78 # Run all the path to insert all the hssp files of the path 79 os.path.walk(self.input_file,insert_fsa_aa_files,None)

80 81 82

83 - def parse_fasta_file(self, fasta_file_fd):

84 85 # Define the parser information to print in the help: 86 87 # STEP 3: THE PARSER ITSELF. IT MUST USE THE METHODS OF PianaDBaccess 88 89 90 91 if self.verbose: 92 sys.stderr.write("Loading taxonomies\n") 93 94 95 if self.verbose: 96 sys.stderr.write( "Processing file\n") 97 98 99 sequence = [] 100 protein_title_line = None 101 102 for line in fasta_file_fd: 103 104 if line[0]==">": 105 106 if len(sequence)>0: 107 self.parse_genpept_entry(header_line = protein_title_line, sequence = "".join(sequence)) 108 109 protein_title_line = line[1:] 110 sequence = [] 111 else: 112 sequence.append(line.strip()) 113 114 if len(sequence)>0: 115 self.parse_genpept_entry(header_line = protein_title_line, sequence = "".join(sequence))

116 117

118 - def parse_genpept_entry(self, header_line, sequence):

119 120 genpept_object = ExternalEntity( source_database = self.database, type="protein" ) 121 122 self.protein_number += 1 123 124 self.add_to_log("Number of proteins processed") 125 126 if self.verbose: 127 sys.stderr.write( "===================================\n") 128 sys.stderr.write( " NEW ENTRY\n") 129 sys.stderr.write( "===================================\n") 130 131 132 if self.time_control: 133 if self.protein_number%20000==0: 134 sys.stderr.write("%s proteins done in %s seconds\n" %(self.protein_number,time.time()-self.initial_time)) 135 136 protein_title_line = header_line 137 protein_sequence = sequence 138 139 if protein_sequence: 140 genpept_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "proteinsequence", 141 value = ProteinSequence(protein_sequence)) ) 142 143 if self.verbose: 144 sys.stderr.write("title line is : %s\n" %(protein_title_line)) 145 146 # title looks like this: gi|gi_id|sourceDB|acc| protein name [species] 147 # - gi|4105707|gb|AAD02507.1| carbamate kinase [Trichomonas vaginalis] 148 # - gi|4105707|gb|AAD02507.1| carbamate kinase [Trichomonas vaginalis] 149 # - gi|4105707|gb|AAD02507.1| carbamate kinase 150 # - gi|4105707|dbj|BBAS02507.1| carbamate kinase 4243 151 # 152 # (character '>' from input file has been removed) 153 154 title_atoms = protein_title_line.split('|') 155 156 gi_id = int(title_atoms[1]) 157 158 accessionNum = None 159 accessionVersion = None 160 161 if self.accepted_coll_accessions.has_key(title_atoms[2]): 162 # we consider accessions from sources in accepted_coll_accessions 163 coll_acc = title_atoms[3].strip() 164 165 # Split Accession Number and Version 166 try: 167 splited = re.split("\.",coll_acc) 168 accessionNum = splited[0] 169 accessionVersion = splited[1] 170 except: 171 accessionNum = coll_acc 172 173 temp = [] 174 for x in xrange(4,len(title_atoms)): 175 temp.append(title_atoms[x]) 176 temp_protein_name = "".join(temp) 177 178 179 # Process species name... 180 181 starts = [] 182 ends = [] 183 184 for x in xrange(len(temp_protein_name)): 185 if temp_protein_name[x]=='[': 186 starts.append(x) 187 elif temp_protein_name[x]==']': 188 ends.append(x) 189 190 starts.reverse() 191 ends.reverse() 192 193 try: 194 limit = starts[0] 195 196 for x in xrange(1,len(ends)): 197 if ends[x]>starts[x-1]: 198 limit = starts[x] 199 continue 200 limit = starts[x-1] 201 break 202 203 204 species_name = temp_protein_name[limit+1:-1] 205 206 try: 207 208 tax_id_value = self.dict_name_tax[species_name.lower()] 209 genpept_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "taxID", 210 value = tax_id_value )) 211 except: 212 # If not found, try to split by "," or by "=" 213 try: 214 tax_id_value = self.dict_name_tax[species_name.split(",")[0].lower()] 215 genpept_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "taxID", 216 value = tax_id_value )) 217 except: 218 219 try: 220 tax_id_value = self.dict_name_tax[species_name.split("=")[0].lower()] 221 genpept_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "taxID", 222 value = tax_id_value )) 223 224 except: 225 if self.verbose: 226 sys.stderr.write("%s not found\n" %(species_name)) 227 228 229 genpept_object.add_attribute( ExternalEntityAttribute(attribute_identifier = "description", 230 value = temp_protein_name[:limit].strip() ) ) 231 232 self.add_to_log("Number of protein descriptions inserted") 233 234 except: 235 genpept_object.add_attribute( ExternalEntityAttribute(attribute_identifier = "description", 236 value = temp_protein_name.strip() ) ) 237 self.add_to_log("Number of protein descriptions inserted") 238 239 240 """ 241 Now, we have a proteinPiana for the (sequence, tax_id), either the one already existing before or a new one. 242 243 Just add protein information to database tables. 244 245 TO DO!! this would be a good place to check for consistency between uniprot and genbank, using embl accessions 246 """ 247 248 if gi_id: 249 genpept_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "gi", 250 value = gi_id, 251 type = "unique" ) ) 252 self.add_to_log("Number of gi codes inserted") 253 254 if accessionNum != "": 255 if accessionVersion is None: 256 genpept_object.add_attribute( ExternalEntityAttribute(attribute_identifier = "accessionNumber", 257 value = accessionNum, 258 type = "unique" ) ) 259 else: 260 genpept_object.add_attribute( ExternalEntityAttribute(attribute_identifier = "accessionNumber", 261 value = accessionNum, 262 version = accessionVersion, 263 type = "unique")) 264 self.add_to_log("Number of accessionNumber inserted") 265 266 267 268 # reading next record 269 if self.verbose: 270 sys.stderr.write( "reading next record\n") 271 272 273 self.biana_access.insert_new_external_entity( externalEntity = genpept_object )

274

Source Code for Module biana.BianaParser.ncbiGenPeptParser