biana.BianaParser.nrParser

45 """ 46 NCBI nr Parser Class 47 """ 48 49 name = "nr" 50 description = "This file implements a program that fills up tables in BIANA database with information from nr" 51 external_entity_definition = "A external entity represents a protein" 52 external_entity_relations = "" 53

54 - def __init__(self):

55 56 # Start with the default values 57 58 BianaParser.__init__(self, default_db_description = "Non-Redundant NCBI database", 59 default_script_name = "nrParser.py", 60 default_script_description = NrParser.description) 61 62 self.default_eE_attribute = "proteinSequence" 63 self.initialize_input_file_descriptor()

64 65

66 - def parse_database(self):

67 """ 68 Method that implements the specific operations of nr parser 69 """ 70 71 # Read specific parameters 72 73 verbose_all = 0 74 75 self.initialize_input_file_descriptor() 76 77 78 # Initialize protein number (to the output printing...) 79 self.protein_number=0 # Counter of total of protein lines in file 80 self.total_entries=0 # Counter of total of sequence-taxid entries in file 81 self.total_inconsistencies=0 # counter of inconsistencies found (inconsistencies with identifiers cross-references) 82 83 #nr_parser = Fasta.RecordParser() 84 #nr_input_file = self.input_file_fd 85 #nr_iterator = Fasta.Iterator(nr_input_file, nr_parser) 86 87 88 #nr_record = nr_iterator.next() 89 90 self.initialize_input_file_descriptor() 91 92 93 if self.verbose: 94 print "Processing file" 95 96 # get dictionary for gis and tax ids 97 if self.verbose: 98 print "Reading gi/taxonomy information..." 99 100 dict_name_tax = self.biana_access.get_taxonomy_names_taxID_dict() 101 102 103 # Dictionary to save inconsistencies (when a proteinPiana does not exists but exists any external Identifier...) 104 inconsistencies = {} 105 106 # while record read is not None, parse the record and insert data into piana 107 108 sequence = [] 109 protein_title_line = None 110 111 for line in self.input_file_fd: 112 113 if line[0]==">": 114 if len(sequence)>0: 115 self.parse_nr_record(header_line = protein_title_line, sequence = "".join(sequence)) 116 protein_title_line = line[1:] 117 sequence = [] 118 else: 119 sequence.append(line.strip()) 120 121 if len(sequence)>0: 122 self.parse_nr_record(header_line = protein_title_line, sequence = "".join(sequence))

123 124

125 - def parse_nr_record(self, header_line, sequence):

126 127 self.protein_number += 1 128 self.add_to_log("Number of proteins in input file") 129 130 131 if self.time_control: 132 if self.protein_number%20000==0: 133 sys.stderr.write("%s proteins\t%s seq-taxid entries\tin %s seconds\n" %(self.protein_number,self.total_entries,time.time()-self.initial_time)) 134 135 protein_title_line = header_line 136 protein_sequence = sequence 137 #protein_title_line = nr_record.title 138 #protein_sequence = nr_record.sequence.strip() 139 140 """ 141 Now, we have a proteinPiana that will be asigned to all other codes that we find in the title line 142 """ 143 # title can look like this: gi|1346670|sp|P04175|NCPR_PIG NADPH--cytochrome P450 reductase (CPR) (P450R) 144 # gi|5832586|dbj|BAA84019.1| maturase [Eucharis grandiflora] 145 # gi|223154|prf||0601198A polymerase beta,RNA 146 # gi|17538019|ref|NP_496216.1| surfeit 5 (18.1 kD) (2K591) [Caenorhabditis elegans]$gi|2497033|sp|Q23679|YWM3_CAEEL Hypothetical protein ZK970.3 in chromosome II$gi|3881897|emb|CAA88887.1| Hypothetical protein ZK970.3 [Caenorhabditis elegans]$gi|7511441|pir||T28127 hypothetical protein ZK970.3 - Caenorhabditis elegans 147 # 148 # (character '>' from input file has been removed) 149 # (where character $ is actually octal character 001) 150 # 151 # in nr (as opposed to genpept) a title can have several gi identifiers for the same sequence 152 # however, nr is a mess, so it is easier to get species and so on from genpept (that is why we parse it first) 153 154 155 # So, given the messy format of nr, first of all split by octal character 001 to see if there is more than one gi 156 title_entries = protein_title_line.split("\001") 157 158 # for each of the title entries (normally just one) split it, retrieve information and insert codes, using same sequence for all) 159 for title_entrie in title_entries: 160 161 162 nr_object = ExternalEntity( source_database = self.database, type="protein" ) 163 164 nr_object.add_attribute(ExternalEntityAttribute( attribute_identifier = "proteinsequence", 165 value = ProteinSequence(protein_sequence))) 166 167 self.total_entries += 1 168 169 gi = None 170 171 if self.verbose: 172 sys.stderr.write("One entry of the title is: %s\n" %(title_entrie)) 173 174 title_atoms = title_entrie.split('|') 175 176 # title_atom[1] (should) is always the gi code 177 #if( title_atoms[0] == "gi" ): 178 # gi_id = int(title_atoms[1]) 179 180 # 181 # title_atom[2] can be (exhaustive list as of 10.2004): 182 # 183 # - gb: for EMBL accessions 184 # - emb: for EMBL protein id 185 # - pir: for PIR accessions 186 # - sp: for swissprot 187 # - dbj: dna japan 188 # - prf: ??? 189 # - ref: ref_seq 190 # - pdb: pdb code 191 # - tpg: ??? 192 # - tpe: ??? 193 194 ### UPDATED: February 2007: 195 196 # - gb: for GenBank acession Number 197 # - emb: for EMBL accession Number 198 # - dbj: for DDBJ, DNA Database of Japan 199 # - pir: for NBRF PIR 200 # - prf: for Protein Research Foundation 201 # - sp: for Swiss-Prot Entries 202 # - pdb: for Brookhaven Protein Data Bank 203 # - pat: for Patents 204 # - bbs: for GenInfo Backbone Id 205 # - gnl: for General database Identifier 206 # - ref: for NCBI Reference Sequence 207 # - lcl: for Local Sequence Identifier 208 209 # title_atom[3] is the value (a protein id) indicated in title_atom[2] 210 211 # In theory, only one has to exist... (because of the EMBL/GenBank/DDBJ nucleotide sequence database... 212 #gb_accession = None 213 #embl_accession = None 214 #dbj_accession = None 215 216 accessionNumber = None 217 uniprot_accession = None 218 uniprot_entry = None 219 ref_seq = None 220 pdb_code = None 221 pdb_chain = None 222 pir = None 223 description = None 224 species_name = None 225 226 # Read all the interesting cross-references: 227 for i in xrange(len(title_atoms)): 228 if title_atoms[i] == "gi": 229 if gi is None: 230 gi = int(title_atoms[i+1]) 231 nr_object.add_attribute(ExternalEntityAttribute(attribute_identifier = "gi", 232 value = gi, 233 type = "unique" )) 234 else: 235 sys.stderr.write(" Error parsing... how an entry can have several gi values? \n") 236 elif title_atoms[i] == "gb" or title_atoms[i] == "emb" or title_atoms[i] == "dbj": 237 if accessionNumber is None: 238 accession = re.search("(\w+)\.(\d+)",title_atoms[i+1]) 239 if accession: 240 accessionNumber = accession.group(1) 241 accessionNumber_version = accession.group(2) 242 nr_object.add_attribute(ExternalEntityAttribute(attribute_identifier="accessionNumber", 243 value = accessionNumber, 244 version = accessionNumber_version, 245 type = "unique" )) 246 else: 247 sys.stderr.write(" Error parsing... how an entry can have several accesionNumber.version values? \n") 248 elif title_atoms[i] == "pir": 249 if pir is None: 250 pirRe = re.search("(\w+)\.*",title_atoms[i+2]) 251 if pirRe: 252 pir = pirRe.group(1) 253 nr_object.add_attribute(ExternalEntityAttribute(attribute_identifier = "pir", 254 value = pir, 255 type = "cross-reference" )) 256 else: 257 sys.stderr.write(" Error parsing... How an entry can have several pir values? \n") 258 259 elif title_atoms[i] == "sp": 260 if uniprot_accession is None: 261 uniprot_accession = title_atoms[i+1][0:6] 262 else: 263 sys.stderr.write(" There are more than one uniprot accession for the same gi... \n") 264 if uniprot_entry is None: 265 # "Chapuza" because there is one entry that has no uniprotEntry name...and the parser does not work correctly 266 # This entry is: gi|20148613|gb|AAM10197.1| similar to high affinity sulfate transporter 2|sp|P53392 [Arabidopsis thaliana] 267 if len(title_atoms) > (i+2): 268 uniprot_entry = (title_atoms[i+2].split())[0].strip() 269 nr_object.add_attribute(ExternalEntityAttribute(attribute_identifier = "uniprotentry", 270 value = uniprot_entry, 271 type = "cross-reference")) 272 else: 273 sys.stderr.write(" There are more than one uniprot entry for the same gi... \n") 274 elif title_atoms[i] == "ref": 275 if ref_seq is None: 276 ref_seq = title_atoms[i+1].strip().split(".") 277 nr_object.add_attribute(ExternalEntityAttribute(attribute_identifier = "refseq", 278 value = ref_seq[0], 279 version = ref_seq[1], 280 type = "cross-reference")) 281 else: 282 sys.stderr.write(" There are more than one uniprot RefSeq for the same gi... \n") 283 elif title_atoms[i] == "pdb": 284 if pdb_code is None: 285 pdb_code = title_atoms[i+1].strip() 286 pdb_chain = title_atoms[i+2][0].strip() 287 nr_object.add_attribute(ExternalEntityAttribute(attribute_identifier = "pdb", 288 value = pdb_code, 289 additional_fields = { "chain": pdb_chain }, 290 type = "cross-reference")) 291 else: 292 sys.stderr.write(" There are more than one PDB for the same gi... \n") 293 294 # Take the description 295 # has_description = re.search("\|([^\|]*)\[",title_entrie) 296 # if has_description: 297 # description = has_description.group(1) 298 # if description != "": 299 # description = description.replace('"'," ").replace("\n", " ").replace("'"," ").replace('\\', " ").strip(), 300 # nr_object.add_attribute(ExternalEntityAttribute(attribute_identifier = "description", 301 # value = description )) 302 303 304 305 # TO CHECK: 306 # Take the specie: 307 #has_specie = re.search("\[(.*)\]$",title_entrie) 308 #if has_specie: 309 # species_name = has_specie.group(1).replace('"','').replace("(","").strip() 310 311 312 # Now, get the tax_ID associated with the specie 313 # Process species name... 314 315 starts = [] 316 ends = [] 317 318 for x in xrange(len(title_entrie)): 319 if title_entrie[x]=='[': 320 starts.append(x) 321 elif title_entrie[x]==']': 322 ends.append(x) 323 324 starts.reverse() 325 ends.reverse() 326 327 try: 328 limit = starts[0] 329 330 for x in xrange(1,len(ends)): 331 if ends[x]>starts[x-1]: 332 limit = starts[x] 333 continue 334 limit = starts[x-1] 335 break 336 337 species_name = title_entrie[limit+1:-1] 338 339 try: 340 341 tax_id_value = dict_name_tax[species_name.lower()] 342 nr_object.add_attribute( ExternalEntityAttribute( attribute_identifier="taxid", value=tax_id_value) ) 343 344 except: 345 # If not found, try to split by "," or by "=" 346 try: 347 tax_id_value = dict_name_tax[species_name.split(",")[0].lower()] 348 nr_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "taxID", value = tax_id_value )) 349 except: 350 351 try: 352 tax_id_value = dict_name_tax[species_name.split("=")[0].lower()] 353 nr_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "taxID", value = tax_id_value )) 354 355 except: 356 if self.verbose: 357 sys.stderr.write("%s not found\n" %(species_name)) 358 359 360 nr_object.add_attribute( ExternalEntityAttribute(attribute_identifier = "description", 361 value = title_entrie[:limit].strip() ) ) 362 363 except: 364 nr_object.add_attribute( ExternalEntityAttribute(attribute_identifier = "description", 365 value = title_entrie.strip())) 366 367 368 self.biana_access.insert_new_external_entity( externalEntity = nr_object ) 369 370 371 # reading next record 372 if self.verbose: 373 sys.stderr.write( "\n---------------reading next record---------------\n")

Source Code for Module biana.BianaParser.nrParser