biana.BianaParser.keggGeneParser

19 """ 20 21 """ 22 23 name = "kegg_gene" 24 description = "This file implements a program that fills up tables in database biana with information of kegg Gene Database" 25 external_entity_definition = "A external entity represents a gene" 26 external_entity_relations = "" 27

28 - def __init__(self):

29 30 # Start with the default values 31 32 BianaParser.__init__(self, default_db_description = "KEGG GENE database", 33 default_script_name = "keggGeneParser.py", 34 default_script_description = KeggGeneParser.description ) 35 self.default_eE_attribute = "keggGene" 36 self.initialize_input_file_descriptor()

37

38 - def parse_database(self):

39 """ 40 """ 41 42 # General regex 43 continue_field_regex = re.compile("^\s{3,}([^;]+);*$") 44 field_regex = re.compile("^(\w+)\s+([^;]+);*$") 45 pathway_regex = re.compile("PATH\:\s+(map|rn)(\d+)\s+(.+)$") 46 ec_regex = re.compile("\[EC\:([\d\.])+\]") 47 48 space_regex = re.compile("\s+") 49 parenthesis_regex = re.compile("$.+$") # used to eliminate extra information in sequence 50 51 # In this case, the entry contains information about the specie 52 #ENTRY ZMO0001 CDS Z.mobilis 53 entry_regex = re.compile("ENTRY\s+(\w+)\s+([\w\_]+)\s+([\w\.]+)$") 54 55 dblink_split_regex = re.compile("(\w+)\:") 56 57 kegg_gene_object = None 58 59 temp_value = [] # List used to store the information of those fields that can have more than a single line 60 current_field = None 61 62 number_of_entries = 0 63 64 dict_name_tax = self.biana_access.get_taxonomy_names_taxID_dict() 65 new_dict_name_tax = {} 66 67 if len(dict_name_tax)==0: 68 print "Taxonomy won't be inserted as Taxonomy database has not been previously inserted" 69 70 # Transform species name 71 for current_tax_name in dict_name_tax: 72 splitted = current_tax_name.split(" ") 73 if( len(splitted)==2 ): 74 new_dict_name_tax[current_tax_name[0].upper()+"."+splitted[1]] = dict_name_tax[current_tax_name] 75 76 del dict_name_tax 77 dict_name_tax = new_dict_name_tax 78 79 not_recognized_tax_id_names = sets.Set() 80 81 for line in self.input_file_fd: 82 83 m = entry_regex.search(line) 84 85 if m: 86 87 if kegg_gene_object is not None: 88 self.biana_access.insert_new_external_entity( externalEntity = kegg_gene_object ) 89 90 # It should be gene or protein... to check!!! 91 if m.group(2) == "misc_RNA": 92 type = "RNA" 93 elif m.group(2) == "tRNA": 94 type = "tRNA" 95 elif m.group(2) == "rRNA": 96 type = "rRNA" 97 elif m.group(2) == "mRNA": 98 type = "mRNA" 99 elif m.group(2) == "CDS": 100 type = "CDS" 101 elif m.group(2) == "snRNA": 102 type = "snRNA" 103 elif m.group(2) == "snoRNA": 104 type = "snoRNA" 105 elif m.group(2) == "gene": 106 type = "gene" 107 else: 108 print "type %s not recognized..." %(m.group(2)) 109 110 111 kegg_gene_object = ExternalEntity( source_database = self.database, type = type ) 112 kegg_gene_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "keggGene", value = m.group(1), type = "unique" ) ) 113 114 try: 115 kegg_gene_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "taxID", value = dict_name_tax[m.group(3)]) ) 116 except: 117 not_recognized_tax_id_names.add(m.group(3)) 118 119 120 number_of_entries += 1 121 if self.time_control: 122 if number_of_entries%20000==0: 123 sys.stderr.write("%s entries done in %s seconds\n" %(number_of_entries,time.time()-self.initial_time)) 124 125 126 continue 127 128 129 new_field = field_regex.match(line) 130 if new_field: 131 if current_field == "DEFINITION": 132 kegg_gene_object.add_attribute( ExternalEntityAttribute(attribute_identifier = "description", value = " ".join(temp_value) ) ) 133 134 ec_match = ec_regex.search("".join(temp_value)) 135 if ec_match: 136 kegg_gene_object.add_attribute( ExternalEntityAttribute(attribute_identifier = "ec", value = ec_match.group(1) ) ) 137 138 139 if current_field == "DBLINK": 140 all_db_links = " ".join(temp_value) 141 list_db_links = [ x.strip() for x in dblink_split_regex.split(all_db_links) ] 142 143 for actual_position in xrange(len(list_db_links)): 144 if list_db_links[actual_position] == "NCBI-GI": 145 [ kegg_gene_object.add_attribute(ExternalEntityAttribute( attribute_identifier = "gi", value = x, type="cross-reference")) for x in list_db_links[actual_position+1].split(" ") ] 146 147 elif list_db_links[actual_position] == "NCBI-GeneID": 148 [ kegg_gene_object.add_attribute(ExternalEntityAttribute( attribute_identifier = "geneID", value=x, type="cross-reference")) for x in list_db_links[actual_position+1].split(" ") ] 149 150 elif list_db_links[actual_position] == "UniProt": 151 [ kegg_gene_object.add_attribute(ExternalEntityAttribute( attribute_identifier = "uniprotAccession", value=x, type ="cross-reference")) for x in list_db_links[actual_position+1].split(" ") ] 152 elif list_db_links[actual_position] == "TIGR": 153 [ kegg_gene_object.add_attribute(ExternalEntityAttribute( attribute_identifier = "tigr", value=x, type ="cross-reference")) for x in list_db_links[actual_position+1].split(" ") ] 154 155 # MOTIF not used because not standard nomenclature... 156 elif current_field == "MOTIF": 157 all_db_links = " ".join(temp_value) 158 list_db_links = [ x.strip() for x in dblink_split_regex.split(all_db_links) ] 159 for actual_position in xrange(len(list_db_links)): 160 if list_db_links[actual_position] == "Pfam": 161 [ kegg_gene_object.add_attribute(ExternalEntityAttribute( attribute_identifier ="pfam", value=x,type="cross-reference")) for x in list_db_links[actual_position+1].split(" ") ] 162 elif list_db_links[actual_position] == "PROSITE": 163 [ kegg_gene_object.add_attribute(ExternalEntityAttribute( attribute_identifier = "prosite", value=x, type = "cross-reference")) for x in list_db_links[actual_position+1].split(" ") ] 164 165 elif current_field == "AASEQ": 166 aa_seq = "".join(temp_value[1:]) 167 kegg_gene_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "proteinSequence", value = ProteinSequence(aa_seq) ) ) 168 169 elif current_field == "NTSEQ": 170 nn_seq = "".join(temp_value[1:]) 171 kegg_gene_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "nucleotideSequence", value = DNASequence(nn_seq) )) 172 173 174 current_field = new_field.group(1) 175 temp_value = [new_field.group(2)] 176 else: 177 cont_value = continue_field_regex.match(line) 178 if cont_value: 179 temp_value.append(cont_value.group(1)) 180 181 182 # Insert the last one 183 if kegg_gene_object is not None: 184 self.biana_access.insert_new_external_entity( externalEntity = kegg_gene_object ) 185 186 print "Not recognized specie names: \n%s" %"\n".join(not_recognized_tax_id_names)

Source Code for Module biana.BianaParser.keggGeneParser