1 """
2 File : keggGeneParser.py
3 Author : Javier Garcia Garcia
4 Creation : January 2008
5 Contents : fills up tables in database biana with information from kegg gene database
6 Called from :
7
8 =======================================================================================================
9
10 This file implements a program that fills up tables in database biana with information of kegg gene databases
11
12 """
13
14 from bianaParser import *
15 import sets
16
17
19 """
20
21 """
22
23 name = "kegg_gene"
24 description = "This file implements a program that fills up tables in database biana with information of kegg Gene Database"
25 external_entity_definition = "A external entity represents a gene"
26 external_entity_relations = ""
27
37
39 """
40 """
41
42
43 continue_field_regex = re.compile("^\s{3,}([^;]+);*$")
44 field_regex = re.compile("^(\w+)\s+([^;]+);*$")
45 pathway_regex = re.compile("PATH\:\s+(map|rn)(\d+)\s+(.+)$")
46 ec_regex = re.compile("\[EC\:([\d\.])+\]")
47
48 space_regex = re.compile("\s+")
49 parenthesis_regex = re.compile("\(.+\)")
50
51
52
53 entry_regex = re.compile("ENTRY\s+(\w+)\s+([\w\_]+)\s+([\w\.]+)$")
54
55 dblink_split_regex = re.compile("(\w+)\:")
56
57 kegg_gene_object = None
58
59 temp_value = []
60 current_field = None
61
62 number_of_entries = 0
63
64 dict_name_tax = self.biana_access.get_taxonomy_names_taxID_dict()
65 new_dict_name_tax = {}
66
67 if len(dict_name_tax)==0:
68 print "Taxonomy won't be inserted as Taxonomy database has not been previously inserted"
69
70
71 for current_tax_name in dict_name_tax:
72 splitted = current_tax_name.split(" ")
73 if( len(splitted)==2 ):
74 new_dict_name_tax[current_tax_name[0].upper()+"."+splitted[1]] = dict_name_tax[current_tax_name]
75
76 del dict_name_tax
77 dict_name_tax = new_dict_name_tax
78
79 not_recognized_tax_id_names = sets.Set()
80
81 for line in self.input_file_fd:
82
83 m = entry_regex.search(line)
84
85 if m:
86
87 if kegg_gene_object is not None:
88 self.biana_access.insert_new_external_entity( externalEntity = kegg_gene_object )
89
90
91 if m.group(2) == "misc_RNA":
92 type = "RNA"
93 elif m.group(2) == "tRNA":
94 type = "tRNA"
95 elif m.group(2) == "rRNA":
96 type = "rRNA"
97 elif m.group(2) == "mRNA":
98 type = "mRNA"
99 elif m.group(2) == "CDS":
100 type = "CDS"
101 elif m.group(2) == "snRNA":
102 type = "snRNA"
103 elif m.group(2) == "snoRNA":
104 type = "snoRNA"
105 elif m.group(2) == "gene":
106 type = "gene"
107 else:
108 print "type %s not recognized..." %(m.group(2))
109
110
111 kegg_gene_object = ExternalEntity( source_database = self.database, type = type )
112 kegg_gene_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "keggGene", value = m.group(1), type = "unique" ) )
113
114 try:
115 kegg_gene_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "taxID", value = dict_name_tax[m.group(3)]) )
116 except:
117 not_recognized_tax_id_names.add(m.group(3))
118
119
120 number_of_entries += 1
121 if self.time_control:
122 if number_of_entries%20000==0:
123 sys.stderr.write("%s entries done in %s seconds\n" %(number_of_entries,time.time()-self.initial_time))
124
125
126 continue
127
128
129 new_field = field_regex.match(line)
130 if new_field:
131 if current_field == "DEFINITION":
132 kegg_gene_object.add_attribute( ExternalEntityAttribute(attribute_identifier = "description", value = " ".join(temp_value) ) )
133
134 ec_match = ec_regex.search("".join(temp_value))
135 if ec_match:
136 kegg_gene_object.add_attribute( ExternalEntityAttribute(attribute_identifier = "ec", value = ec_match.group(1) ) )
137
138
139 if current_field == "DBLINK":
140 all_db_links = " ".join(temp_value)
141 list_db_links = [ x.strip() for x in dblink_split_regex.split(all_db_links) ]
142
143 for actual_position in xrange(len(list_db_links)):
144 if list_db_links[actual_position] == "NCBI-GI":
145 [ kegg_gene_object.add_attribute(ExternalEntityAttribute( attribute_identifier = "gi", value = x, type="cross-reference")) for x in list_db_links[actual_position+1].split(" ") ]
146
147 elif list_db_links[actual_position] == "NCBI-GeneID":
148 [ kegg_gene_object.add_attribute(ExternalEntityAttribute( attribute_identifier = "geneID", value=x, type="cross-reference")) for x in list_db_links[actual_position+1].split(" ") ]
149
150 elif list_db_links[actual_position] == "UniProt":
151 [ kegg_gene_object.add_attribute(ExternalEntityAttribute( attribute_identifier = "uniprotAccession", value=x, type ="cross-reference")) for x in list_db_links[actual_position+1].split(" ") ]
152 elif list_db_links[actual_position] == "TIGR":
153 [ kegg_gene_object.add_attribute(ExternalEntityAttribute( attribute_identifier = "tigr", value=x, type ="cross-reference")) for x in list_db_links[actual_position+1].split(" ") ]
154
155
156 elif current_field == "MOTIF":
157 all_db_links = " ".join(temp_value)
158 list_db_links = [ x.strip() for x in dblink_split_regex.split(all_db_links) ]
159 for actual_position in xrange(len(list_db_links)):
160 if list_db_links[actual_position] == "Pfam":
161 [ kegg_gene_object.add_attribute(ExternalEntityAttribute( attribute_identifier ="pfam", value=x,type="cross-reference")) for x in list_db_links[actual_position+1].split(" ") ]
162 elif list_db_links[actual_position] == "PROSITE":
163 [ kegg_gene_object.add_attribute(ExternalEntityAttribute( attribute_identifier = "prosite", value=x, type = "cross-reference")) for x in list_db_links[actual_position+1].split(" ") ]
164
165 elif current_field == "AASEQ":
166 aa_seq = "".join(temp_value[1:])
167 kegg_gene_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "proteinSequence", value = ProteinSequence(aa_seq) ) )
168
169 elif current_field == "NTSEQ":
170 nn_seq = "".join(temp_value[1:])
171 kegg_gene_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "nucleotideSequence", value = DNASequence(nn_seq) ))
172
173
174 current_field = new_field.group(1)
175 temp_value = [new_field.group(2)]
176 else:
177 cont_value = continue_field_regex.match(line)
178 if cont_value:
179 temp_value.append(cont_value.group(1))
180
181
182
183 if kegg_gene_object is not None:
184 self.biana_access.insert_new_external_entity( externalEntity = kegg_gene_object )
185
186 print "Not recognized specie names: \n%s" %"\n".join(not_recognized_tax_id_names)
187