1 """
2 BIANA: Biologic Interactions and Network Analysis
3 Copyright (C) 2009 Javier Garcia-Garcia, Emre Guney, Baldo Oliva
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>.
17
18 """
19
20 from bianaParser import *
21 import sets
22
24 """
25 COG Parser Class
26 """
27
28 name = "cog"
29 description = "Clusters of Orthologous Groups of proteins (COGs)"
30 external_entity_definition = "An element in a COG"
31 external_entity_relations = "A COG"
32
34
35
36
37 BianaParser.__init__(self, default_db_description = "COG database",
38 default_script_name = "cogParser.py",
39 default_script_description = CogParser.description,
40 additional_optional_arguments = [])
41 self.default_eE_attribute = "cog"
42
43
44
46
47
48 if os.path.isdir(self.input_file):
49 self.input_path = self.input_file
50 else:
51 raise ValueError("You must specify a path instead of a file")
52
53 files = ["myva","myva=gb","org.txt","fun.txt","whog"]
54
55 for current_file in files:
56 if os.path.exists(self.input_path+os.sep+current_file) is False:
57 raise ValueError("File %s is missing in %s" %(current_file, self.input_path))
58
59
60
61
62 species_file_fd = open(self.input_path+os.sep+"org.txt",'r')
63 specie_taxid_dict = {}
64 sp_taxid_regex = re.compile("\s*(\S+)\s+(\S+)\s+")
65
66 for line in species_file_fd:
67 m = sp_taxid_regex.match(line)
68 if m:
69 specie_taxid_dict[m.group(1).lower()] = m.group(2)
70
71 species_file_fd.close()
72
73
74
75
76 function_dict = {}
77 function_file_fd = open(self.input_path+os.sep+"fun.txt",'r')
78 funct_regex = re.compile("\s*\[(\w+)\]\s+(.+)$")
79
80 for line in function_file_fd:
81 m = funct_regex.match(line)
82 if m:
83 function_dict[m.group(1)] = m.group(2)
84
85 function_file_fd.close()
86
87
88
89 name_to_gi_dict = {}
90 name2gi_file_fd = open(self.input_path+os.sep+"myva=gb",'r')
91 name2gi_regex = re.compile("\s*(\S+)\s+(\S+)\s+$")
92
93 for line in name2gi_file_fd:
94 m = name2gi_regex.match(line)
95 if m:
96 if m.group(2) != "gi?":
97 name_to_gi_dict[m.group(1).lower()] = m.group(2)
98
99 name2gi_file_fd.close()
100
101
102
103
104 whog_file_fd = open(self.input_path+os.sep+"whog",'r')
105 name2species_dict = {}
106 cogs_components_dict = {}
107 cogs_funct_dict = {}
108 cogs_description_dict = {}
109 name2cogs_dict = {}
110 current_cog = None
111
112
113 new_cog_regex = re.compile("\s*\[(\w+)\]\s+(\w+)\s+(.+)$")
114 assignment_regex = re.compile("\s*(\w{3})\:\s+(.+)$")
115
116 for line in whog_file_fd:
117
118 m = new_cog_regex.match(line)
119 if m:
120 cogs_description_dict[m.group(2)] = m.group(3)
121 cogs_funct_dict[m.group(2)] = m.group(1)
122 cogs_components_dict.setdefault(m.group(2),[])
123 current_cog = m.group(2)
124 continue
125
126 m = assignment_regex.match(line)
127
128 if m:
129 components = m.group(2).split(" ")
130 for current_component in components:
131 cogs_components_dict[current_cog].append(current_component)
132 name2cogs_dict.setdefault(current_component.lower(),[]).append(current_cog)
133 name2species_dict.setdefault(current_component.lower(),sets.Set()).add(m.group(1).lower())
134
135
136 whog_file_fd.close()
137
138
139 def create_and_insert_eE():
140 eE_object = ExternalEntity( source_database = self.database, type="protein" )
141 eE_object.add_attribute(ExternalEntityAttribute( attribute_identifier = "proteinsequence",
142 value = ProteinSequence("".join(sequence))))
143 if name_to_gi_dict.has_key(protein_name.lower()):
144 eE_object.add_attribute(ExternalEntityAttribute( attribute_identifier = "GI",
145 value = name_to_gi_dict[protein_name.lower()]) )
146
147
148 if name2species_dict.has_key(protein_name.lower()):
149 species = name2species_dict[protein_name.lower()]
150
151 if len(species)>1:
152 print "Protein %s has more than a single specie assigned!" %protein_name
153
154 for current_specie in species:
155 eE_object.add_attribute(ExternalEntityAttribute( attribute_identifier = "taxID",
156 value = specie_taxid_dict[current_specie.lower()] ) )
157
158 for current_cog in name2cogs_dict[protein_name.lower()]:
159 eE_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "COG",
160 value = current_cog ) )
161 for current_function in cogs_funct_dict[current_cog]:
162 eE_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "function",
163 value = function_dict[current_function] ) )
164
165
166 eE_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "OrderedLocusName",
167 value = protein_name ) )
168
169
170 self.biana_access.insert_new_external_entity( externalEntity = eE_object )
171
172
173
174
175 fasta_file_fd = open(self.input_path+os.sep+"myva",'r')
176 sequence = []
177 protein_name_regex = re.compile(">(.+)$")
178 protein_name = None
179
180 for line in fasta_file_fd:
181
182 m = protein_name_regex.match(line)
183 if m:
184 if len(sequence)>0:
185 create_and_insert_eE()
186
187 sequence = []
188 protein_name = m.group(1)
189 else:
190 sequence.append(line.strip())
191
192 fasta_file_fd.close()
193
194 if len(sequence)>0:
195 create_and_insert_eE()
196
197
198
199
200