1 """
2 BIANA: Biologic Interactions and Network Analysis
3 Copyright (C) 2009 Javier Garcia-Garcia, Emre Guney, Baldo Oliva
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>.
17
18 """
19
20
21 """
22 File : ipi2piana.py
23 Author : Javier Garcia
24 Creation : November 2007
25 Contents : fills up tables in database piana with information from IPI
26 Called from :
27 =======================================================================================================
28 """
29
30 import re
31 from bianaParser import *
32
33
35 """
36 IPI Parser Class
37 """
38
39 name = "ipi"
40 description = "Inserts information of IPI database into BIANA"
41 external_entity_definition = "External entities are proteins"
42 external_entity_relations = ""
43
45
46
47
48 BianaParser.__init__(self, default_db_description = "IPI. International Protein Index",
49 default_script_name = "ipi2piana.py",
50 default_script_description = IPIParser.description,
51 additional_compulsory_arguments = [])
52 self.default_eE_attribute = "ipi"
53
54
71
72
74 """
75 Method that implements the specific operations of HGNC parser
76 """
77
78
79
80
81
82
83
84
85
86 line_number = 0
87 ipi_object = None
88 ipi_object_number = 0
89 actual_sequence = []
90
91 for line in self.input_file_fd:
92
93 line_number += 1
94
95 line.strip()
96
97 field_search_re = re.compile("([\w\-]+)\:(\S+)")
98 tax_id_regex = re.compile("Tax_Id=(\d+)")
99 gene_symbol_regex = re.compile("Gene_Symbol=(\S+)\s+(.*)")
100
101 if line[0]=='>':
102
103
104
105 if ipi_object is not None:
106 ipi_object.add_attribute(ExternalEntityAttribute(attribute_identifier="proteinSequence",
107 value = ProteinSequence("".join(actual_sequence))))
108
109 self.biana_access.insert_new_external_entity( externalEntity = ipi_object )
110
111
112 ipi_object = ExternalEntity( source_database = self.database, type="protein" )
113 ipi_object_number += 1
114
115 if self.time_control:
116 if ipi_object_number%20000==0:
117 sys.stderr.write("%s entries done in %s seconds\n" %(ipi_object_number,time.time()-self.initial_time))
118
119 actual_sequence = []
120 line_fields = line.lstrip(">").split("|")
121
122 for actual_field in line_fields:
123
124 search = field_search_re.search(actual_field)
125
126 if search:
127 identifier_type = search.group(1)
128 values = search.group(2).split(";")
129
130 if( identifier_type == "IPI" ):
131 for actual_value in values:
132
133
134 ipi_object.add_attribute(ExternalEntityAttribute(attribute_identifier = "ipi",
135 value = actual_value,
136 type = "unique" ))
137
138 elif( identifier_type == "ENSEMBL" ):
139 for actual_value in values:
140 ipi_object.add_attribute(ExternalEntityAttribute(attribute_identifier="ensembl",
141 value = actual_value,
142 type = "cross-reference" ))
143
144 elif( identifier_type == "REFSEQ" ):
145 for actual_value in values:
146 ipi_object.add_attribute(ExternalEntityAttribute(attribute_identifier="refseq",
147 value = actual_value,
148 type = "cross-reference" ))
149
150 elif( identifier_type == "TREMBL" ):
151 for actual_value in values:
152 ipi_object.add_attribute(ExternalEntityAttribute(attribute_identifier="uniprotaccession",
153 value = actual_value,
154 type = "cross-reference" ))
155
156 elif( identifier_type == "SWISS-PROT" ):
157 for actual_value in values:
158 ipi_object.add_attribute(ExternalEntityAttribute(attribute_identifier="UniprotAccession",
159 value = actual_value[0:6],
160 type = "cross-reference" ))
161
162 elif( identifier_type == "TAIR" ):
163 for actual_value in values:
164 ipi_object.add_attribute(ExternalEntityAttribute(attribute_identifier="tair",
165 value = actual_value,
166 type = "cross-reference" ))
167
168 search = tax_id_regex.search(line)
169 if search:
170 ipi_object.add_attribute(ExternalEntityAttribute(attribute_identifier="taxID",
171 value = search.group(1) ))
172
173 search = gene_symbol_regex.search(line)
174 if search:
175 ipi_object.add_attribute(ExternalEntityAttribute(attribute_identifier="geneSymbol",
176 value = actual_value,
177 type = "cross-reference" ))
178
179 search2 = re.search("[Emb|Gb]\|(\S+)",search.group(2))
180 if search2:
181 ipi_object.add_attribute(ExternalEntityAttribute(attribute_identifier = "accessionNumber",
182 value = search2.group(1),
183 type = "cross-reference" ))
184 else:
185 ipi_object.add_attribute(ExternalEntityAttribute(attribute_identifier="description",
186 value = search.group(2) ))
187
188 else:
189
190 actual_sequence.append(line.strip())
191
192
193 if ipi_object is not None:
194 ipi_object.add_attribute(ExternalEntityAttribute(attribute_identifier="proteinSequence",
195 value = ProteinSequence("".join(actual_sequence))))
196 self.biana_access.insert_new_external_entity( externalEntity = ipi_object )
197