1 """
2 BIANA: Biologic Interactions and Network Analysis
3 Copyright (C) 2009 Javier Garcia-Garcia, Emre Guney, Baldo Oliva
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>.
17
18 """
19
20 """
21 File : hgnc2piana.py
22 Author : Javier Garcia
23 Creation : 14 November 2007
24 Contents : fills up tables in database piana with information from HGNC
25 Called from :
26 =======================================================================================================
27
28 """
29
30
31
32
33 from bianaParser import *
34
35
37 """
38 HGNC Parser Class
39 """
40
41 name = "hgnc"
42 description = "This file implements a program that fills up tables in BIANA database with information from HGNC"
43 external_entity_definition = "A external entity represents a protein"
44 external_entity_relations = ""
45
47
48
49
50 BianaParser.__init__(self, default_db_description = "HUGO Gene Nomenclature Committee",
51 default_script_name = "hgncParser.py",
52 default_script_description = HGNCParser.description,
53 additional_compulsory_arguments = [])
54 self.default_eE_attribute = "hgnc"
55
57 """
58 Method that implements the specific operations of HGNC parser
59
60 # Python generated dict
61 0 : HGNC ID
62 1 : Approved Symbol
63 2 : Approved Name
64 3 : Status
65 4 : Locus Type
66 5 : Previous Symbols
67 6 : Previous Names
68 7 : Aliases
69 8 : Name Aliases
70 9 : Chromosome
71 10 : Date Approved
72 11 : Date Modified
73 12 : Date Symbol Changed
74 13 : Date Name Changed
75 14 : Accession Numbers
76 15 : Enzyme IDs
77 16 : Entrez Gene ID
78 17 : Ensembl Gene ID
79 18 : Mouse Genome Database ID
80 19 : Specialist Database Links
81 20 : Specialist Database IDs
82 21 : Pubmed IDs
83 22 : RefSeq IDs
84 23 : Gene Family Name
85 24 : Record Type
86 25 : Primary IDs
87 26 : Secondary IDs
88 27 : CCDS IDs
89 28 : VEGA IDs
90 29 : Locus Specific Databases
91 30 : GDB ID (mapped data)
92 31 : Entrez Gene ID (mapped data supplied by NCBI)
93 32 : OMIM ID (mapped data supplied by NCBI)
94 33 : RefSeq (mapped data supplied by NCBI)
95 34 : UniProt ID (mapped data supplied by UniProt)
96 35 : Ensembl ID (mapped data supplied by Ensembl)
97 36 : UCSC ID (mapped data supplied by UCSC)
98 37 : Rat Genome Database ID (mapped data supplied by RGD)
99
100 """
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151 self.initialize_input_file_descriptor()
152
153 line_number=0
154 header_columns = {}
155
156 columns = 0
157
158 for line in self.input_file_fd:
159
160 line_number += 1
161
162
163
164 if line_number == 1:
165 value_list = line.strip().split("\t")
166 header_columns = dict([ (value_list[i], i) for i in xrange(len(value_list))])
167
168 columns = len(value_list)
169
170
171 if line_number>1:
172
173 try:
174 if line_number>1:
175
176 line.strip()
177
178
179 hgnc_object = ExternalEntity( source_database = self.database, type="protein" )
180
181
182 hgnc_object.add_attribute(ExternalEntityAttribute(attribute_identifier="taxid",
183 value=9606,
184 type="unique"))
185
186 line_fields = line.split("\t")
187
188
189 if len(line_fields) != columns:
190 sys.stderr.write("Incorrect fields number\n%s\n" %(line))
191
192
193
194 column_index = header_columns["HGNC ID"]
195 column_value = line_fields[column_index].strip()
196 if column_value.startswith("HGNC:"):
197 hgnc_id = column_value[5:]
198 else:
199 hgnc_id = column_value
200
201 hgnc_object.add_attribute(ExternalEntityAttribute(attribute_identifier = "hgnc",
202 value = hgnc_id,
203 type = "unique" ))
204
205 column_index = header_columns["Approved Symbol"]
206 official_gene_symbol = line_fields[column_index].strip()
207 hgnc_object.add_attribute(ExternalEntityAttribute(attribute_identifier = "geneSymbol",
208 value = official_gene_symbol,
209 type = "unique" ))
210
211 column_index = header_columns["Approved Name"]
212 official_gene_name = line_fields[column_index].strip()
213
214 hgnc_object.add_attribute( ExternalEntityAttribute(attribute_identifier = "description",
215 value = official_gene_name ))
216
217 column_index = header_columns["Previous Symbols"]
218 previous_symbols = line_fields[column_index].strip()
219 if len(previous_symbols)>0:
220 previous_symbols = [ x.strip() for x in previous_symbols.split(",") ]
221 [ hgnc_object.add_attribute(ExternalEntityAttribute(attribute_identifier = "geneSymbol",
222 value = x,
223 type = "previous")) for x in previous_symbols ]
224
225 column_index = header_columns["Previous Names"]
226 previous_names = line_fields[column_index].strip()
227 if len(previous_names)>0:
228 previous_names = [ x.strip('"\s') for x in previous_names.split("\",\"") ]
229 [ hgnc_object.add_attribute(ExternalEntityAttribute(attribute_identifier = "description",
230 value = x )) for x in previous_names ]
231
232 column_index = header_columns["Aliases"]
233 aliases_symbol = line_fields[column_index].strip()
234 if len(aliases_symbol)>0:
235 aliases_symbol = [ x.strip() for x in aliases_symbol.split(",") ]
236 [ hgnc_object.add_attribute(ExternalEntityAttribute(attribute_identifier = "geneSymbol",
237 value = x,
238 type = "alias" )) for x in aliases_symbol ]
239
240 column_index = header_columns["Accession Numbers"]
241 accession_numbers = line_fields[column_index].strip()
242 if len(accession_numbers)>0:
243 accession_numbers = [ x.strip() for x in accession_numbers.split(",") ]
244 [ hgnc_object.add_attribute(ExternalEntityAttribute(attribute_identifier = "accessionNumber",
245 value = x,
246 type = "cross-reference")) for x in accession_numbers ]
247
248
249 column_index = header_columns["Name Aliases"]
250 accession_numbers = line_fields[column_index].strip()
251 if len(accession_numbers)>0:
252 accession_numbers = [ x.strip() for x in accession_numbers.split(",") ]
253 [ hgnc_object.add_attribute(ExternalEntityAttribute(attribute_identifier = "description",
254 value = x,
255 type = "alias")) for x in accession_numbers ]
256
257
258 column_index = header_columns["Enzyme IDs"]
259 enzyme_IDs = line_fields[column_index].strip()
260 if len(enzyme_IDs)>0:
261 enzyme_IDs = [ x.strip() for x in enzyme_IDs.split(",") ]
262 new_enzyme_IDs = []
263 for id in enzyme_IDs:
264 m = re.match("\s*(.+\..+\..+\..+)", id)
265 if m:
266 new_enzyme_IDs.append(m.group(1))
267 enzyme_IDs = new_enzyme_IDs
268
269 [ hgnc_object.add_attribute(ExternalEntityAttribute(attribute_identifier = "EC",
270 value = x,
271 type = "cross-reference" ) ) for x in enzyme_IDs ]
272
273 column_index = header_columns["Entrez Gene ID"]
274 column_value = line_fields[column_index].strip()
275 if len(column_value)>0:
276 geneIDs = [ x.strip() for x in column_value.split(",") ]
277 [ hgnc_object.add_attribute(ExternalEntityAttribute(attribute_identifier = "geneID",
278 value = x,
279 type = "cross-reference")) for x in geneIDs ]
280
281 column_index = header_columns["Mouse Genome Database ID"]
282 column_value = line_fields[column_index]
283 if len(column_value)>0:
284 MGD_IDs = [ x.lstrip("MGI:") for x in column_value.split(",") ]
285 [ hgnc_object.add_attribute(ExternalEntityAttribute(attribute_identifier = "mgi",
286 value = x,
287 type = "cross-reference")) for x in MGD_IDs ]
288
289 column_index = header_columns["RefSeq IDs"]
290 column_value = line_fields[column_index].strip()
291 if len(column_value)>0:
292 refseqs = [ x.strip() for x in column_value.split(",") ]
293 [ hgnc_object.add_attribute(ExternalEntityAttribute(attribute_identifier = "refseq",
294 value = x,
295 type = "cross-reference")) for x in refseqs ]
296
297 column_index = header_columns["GDB ID (mapped data)"]
298 column_value = line_fields[column_index].strip()
299 if len(column_value)>0:
300 GDB_IDs = [ x.lstrip("GDB:") for x in column_value.split(",") ]
301 [ hgnc_object.add_attribute(ExternalEntityAttribute(attribute_identifier = "gdb",
302 value = x,
303 type = "cross-reference" )) for x in GDB_IDs ]
304
305 column_index = header_columns["Entrez Gene ID (mapped data supplied by NCBI)"]
306 column_value = line_fields[column_index].strip()
307 if len(column_value)>0:
308 mapped_geneIDs = [ x.strip() for x in column_value.split(",") ]
309 [ hgnc_object.add_attribute(ExternalEntityAttribute(attribute_identifier = "geneID",
310 value = x,
311 type = "cross-reference")) for x in mapped_geneIDs ]
312
313 column_index = header_columns["OMIM ID (mapped data supplied by NCBI)"]
314 column_value = line_fields[column_index].strip()
315 if len(column_value)>0:
316 omimIDs = [ x.strip() for x in column_value.split(",") ]
317 [ hgnc_object.add_attribute(ExternalEntityAttribute(attribute_identifier = "mim",
318 value = x)) for x in omimIDs ]
319
320 column_index = header_columns["RefSeq (mapped data supplied by NCBI)"]
321 column_value = line_fields[column_index].strip()
322 if len(column_value)>0:
323 mapped_refseqs = [ x.strip() for x in column_value.split(",") ]
324 [ hgnc_object.add_attribute(ExternalEntityAttribute(attribute_identifier = "refseq",
325 value = x,
326 type = "cross-reference")) for x in mapped_refseqs ]
327
328 column_index = header_columns["UniProt ID (mapped data supplied by UniProt)"]
329 column_value = line_fields[column_index].strip()
330 if len(column_value)>0:
331 uniprotIDs = [ x.strip() for x in column_value.strip().split(",") ]
332 [ hgnc_object.add_attribute(ExternalEntityAttribute(attribute_identifier = "uniprotaccession",
333 value = x,
334 type = "cross-reference")) for x in uniprotIDs ]
335
336 column_index = header_columns["Rat Genome Database ID (mapped data supplied by RGD)"]
337 column_value = line_fields[column_index].strip()
338 if len(column_value)>0:
339 RGD_IDs = [ x.lstrip("RGD:") for x in column_value.split(",") ]
340 for current_rgd_id in RGD_IDs:
341 if current_rgd_id.strip() != "":
342 hgnc_object.add_attribute(ExternalEntityAttribute(attribute_identifier = "rgd",
343 value = current_rgd_id.strip(),
344 type = "cross-reference"))
345
346
347 self.biana_access.insert_new_external_entity( externalEntity = hgnc_object )
348
349
350 except:
351 traceback.print_exc()
352 sys.stderr.write("Error in parsing line %s\n" %(line_number))
353 raise Exception;
354