biana.BianaParser.hgncParser

37 """ 38 HGNC Parser Class 39 """ 40 41 name = "hgnc" 42 description = "This file implements a program that fills up tables in BIANA database with information from HGNC" 43 external_entity_definition = "A external entity represents a protein" 44 external_entity_relations = "" 45

46 - def __init__(self):

47 48 # Start with the default values 49 50 BianaParser.__init__(self, default_db_description = "HUGO Gene Nomenclature Committee", 51 default_script_name = "hgncParser.py", 52 default_script_description = HGNCParser.description, 53 additional_compulsory_arguments = []) 54 self.default_eE_attribute = "hgnc"

55

56 - def parse_database(self):

57 """ 58 Method that implements the specific operations of HGNC parser 59 60 # Python generated dict 61 0 : HGNC ID 62 1 : Approved Symbol 63 2 : Approved Name 64 3 : Status 65 4 : Locus Type 66 5 : Previous Symbols 67 6 : Previous Names 68 7 : Aliases 69 8 : Name Aliases 70 9 : Chromosome 71 10 : Date Approved 72 11 : Date Modified 73 12 : Date Symbol Changed 74 13 : Date Name Changed 75 14 : Accession Numbers 76 15 : Enzyme IDs 77 16 : Entrez Gene ID 78 17 : Ensembl Gene ID 79 18 : Mouse Genome Database ID 80 19 : Specialist Database Links 81 20 : Specialist Database IDs 82 21 : Pubmed IDs 83 22 : RefSeq IDs 84 23 : Gene Family Name 85 24 : Record Type 86 25 : Primary IDs 87 26 : Secondary IDs 88 27 : CCDS IDs 89 28 : VEGA IDs 90 29 : Locus Specific Databases 91 30 : GDB ID (mapped data) 92 31 : Entrez Gene ID (mapped data supplied by NCBI) 93 32 : OMIM ID (mapped data supplied by NCBI) 94 33 : RefSeq (mapped data supplied by NCBI) 95 34 : UniProt ID (mapped data supplied by UniProt) 96 35 : Ensembl ID (mapped data supplied by Ensembl) 97 36 : UCSC ID (mapped data supplied by UCSC) 98 37 : Rat Genome Database ID (mapped data supplied by RGD) 99 100 """ 101 102 103 104 # List of tables to lock. It is used to improve speed inserts, as the indices are not updated in each insert 105 # Commented. Locking all tables for the moment 106 # tables_to_lock = [PianaGlobals.crossReferenceSource_table, 107 # PianaGlobals.crossReferences_table] 108 109 110 # HGNC Fields are the following: (CD: Multiple values, comma delimited QCD: Multiple Quoited values in a comma delimited list 111 # 0: HGNC ID 112 # 1: Approved Symbol (Oficial Gene Symbol) 113 # 2: Approved Name (Oficial Gene Name) 114 # 3: Status 115 # 4: Locus Type 116 # 5: Previous Symbols CD 117 # 6: Previous Names QCD 118 # 7: Aliases CD 119 # 8: Name Aliases QCD # EMPTY!!!! 120 # 9: Chromosome 121 # 10: Date Approved 122 # 11: Date Modified 123 # 12: Date Symbol changed NOT EXISTS!!!! 124 # 13: Date Name Changed 125 # 14: Accession Numbers CD 126 # 15: Enzyme ID CD 127 # 16: Entrez Gene ID (Replaeced Locus Link) 128 # 17: Ensembl Gene ID 129 # 18: MGD ID 130 # 19: Specialist Database Links (CD) 131 # 20: Specialist Database IDs (CD) NOT EXISTS!!!! 132 # 21: Pubmed IDs (CD) 133 # 22: RefSeq IDs (CD) Only One is selected! 134 # 23: Gene Family Name (CD) 135 # 24: Record Type 136 # 25: Primary IDs 137 # 26: Secondary IDs 138 # 27: CCDS IDs 139 # 28: VEGA IDs 140 # 29: Locus Specific Databases 141 # 30: GBD ID 142 # 31: Entrez Gene ID 143 # 32: OMIM ID 144 # 33: RefSeq 145 # 34: Uniprot ID 146 # 35: EnsembL 147 # 36: UCSC ID 148 # 37: RGD ID 149 150 151 self.initialize_input_file_descriptor() 152 153 line_number=0 154 header_columns = {} 155 156 columns = 0 157 158 for line in self.input_file_fd: 159 160 line_number += 1 161 162 163 # Read columns of header line into dictionary 164 if line_number == 1: 165 value_list = line.strip().split("\t") 166 header_columns = dict([ (value_list[i], i) for i in xrange(len(value_list))]) 167 #sys.stderr.write("%s columns in header\n" %len(value_list)) 168 columns = len(value_list) 169 170 171 if line_number>1: 172 173 try: 174 if line_number>1: 175 176 line.strip() 177 178 # Create a new external entity object 179 hgnc_object = ExternalEntity( source_database = self.database, type="protein" ) 180 181 # ADDING TAXID AS IT ONLY CONTAINS HUMAN GENES 182 hgnc_object.add_attribute(ExternalEntityAttribute(attribute_identifier="taxid", 183 value=9606, 184 type="unique")) 185 186 line_fields = line.split("\t") 187 188 189 if len(line_fields) != columns: 190 sys.stderr.write("Incorrect fields number\n%s\n" %(line)) 191 192 193 194 column_index = header_columns["HGNC ID"] 195 column_value = line_fields[column_index].strip() 196 if column_value.startswith("HGNC:"): 197 hgnc_id = column_value[5:] 198 else: 199 hgnc_id = column_value 200 # Take the values. Those that can be multiple values are stored in a list 201 hgnc_object.add_attribute(ExternalEntityAttribute(attribute_identifier = "hgnc", 202 value = hgnc_id, 203 type = "unique" )) 204 205 column_index = header_columns["Approved Symbol"] 206 official_gene_symbol = line_fields[column_index].strip() 207 hgnc_object.add_attribute(ExternalEntityAttribute(attribute_identifier = "geneSymbol", 208 value = official_gene_symbol, 209 type = "unique" )) 210 211 column_index = header_columns["Approved Name"] 212 official_gene_name = line_fields[column_index].strip() 213 # Oficial gene Name is entered as a description 214 hgnc_object.add_attribute( ExternalEntityAttribute(attribute_identifier = "description", 215 value = official_gene_name )) 216 217 column_index = header_columns["Previous Symbols"] 218 previous_symbols = line_fields[column_index].strip() 219 if len(previous_symbols)>0: 220 previous_symbols = [ x.strip() for x in previous_symbols.split(",") ] 221 [ hgnc_object.add_attribute(ExternalEntityAttribute(attribute_identifier = "geneSymbol", 222 value = x, 223 type = "previous")) for x in previous_symbols ] 224 225 column_index = header_columns["Previous Names"] 226 previous_names = line_fields[column_index].strip() 227 if len(previous_names)>0: 228 previous_names = [ x.strip('"\s') for x in previous_names.split("\",\"") ] 229 [ hgnc_object.add_attribute(ExternalEntityAttribute(attribute_identifier = "description", 230 value = x )) for x in previous_names ] 231 232 column_index = header_columns["Aliases"] 233 aliases_symbol = line_fields[column_index].strip() 234 if len(aliases_symbol)>0: 235 aliases_symbol = [ x.strip() for x in aliases_symbol.split(",") ] 236 [ hgnc_object.add_attribute(ExternalEntityAttribute(attribute_identifier = "geneSymbol", 237 value = x, 238 type = "alias" )) for x in aliases_symbol ] 239 240 column_index = header_columns["Accession Numbers"] 241 accession_numbers = line_fields[column_index].strip() 242 if len(accession_numbers)>0: 243 accession_numbers = [ x.strip() for x in accession_numbers.split(",") ] 244 [ hgnc_object.add_attribute(ExternalEntityAttribute(attribute_identifier = "accessionNumber", 245 value = x, 246 type = "cross-reference")) for x in accession_numbers ] 247 248 249 column_index = header_columns["Name Aliases"] 250 accession_numbers = line_fields[column_index].strip() 251 if len(accession_numbers)>0: 252 accession_numbers = [ x.strip() for x in accession_numbers.split(",") ] 253 [ hgnc_object.add_attribute(ExternalEntityAttribute(attribute_identifier = "description", 254 value = x, 255 type = "alias")) for x in accession_numbers ] 256 257 258 column_index = header_columns["Enzyme IDs"] 259 enzyme_IDs = line_fields[column_index].strip() 260 if len(enzyme_IDs)>0: 261 enzyme_IDs = [ x.strip() for x in enzyme_IDs.split(",") ] 262 new_enzyme_IDs = [] 263 for id in enzyme_IDs: 264 m = re.match("\s*(.+\..+\..+\..+)", id) 265 if m: 266 new_enzyme_IDs.append(m.group(1)) 267 enzyme_IDs = new_enzyme_IDs 268 269 [ hgnc_object.add_attribute(ExternalEntityAttribute(attribute_identifier = "EC", 270 value = x, 271 type = "cross-reference" ) ) for x in enzyme_IDs ] 272 273 column_index = header_columns["Entrez Gene ID"] 274 column_value = line_fields[column_index].strip() 275 if len(column_value)>0: 276 geneIDs = [ x.strip() for x in column_value.split(",") ] 277 [ hgnc_object.add_attribute(ExternalEntityAttribute(attribute_identifier = "geneID", 278 value = x, 279 type = "cross-reference")) for x in geneIDs ] 280 281 column_index = header_columns["Mouse Genome Database ID"] 282 column_value = line_fields[column_index] 283 if len(column_value)>0: 284 MGD_IDs = [ x.lstrip("MGI:") for x in column_value.split(",") ] 285 [ hgnc_object.add_attribute(ExternalEntityAttribute(attribute_identifier = "mgi", 286 value = x, 287 type = "cross-reference")) for x in MGD_IDs ] 288 289 column_index = header_columns["RefSeq IDs"] 290 column_value = line_fields[column_index].strip() 291 if len(column_value)>0: 292 refseqs = [ x.strip() for x in column_value.split(",") ] 293 [ hgnc_object.add_attribute(ExternalEntityAttribute(attribute_identifier = "refseq", 294 value = x, 295 type = "cross-reference")) for x in refseqs ] 296 297 column_index = header_columns["GDB ID (mapped data)"] 298 column_value = line_fields[column_index].strip() 299 if len(column_value)>0: 300 GDB_IDs = [ x.lstrip("GDB:") for x in column_value.split(",") ] 301 [ hgnc_object.add_attribute(ExternalEntityAttribute(attribute_identifier = "gdb", 302 value = x, 303 type = "cross-reference" )) for x in GDB_IDs ] 304 305 column_index = header_columns["Entrez Gene ID (mapped data supplied by NCBI)"] 306 column_value = line_fields[column_index].strip() 307 if len(column_value)>0: 308 mapped_geneIDs = [ x.strip() for x in column_value.split(",") ] 309 [ hgnc_object.add_attribute(ExternalEntityAttribute(attribute_identifier = "geneID", 310 value = x, 311 type = "cross-reference")) for x in mapped_geneIDs ] 312 313 column_index = header_columns["OMIM ID (mapped data supplied by NCBI)"] 314 column_value = line_fields[column_index].strip() 315 if len(column_value)>0: 316 omimIDs = [ x.strip() for x in column_value.split(",") ] 317 [ hgnc_object.add_attribute(ExternalEntityAttribute(attribute_identifier = "mim", 318 value = x)) for x in omimIDs ] 319 320 column_index = header_columns["RefSeq (mapped data supplied by NCBI)"] 321 column_value = line_fields[column_index].strip() 322 if len(column_value)>0: 323 mapped_refseqs = [ x.strip() for x in column_value.split(",") ] 324 [ hgnc_object.add_attribute(ExternalEntityAttribute(attribute_identifier = "refseq", 325 value = x, 326 type = "cross-reference")) for x in mapped_refseqs ] 327 328 column_index = header_columns["UniProt ID (mapped data supplied by UniProt)"] 329 column_value = line_fields[column_index].strip() 330 if len(column_value)>0: 331 uniprotIDs = [ x.strip() for x in column_value.strip().split(",") ] 332 [ hgnc_object.add_attribute(ExternalEntityAttribute(attribute_identifier = "uniprotaccession", 333 value = x, 334 type = "cross-reference")) for x in uniprotIDs ] 335 336 column_index = header_columns["Rat Genome Database ID (mapped data supplied by RGD)"] 337 column_value = line_fields[column_index].strip() 338 if len(column_value)>0: 339 RGD_IDs = [ x.lstrip("RGD:") for x in column_value.split(",") ] 340 for current_rgd_id in RGD_IDs: 341 if current_rgd_id.strip() != "": 342 hgnc_object.add_attribute(ExternalEntityAttribute(attribute_identifier = "rgd", 343 value = current_rgd_id.strip(), 344 type = "cross-reference")) 345 346 # Save the object in the database 347 self.biana_access.insert_new_external_entity( externalEntity = hgnc_object ) 348 349 350 except: 351 traceback.print_exc() 352 sys.stderr.write("Error in parsing line %s\n" %(line_number)) 353 raise Exception;

Source Code for Module biana.BianaParser.hgncParser