Package biana :: Package BianaParser :: Module pfamParser
[hide private]
[frames] | no frames]

Source Code for Module biana.BianaParser.pfamParser

  1  """ 
  2      BIANA: Biologic Interactions and Network Analysis 
  3      Copyright (C) 2009  Javier Garcia-Garcia, Emre Guney, Baldo Oliva 
  4   
  5      This program is free software: you can redistribute it and/or modify 
  6      it under the terms of the GNU General Public License as published by 
  7      the Free Software Foundation, either version 3 of the License, or 
  8      (at your option) any later version. 
  9   
 10      This program is distributed in the hope that it will be useful, 
 11      but WITHOUT ANY WARRANTY; without even the implied warranty of 
 12      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 13      GNU General Public License for more details. 
 14   
 15      You should have received a copy of the GNU General Public License 
 16      along with this program.  If not, see <http://www.gnu.org/licenses/>. 
 17   
 18  """ 
 19   
 20  """ 
 21  File        : pfamParser.py 
 22  Author      : Javier Garcia 
 23  Creation    : 19 December 2007 
 24  Contents    : fills up tables in database biana with information from PFAM 
 25  Called from :  
 26  ======================================================================================================= 
 27   
 28  """ 
 29   
 30   
 31  ## STEP 1: IMPORT NECESSARY MODULES 
 32   
 33  from bianaParser import * 
 34   
 35   
36 -class PFAMParser(BianaParser):
37 """ 38 PFAM Parser Class 39 """ 40 41 name = "pfam" 42 description = "PFAM Parser. Collection of multiple sequence alignments and hidden markov models covering many common protein domains and families" 43 external_entity_definition = "A external entity represents a pfam domain" 44 external_entity_relations = "" 45
46 - def __init__(self):
47 48 # Start with the default values 49 50 BianaParser.__init__(self, default_db_description = "Collection of multiple sequence alignments and hidden markov models covering many common protein domains and families", 51 default_script_name = "pfamParser.py", 52 default_script_description = PFAMParser.description, 53 additional_optional_arguments = [("include-sequence-mapping",True,"Includes sequence information and maps pfam to them"), 54 ("pfamA-file-name=","Pfam-A.full.gz","Name of the pfam A file. If None, it is not used"), 55 ("pfamB-file-name=","Pfam-B.gz","Name of the pfam B file. If None, it is not used"), 56 ("pfamSeq-file-name=","pfamseq.gz","Name of the file with pfam sequences (only used if \"include-sequence-mapping\" option is selected")]) 57 self.default_eE_attribute = "pfam"
58 #self.is_promiscuous = True 59 60
61 - def parse_database(self):
62 """ 63 Method that implements the specific operations of PFAM parser 64 """ 65 66 # Add the possibility to transfer pfam id using uniprotaccession 67 self.biana_access._add_transfer_attribute( externalDatabaseID = self.database.get_id(), 68 key_attribute = "uniprotaccession", 69 transfer_attribute="pfam" ) 70 71 self.biana_access._add_transfer_attribute( externalDatabaseID = self.database.get_id(), 72 key_attribute = "uniprotentry", 73 transfer_attribute="pfam" ) 74 75 if not self.input_file.endswith(os.sep): 76 self.input_file += os.sep 77 78 self.uniprot_acc_seq_md5_dict = {} #! this dictionary is possibly causing memory problems 79 80 self.insert_mapping = None 81 82 # First, if "include_sequence_mapping" option is selected, gets the exact information of sequences used and inserts them to the database 83 84 if self.arguments_dic["include-sequence-mapping"]: 85 86 sequences_parsing_time = time.time() 87 88 self.seq_database = ExternalDatabase( databaseName = self.sourcedb_name+"_sequences", 89 databaseVersion = self.sourcedb_version, 90 databaseFile = self.input_file.split(os.sep)[-1], 91 databaseDescription = self.database_description, 92 defaultExternalEntityAttribute = "proteinSequence") 93 94 self.biana_access.insert_new_external_database( externalDatabase = self.seq_database ) 95 96 self.insert_mapping = 1 97 98 pfamseq_file = self.input_file+self.arguments_dic["pfamSeq-file-name"] 99 100 if pfamseq_file.endswith(".gz"): 101 pfamseq_input_file_fd = gzip.open(pfamseq_file,'r') 102 else: 103 pfamseq_input_file_fd = file(pfamseq_file, 'r') 104 105 sequence = [] 106 protein_title_line = None 107 108 protein_number = 0 109 110 #self.title_regex = re.compile("^(\w+)\.(\d+)") 111 112 uniprot_acc = None 113 uniprot_acc_version = None 114 uniprot_entry = None 115 sequence = None 116 117 self.title_regex = re.compile("^(\w+)\.(\d+)\s+(\S+)") 118 119 for line in pfamseq_input_file_fd: 120 121 if line[0]==">": 122 protein_number += 1 123 124 if self.time_control: 125 if protein_number%20000==0: 126 sys.stderr.write("%s proteins in %s seconds\n" %(protein_number,time.time()-self.initial_time)) 127 128 if sequence is not None: 129 self.parse_pfam_seq_record(header_line = protein_title_line, sequence = "".join(sequence)) 130 131 protein_title_line = line[1:] 132 sequence = [] 133 else: 134 sequence.append(line.strip()) 135 136 if len(sequence)>0: 137 self.parse_pfam_seq_record(header_line = protein_title_line, sequence = "".join(sequence)) 138 139 # Updates the information that this external database has inserted 140 self.seq_database.set_parsing_time( int(time.time() - sequences_parsing_time) ) 141 self.biana_access.update_external_database_external_entity_attributes( self.seq_database ) 142 143 pfamseq_input_file_fd.close() 144 145 146 if self.arguments_dic["pfamA-file-name"].lower() != "none": 147 self.parse_pfam_file(pfamType='A', pfamFile= self.input_file+self.arguments_dic["pfamA-file-name"]) 148 149 150 if self.arguments_dic["pfamB-file-name"].lower() != "none": 151 self.parse_pfam_file(pfamType='B', pfamFile= self.input_file+self.arguments_dic["pfamB-file-name"])
152 153
154 - def parse_pfam_seq_record(self, header_line, sequence):
155 156 pfamseq_object = ExternalEntity( source_database = self.seq_database, type="protein" ) 157 158 m = self.title_regex.match(header_line) 159 160 if m: 161 uniprot_acc = m.group(1) 162 uniprot_acc_version = m.group(2) 163 uniprot_entry = m.group(3) 164 else: 165 raise "ERROR in parsing line %s" %header_line 166 167 pfamseq_object.add_attribute(ExternalEntityAttribute( attribute_identifier = "uniprotentry", value = uniprot_entry,type = "cross-reference" )) 168 pfamseq_object.add_attribute(ExternalEntityAttribute( attribute_identifier = "uniprotaccession", value = uniprot_acc, version = uniprot_acc_version, type = "cross-reference")) 169 170 sequenceObject = ProteinSequence(sequence.strip()) 171 172 pfamseq_object.add_attribute(ExternalEntityAttribute( attribute_identifier = "proteinSequence", value = sequenceObject)) 173 174 self.uniprot_acc_seq_md5_dict["%s.%s" %(uniprot_acc, uniprot_acc_version)] = sequenceObject.get_sequence_MD5() 175 176 self.biana_access.insert_new_external_entity( externalEntity = pfamseq_object )
177 178 179
180 - def parse_pfam_file(self, pfamType, pfamFile):
181 182 # STOCKHOLM 1.0 183 #=GF ID 14-3-3 One word name for family 184 #=GF AC PF00244.11 Accession number in form PFxxxxx.version or PBxxxxxx. 185 #=GF DE 14-3-3 protein Short description of family 186 187 #GF DR PROSITE; PDOC00633; 188 #GF DR SMART; 14_3_3; 189 #GF DR PRINTS; PR00305; 190 #=GF DR SCOP; 1a4o; fa; 191 #=GF DR INTERPRO; IPR000308; 192 #=GF DR PFAMB; PB176422; 193 #=GF DR PRODOM; PD000197; 194 #=GF DR PFAMA; PF00297.13; 195 196 #=GF SQ 560 197 #=GS Q7M332_SHEEP/3-161 AC Q7M332.1 198 #=GS 1433Z_BOVIN/3-236 DR PDB; 1a38 B; 3-228; 199 200 if pfamFile.endswith(".gz"): 201 pfam_input_file_fd = gzip.open(pfamFile,'r') 202 else: 203 pfam_input_file_fd = file(pfamFile, 'r') 204 205 line_number=0 206 207 # Create a new external entity object 208 pfam_object = ExternalEntity( source_database = self.database, type="pattern") 209 210 search_id = re.compile("^#=GF ID\s+(.+)$") 211 search_description = re.compile("^#=GF DE\s+(.+)$") 212 search_prosite = re.compile("^#=GF DR\s+PROSITE;\s+(\w+);") 213 search_interpro = re.compile("^#=GF DR\s+INTERPRO;\s+(\w+);") 214 search_scop = re.compile("^#=GF DR\s+SCOP;\s+(\w+);\s+(\w+);") 215 search_prints = re.compile("^#=GF DR\s+PRINTS;\s+(\w+);") 216 search_pfam_ac = re.compile("^#=GF AC\s+(.+)\.*(\d*)$") 217 search_pfama = re.compile("^#=GF DR\s+PFAMA;\s+(.+)\.*(\d*);$") 218 search_pfamb = re.compile("^#=GF DR\s+PFAMB;\s+(.+);$") 219 search_prodom = re.compile("^#=GF DR\s+PRODOM;\s+(.+);$") 220 search_end = re.compile("^\/\/$") 221 222 223 #=GS Q28DR3_XENTR/4-241 AC Q28DR3.1 224 search_uniprot = re.compile("^#=GS\s+([\w\_]+)\/(\d+\-\d+)\s+AC\s+(\w+)\.(\d+)") 225 226 #=GS 1433T_HUMAN/3-236 DR PDB; 2btp A; 3-234; 227 search_pdb_cross_ref = re.compile("^#=GS\s+[\w\_]+\/\d+\-\d+\s+DR\s+PDB;\s+(\w+)\s+(\w*);\s+(\d+)\-(\d+)") 228 229 entries = 0 230 231 for line in pfam_input_file_fd: 232 233 pfamAC = None 234 pfamDescription = None 235 pfamType = pfamType 236 pfamName = None 237 238 line_number += 1 239 240 try: 241 line.strip() 242 243 if( search_end.match(line) ): 244 # Insert current pfam_object 245 # Insert current pfam attribute properties 246 247 self.biana_access.insert_new_external_entity( externalEntity = pfam_object ) 248 249 if self.time_control: 250 if entries%200==0: 251 sys.stderr.write("%s pfam entries in %s seconds\n" %(entries,time.time()-self.initial_time)) 252 253 pfam_object = ExternalEntity( source_database = self.database, type="pattern") 254 continue 255 256 257 #ID 258 m = search_id.match(line) 259 if m: 260 entries += 1 261 pfamName = m.group(1) 262 pfam_object.add_attribute(ExternalEntityAttribute( attribute_identifier = "name", value = pfamName, type = "unique")) 263 continue 264 265 #DESCRIPTION 266 m = search_description.match(line) 267 if m: 268 pfamDescription = m.group(1) 269 pfam_object.add_attribute(ExternalEntityAttribute( attribute_identifier = "description", value = pfamDescription)) 270 continue 271 272 #PRINTS 273 m = search_prints.match(line) 274 if m: 275 pfam_object.add_attribute(ExternalEntityAttribute( attribute_identifier = "prints", value = m.group(1), type = "cross-reference")) 276 277 278 #PRODOM 279 m = search_prodom.match(line) 280 if m: 281 pfam_object.add_attribute(ExternalEntityAttribute( attribute_identifier = "prodom", value = m.group(1), type = "cross-reference")) 282 283 #PFAMA 284 m = search_pfama.match(line) 285 if m: 286 pfam_object.add_attribute(ExternalEntityAttribute( attribute_identifier = "pfam", value = m.group(1), type = "cross-reference")) 287 288 #PFAMB 289 m = search_pfamb.match(line) 290 if m: 291 pfam_object.add_attribute(ExternalEntityAttribute( attribute_identifier = "pfam", value = m.group(1), type = "cross-reference")) 292 continue 293 294 #INTERPRO 295 m = search_interpro.match(line) 296 if m: 297 pfam_object.add_attribute(ExternalEntityAttribute( attribute_identifier = "interpro", value = m.group(1), type = "cross-reference")) 298 continue 299 300 301 #AC 302 m = search_pfam_ac.match(line) 303 if m: 304 pfamAC = m.group(1) 305 pfam_object.add_attribute(ExternalEntityAttribute( attribute_identifier = "pfam", value = pfamAC, type = "unique") ) 306 continue 307 308 #PROSITE 309 m = search_prosite.match(line) 310 if m: 311 pfam_object.add_attribute(ExternalEntityAttribute( attribute_identifier = "prosite", value = m.group(1), type = "cross-reference")) 312 continue 313 314 #UNIPROT 315 m = search_uniprot.match(line) 316 if m: 317 pfam_object.add_attribute(ExternalEntityAttribute( attribute_identifier = "uniprotaccession", value = m.group(3), version = m.group(4), type = "cross-reference") ) 318 pfam_object.add_attribute(ExternalEntityAttribute( attribute_identifier = "uniprotentry", value = m.group(1), type = "cross-reference")) 319 320 if self.insert_mapping: 321 try: 322 sequenceMD5 = self.uniprot_acc_seq_md5_dict["%s.%s" %(m.group(3),m.group(4))] 323 pfam_object.add_attribute(ExternalEntityAttribute( attribute_identifier = "sequenceMap", value = sequenceMD5, 324 additional_fields = {"seq_range": m.group(2)} )) 325 except: 326 sys.stderr.write("SequenceMD5 not found for %s.%s\n" %(m.group(3),m.group(4))) 327 328 329 330 continue 331 332 #PDB 333 m = search_pdb_cross_ref.match(line) 334 if m: 335 pfam_object.add_attribute(ExternalEntityAttribute( attribute_identifier = "pdb", value = m.group(1), type = "cross-reference", 336 additional_fields = {"chain": m.group(2), 337 "pdb_range": "%s-%s" %(m.group(3),m.group(4))} )) 338 continue 339 340 except: 341 print line 342 print traceback.print_exc() 343 sys.stderr.write("Error in parsing line %s\n" %(line_number)) 344 raise ValueError("error in parsing line") 345 346 347 if pfam_object is not None: 348 self.biana_access.insert_new_external_entity( externalEntity = pfam_object ) 349 350 pfam_input_file_fd.close()
351