biana.BianaParser.cogParser

24 """ 25 COG Parser Class 26 """ 27 28 name = "cog" 29 description = "Clusters of Orthologous Groups of proteins (COGs)" 30 external_entity_definition = "An element in a COG" 31 external_entity_relations = "A COG" 32

33 - def __init__(self):

34 35 # Start with the default values 36 37 BianaParser.__init__(self, default_db_description = "COG database", 38 default_script_name = "cogParser.py", 39 default_script_description = CogParser.description, 40 additional_optional_arguments = []) 41 self.default_eE_attribute = "cog"

42 #self.is_promiscuous = True 43 44

45 - def parse_database(self):

46 47 # FIRST: Check that all the files exist 48 if os.path.isdir(self.input_file): 49 self.input_path = self.input_file 50 else: 51 raise ValueError("You must specify a path instead of a file") 52 53 files = ["myva","myva=gb","org.txt","fun.txt","whog"] #"pa" for the moment it is not necessary 54 55 for current_file in files: 56 if os.path.exists(self.input_path+os.sep+current_file) is False: 57 raise ValueError("File %s is missing in %s" %(current_file, self.input_path)) 58 59 60 # Read correspondence letters to TaxID for the external entities 61 62 species_file_fd = open(self.input_path+os.sep+"org.txt",'r') 63 specie_taxid_dict = {} 64 sp_taxid_regex = re.compile("\s*(\S+)\s+(\S+)\s+") 65 66 for line in species_file_fd: 67 m = sp_taxid_regex.match(line) 68 if m: 69 specie_taxid_dict[m.group(1).lower()] = m.group(2) 70 71 species_file_fd.close() 72 73 74 # Read the functional information 75 76 function_dict = {} 77 function_file_fd = open(self.input_path+os.sep+"fun.txt",'r') 78 funct_regex = re.compile("\s*\[(\w+)\]\s+(.+)$") 79 80 for line in function_file_fd: 81 m = funct_regex.match(line) 82 if m: 83 function_dict[m.group(1)] = m.group(2) 84 85 function_file_fd.close() 86 87 88 # Read the name and gi correspondence 89 name_to_gi_dict = {} 90 name2gi_file_fd = open(self.input_path+os.sep+"myva=gb",'r') 91 name2gi_regex = re.compile("\s*(\S+)\s+(\S+)\s+$") 92 93 for line in name2gi_file_fd: 94 m = name2gi_regex.match(line) 95 if m: 96 if m.group(2) != "gi?": 97 name_to_gi_dict[m.group(1).lower()] = m.group(2) 98 99 name2gi_file_fd.close() 100 101 102 # Obtain, from the COGs file, to which specie belongs each protein 103 # Obtain also the information for the COGs, description, functional_classification... 104 whog_file_fd = open(self.input_path+os.sep+"whog",'r') 105 name2species_dict = {} 106 cogs_components_dict = {} 107 cogs_funct_dict = {} 108 cogs_description_dict = {} 109 name2cogs_dict = {} 110 current_cog = None 111 112 113 new_cog_regex = re.compile("\s*\[(\w+)\]\s+(\w+)\s+(.+)$") 114 assignment_regex = re.compile("\s*(\w{3})\:\s+(.+)$") 115 116 for line in whog_file_fd: 117 118 m = new_cog_regex.match(line) 119 if m: 120 cogs_description_dict[m.group(2)] = m.group(3) 121 cogs_funct_dict[m.group(2)] = m.group(1) 122 cogs_components_dict.setdefault(m.group(2),[]) 123 current_cog = m.group(2) 124 continue 125 126 m = assignment_regex.match(line) 127 128 if m: 129 components = m.group(2).split(" ") 130 for current_component in components: 131 cogs_components_dict[current_cog].append(current_component) 132 name2cogs_dict.setdefault(current_component.lower(),[]).append(current_cog) 133 name2species_dict.setdefault(current_component.lower(),sets.Set()).add(m.group(1).lower()) 134 135 136 whog_file_fd.close() 137 138 139 def create_and_insert_eE(): 140 eE_object = ExternalEntity( source_database = self.database, type="protein" ) 141 eE_object.add_attribute(ExternalEntityAttribute( attribute_identifier = "proteinsequence", 142 value = ProteinSequence("".join(sequence)))) 143 if name_to_gi_dict.has_key(protein_name.lower()): 144 eE_object.add_attribute(ExternalEntityAttribute( attribute_identifier = "GI", 145 value = name_to_gi_dict[protein_name.lower()]) ) 146 147 148 if name2species_dict.has_key(protein_name.lower()): 149 species = name2species_dict[protein_name.lower()] 150 151 if len(species)>1: 152 print "Protein %s has more than a single specie assigned!" %protein_name 153 154 for current_specie in species: 155 eE_object.add_attribute(ExternalEntityAttribute( attribute_identifier = "taxID", 156 value = specie_taxid_dict[current_specie.lower()] ) ) 157 158 for current_cog in name2cogs_dict[protein_name.lower()]: 159 eE_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "COG", 160 value = current_cog ) ) 161 for current_function in cogs_funct_dict[current_cog]: 162 eE_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "function", 163 value = function_dict[current_function] ) ) 164 165 # HOW SHOULD THE NAME BE INSERTED??? In NCBI, they appear as Locus Name... Insert it as Ordered Locus Name? 166 eE_object.add_attribute( ExternalEntityAttribute( attribute_identifier = "OrderedLocusName", 167 value = protein_name ) ) 168 169 170 self.biana_access.insert_new_external_entity( externalEntity = eE_object )

171 172 173 # Read the sequences and insert the external entities 174 175 fasta_file_fd = open(self.input_path+os.sep+"myva",'r') 176 sequence = [] 177 protein_name_regex = re.compile(">(.+)$") 178 protein_name = None 179 180 for line in fasta_file_fd: 181 182 m = protein_name_regex.match(line) 183 if m: 184 if len(sequence)>0: 185 create_and_insert_eE() 186 187 sequence = [] 188 protein_name = m.group(1) 189 else: 190 sequence.append(line.strip()) 191 192 fasta_file_fd.close() 193 194 if len(sequence)>0: 195 create_and_insert_eE()

Source Code for Module biana.BianaParser.cogParser