biana.utilities.utilities

1 """ 2 BIANA: Biologic Interactions and Network Analysis 3 Copyright (C) 2009 Javier Garcia-Garcia, Emre Guney, Baldo Oliva 4 5 This program is free software: you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation, either version 3 of the License, or 8 (at your option) any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with this program. If not, see <http://www.gnu.org/licenses/>. 17 18 """ 19 20 """ 21 File : utilities.py 22 Author : Ramon Aragues & Joan Planas 23 Creation : 15.01.2004 24 Modification: October 2007 25 Contents : miscelaneous utilities used by piana 26 ======================================================================================================= 27 28 """ 29 30 import sys, string, re, md5 31 import fnmatch 32 import os 33 import cPickle 34 import time 35 36 from sets import * 37 38 import math 39 40 41 from Bio.Blast import NCBIStandalone 42 from Bio import Fasta # needed to read fasta format input files 43 44 import gzip 45 46 verbose = 0 47 verbose_detailed = 0 48 verbose_very_detailed = 0 49 verbose_matrix = 0 50 verbose_string_utilities = 0 51 verbose_blast_report = 0 52 53 54 # ------------------------------ 55 # utilities for loading file info 56 # ------------------------------ 57

58 -def return_non_commented_non_empty_lines(file_name=None):

59 """ 60 returns a list with one line each element 61 62 Non of the elements will be lines that started with a '#', and lines were not empty 63 """ 64 these_lines = [] 65 for one_line in file(file_name, "r"): 66 if one_line.lstrip().startswith("#"): 67 continue 68 69 if not one_line.strip(): 70 continue 71 72 these_lines.append(one_line.rstrip()) 73 74 return these_lines

75 76 77 # ------------------------------------------------------- 78 # Methods for conversion from one protein code to another 79 # ------------------------------------------------------- 80

81 -def sequence2md5(sequence):

82 """ 83 Return MD5 code for sequence "sequence" 84 (MD5 hexdigestion of sequence + its leading 4 chars 85 + its last 4 chars) 86 """ 87 88 sequence = sequence.strip() 89 head = sequence[:4] 90 tail = sequence[-4:] 91 toconvert = md5.new(sequence) 92 digested = toconvert.hexdigest() 93 md5_code = digested + head + tail 94 95 return md5_code

96 97

98 -def get_id_type(protein_id):

99 """ 100 method that returns a list with potential types of protein code (ie database column) of 101 a given protein name "protein_id" for which we do not know the type of code 102 103 This method should be called prior to PianaDBaccess.get_list_proteinPiana() if the identifier type is not known 104 105 Attention!!! This function is only being used by string2piana 106 107 108 THIS IS CURRENTLY ONLY BEING USED IN THE STRING PARSER string2piana: that is why I am currently only looking for codes that might appear in STRING 109 """ 110 111 list_potential_id_types = [] 112 113 if re.match("[a-zA-Z0-9]{1,6}_[a-zA-Z0-9]{1,5}", protein_id): 114 # this is for sure a unientry (ie. uniprot entry) 115 list_potential_id_types.append("unientry") 116 if re.match("[a-zA-Z]{1,3}_\d+"): 117 list_potential_id_types.append("refseq") 118 if re.match("[OPQ][0-9][a-zA-Z0-9]{3}[0-9]", protein_id): 119 # this is a uniacc or emblAccession 120 list_potential_id_types.append("uniacc") 121 if re.match("[a-zA-Z]{1,3}[0-9]{5,7}", protein_id): 122 # being very permissive to consider something a emblAccession... 123 list_potential_id_types.append("accessionNumber") 124 if re.match("\w+", protein_id): 125 # nothing matched before, and there are letters and numbers... try geneName 126 list_potential_id_types.append("geneName") 127 128 else: 129 list_potential_id_types = [] 130 131 return list_potential_id_types

132 133 134

135 -def get_clean_sequence(input_sequence):

136 """ 137 cleans an input sequence from all spaces, tabs, and special characters it might have, leaving only a contigous list of aminoacids 138 """ 139 return input_sequence.replace(" ", "").replace("*", "").replace("\n", "").replace("\t", "").replace("\r", "").replace("_", "")

140 141 142 143 144 145 # -------------------------- 146 # String utilities 147 # -------------------------- 148

149 -def parse_string_field_value(input_string= None, separator_field_value=None, global_separators=None):

150 """ 151 parses a string that has field_names and values and returns a list of pairs [[field_name,value], [field_name, value], ...] 152 153 global_separators is a list with all the string separators that can act as a string separator (e.g. [" ", "|", ";" ]) 154 155 separator_field_value can only be one character 156 157 string must follow format: 158 159 [global_separator]*field[separator_field_value]value[global_separator]*field[separator_field_value]value[global_separator]*..... 160 161 meaning that each field has a value 162 163 for example, stringX 164 165 " ;Name=Ramon ; and Name=Pedro , Synonim=Juan ; " 166 167 could be converted into a list [[Name, Ramon], [Name, Pedro], [Synonim, Juan] by calling parse_string_field_value(input_string=stringX, 168 separator_field_value="=", 169 global_separators=[" ",";"]) 170 171 Attention!!! Even if space (ie " ") is not in global_separators, a strip() is done before returning the pairs, to remove trailing spaces from the 172 field names and field values. So, if trailing spaces are needed, something else has to be done... 173 """ 174 string_global_groups = [] 175 pairs_field_value = [] 176 177 if verbose_string_utilities: 178 sys.stderr.write("Old input string: %s\n" %input_string) 179 180 # to simplify the process, convert all global_separators characters into global_separators[0] 181 for i in range(1,len(global_separators)): 182 homogeneous_input_string = input_string.replace(global_separators[i], global_separators[0]) 183 input_string = homogeneous_input_string 184 # END OF for i in range(1,len(global_separators)): 185 186 187 if verbose_string_utilities: 188 sys.stderr.write("New input string: %s\n" %input_string) 189 190 string_global_groups = input_string.split(global_separators[0]) 191 192 if verbose_string_utilities: 193 sys.stderr.write("Global groups: %s\n" %string_global_groups) 194 195 # string_global_groups will contain those pieces of the string that are between global_separator 196 # that piece of string only contains field and value if there is a separator_field_value 197 for string_global_group in string_global_groups: 198 if separator_field_value in string_global_group: 199 pair_field_value = string_global_group.split(separator_field_value) 200 pairs_field_value.append([pair_field_value[0].strip(), pair_field_value[1].strip()]) 201 # END OF if separator_field_value in string_global_group: 202 # END OF for string_global_group in string_global_groups: 203 204 return pairs_field_value

205 206 207 208 209 ############################## 210 ### PARSER UTILITIES ######### 211 ##############################

212 -def return_dic_gi_vs_tax(file_name= None):

213 """ 214 215 returns a dictionary { gi: tax_id, 216 gi: tax_id, 217 ...... 218 } 219 220 filled with info from "file_name" (gis and tax_ids are both integers 221 222 "file_name" is a file name of a file that has two tab-separated columns 223 1st one is gi code 224 2nd one is tax id for that gi 225 """ 226 227 dic_gi_tax = {} 228 229 file_fd = file(file_name, "r") 230 231 for line in file_fd: 232 line_fields = line.split() 233 234 if len(line_fields) == 2: 235 dic_gi_tax[int(line_fields[0])] = int(line_fields[1]) 236 237 # END OF for line in file_fd: 238 239 return dic_gi_tax

240

Source Code for Module biana.utilities.utilities