1 """
2 BIANA: Biologic Interactions and Network Analysis
3 Copyright (C) 2009 Javier Garcia-Garcia, Emre Guney, Baldo Oliva
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>.
17
18 """
19
20 """
21 File : utilities.py
22 Author : Ramon Aragues & Joan Planas
23 Creation : 15.01.2004
24 Modification: October 2007
25 Contents : miscelaneous utilities used by piana
26 =======================================================================================================
27
28 """
29
30 import sys, string, re, md5
31 import fnmatch
32 import os
33 import cPickle
34 import time
35
36 from sets import *
37
38 import math
39
40
41 from Bio.Blast import NCBIStandalone
42 from Bio import Fasta
43
44 import gzip
45
46 verbose = 0
47 verbose_detailed = 0
48 verbose_very_detailed = 0
49 verbose_matrix = 0
50 verbose_string_utilities = 0
51 verbose_blast_report = 0
52
53
54
55
56
57
75
76
77
78
79
80
82 """
83 Return MD5 code for sequence "sequence"
84 (MD5 hexdigestion of sequence + its leading 4 chars
85 + its last 4 chars)
86 """
87
88 sequence = sequence.strip()
89 head = sequence[:4]
90 tail = sequence[-4:]
91 toconvert = md5.new(sequence)
92 digested = toconvert.hexdigest()
93 md5_code = digested + head + tail
94
95 return md5_code
96
97
99 """
100 method that returns a list with potential types of protein code (ie database column) of
101 a given protein name "protein_id" for which we do not know the type of code
102
103 This method should be called prior to PianaDBaccess.get_list_proteinPiana() if the identifier type is not known
104
105 Attention!!! This function is only being used by string2piana
106
107
108 THIS IS CURRENTLY ONLY BEING USED IN THE STRING PARSER string2piana: that is why I am currently only looking for codes that might appear in STRING
109 """
110
111 list_potential_id_types = []
112
113 if re.match("[a-zA-Z0-9]{1,6}_[a-zA-Z0-9]{1,5}", protein_id):
114
115 list_potential_id_types.append("unientry")
116 if re.match("[a-zA-Z]{1,3}_\d+"):
117 list_potential_id_types.append("refseq")
118 if re.match("[OPQ][0-9][a-zA-Z0-9]{3}[0-9]", protein_id):
119
120 list_potential_id_types.append("uniacc")
121 if re.match("[a-zA-Z]{1,3}[0-9]{5,7}", protein_id):
122
123 list_potential_id_types.append("accessionNumber")
124 if re.match("\w+", protein_id):
125
126 list_potential_id_types.append("geneName")
127
128 else:
129 list_potential_id_types = []
130
131 return list_potential_id_types
132
133
134
136 """
137 cleans an input sequence from all spaces, tabs, and special characters it might have, leaving only a contigous list of aminoacids
138 """
139 return input_sequence.replace(" ", "").replace("*", "").replace("\n", "").replace("\t", "").replace("\r", "").replace("_", "")
140
141
142
143
144
145
146
147
148
150 """
151 parses a string that has field_names and values and returns a list of pairs [[field_name,value], [field_name, value], ...]
152
153 global_separators is a list with all the string separators that can act as a string separator (e.g. [" ", "|", ";" ])
154
155 separator_field_value can only be one character
156
157 string must follow format:
158
159 [global_separator]*field[separator_field_value]value[global_separator]*field[separator_field_value]value[global_separator]*.....
160
161 meaning that each field has a value
162
163 for example, stringX
164
165 " ;Name=Ramon ; and Name=Pedro , Synonim=Juan ; "
166
167 could be converted into a list [[Name, Ramon], [Name, Pedro], [Synonim, Juan] by calling parse_string_field_value(input_string=stringX,
168 separator_field_value="=",
169 global_separators=[" ",";"])
170
171 Attention!!! Even if space (ie " ") is not in global_separators, a strip() is done before returning the pairs, to remove trailing spaces from the
172 field names and field values. So, if trailing spaces are needed, something else has to be done...
173 """
174 string_global_groups = []
175 pairs_field_value = []
176
177 if verbose_string_utilities:
178 sys.stderr.write("Old input string: %s\n" %input_string)
179
180
181 for i in range(1,len(global_separators)):
182 homogeneous_input_string = input_string.replace(global_separators[i], global_separators[0])
183 input_string = homogeneous_input_string
184
185
186
187 if verbose_string_utilities:
188 sys.stderr.write("New input string: %s\n" %input_string)
189
190 string_global_groups = input_string.split(global_separators[0])
191
192 if verbose_string_utilities:
193 sys.stderr.write("Global groups: %s\n" %string_global_groups)
194
195
196
197 for string_global_group in string_global_groups:
198 if separator_field_value in string_global_group:
199 pair_field_value = string_global_group.split(separator_field_value)
200 pairs_field_value.append([pair_field_value[0].strip(), pair_field_value[1].strip()])
201
202
203
204 return pairs_field_value
205
206
207
208
209
210
211
213 """
214
215 returns a dictionary { gi: tax_id,
216 gi: tax_id,
217 ......
218 }
219
220 filled with info from "file_name" (gis and tax_ids are both integers
221
222 "file_name" is a file name of a file that has two tab-separated columns
223 1st one is gi code
224 2nd one is tax id for that gi
225 """
226
227 dic_gi_tax = {}
228
229 file_fd = file(file_name, "r")
230
231 for line in file_fd:
232 line_fields = line.split()
233
234 if len(line_fields) == 2:
235 dic_gi_tax[int(line_fields[0])] = int(line_fields[1])
236
237
238
239 return dic_gi_tax
240