1 """
2 BIANA: Biologic Interactions and Network Analysis
3 Copyright (C) 2009 Javier Garcia-Garcia, Emre Guney, Baldo Oliva
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>.
17
18 """
19
20 import md5
21 from biana.utilities import autoincrement
22
23
25 """
26 Class to represent a biologic sequencial entity, as nucleotide sequences, protein sequences, etc
27
28 This class is suposed to be an abstract class. Only instances for their subclasses should be created
29 """
30
31
32 - def __init__(self, sequence, sequenceMD5=None, sequenceID=None, sequence_type=None):
33 """
34 "sequence": the sequence itself. It is processed to eliminate non sequence elements
35
36 "sequenceMD5" is the digested md5 code of the sequence. It usually has not to be defined as a parameter, as it implements a method to calculate it
37
38 "sequenceID" is the unique identifier for the sequence. A sequence must have a sequenceID only when it has been inserted into database
39
40 "sequence_type" is the type of sequence (protein, dna or rna)
41 """
42
43
44 self.sequence = sequence.replace(" ", "").replace("*", "").replace("\n", "").replace("\t", "").replace("\r", "").replace("_", "").upper()
45 self.sequenceMD5_value = sequenceMD5
46
47 self.sequenceID = sequenceID
48 self.sequence_type = sequence_type
49
50 self.fragmented_sequence = None
51
54
56 return len(self.sequence)
57
60
62 return self.sequence_type
63
65 """
66 it returns a list with the ordered indices of the fragments
67 """
68
69 divisions = len(self.sequence) / using_size
70 resta = len(self.sequence) % using_size
71
72 temp_seq = self.sequence
73
74 if resta!=0:
75 divisions += 1
76 for less in xrange(using_size-resta):
77 temp_seq += using_list[0]
78 del less
79
80 return [ using_translation_method(temp_seq[(x*using_size):(x*using_size)+using_size]) for x in xrange(divisions) ]
81
83 return self.fragmented_sequence
84
86 """
87 Now:
88 SequenceMD5 is used only for internal working, and it is represented in a 16-byte string
89
90 Previous:
91 Return MD5 code for sequence "sequence"
92 (MD5 hexdigestion of sequence + its leading 4 chars
93 + its last 4 chars)
94 """
95
96 if self.sequenceMD5_value is None:
97 sequence = self.sequence.strip()
98 toconvert = md5.new(sequence)
99
100 digested = toconvert.digest()
101 self.sequenceMD5_value = digested
102
103
104
105
106
107
108
109
110
111 return self.sequenceMD5_value
112
113
115 return self.sequenceID
116
118 """
119 Sets the sequenceID for a sequence.
120
121 If it contains a sequence ID it should not be modified. In special cases where it should be able to be modified, use the parameter force=True
122 """
123
124 if( self.sequenceID is None ) or force is True:
125 self.sequenceID = sequenceID
126 else:
127 raise ValueError("Trying to set an ID to a sequence that previously had one...)")
128
129
131 """
132 Returns a list of sequence objects contained in a file in the specified format
133
134 "format" can be: fasta
135
136 "sequences_type" can be: protein, dna or rna
137 """
138
139 if inputPath.endswith(".gz"):
140 import gzip
141 in_fd = gzip.open(inputPath,'r')
142 else:
143 in_fd = open(inputPath)
144
145 sequences_list = []
146
147 temp_seq = []
148
149 format = format.lower()
150 sequences_type = sequences_type.lower()
151
152 sequenceID = None
153
154 if format=="fasta":
155 for line in in_fd:
156 if line[0]==">":
157 if len(temp_seq)>0:
158 if sequences_type=="protein":
159 sequences_list.append( ProteinSequence( sequence = "".join(temp_seq), sequenceID = sequenceID ) )
160 elif sequences_type=="dna":
161 sequences_list.append( DNASequence( sequence = "".join(temp_seq), sequenceID = sequenceID ) )
162 elif sequences_type=="rna":
163 sequences_list.append( RNASequence( sequence = "".join(temp_seq), sequenceID = sequenceID ) )
164 else:
165 raise ValueError('Sequence type not recognized: %s' %sequences_type)
166 temp_seq = []
167 sequenceID=line[1:].strip()
168 else:
169 temp_seq.append(line.strip())
170
171 else:
172 raise ValueError("Format not recognized: %s" %(format))
173
174 if len(temp_seq)>0:
175 if sequences_type=="protein":
176 sequences_list.append( ProteinSequence( sequence = "".join(temp_seq), sequenceID = sequenceID ) )
177 elif sequences_type=="dna":
178 sequences_list.append( DNASequence( sequence = "".join(temp_seq), sequenceID = sequenceID ) )
179 elif sequences_type=="rna":
180 sequences_list.append( RNASequence( sequence = "".join(temp_seq), sequenceID = sequenceID ) )
181 else:
182 raise ValueError('Sequence type not recognized: %s' %sequences_type)
183
184 return sequences_list
185
186 read_sequences_file = staticmethod( read_sequences_file )
187
188
190
191 translation_dict = None
192 dictionary = "ACUG"
193 window_size = 8
194
195 - def __init__(self, sequence, sequenceMD5=None, sequenceID=None):
196 """
197 "sequence": the sequence itself. It is processed...
198 """
199
200 Sequence.__init__( self, sequence = sequence, sequenceMD5 = sequenceMD5, sequenceID = sequenceID, sequence_type="rna" )
201
202
218
219
226
227
229
230 translation_dict = None
231 dictionary = "ACTG"
232 window_size = 8
233
234 - def __init__(self, sequence, sequenceMD5=None, sequenceID=None):
235 """
236 "sequence": the sequence itself. It is processed...
237 """
238
239
240
241 Sequence.__init__( self, sequence = sequence, sequenceMD5 = sequenceMD5, sequenceID = sequenceID, sequence_type="dna" )
242
243
244
260
261
268
269
271
272 translation_dict = None
273 dictionary = "GALMFWKQESPVICYHRNDTXZBUJ"
274 window_size = 3
275
276 aminoacid_codes_3to1 = { "GLY": 'G',
277 "ALA": 'A',
278 "LEU": 'L',
279 "MET": 'M',
280 "PHE": 'F',
281 "TRP": 'W',
282 "LYS": 'K',
283 "GLN": 'Q',
284 "GLU": 'E',
285 "SER": 'S',
286 "PRO": 'P',
287 "VAL": 'V',
288 "ILE": 'I',
289 "CYS": 'C',
290 "TYR": 'Y',
291 "HIS": 'H',
292 "ARG": 'R',
293 "ASN": 'N',
294 "ASP": 'D',
295 "THR": 'T',
296 "MSE": 'M',
297 "CSS": 'C',
298 '2AS':'D', '3AH':'H', '5HP':'E', 'ACL':'R', 'AIB':'A',
299 'ALM':'A', 'ALO':'T', 'ALY':'K', 'ARM':'R', 'ASA':'D',
300 'ASB':'D', 'ASK':'D', 'ASL':'D', 'ASQ':'D', 'AYA':'A',
301 'BCS':'C', 'BHD':'D', 'BMT':'T', 'BNN':'A', 'BUC':'C',
302 'BUG':'L', 'C5C':'C', 'C6C':'C', 'CCS':'C', 'CEA':'C',
303 'CHG':'A', 'CLE':'L', 'CME':'C', 'CSD':'A', 'CSO':'C',
304 'CSP':'C', 'CSS':'C', 'CSW':'C', 'CXM':'M', 'CY1':'C',
305 'CY3':'C', 'CYG':'C', 'CYM':'C', 'CYQ':'C', 'DAH':'F',
306 'DAL':'A', 'DAR':'R', 'DAS':'D', 'DCY':'C', 'DGL':'E',
307 'DGN':'Q', 'DHA':'A', 'DHI':'H', 'DIL':'I', 'DIV':'V',
308 'DLE':'L', 'DLY':'K', 'DNP':'A', 'DPN':'F', 'DPR':'P',
309 'DSN':'S', 'DSP':'D', 'DTH':'T', 'DTR':'W', 'DTY':'Y',
310 'DVA':'V', 'EFC':'C', 'FLA':'A', 'FME':'M', 'GGL':'E',
311 'GLZ':'G', 'GMA':'E', 'GSC':'G', 'HAC':'A', 'HAR':'R',
312 'HIC':'H', 'HIP':'H', 'HMR':'R', 'HPQ':'F', 'HTR':'W',
313 'HYP':'P', 'IIL':'I', 'IYR':'Y', 'KCX':'K', 'LLP':'K',
314 'LLY':'K', 'LTR':'W', 'LYM':'K', 'LYZ':'K', 'MAA':'A',
315 'MEN':'N', 'MHS':'H', 'MIS':'S', 'MLE':'L', 'MPQ':'G',
316 'MSA':'G', 'MSE':'M', 'MVA':'V', 'NEM':'H', 'NEP':'H',
317 'NLE':'L', 'NLN':'L', 'NLP':'L', 'NMC':'G', 'OAS':'S',
318 'OCS':'C', 'OMT':'M', 'PAQ':'Y', 'PCA':'E', 'PEC':'C',
319 'PHI':'F', 'PHL':'F', 'PR3':'C', 'PRR':'A', 'PTR':'Y',
320 'SAC':'S', 'SAR':'G', 'SCH':'C', 'SCS':'C', 'SCY':'C',
321 'SEL':'S', 'SEP':'S', 'SET':'S', 'SHC':'C', 'SHR':'K',
322 'SOC':'C', 'STY':'Y', 'SVA':'S', 'TIH':'A', 'TPL':'W',
323 'TPO':'T', 'TPQ':'A', 'TRG':'K', 'TRO':'W', 'TYB':'Y',
324 'TYQ':'Y', 'TYS':'Y', 'TYY':'Y', 'AGM':'R', 'GL3':'G',
325 'SMC':'C', 'ASX':'B', 'CGU':'E', 'CSX':'C', 'GLX':'Z',
326 'UNK':'X'}
327
328 - def __init__(self, sequence, sequenceMD5=None, sequenceID=None, proteinMW=None, proteinIP=None):
329 """
330 "sequence": the sequence itself. It is processed...
331 """
332
333 self.proteinMW = proteinMW
334 self.proteinIP = proteinIP
335
336 self.biopython_protein_analyzer = None
337
338 Sequence.__init__( self, sequence = sequence, sequenceMD5 = sequenceMD5, sequenceID = sequenceID, sequence_type="peptide" )
339
340
341
342
344
345 if self.biopython_protein_analyzer is None:
346 try:
347 self.biopython_protein_analyzer = Bio.SeqUtils.ProtParam.ProteinAnalysis(self.get_sequence())
348 except:
349 pass
350
351
352 return self.biopython_protein_analyzer
353
354
356
357 if self.proteinMW is None:
358 try:
359
360 self.proteinMW = self._get_protein_analyzer().molecular_weight()
361 except:
362
363
364 self.proteinMW = 0
365
366 return self.proteinMW
367
368
370
371 if self.proteinIP is None:
372 try:
373 self.proteinIP= analyzed_protein.isoelectric_point(correction_step = 0.001)
374 except:
375
376 self.proteinIP = 0
377
378 return self.proteinIP
379
380
381
397
404
405
414
415 get_aminoacid_code_3to1 = staticmethod(get_aminoacid_code_3to1)
416