Package biana :: Package BianaObjects :: Module Sequence
[hide private]
[frames] | no frames]

Source Code for Module biana.BianaObjects.Sequence

  1  """ 
  2      BIANA: Biologic Interactions and Network Analysis 
  3      Copyright (C) 2009  Javier Garcia-Garcia, Emre Guney, Baldo Oliva 
  4   
  5      This program is free software: you can redistribute it and/or modify 
  6      it under the terms of the GNU General Public License as published by 
  7      the Free Software Foundation, either version 3 of the License, or 
  8      (at your option) any later version. 
  9   
 10      This program is distributed in the hope that it will be useful, 
 11      but WITHOUT ANY WARRANTY; without even the implied warranty of 
 12      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 13      GNU General Public License for more details. 
 14   
 15      You should have received a copy of the GNU General Public License 
 16      along with this program.  If not, see <http://www.gnu.org/licenses/>. 
 17   
 18  """ 
 19   
 20  import md5 
 21  from biana.utilities import autoincrement 
 22   
 23   
24 -class Sequence(object):
25 """ 26 Class to represent a biologic sequencial entity, as nucleotide sequences, protein sequences, etc 27 28 This class is suposed to be an abstract class. Only instances for their subclasses should be created 29 """ 30 31
32 - def __init__(self, sequence, sequenceMD5=None, sequenceID=None, sequence_type=None):
33 """ 34 "sequence": the sequence itself. It is processed to eliminate non sequence elements 35 36 "sequenceMD5" is the digested md5 code of the sequence. It usually has not to be defined as a parameter, as it implements a method to calculate it 37 38 "sequenceID" is the unique identifier for the sequence. A sequence must have a sequenceID only when it has been inserted into database 39 40 "sequence_type" is the type of sequence (protein, dna or rna) 41 """ 42 43 44 self.sequence = sequence.replace(" ", "").replace("*", "").replace("\n", "").replace("\t", "").replace("\r", "").replace("_", "").upper() 45 self.sequenceMD5_value = sequenceMD5 46 47 self.sequenceID = sequenceID 48 self.sequence_type = sequence_type 49 50 self.fragmented_sequence = None
51
52 - def __str__(self):
53 return self.sequence
54
55 - def get_length(self):
56 return len(self.sequence)
57
58 - def get_sequence(self):
59 return self.sequence
60
61 - def get_type(self):
62 return self.sequence_type
63
64 - def _get_fragmented_sequence(self, using_size, using_translation_method, using_list):
65 """ 66 it returns a list with the ordered indices of the fragments 67 """ 68 69 divisions = len(self.sequence) / using_size 70 resta = len(self.sequence) % using_size 71 72 temp_seq = self.sequence 73 74 if resta!=0: 75 divisions += 1 76 for less in xrange(using_size-resta): 77 temp_seq += using_list[0] 78 del less 79 80 return [ using_translation_method(temp_seq[(x*using_size):(x*using_size)+using_size]) for x in xrange(divisions) ]
81
82 - def get_fragmented_sequence(self):
83 return self.fragmented_sequence
84
85 - def get_sequence_MD5(self):
86 """ 87 Now: 88 SequenceMD5 is used only for internal working, and it is represented in a 16-byte string 89 90 Previous: 91 Return MD5 code for sequence "sequence" 92 (MD5 hexdigestion of sequence + its leading 4 chars 93 + its last 4 chars) 94 """ 95 96 if self.sequenceMD5_value is None: 97 sequence = self.sequence.strip() 98 toconvert = md5.new(sequence) 99 #digested = toconvert.digest().replace('\\','\\\\').replace('"','\\"').replace("'","\\'") 100 digested = toconvert.digest() 101 self.sequenceMD5_value = digested 102 103 #if self.sequenceMD5_value is None: 104 # sequence = self.sequence.strip() 105 # head = sequence[:4] 106 # tail = sequence[-4:] 107 # toconvert = md5.new(sequence) 108 # digested = toconvert.hexdigest() 109 # self.sequenceMD5_value = digested + head + tail 110 111 return self.sequenceMD5_value
112 113
114 - def get_sequenceID(self):
115 return self.sequenceID
116
117 - def set_sequenceID(self, sequenceID, force=False):
118 """ 119 Sets the sequenceID for a sequence. 120 121 If it contains a sequence ID it should not be modified. In special cases where it should be able to be modified, use the parameter force=True 122 """ 123 124 if( self.sequenceID is None ) or force is True: 125 self.sequenceID = sequenceID 126 else: 127 raise ValueError("Trying to set an ID to a sequence that previously had one...)")
128 129
130 - def read_sequences_file(inputPath, format="fasta", sequences_type="protein"):
131 """ 132 Returns a list of sequence objects contained in a file in the specified format 133 134 "format" can be: fasta 135 136 "sequences_type" can be: protein, dna or rna 137 """ 138 139 if inputPath.endswith(".gz"): 140 import gzip 141 in_fd = gzip.open(inputPath,'r') 142 else: 143 in_fd = open(inputPath) 144 145 sequences_list = [] 146 147 temp_seq = [] 148 149 format = format.lower() 150 sequences_type = sequences_type.lower() 151 152 sequenceID = None 153 154 if format=="fasta": 155 for line in in_fd: 156 if line[0]==">": 157 if len(temp_seq)>0: 158 if sequences_type=="protein": 159 sequences_list.append( ProteinSequence( sequence = "".join(temp_seq), sequenceID = sequenceID ) ) 160 elif sequences_type=="dna": 161 sequences_list.append( DNASequence( sequence = "".join(temp_seq), sequenceID = sequenceID ) ) 162 elif sequences_type=="rna": 163 sequences_list.append( RNASequence( sequence = "".join(temp_seq), sequenceID = sequenceID ) ) 164 else: 165 raise ValueError('Sequence type not recognized: %s' %sequences_type) 166 temp_seq = [] 167 sequenceID=line[1:].strip() 168 else: 169 temp_seq.append(line.strip()) 170 171 else: 172 raise ValueError("Format not recognized: %s" %(format)) 173 174 if len(temp_seq)>0: 175 if sequences_type=="protein": 176 sequences_list.append( ProteinSequence( sequence = "".join(temp_seq), sequenceID = sequenceID ) ) 177 elif sequences_type=="dna": 178 sequences_list.append( DNASequence( sequence = "".join(temp_seq), sequenceID = sequenceID ) ) 179 elif sequences_type=="rna": 180 sequences_list.append( RNASequence( sequence = "".join(temp_seq), sequenceID = sequenceID ) ) 181 else: 182 raise ValueError('Sequence type not recognized: %s' %sequences_type) 183 184 return sequences_list
185 186 read_sequences_file = staticmethod( read_sequences_file )
187 188
189 -class RNASequence(Sequence):
190 191 translation_dict = None 192 dictionary = "ACUG" 193 window_size = 8 194
195 - def __init__(self, sequence, sequenceMD5=None, sequenceID=None):
196 """ 197 "sequence": the sequence itself. It is processed... 198 """ 199 200 Sequence.__init__( self, sequence = sequence, sequenceMD5 = sequenceMD5, sequenceID = sequenceID, sequence_type="rna" )
201 202
203 - def _get_digested_code(self, window):
204 """ 205 """ 206 207 if RNASequence.translation_dict is None: 208 autoinc = autoincrement(1) 209 RNASequence.translation_dict = dict( [ ("%s%s%s%s%s%s%s%s" %(a,b,c,d,e,f,g,h),autoinc.next()) for a in RNASequence.dictionary 210 for b in RNASequence.dictionary for c in RNASequence.dictionary 211 for d in RNASequence.dictionary for e in RNASequence.dictionary 212 for f in RNASequence.dictionary for g in RNASequence.dictionary for h in RNASequence.dictionary ] ) 213 214 try: 215 return RNASequence.translation_dict[window] 216 except KeyError: 217 return 0
218 219
220 - def _get_fragmented_sequence(self):
221 222 return Sequence._get_fragmented_sequence(self, 223 using_translation_method = self._get_digested_code, 224 using_size = RNASequence.window_size, 225 using_list = RNASequence.dictionary)
226 227
228 -class DNASequence(Sequence):
229 230 translation_dict = None 231 dictionary = "ACTG" 232 window_size = 8 233
234 - def __init__(self, sequence, sequenceMD5=None, sequenceID=None):
235 """ 236 "sequence": the sequence itself. It is processed... 237 """ 238 239 # test to summarize the sequence as a list of numbers 240 241 Sequence.__init__( self, sequence = sequence, sequenceMD5 = sequenceMD5, sequenceID = sequenceID, sequence_type="dna" )
242 243 244
245 - def _get_digested_code(self, window):
246 """ 247 """ 248 249 if DNASequence.translation_dict is None: 250 autoinc = autoincrement(1) 251 DNASequence.translation_dict = dict( [ ("%s%s%s%s%s%s%s%s" %(a,b,c,d,e,f,g,h),autoinc.next()) for a in DNASequence.dictionary 252 for b in DNASequence.dictionary for c in DNASequence.dictionary 253 for d in DNASequence.dictionary for e in DNASequence.dictionary 254 for f in DNASequence.dictionary for g in DNASequence.dictionary for h in DNASequence.dictionary ] ) 255 256 try: 257 return DNASequence.translation_dict[window] 258 except KeyError: 259 return 0
260 261
262 - def _get_fragmented_sequence(self):
263 264 return Sequence._get_fragmented_sequence(self, 265 using_translation_method = self._get_digested_code, 266 using_size = DNASequence.window_size, 267 using_list = DNASequence.dictionary )
268 269
270 -class ProteinSequence(Sequence):
271 272 translation_dict = None 273 dictionary = "GALMFWKQESPVICYHRNDTXZBUJ" 274 window_size = 3 275 276 aminoacid_codes_3to1 = { "GLY": 'G', 277 "ALA": 'A', 278 "LEU": 'L', 279 "MET": 'M', 280 "PHE": 'F', 281 "TRP": 'W', 282 "LYS": 'K', 283 "GLN": 'Q', 284 "GLU": 'E', 285 "SER": 'S', 286 "PRO": 'P', 287 "VAL": 'V', 288 "ILE": 'I', 289 "CYS": 'C', 290 "TYR": 'Y', 291 "HIS": 'H', 292 "ARG": 'R', 293 "ASN": 'N', 294 "ASP": 'D', 295 "THR": 'T', 296 "MSE": 'M', #Selenomethionin 297 "CSS": 'C', 298 '2AS':'D', '3AH':'H', '5HP':'E', 'ACL':'R', 'AIB':'A', 299 'ALM':'A', 'ALO':'T', 'ALY':'K', 'ARM':'R', 'ASA':'D', 300 'ASB':'D', 'ASK':'D', 'ASL':'D', 'ASQ':'D', 'AYA':'A', 301 'BCS':'C', 'BHD':'D', 'BMT':'T', 'BNN':'A', 'BUC':'C', 302 'BUG':'L', 'C5C':'C', 'C6C':'C', 'CCS':'C', 'CEA':'C', 303 'CHG':'A', 'CLE':'L', 'CME':'C', 'CSD':'A', 'CSO':'C', 304 'CSP':'C', 'CSS':'C', 'CSW':'C', 'CXM':'M', 'CY1':'C', 305 'CY3':'C', 'CYG':'C', 'CYM':'C', 'CYQ':'C', 'DAH':'F', 306 'DAL':'A', 'DAR':'R', 'DAS':'D', 'DCY':'C', 'DGL':'E', 307 'DGN':'Q', 'DHA':'A', 'DHI':'H', 'DIL':'I', 'DIV':'V', 308 'DLE':'L', 'DLY':'K', 'DNP':'A', 'DPN':'F', 'DPR':'P', 309 'DSN':'S', 'DSP':'D', 'DTH':'T', 'DTR':'W', 'DTY':'Y', 310 'DVA':'V', 'EFC':'C', 'FLA':'A', 'FME':'M', 'GGL':'E', 311 'GLZ':'G', 'GMA':'E', 'GSC':'G', 'HAC':'A', 'HAR':'R', 312 'HIC':'H', 'HIP':'H', 'HMR':'R', 'HPQ':'F', 'HTR':'W', 313 'HYP':'P', 'IIL':'I', 'IYR':'Y', 'KCX':'K', 'LLP':'K', 314 'LLY':'K', 'LTR':'W', 'LYM':'K', 'LYZ':'K', 'MAA':'A', 315 'MEN':'N', 'MHS':'H', 'MIS':'S', 'MLE':'L', 'MPQ':'G', 316 'MSA':'G', 'MSE':'M', 'MVA':'V', 'NEM':'H', 'NEP':'H', 317 'NLE':'L', 'NLN':'L', 'NLP':'L', 'NMC':'G', 'OAS':'S', 318 'OCS':'C', 'OMT':'M', 'PAQ':'Y', 'PCA':'E', 'PEC':'C', 319 'PHI':'F', 'PHL':'F', 'PR3':'C', 'PRR':'A', 'PTR':'Y', 320 'SAC':'S', 'SAR':'G', 'SCH':'C', 'SCS':'C', 'SCY':'C', 321 'SEL':'S', 'SEP':'S', 'SET':'S', 'SHC':'C', 'SHR':'K', 322 'SOC':'C', 'STY':'Y', 'SVA':'S', 'TIH':'A', 'TPL':'W', 323 'TPO':'T', 'TPQ':'A', 'TRG':'K', 'TRO':'W', 'TYB':'Y', 324 'TYQ':'Y', 'TYS':'Y', 'TYY':'Y', 'AGM':'R', 'GL3':'G', 325 'SMC':'C', 'ASX':'B', 'CGU':'E', 'CSX':'C', 'GLX':'Z', 326 'UNK':'X'} 327
328 - def __init__(self, sequence, sequenceMD5=None, sequenceID=None, proteinMW=None, proteinIP=None):
329 """ 330 "sequence": the sequence itself. It is processed... 331 """ 332 333 self.proteinMW = proteinMW 334 self.proteinIP = proteinIP 335 336 self.biopython_protein_analyzer = None 337 338 Sequence.__init__( self, sequence = sequence, sequenceMD5 = sequenceMD5, sequenceID = sequenceID, sequence_type="peptide" )
339 340 341 342
343 - def _get_protein_analyzer(self):
344 345 if self.biopython_protein_analyzer is None: 346 try: 347 self.biopython_protein_analyzer = Bio.SeqUtils.ProtParam.ProteinAnalysis(self.get_sequence()) 348 except: 349 pass 350 #sys.stderr.write("ERROR! Bio.SeqUtils.ProtParam.ProteinAnalysis cannot be loaded.") 351 352 return self.biopython_protein_analyzer
353 354
355 - def get_proteinMW(self):
356 357 if self.proteinMW is None: 358 try: 359 # calculate the molecular weight 360 self.proteinMW = self._get_protein_analyzer().molecular_weight() 361 except: 362 # any error in the function sets weight to 0 363 # sys.stderr.write("ERROR! Assigning protein MW to 0.") 364 self.proteinMW = 0 365 366 return self.proteinMW
367 368
369 - def get_proteinIP(self):
370 371 if self.proteinIP is None: 372 try: 373 self.proteinIP= analyzed_protein.isoelectric_point(correction_step = 0.001) 374 except: 375 #sys.stderr.write("ERROR! Assigning protein IP to 0.") 376 self.proteinIP = 0 377 378 return self.proteinIP
379 380 381
382 - def _get_digested_code(self, window):
383 """ 384 """ 385 386 if ProteinSequence.translation_dict is None: 387 autoinc = autoincrement(1) 388 ProteinSequence.translation_dict = dict( [ ("%s%s%s" %(a,b,c),autoinc.next()) for a in ProteinSequence.dictionary 389 for b in ProteinSequence.dictionary 390 for c in ProteinSequence.dictionary ] ) 391 392 393 try: 394 return ProteinSequence.translation_dict[window] 395 except KeyError: 396 return 0
397
398 - def _get_fragmented_sequence(self):
399 400 return Sequence._get_fragmented_sequence(self, 401 using_translation_method = self._get_digested_code, 402 using_size = ProteinSequence.window_size, 403 using_list = ProteinSequence.dictionary )
404 405
406 - def get_aminoacid_code_3to1(code):
407 408 try: 409 return ProteinSequence.aminoacid_codes_3to1[code.upper()] 410 except KeyError: 411 print code 412 413 return "X"
414 415 get_aminoacid_code_3to1 = staticmethod(get_aminoacid_code_3to1)
416