biana.BianaObjects.Sequence

1 """ 2 BIANA: Biologic Interactions and Network Analysis 3 Copyright (C) 2009 Javier Garcia-Garcia, Emre Guney, Baldo Oliva 4 5 This program is free software: you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation, either version 3 of the License, or 8 (at your option) any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with this program. If not, see <http://www.gnu.org/licenses/>. 17 18 """ 19 20 import md5 21 from biana.utilities import autoincrement 22 23

24 -class Sequence(object):

25 """ 26 Class to represent a biologic sequencial entity, as nucleotide sequences, protein sequences, etc 27 28 This class is suposed to be an abstract class. Only instances for their subclasses should be created 29 """ 30 31

32 - def __init__(self, sequence, sequenceMD5=None, sequenceID=None, sequence_type=None):

33 """ 34 "sequence": the sequence itself. It is processed to eliminate non sequence elements 35 36 "sequenceMD5" is the digested md5 code of the sequence. It usually has not to be defined as a parameter, as it implements a method to calculate it 37 38 "sequenceID" is the unique identifier for the sequence. A sequence must have a sequenceID only when it has been inserted into database 39 40 "sequence_type" is the type of sequence (protein, dna or rna) 41 """ 42 43 44 self.sequence = sequence.replace(" ", "").replace("*", "").replace("\n", "").replace("\t", "").replace("\r", "").replace("_", "").upper() 45 self.sequenceMD5_value = sequenceMD5 46 47 self.sequenceID = sequenceID 48 self.sequence_type = sequence_type 49 50 self.fragmented_sequence = None

51

52 - def __str__(self):

53 return self.sequence

54

55 - def get_length(self):

56 return len(self.sequence)

57

58 - def get_sequence(self):

59 return self.sequence

60

61 - def get_type(self):

62 return self.sequence_type

63

64 - def _get_fragmented_sequence(self, using_size, using_translation_method, using_list):

65 """ 66 it returns a list with the ordered indices of the fragments 67 """ 68 69 divisions = len(self.sequence) / using_size 70 resta = len(self.sequence) % using_size 71 72 temp_seq = self.sequence 73 74 if resta!=0: 75 divisions += 1 76 for less in xrange(using_size-resta): 77 temp_seq += using_list[0] 78 del less 79 80 return [ using_translation_method(temp_seq[(x*using_size):(x*using_size)+using_size]) for x in xrange(divisions) ]

81

82 - def get_fragmented_sequence(self):

83 return self.fragmented_sequence

84

85 - def get_sequence_MD5(self):

86 """ 87 Now: 88 SequenceMD5 is used only for internal working, and it is represented in a 16-byte string 89 90 Previous: 91 Return MD5 code for sequence "sequence" 92 (MD5 hexdigestion of sequence + its leading 4 chars 93 + its last 4 chars) 94 """ 95 96 if self.sequenceMD5_value is None: 97 sequence = self.sequence.strip() 98 toconvert = md5.new(sequence) 99 #digested = toconvert.digest().replace('\\','\\\\').replace('"','\\"').replace("'","\\'") 100 digested = toconvert.digest() 101 self.sequenceMD5_value = digested 102 103 #if self.sequenceMD5_value is None: 104 # sequence = self.sequence.strip() 105 # head = sequence[:4] 106 # tail = sequence[-4:] 107 # toconvert = md5.new(sequence) 108 # digested = toconvert.hexdigest() 109 # self.sequenceMD5_value = digested + head + tail 110 111 return self.sequenceMD5_value

112 113

114 - def get_sequenceID(self):

115 return self.sequenceID

116

117 - def set_sequenceID(self, sequenceID, force=False):

118 """ 119 Sets the sequenceID for a sequence. 120 121 If it contains a sequence ID it should not be modified. In special cases where it should be able to be modified, use the parameter force=True 122 """ 123 124 if( self.sequenceID is None ) or force is True: 125 self.sequenceID = sequenceID 126 else: 127 raise ValueError("Trying to set an ID to a sequence that previously had one...)")

128 129

130 - def read_sequences_file(inputPath, format="fasta", sequences_type="protein"):

131 """ 132 Returns a list of sequence objects contained in a file in the specified format 133 134 "format" can be: fasta 135 136 "sequences_type" can be: protein, dna or rna 137 """ 138 139 if inputPath.endswith(".gz"): 140 import gzip 141 in_fd = gzip.open(inputPath,'r') 142 else: 143 in_fd = open(inputPath) 144 145 sequences_list = [] 146 147 temp_seq = [] 148 149 format = format.lower() 150 sequences_type = sequences_type.lower() 151 152 sequenceID = None 153 154 if format=="fasta": 155 for line in in_fd: 156 if line[0]==">": 157 if len(temp_seq)>0: 158 if sequences_type=="protein": 159 sequences_list.append( ProteinSequence( sequence = "".join(temp_seq), sequenceID = sequenceID ) ) 160 elif sequences_type=="dna": 161 sequences_list.append( DNASequence( sequence = "".join(temp_seq), sequenceID = sequenceID ) ) 162 elif sequences_type=="rna": 163 sequences_list.append( RNASequence( sequence = "".join(temp_seq), sequenceID = sequenceID ) ) 164 else: 165 raise ValueError('Sequence type not recognized: %s' %sequences_type) 166 temp_seq = [] 167 sequenceID=line[1:].strip() 168 else: 169 temp_seq.append(line.strip()) 170 171 else: 172 raise ValueError("Format not recognized: %s" %(format)) 173 174 if len(temp_seq)>0: 175 if sequences_type=="protein": 176 sequences_list.append( ProteinSequence( sequence = "".join(temp_seq), sequenceID = sequenceID ) ) 177 elif sequences_type=="dna": 178 sequences_list.append( DNASequence( sequence = "".join(temp_seq), sequenceID = sequenceID ) ) 179 elif sequences_type=="rna": 180 sequences_list.append( RNASequence( sequence = "".join(temp_seq), sequenceID = sequenceID ) ) 181 else: 182 raise ValueError('Sequence type not recognized: %s' %sequences_type) 183 184 return sequences_list

185 186 read_sequences_file = staticmethod( read_sequences_file )

187 188

189 -class RNASequence(Sequence):

190 191 translation_dict = None 192 dictionary = "ACUG" 193 window_size = 8 194

195 - def __init__(self, sequence, sequenceMD5=None, sequenceID=None):

196 """ 197 "sequence": the sequence itself. It is processed... 198 """ 199 200 Sequence.__init__( self, sequence = sequence, sequenceMD5 = sequenceMD5, sequenceID = sequenceID, sequence_type="rna" )

201 202

203 - def _get_digested_code(self, window):

204 """ 205 """ 206 207 if RNASequence.translation_dict is None: 208 autoinc = autoincrement(1) 209 RNASequence.translation_dict = dict( [ ("%s%s%s%s%s%s%s%s" %(a,b,c,d,e,f,g,h),autoinc.next()) for a in RNASequence.dictionary 210 for b in RNASequence.dictionary for c in RNASequence.dictionary 211 for d in RNASequence.dictionary for e in RNASequence.dictionary 212 for f in RNASequence.dictionary for g in RNASequence.dictionary for h in RNASequence.dictionary ] ) 213 214 try: 215 return RNASequence.translation_dict[window] 216 except KeyError: 217 return 0

218 219

220 - def _get_fragmented_sequence(self):

221 222 return Sequence._get_fragmented_sequence(self, 223 using_translation_method = self._get_digested_code, 224 using_size = RNASequence.window_size, 225 using_list = RNASequence.dictionary)

226 227

228 -class DNASequence(Sequence):

229 230 translation_dict = None 231 dictionary = "ACTG" 232 window_size = 8 233

234 - def __init__(self, sequence, sequenceMD5=None, sequenceID=None):

235 """ 236 "sequence": the sequence itself. It is processed... 237 """ 238 239 # test to summarize the sequence as a list of numbers 240 241 Sequence.__init__( self, sequence = sequence, sequenceMD5 = sequenceMD5, sequenceID = sequenceID, sequence_type="dna" )

242 243 244

245 - def _get_digested_code(self, window):

246 """ 247 """ 248 249 if DNASequence.translation_dict is None: 250 autoinc = autoincrement(1) 251 DNASequence.translation_dict = dict( [ ("%s%s%s%s%s%s%s%s" %(a,b,c,d,e,f,g,h),autoinc.next()) for a in DNASequence.dictionary 252 for b in DNASequence.dictionary for c in DNASequence.dictionary 253 for d in DNASequence.dictionary for e in DNASequence.dictionary 254 for f in DNASequence.dictionary for g in DNASequence.dictionary for h in DNASequence.dictionary ] ) 255 256 try: 257 return DNASequence.translation_dict[window] 258 except KeyError: 259 return 0

260 261

262 - def _get_fragmented_sequence(self):

263 264 return Sequence._get_fragmented_sequence(self, 265 using_translation_method = self._get_digested_code, 266 using_size = DNASequence.window_size, 267 using_list = DNASequence.dictionary )

268 269

270 -class ProteinSequence(Sequence):

271 272 translation_dict = None 273 dictionary = "GALMFWKQESPVICYHRNDTXZBUJ" 274 window_size = 3 275 276 aminoacid_codes_3to1 = { "GLY": 'G', 277 "ALA": 'A', 278 "LEU": 'L', 279 "MET": 'M', 280 "PHE": 'F', 281 "TRP": 'W', 282 "LYS": 'K', 283 "GLN": 'Q', 284 "GLU": 'E', 285 "SER": 'S', 286 "PRO": 'P', 287 "VAL": 'V', 288 "ILE": 'I', 289 "CYS": 'C', 290 "TYR": 'Y', 291 "HIS": 'H', 292 "ARG": 'R', 293 "ASN": 'N', 294 "ASP": 'D', 295 "THR": 'T', 296 "MSE": 'M', #Selenomethionin 297 "CSS": 'C', 298 '2AS':'D', '3AH':'H', '5HP':'E', 'ACL':'R', 'AIB':'A', 299 'ALM':'A', 'ALO':'T', 'ALY':'K', 'ARM':'R', 'ASA':'D', 300 'ASB':'D', 'ASK':'D', 'ASL':'D', 'ASQ':'D', 'AYA':'A', 301 'BCS':'C', 'BHD':'D', 'BMT':'T', 'BNN':'A', 'BUC':'C', 302 'BUG':'L', 'C5C':'C', 'C6C':'C', 'CCS':'C', 'CEA':'C', 303 'CHG':'A', 'CLE':'L', 'CME':'C', 'CSD':'A', 'CSO':'C', 304 'CSP':'C', 'CSS':'C', 'CSW':'C', 'CXM':'M', 'CY1':'C', 305 'CY3':'C', 'CYG':'C', 'CYM':'C', 'CYQ':'C', 'DAH':'F', 306 'DAL':'A', 'DAR':'R', 'DAS':'D', 'DCY':'C', 'DGL':'E', 307 'DGN':'Q', 'DHA':'A', 'DHI':'H', 'DIL':'I', 'DIV':'V', 308 'DLE':'L', 'DLY':'K', 'DNP':'A', 'DPN':'F', 'DPR':'P', 309 'DSN':'S', 'DSP':'D', 'DTH':'T', 'DTR':'W', 'DTY':'Y', 310 'DVA':'V', 'EFC':'C', 'FLA':'A', 'FME':'M', 'GGL':'E', 311 'GLZ':'G', 'GMA':'E', 'GSC':'G', 'HAC':'A', 'HAR':'R', 312 'HIC':'H', 'HIP':'H', 'HMR':'R', 'HPQ':'F', 'HTR':'W', 313 'HYP':'P', 'IIL':'I', 'IYR':'Y', 'KCX':'K', 'LLP':'K', 314 'LLY':'K', 'LTR':'W', 'LYM':'K', 'LYZ':'K', 'MAA':'A', 315 'MEN':'N', 'MHS':'H', 'MIS':'S', 'MLE':'L', 'MPQ':'G', 316 'MSA':'G', 'MSE':'M', 'MVA':'V', 'NEM':'H', 'NEP':'H', 317 'NLE':'L', 'NLN':'L', 'NLP':'L', 'NMC':'G', 'OAS':'S', 318 'OCS':'C', 'OMT':'M', 'PAQ':'Y', 'PCA':'E', 'PEC':'C', 319 'PHI':'F', 'PHL':'F', 'PR3':'C', 'PRR':'A', 'PTR':'Y', 320 'SAC':'S', 'SAR':'G', 'SCH':'C', 'SCS':'C', 'SCY':'C', 321 'SEL':'S', 'SEP':'S', 'SET':'S', 'SHC':'C', 'SHR':'K', 322 'SOC':'C', 'STY':'Y', 'SVA':'S', 'TIH':'A', 'TPL':'W', 323 'TPO':'T', 'TPQ':'A', 'TRG':'K', 'TRO':'W', 'TYB':'Y', 324 'TYQ':'Y', 'TYS':'Y', 'TYY':'Y', 'AGM':'R', 'GL3':'G', 325 'SMC':'C', 'ASX':'B', 'CGU':'E', 'CSX':'C', 'GLX':'Z', 326 'UNK':'X'} 327

328 - def __init__(self, sequence, sequenceMD5=None, sequenceID=None, proteinMW=None, proteinIP=None):

329 """ 330 "sequence": the sequence itself. It is processed... 331 """ 332 333 self.proteinMW = proteinMW 334 self.proteinIP = proteinIP 335 336 self.biopython_protein_analyzer = None 337 338 Sequence.__init__( self, sequence = sequence, sequenceMD5 = sequenceMD5, sequenceID = sequenceID, sequence_type="peptide" )

339 340 341 342

343 - def _get_protein_analyzer(self):

344 345 if self.biopython_protein_analyzer is None: 346 try: 347 self.biopython_protein_analyzer = Bio.SeqUtils.ProtParam.ProteinAnalysis(self.get_sequence()) 348 except: 349 pass 350 #sys.stderr.write("ERROR! Bio.SeqUtils.ProtParam.ProteinAnalysis cannot be loaded.") 351 352 return self.biopython_protein_analyzer

353 354

355 - def get_proteinMW(self):

356 357 if self.proteinMW is None: 358 try: 359 # calculate the molecular weight 360 self.proteinMW = self._get_protein_analyzer().molecular_weight() 361 except: 362 # any error in the function sets weight to 0 363 # sys.stderr.write("ERROR! Assigning protein MW to 0.") 364 self.proteinMW = 0 365 366 return self.proteinMW

367 368

369 - def get_proteinIP(self):

370 371 if self.proteinIP is None: 372 try: 373 self.proteinIP= analyzed_protein.isoelectric_point(correction_step = 0.001) 374 except: 375 #sys.stderr.write("ERROR! Assigning protein IP to 0.") 376 self.proteinIP = 0 377 378 return self.proteinIP

379 380 381

382 - def _get_digested_code(self, window):

383 """ 384 """ 385 386 if ProteinSequence.translation_dict is None: 387 autoinc = autoincrement(1) 388 ProteinSequence.translation_dict = dict( [ ("%s%s%s" %(a,b,c),autoinc.next()) for a in ProteinSequence.dictionary 389 for b in ProteinSequence.dictionary 390 for c in ProteinSequence.dictionary ] ) 391 392 393 try: 394 return ProteinSequence.translation_dict[window] 395 except KeyError: 396 return 0

397

398 - def _get_fragmented_sequence(self):

399 400 return Sequence._get_fragmented_sequence(self, 401 using_translation_method = self._get_digested_code, 402 using_size = ProteinSequence.window_size, 403 using_list = ProteinSequence.dictionary )

404 405

406 - def get_aminoacid_code_3to1(code):

407 408 try: 409 return ProteinSequence.aminoacid_codes_3to1[code.upper()] 410 except KeyError: 411 print code 412 413 return "X"

414 415 get_aminoacid_code_3to1 = staticmethod(get_aminoacid_code_3to1)

416

Source Code for Module biana.BianaObjects.Sequence