biana.BianaObjects.SequenceAlignment'

5 """ 6 Object to represent a sequence alignment, multiple or not 7 """

8 - def __init__(self):

9 10 # The alignment is represented as a list of sequence objects 11 # Each sequence in the alignment consists on a list of aligmentBlocks, that define the aligment. 12 # It defines the start position in the global alignment, the start position of the fragment and the length of the fragment 13 # (global_start_position, start_position, length) 14 self.sequence_ids_list = [] 15 self.cross_ids_list = [] 16 self.taxonomy_ids_list = [] # List with the associated taxonomys to aligned sequences 17 self.sequence_fragments = [] 18 self.alignment = [] 19 self.alignment_length = 0 20 self.nseqs = 0 21 self.seq_ids_position_dict = {} 22 self.complete_sequences = {} # Dictionary to store the complete sequences of the alignment (needed to print the alignment) 23 24 # Other parameters extracted from HSSP 25 self.sim = [] 26 self.wsim = [] 27 self.lali = [] 28 self.ngap = [] 29 self.lgap = [] 30 31 self.C_align = None

32

33 - def _get_C_align(self):

34 35 if( self.C_align is None ): 36 37 import pprc 38 39 self.C_align = pprc.C_align(total_proteins = self.nseqs, 40 length = self.alignment_length) 41 42 [ self.C_align.append(self.get_aligned_sequence(seq_pos=x)) for x in xrange(self.nseqs) ] 43 44 return self.C_align

45 46

47 - def set_alignment_length(self, length):

48 self.alignment_length=length

49

50 - def get_alignment_length(self):

51 return self.alignment_length

52

53 - def get_number_of_sequences(self):

54 return self.nseqs

55

56 - def set_number_of_sequences(self, number_of_sequences):

57 self.nseqs = number_of_sequences 58 self.sequence_ids_list = [ None for x in xrange(number_of_sequences) ] 59 self.cross_ids_list = [ None for x in xrange(number_of_sequences) ] 60 self.taxonomy_ids_list = [ None for x in xrange(number_of_sequences) ] 61 self.alignment = [ None for x in xrange(number_of_sequences) ] 62 self.sequence_fragments = [ None for x in xrange(number_of_sequences) ]

63

64 - def get_sequence_fragments_by_id(self, seq_id):

65 return self.sequence_fragments[self.seq_ids_position_dict[str(seq_id)]]

66

67 - def _get_sequence_fragments_from_ascii(self, ascii_fragments):

68 69 return [ ((ord(ascii_fragments[x])*255+ord(ascii_fragments[x+1])), 70 (ord(ascii_fragments[x+2])*255+ord(ascii_fragments[x+3])), 71 (ord(ascii_fragments[x+4])*255+ord(ascii_fragments[x+5]))) for x in xrange(0,len(ascii_fragments),6) ]

72

73 - def set_taxIDlist(self, sequenceID, taxIDslist):

74 75 if len(self.taxonomy_ids_list)==0: 76 self.taxonomy_ids_list = [ [] for x in xrange(number_of_sequences) ] 77 78 try: 79 self.taxonomy_ids_list[self.seq_ids_position_dict[sequenceID]] = taxIDslist 80 except: 81 pass

82 83

84 - def get_sequence_fragments_in_ascii(self,seq_position):

85 86 temp_list = [] 87 for actual_fragment in self.sequence_fragments[seq_position]: 88 temp_list.append(chr(actual_fragment[0]/255)) 89 temp_list.append(chr(actual_fragment[0]%255)) 90 temp_list.append(chr(actual_fragment[1]/255)) 91 temp_list.append(chr(actual_fragment[1]%255)) 92 temp_list.append(chr(actual_fragment[2]/255)) 93 temp_list.append(chr(actual_fragment[2]%255)) 94 return "".join(temp_list).replace('\\','\\\\').replace('"','\\"')

95

96 - def append_sequence(self,sequence_id,aligned_sequence,taxID_list=[],sim=None,wsim=None,lali=None,crossID=None):

97 """ 98 Appends the aligned sequence to the alignment 99 """ 100 101 if( not self.alignment_length ): 102 self.alignment_length = len(aligned_sequence) 103 elif len(aligned_sequence) != self.alignment_length: 104 raise ValueError("Trying to add a protein in the alignment with distinct length (%s!=%s" %(self.alignment_length,len(aligned_sequence)) ) 105 106 self.seq_ids_position_dict[sequence_id] = self.nseqs 107 self.nseqs += 1 108 self.sequence_ids_list.append(sequence_id) 109 self.cross_ids_list.append(crossID) 110 self.taxonomy_ids_list.append(taxID_list) 111 self.alignment.append(aligned_sequence) 112 self.sequence_fragments.append(self._get_sequence_fragments(len(self.sequence_ids_list)-1)) 113 self.complete_sequences[sequence_id] = aligned_sequence.replace('-','').replace('.','') 114 115 self.sim.append(sim) 116 self.wsim.append(wsim) 117 self.lali.append(lali)

118

119 - def append_sequence_fragments(self,sequence_id,sequence_fragments, complete_sequence,crossID=None):

120 121 self.seq_ids_position_dict[sequence_id] = self.nseqs 122 self.nseqs += 1 123 self.sequence_ids_list.append(sequence_id) 124 self.cross_ids_list.append(crossID) 125 self.alignment.append(None) 126 self.sequence_fragments.append(sequence_fragments) 127 self.complete_sequences[sequence_id] = complete_sequence

128

129 - def _get_aligned_sequence(self, sequence_id):

130 131 seq_pos = self.seq_ids_position_dict[sequence_id] 132 133 if self.alignment[seq_pos] is None: 134 seq = [] 135 position_start = 0 136 for actual_fragment in self.sequence_fragments[seq_pos]: 137 seq.extend(["-" for x in xrange(actual_fragment[0]-position_start)]) 138 position_start += actual_fragment[2] 139 seq.append( str(self.complete_sequences[sequence_id])[actual_fragment[1]:actual_fragment[1]+actual_fragment[2]] ) 140 self.alignment[seq_pos] = "".join(seq) 141 142 return self.alignment[seq_pos]

143 144

145 - def add_sequence_to_position(self,sequence_id,sequence_position,taxID_list=[],sequence_fragments_ascii=None,aligned_sequence=None,complete_sequence=None, crossID=None):

146 147 if sequence_fragments_ascii is None and aligned_sequence is None: 148 raise ValueError("At least one of the parameters must be specified") 149 150 self.sequence_ids_list[sequence_position] = sequence_id 151 self.cross_ids_list[sequence_position] = crossID 152 self.taxonomy_ids_list[sequence_position] = taxID_list 153 self.seq_ids_position_dict[sequence_id] = sequence_position 154 155 if sequence_fragments_ascii: 156 self.sequence_fragments[sequence_position] = self._get_sequence_fragments_from_ascii(ascii_fragments = sequence_fragments_ascii) 157 self.complete_sequences[sequence_id] = complete_sequence 158 #print self.sequence_fragments[sequence_position] 159 if aligned_sequence: 160 #print "adding as aligned sequence to position %s" %(sequence_position) 161 self.alignment[sequence_position] = aligned_sequence 162 self.complete_sequences[sequence_id] = aligned_sequence.replace('-','').replace('.','')

163 #print aligned_sequence 164 165 # This could be done in a C function (using PPRC...)

166 - def _get_sequence_fragments(self,aln_position):

167 168 fragments = [] 169 in_gap = None 170 counter = 0 171 aln_start = 0 172 seq_start = 0 173 actual_pos = 0 174 175 if self.alignment[aln_position][0] == "-": 176 in_gap = 1 177 178 for x in xrange(len(self.alignment[aln_position])): 179 if in_gap: 180 if self.alignment[aln_position][x]!="-": 181 in_gap = None 182 counter = 1 183 aln_start = x 184 seq_start = actual_pos 185 actual_pos += 1 186 else: 187 continue 188 else: 189 if self.alignment[aln_position][x]!="-": 190 counter += 1 191 actual_pos += 1 192 else: 193 fragments.append((aln_start,seq_start,counter)) 194 #print "Start Position: %s\tLength: %s" %(aln_start,counter) 195 in_gap = 1 196 197 if( self.alignment[aln_position][x]!="-" ): 198 fragments.append((aln_start,seq_start,counter)) 199 #print "Start Position: %s\tLength: %s" %(aln_start,counter) 200 201 #print self.alignment[aln_position] 202 #print fragments 203 return fragments

204

205 - def get_sequence_fragments(self):

206 return self.sequence_fragments

207

208 - def get_sequence_fragments_by_pos(self,seq_pos):

209 return self.sequence_fragments[seq_pos]

210 211 #def append_sequence_fragment(self,sequence_id,global_start_pos,start_pos,end_pos): 212 # self.sequence_fragments 213

214 - def get_aligned_sequence(self,seq_pos,start_pos=None,end_pos=None):

215 if self.alignment[seq_pos] is None: 216 #print "Trying to get position %s, and maximum is %s" %(seq_pos,self.nseqs) 217 self.alignment[seq_pos] = self._get_aligned_sequence(sequence_id = self.sequence_ids_list[seq_pos]) 218 if start_pos is None: 219 return self.alignment[seq_pos] 220 else: 221 #print "Getting alignment from %s to %s" %(start_pos,end_pos+1) 222 return self.alignment[seq_pos][start_pos:end_pos+1]

223

224 - def print_alignment(self, format="default",fd_out = sys.stdout):

225 """ 226 Prints the alignment with the default identifier 227 """ 228 for x in xrange(len(self.alignment)): 229 #print "%s\t%s" %(self.sequence_ids_list[x],self.get_aligned_sequence(x)) 230 sequence = self.get_aligned_sequence(x) 231 if self.cross_ids_list[x] is not None: 232 id = self.cross_ids_list[x]#[1] 233 #print "%s\t%s" %(self.cross_ids_list[x][1],self.get_aligned_sequence(x)) 234 else: 235 id = "UNKNOWN IDENTIFIER" 236 #print "%s\t%s" %("UNKNOWN IDENTIFIER",self.get_aligned_sequence(x)) 237 print id 238 if format=="fasta": 239 fd_out.write(">%s\n%s\n" %(id, sequence)) 240 else: 241 fd_out.write("%s\t%s\n" %(id, sequence))

242 #control: 243 #print "%s\t%s [%s]" %(self.sequence_ids_list[x],self.get_aligned_sequence(x),self.sequence_fragments[x]) 244 245

246 - def read_alignment(fd_in, format="default"):

247 248 alignment = SequenceAlignment() 249 250 if format=="default": 251 for line in fd_in: 252 line = line.strip() 253 splitted = line.split("\t") 254 alignment.append_sequence(sequence_id=splitted[0], 255 aligned_sequence=splitted[1], 256 crossID=splitted[0]) 257 else: 258 raise ValueError("Not recognized alignment input format") 259 260 return alignment

261 262 read_alignment = staticmethod(read_alignment) 263 264

265 - def get_alignment(self):

266 """ 267 Returns the alignment in the following format: 268 A list where each position contains the string corresponding to the sequence in string format 269 """ 270 return self.alignment

271

272 - def concatenate_by_crossID(self, alignment2):

273 274 print "concatenating %s + %s" %(self.alignment_length, alignment2.alignment_length) 275 276 new_align = SequenceAlignment() 277 278 crossID1_to_position = {} 279 crossID2_to_position = {} 280 281 for current_position in xrange(self.nseqs): 282 crossID1_to_position.setdefault(self.cross_ids_list[current_position],current_position) 283 for current_position in xrange(alignment2.nseqs): 284 crossID2_to_position.setdefault(alignment2.cross_ids_list[current_position],current_position) 285 286 for current_crossID in crossID1_to_position: 287 if current_crossID in crossID2_to_position: 288 new_align.append_sequence(sequence_id = self.sequence_ids_list[crossID1_to_position[current_crossID]], 289 aligned_sequence = self.get_aligned_sequence(seq_pos = crossID1_to_position[current_crossID])+alignment2.get_aligned_sequence(seq_pos = crossID2_to_position[current_crossID]), 290 crossID = current_crossID, 291 taxID_list = self.taxonomy_ids_list[crossID1_to_position[current_crossID]] ) 292 293 return new_align

294

295 - def concatenate_alignments(self, alignment2):

296 """ 297 Concatenate two alginments. They must have the same number of sequences 298 """ 299 300 new_align = SequenceAlignment() 301 302 303 for current_position in xrange(self.nseqs): 304 id = "%s_%s" %(self.sequence_ids_list[current_position],alignment2.sequence_ids_list[current_position]) 305 new_align.append_sequence(sequence_id = id, 306 aligned_sequence = self.get_aligned_sequence(seq_pos = current_position)+alignment2.get_aligned_sequence(seq_pos=current_position), 307 crossID = id, 308 taxID_list = self.taxonomy_ids_list[current_position] ) 309 310 return new_align

311 312

313 - def get_species_ordered_alignment(self, alignment2, method="only_first"):

314 """ 315 Returns an alignment object where sequence positions have been ordered according to its specie 316 317 "sequenceMD5_taxID_correspondence" is a dictionary with the correspondences between sequenceIDs and taxonomy 318 319 method. Possibilities: "all_combinations", "only_first" 320 """ 321 322 al1_taxIDs = {} 323 324 al2_taxIDs = {} 325 326 for x in xrange(len(self.taxonomy_ids_list)): 327 for actual_taxID in self.taxonomy_ids_list[x]: 328 if al1_taxIDs.has_key(actual_taxID): 329 al1_taxIDs[actual_taxID].add(x) 330 else: 331 al1_taxIDs[actual_taxID] = sets.Set([x]) 332 333 for x in xrange(len(alignment2.taxonomy_ids_list)): 334 for actual_taxID in alignment2.taxonomy_ids_list[x]: 335 if al2_taxIDs.has_key(actual_taxID): 336 al2_taxIDs[actual_taxID].add(x) 337 else: 338 al2_taxIDs[actual_taxID] = sets.Set([x]) 339 340 # Create the new alignment 341 new_align1 = SequenceAlignment() 342 new_align2 = SequenceAlignment() 343 344 if method == "all_combinations": 345 for actual_taxID in al1_taxIDs: 346 if al2_taxIDs.has_key(actual_taxID): 347 #print actual_taxID," found in both alignments!" 348 for actual_position1 in al1_taxIDs[actual_taxID]: 349 for actual_position2 in al2_taxIDs[actual_taxID]: 350 new_align1.append_sequence(sequence_id = self.sequence_ids_list[actual_position1], 351 aligned_sequence= self.get_aligned_sequence(seq_pos=actual_position1), 352 crossID = self.cross_ids_list[actual_position1], 353 taxID_list = self.taxonomy_ids_list[actual_position1]) 354 new_align2.append_sequence(sequence_id = alignment2.sequence_ids_list[actual_position2], 355 aligned_sequence = alignment2.get_aligned_sequence(seq_pos=actual_position2), 356 crossID = alignment2.cross_ids_list[actual_position2], 357 taxID_list = alignment2.taxonomy_ids_list[actual_position2]) 358 359 elif method == "only_first": 360 for actual_taxID in al1_taxIDs: 361 if al2_taxIDs.has_key(actual_taxID): 362 for actual_position1 in al1_taxIDs[actual_taxID]: 363 for actual_position2 in al2_taxIDs[actual_taxID]: 364 new_align1.append_sequence(sequence_id = self.sequence_ids_list[actual_position1], 365 aligned_sequence= self.get_aligned_sequence(seq_pos=actual_position1), 366 crossID = self.cross_ids_list[actual_position1], 367 taxID_list = self.taxonomy_ids_list[actual_position1]) 368 new_align2.append_sequence(sequence_id = alignment2.sequence_ids_list[actual_position2], 369 aligned_sequence = alignment2.get_aligned_sequence(seq_pos=actual_position2), 370 crossID = alignment2.cross_ids_list[actual_position2], 371 taxID_list = alignment2.taxonomy_ids_list[actual_position2]) 372 break 373 break 374 375 376 return (new_align1, new_align2)

377 378 379

380 - def get_subalignment(self, fragments):

381 """ 382 "fragments" is a list of tuples with the format (start_position, end_position), with the fragments to select 383 """ 384 385 new_align = SequenceAlignment() 386 387 if len(fragments)==0: 388 raise ValueError("Trying to get a subalignment without specifying fragments") 389 390 for x in xrange(self.nseqs): 391 aligned_seq = "".join([self.get_aligned_sequence(seq_pos=x,start_pos=actual_fragment[0],end_pos=actual_fragment[1]) 392 for actual_fragment in fragments]) 393 394 #if( aligned_seq.count('-') < len(aligned_seq) ): 395 new_align.append_sequence(sequence_id = self.sequence_ids_list[x], 396 aligned_sequence = aligned_seq, 397 crossID = self.cross_ids_list[x], 398 taxID_list = self.taxonomy_ids_list[x]) 399 400 return new_align

401 402

403 - def get_subalignment_for_sequence_without_gaps(self, sequenceID):

404 """ 405 Returns a new alignment, in the same order, but eliminating all the positions in which sequenceID has a gap 406 """ 407 408 fragments = [] 409 410 aligned_seq = self._get_aligned_sequence(sequence_id=sequenceID) 411 412 current_start = None 413 for x in xrange(len(aligned_seq)): 414 if aligned_seq[x]!='-' and aligned_seq[x]!='.' and current_start is None: 415 current_start = x 416 continue 417 if (aligned_seq[x]=='-' or aligned_seq[x]=='.') and current_start is not None: 418 fragments.append((current_start, x-1)) 419 current_start = None 420 continue 421 422 if current_start is not None: 423 fragments.append((current_start,len(aligned_seq)-1)) 424 425 return self.get_subalignment(fragments=fragments)

426 427

428 - def clean_empty_sequences( align1, align2 ):

429 """ 430 431 """ 432 433 new_align1 = SequenceAlignment() 434 new_align2 = SequenceAlignment() 435 436 if( align1.nseqs != align2.nseqs ): 437 raise ValueError("Both sequences must have the same number of proteins") 438 439 for x in xrange(align1.nseqs): 440 aligned_seq1 = align1.get_aligned_sequence(seq_pos=x) 441 aligned_seq2 = align2.get_aligned_sequence(seq_pos=x) 442 if( aligned_seq1.count('-') < len(aligned_seq1) and aligned_seq2.count('-') < len(aligned_seq2) ): 443 new_align1.append_sequence( sequence_id = align1.sequence_ids_list[x], 444 aligned_sequence = aligned_seq1 ) 445 new_align2.append_sequence( sequence_id = align2.sequence_ids_list[x], 446 aligned_sequence = aligned_seq2 ) 447 448 return (new_align1, new_align2)

Source Code for Module biana.BianaObjects.SequenceAlignment'