1 import sets
2 import sys
3
5 """
6 Object to represent a sequence alignment, multiple or not
7 """
9
10
11
12
13
14 self.sequence_ids_list = []
15 self.cross_ids_list = []
16 self.taxonomy_ids_list = []
17 self.sequence_fragments = []
18 self.alignment = []
19 self.alignment_length = 0
20 self.nseqs = 0
21 self.seq_ids_position_dict = {}
22 self.complete_sequences = {}
23
24
25 self.sim = []
26 self.wsim = []
27 self.lali = []
28 self.ngap = []
29 self.lgap = []
30
31 self.C_align = None
32
34
35 if( self.C_align is None ):
36
37 import pprc
38
39 self.C_align = pprc.C_align(total_proteins = self.nseqs,
40 length = self.alignment_length)
41
42 [ self.C_align.append(self.get_aligned_sequence(seq_pos=x)) for x in xrange(self.nseqs) ]
43
44 return self.C_align
45
46
48 self.alignment_length=length
49
51 return self.alignment_length
52
55
57 self.nseqs = number_of_sequences
58 self.sequence_ids_list = [ None for x in xrange(number_of_sequences) ]
59 self.cross_ids_list = [ None for x in xrange(number_of_sequences) ]
60 self.taxonomy_ids_list = [ None for x in xrange(number_of_sequences) ]
61 self.alignment = [ None for x in xrange(number_of_sequences) ]
62 self.sequence_fragments = [ None for x in xrange(number_of_sequences) ]
63
65 return self.sequence_fragments[self.seq_ids_position_dict[str(seq_id)]]
66
68
69 return [ ((ord(ascii_fragments[x])*255+ord(ascii_fragments[x+1])),
70 (ord(ascii_fragments[x+2])*255+ord(ascii_fragments[x+3])),
71 (ord(ascii_fragments[x+4])*255+ord(ascii_fragments[x+5]))) for x in xrange(0,len(ascii_fragments),6) ]
72
74
75 if len(self.taxonomy_ids_list)==0:
76 self.taxonomy_ids_list = [ [] for x in xrange(number_of_sequences) ]
77
78 try:
79 self.taxonomy_ids_list[self.seq_ids_position_dict[sequenceID]] = taxIDslist
80 except:
81 pass
82
83
85
86 temp_list = []
87 for actual_fragment in self.sequence_fragments[seq_position]:
88 temp_list.append(chr(actual_fragment[0]/255))
89 temp_list.append(chr(actual_fragment[0]%255))
90 temp_list.append(chr(actual_fragment[1]/255))
91 temp_list.append(chr(actual_fragment[1]%255))
92 temp_list.append(chr(actual_fragment[2]/255))
93 temp_list.append(chr(actual_fragment[2]%255))
94 return "".join(temp_list).replace('\\','\\\\').replace('"','\\"')
95
96 - def append_sequence(self,sequence_id,aligned_sequence,taxID_list=[],sim=None,wsim=None,lali=None,crossID=None):
97 """
98 Appends the aligned sequence to the alignment
99 """
100
101 if( not self.alignment_length ):
102 self.alignment_length = len(aligned_sequence)
103 elif len(aligned_sequence) != self.alignment_length:
104 raise ValueError("Trying to add a protein in the alignment with distinct length (%s!=%s" %(self.alignment_length,len(aligned_sequence)) )
105
106 self.seq_ids_position_dict[sequence_id] = self.nseqs
107 self.nseqs += 1
108 self.sequence_ids_list.append(sequence_id)
109 self.cross_ids_list.append(crossID)
110 self.taxonomy_ids_list.append(taxID_list)
111 self.alignment.append(aligned_sequence)
112 self.sequence_fragments.append(self._get_sequence_fragments(len(self.sequence_ids_list)-1))
113 self.complete_sequences[sequence_id] = aligned_sequence.replace('-','').replace('.','')
114
115 self.sim.append(sim)
116 self.wsim.append(wsim)
117 self.lali.append(lali)
118
120
121 self.seq_ids_position_dict[sequence_id] = self.nseqs
122 self.nseqs += 1
123 self.sequence_ids_list.append(sequence_id)
124 self.cross_ids_list.append(crossID)
125 self.alignment.append(None)
126 self.sequence_fragments.append(sequence_fragments)
127 self.complete_sequences[sequence_id] = complete_sequence
128
130
131 seq_pos = self.seq_ids_position_dict[sequence_id]
132
133 if self.alignment[seq_pos] is None:
134 seq = []
135 position_start = 0
136 for actual_fragment in self.sequence_fragments[seq_pos]:
137 seq.extend(["-" for x in xrange(actual_fragment[0]-position_start)])
138 position_start += actual_fragment[2]
139 seq.append( str(self.complete_sequences[sequence_id])[actual_fragment[1]:actual_fragment[1]+actual_fragment[2]] )
140 self.alignment[seq_pos] = "".join(seq)
141
142 return self.alignment[seq_pos]
143
144
145 - def add_sequence_to_position(self,sequence_id,sequence_position,taxID_list=[],sequence_fragments_ascii=None,aligned_sequence=None,complete_sequence=None, crossID=None):
146
147 if sequence_fragments_ascii is None and aligned_sequence is None:
148 raise ValueError("At least one of the parameters must be specified")
149
150 self.sequence_ids_list[sequence_position] = sequence_id
151 self.cross_ids_list[sequence_position] = crossID
152 self.taxonomy_ids_list[sequence_position] = taxID_list
153 self.seq_ids_position_dict[sequence_id] = sequence_position
154
155 if sequence_fragments_ascii:
156 self.sequence_fragments[sequence_position] = self._get_sequence_fragments_from_ascii(ascii_fragments = sequence_fragments_ascii)
157 self.complete_sequences[sequence_id] = complete_sequence
158
159 if aligned_sequence:
160
161 self.alignment[sequence_position] = aligned_sequence
162 self.complete_sequences[sequence_id] = aligned_sequence.replace('-','').replace('.','')
163
164
165
167
168 fragments = []
169 in_gap = None
170 counter = 0
171 aln_start = 0
172 seq_start = 0
173 actual_pos = 0
174
175 if self.alignment[aln_position][0] == "-":
176 in_gap = 1
177
178 for x in xrange(len(self.alignment[aln_position])):
179 if in_gap:
180 if self.alignment[aln_position][x]!="-":
181 in_gap = None
182 counter = 1
183 aln_start = x
184 seq_start = actual_pos
185 actual_pos += 1
186 else:
187 continue
188 else:
189 if self.alignment[aln_position][x]!="-":
190 counter += 1
191 actual_pos += 1
192 else:
193 fragments.append((aln_start,seq_start,counter))
194
195 in_gap = 1
196
197 if( self.alignment[aln_position][x]!="-" ):
198 fragments.append((aln_start,seq_start,counter))
199
200
201
202
203 return fragments
204
206 return self.sequence_fragments
207
209 return self.sequence_fragments[seq_pos]
210
211
212
213
215 if self.alignment[seq_pos] is None:
216
217 self.alignment[seq_pos] = self._get_aligned_sequence(sequence_id = self.sequence_ids_list[seq_pos])
218 if start_pos is None:
219 return self.alignment[seq_pos]
220 else:
221
222 return self.alignment[seq_pos][start_pos:end_pos+1]
223
225 """
226 Prints the alignment with the default identifier
227 """
228 for x in xrange(len(self.alignment)):
229
230 sequence = self.get_aligned_sequence(x)
231 if self.cross_ids_list[x] is not None:
232 id = self.cross_ids_list[x]
233
234 else:
235 id = "UNKNOWN IDENTIFIER"
236
237 print id
238 if format=="fasta":
239 fd_out.write(">%s\n%s\n" %(id, sequence))
240 else:
241 fd_out.write("%s\t%s\n" %(id, sequence))
242
243
244
245
247
248 alignment = SequenceAlignment()
249
250 if format=="default":
251 for line in fd_in:
252 line = line.strip()
253 splitted = line.split("\t")
254 alignment.append_sequence(sequence_id=splitted[0],
255 aligned_sequence=splitted[1],
256 crossID=splitted[0])
257 else:
258 raise ValueError("Not recognized alignment input format")
259
260 return alignment
261
262 read_alignment = staticmethod(read_alignment)
263
264
266 """
267 Returns the alignment in the following format:
268 A list where each position contains the string corresponding to the sequence in string format
269 """
270 return self.alignment
271
273
274 print "concatenating %s + %s" %(self.alignment_length, alignment2.alignment_length)
275
276 new_align = SequenceAlignment()
277
278 crossID1_to_position = {}
279 crossID2_to_position = {}
280
281 for current_position in xrange(self.nseqs):
282 crossID1_to_position.setdefault(self.cross_ids_list[current_position],current_position)
283 for current_position in xrange(alignment2.nseqs):
284 crossID2_to_position.setdefault(alignment2.cross_ids_list[current_position],current_position)
285
286 for current_crossID in crossID1_to_position:
287 if current_crossID in crossID2_to_position:
288 new_align.append_sequence(sequence_id = self.sequence_ids_list[crossID1_to_position[current_crossID]],
289 aligned_sequence = self.get_aligned_sequence(seq_pos = crossID1_to_position[current_crossID])+alignment2.get_aligned_sequence(seq_pos = crossID2_to_position[current_crossID]),
290 crossID = current_crossID,
291 taxID_list = self.taxonomy_ids_list[crossID1_to_position[current_crossID]] )
292
293 return new_align
294
296 """
297 Concatenate two alginments. They must have the same number of sequences
298 """
299
300 new_align = SequenceAlignment()
301
302
303 for current_position in xrange(self.nseqs):
304 id = "%s_%s" %(self.sequence_ids_list[current_position],alignment2.sequence_ids_list[current_position])
305 new_align.append_sequence(sequence_id = id,
306 aligned_sequence = self.get_aligned_sequence(seq_pos = current_position)+alignment2.get_aligned_sequence(seq_pos=current_position),
307 crossID = id,
308 taxID_list = self.taxonomy_ids_list[current_position] )
309
310 return new_align
311
312
314 """
315 Returns an alignment object where sequence positions have been ordered according to its specie
316
317 "sequenceMD5_taxID_correspondence" is a dictionary with the correspondences between sequenceIDs and taxonomy
318
319 method. Possibilities: "all_combinations", "only_first"
320 """
321
322 al1_taxIDs = {}
323
324 al2_taxIDs = {}
325
326 for x in xrange(len(self.taxonomy_ids_list)):
327 for actual_taxID in self.taxonomy_ids_list[x]:
328 if al1_taxIDs.has_key(actual_taxID):
329 al1_taxIDs[actual_taxID].add(x)
330 else:
331 al1_taxIDs[actual_taxID] = sets.Set([x])
332
333 for x in xrange(len(alignment2.taxonomy_ids_list)):
334 for actual_taxID in alignment2.taxonomy_ids_list[x]:
335 if al2_taxIDs.has_key(actual_taxID):
336 al2_taxIDs[actual_taxID].add(x)
337 else:
338 al2_taxIDs[actual_taxID] = sets.Set([x])
339
340
341 new_align1 = SequenceAlignment()
342 new_align2 = SequenceAlignment()
343
344 if method == "all_combinations":
345 for actual_taxID in al1_taxIDs:
346 if al2_taxIDs.has_key(actual_taxID):
347
348 for actual_position1 in al1_taxIDs[actual_taxID]:
349 for actual_position2 in al2_taxIDs[actual_taxID]:
350 new_align1.append_sequence(sequence_id = self.sequence_ids_list[actual_position1],
351 aligned_sequence= self.get_aligned_sequence(seq_pos=actual_position1),
352 crossID = self.cross_ids_list[actual_position1],
353 taxID_list = self.taxonomy_ids_list[actual_position1])
354 new_align2.append_sequence(sequence_id = alignment2.sequence_ids_list[actual_position2],
355 aligned_sequence = alignment2.get_aligned_sequence(seq_pos=actual_position2),
356 crossID = alignment2.cross_ids_list[actual_position2],
357 taxID_list = alignment2.taxonomy_ids_list[actual_position2])
358
359 elif method == "only_first":
360 for actual_taxID in al1_taxIDs:
361 if al2_taxIDs.has_key(actual_taxID):
362 for actual_position1 in al1_taxIDs[actual_taxID]:
363 for actual_position2 in al2_taxIDs[actual_taxID]:
364 new_align1.append_sequence(sequence_id = self.sequence_ids_list[actual_position1],
365 aligned_sequence= self.get_aligned_sequence(seq_pos=actual_position1),
366 crossID = self.cross_ids_list[actual_position1],
367 taxID_list = self.taxonomy_ids_list[actual_position1])
368 new_align2.append_sequence(sequence_id = alignment2.sequence_ids_list[actual_position2],
369 aligned_sequence = alignment2.get_aligned_sequence(seq_pos=actual_position2),
370 crossID = alignment2.cross_ids_list[actual_position2],
371 taxID_list = alignment2.taxonomy_ids_list[actual_position2])
372 break
373 break
374
375
376 return (new_align1, new_align2)
377
378
379
381 """
382 "fragments" is a list of tuples with the format (start_position, end_position), with the fragments to select
383 """
384
385 new_align = SequenceAlignment()
386
387 if len(fragments)==0:
388 raise ValueError("Trying to get a subalignment without specifying fragments")
389
390 for x in xrange(self.nseqs):
391 aligned_seq = "".join([self.get_aligned_sequence(seq_pos=x,start_pos=actual_fragment[0],end_pos=actual_fragment[1])
392 for actual_fragment in fragments])
393
394
395 new_align.append_sequence(sequence_id = self.sequence_ids_list[x],
396 aligned_sequence = aligned_seq,
397 crossID = self.cross_ids_list[x],
398 taxID_list = self.taxonomy_ids_list[x])
399
400 return new_align
401
402
404 """
405 Returns a new alignment, in the same order, but eliminating all the positions in which sequenceID has a gap
406 """
407
408 fragments = []
409
410 aligned_seq = self._get_aligned_sequence(sequence_id=sequenceID)
411
412 current_start = None
413 for x in xrange(len(aligned_seq)):
414 if aligned_seq[x]!='-' and aligned_seq[x]!='.' and current_start is None:
415 current_start = x
416 continue
417 if (aligned_seq[x]=='-' or aligned_seq[x]=='.') and current_start is not None:
418 fragments.append((current_start, x-1))
419 current_start = None
420 continue
421
422 if current_start is not None:
423 fragments.append((current_start,len(aligned_seq)-1))
424
425 return self.get_subalignment(fragments=fragments)
426
427
429 """
430
431 """
432
433 new_align1 = SequenceAlignment()
434 new_align2 = SequenceAlignment()
435
436 if( align1.nseqs != align2.nseqs ):
437 raise ValueError("Both sequences must have the same number of proteins")
438
439 for x in xrange(align1.nseqs):
440 aligned_seq1 = align1.get_aligned_sequence(seq_pos=x)
441 aligned_seq2 = align2.get_aligned_sequence(seq_pos=x)
442 if( aligned_seq1.count('-') < len(aligned_seq1) and aligned_seq2.count('-') < len(aligned_seq2) ):
443 new_align1.append_sequence( sequence_id = align1.sequence_ids_list[x],
444 aligned_sequence = aligned_seq1 )
445 new_align2.append_sequence( sequence_id = align2.sequence_ids_list[x],
446 aligned_sequence = aligned_seq2 )
447
448 return (new_align1, new_align2)
449