1 """
2 BIANA: Biologic Interactions and Network Analysis
3 Copyright (C) 2009 Javier Garcia-Garcia, Emre Guney, Baldo Oliva
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>.
17
18 """
19
20 import sys
21 import re
22 import os
23 import time
24 from sets import *
25 import gzip
26 import tempfile
27 import copy
28
29 from SequenceAlignment import *
30 import biana.biana_globals as biana_globals
31 from BlastResult import BlastResult
32
33
34
36 """
37 Executes clustalw to get the multiple sequence alignment between the proteins in the list
38 """
39
40 temp_file = tempfile.NamedTemporaryFile(bufsize=0)
41 [ temp_file.write(">%s\n%s\n" %(actual_sequence.get_sequenceID(), actual_sequence.get_sequence())) for actual_sequence in sequencesList ]
42
43
44 command = "%s %s > /dev/null" %(biana_globals.CLUSTALW_EXEC, temp_file.name)
45
46 os.system(command)
47
48
49 temp_file.close()
50
51
52 alignment = {}
53 sequence_ids_list = []
54
55 result_re = re.compile("^([^S]+)\s+([\w\-]+)$")
56
57 clustalw_output = open(temp_file.name+".aln",'r')
58
59 seq_done = Set()
60
61 for line in clustalw_output:
62 m = result_re.match(line)
63 if m:
64 if m.group(1) not in seq_done:
65 sequence_ids_list.append(m.group(1))
66 seq_done.add(m.group(1))
67 try:
68 alignment[m.group(1)].append(m.group(2).strip())
69 except:
70 alignment[m.group(1)] = [m.group(2).strip()]
71
72 clustalw_output.close()
73
74
75 os.system("rm %s.*" %temp_file.name)
76 temp_file.close()
77
78 alignmentObject = SequenceAlignment()
79
80 for actual_id in sequence_ids_list:
81 alignmentObject.append_sequence( sequence_id = actual_id,
82 aligned_sequence = "".join(alignment[actual_id]),
83 crossID = actual_id )
84
85 return alignmentObject
86
87
88
90 """
91 Executes CD-HIT with the sequences fasta file and saves the results in the ouput_path
92
93 # TODO: Automate for executing in cluster (now it is done manually)
94 """
95
96 if( sequence_identity_threshold >= 0.7 ):
97 n = 5
98 elif( sequence_identity_threshold >= 0.6 ):
99 n = 4
100 elif( sequence_identity_threshold >= 0.5 ):
101 n = 3
102 elif( sequence_identity_threshold >= 0.4 ):
103 n = 2
104
105 if( sequence_identity_threshold > 1.0 or sequence_identity_threshold<0.4 ):
106 raise ValueError("sequence_identity_threshold must be between 0.4 or 1.0")
107
108 command = "%scd-hit -i %s -o %s -c %s -n %s -B 1 -p 1 -M 700" %(biana_globals.CD_HIT_PATH, fasta_file, output_path, sequence_identity_threshold, n)
109 sys.stderr.write(command)
110
111 cd_hit_result = os.system(command)
112
113 if cd_hit_result:
114 raise ValueError("CD-HIT has cannot be executed correctly")
115
116
117
118
119
120 -def blast_cd_hit_clusters(cd_hit_clusters_file, output_fd, dbaccess, length_blast_db = None, effective_length_space_search = None):
121 """
122 Performs a blast between all the proteins belonging to the same cd-hit cluster
123 """
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139 initial_time = time.time()
140
141 input_file_fd = file(cd_hit_clusters_file, 'r')
142 processed_clusters = 0
143 new_cluster=0
144
145 num_clusters=0
146
147 fd_output_file = output_fd
148
149 for line in input_file_fd:
150
151
152 if re.search(">Cluster \d+",line):
153 if new_cluster:
154 processed_clusters += 1
155 _calculate_similarities(dbaccess=dbaccess, sequenceID_list=cluster_sequences, fd_output_file=fd_output_file, length_blast_db = length_blast_db, effective_length_space_search = effective_length_space_search )
156
157 new_cluster=1
158 cluster_sequences = []
159 num_clusters += 1
160 if num_clusters%100==0:
161 sys.stderr.write("%s clusters done in %seconds\n" %(num_clusters, time.time()-initial_time))
162
163
164
165
166
167
168
169
170
171 else:
172
173 sequence = re.search("\d+\s+\d+aa,\s+\>(\w+)\.+",line)
174 if sequence:
175 cluster_sequences.append(int(sequence.group(1)))
176 else:
177 sys.stderr.write("%s" %line)
178
179
180 if new_cluster:
181
182 _calculate_similarities(sequenceID_list=cluster_sequences,dbaccess=dbaccess,fd_output_file=fd_output_file, length_blast_db = length_blast_db, effective_length_space_search = effective_length_space_search )
183
184
185
186
187
188 -def blast_sequence(blastDatabase, sequenceObject, temporalOutputPath=None):
189 """
190 Does a blast of sequence sequenceObject against database "blastDatabase"
191
192 ATTENTION: "temporalOutputPath" is the temporal gzipped file where the blast results are stored. If it exists, blast is not calculated!
193 If it is None, results are not saved.
194 """
195
196
197 if os.path.exists(temporalOutputPath):
198 if temporalOutputPath.endswith(".gz"):
199 file_fd = gzip.open(temporalOutputPath,'r')
200 else:
201 file_fd = open(temporalOutputPath, 'r')
202 blast_results = parse_blastall_output(file_fd)
203 return blast_results
204
205 command = biana_globals.BLASTALL_EXEC
206
207
208
209
210
211
212 args = ["-p","blastp","-d",blastDatabase,"-F","F","-v","0","-b","10000"]
213
214
215 p_readfd, c_writefd = os.pipe()
216 c_readfd, p_writefd = os.pipe()
217
218 pid1 = os.fork()
219 if pid1:
220
221 for fd in (c_readfd, c_writefd, p_writefd):
222 os.close(fd)
223
224
225 fp = os.fdopen(p_readfd, 'r')
226
227 if temporalOutputPath.endswith(".gz"):
228 blast_results_file = gzip.open(temporalOutputPath,'w')
229 else:
230 blast_results_file = open(temporalOutputPath,'w')
231 blast_results = parse_blastall_output(fp, blast_results_file)
232
233 fp.close()
234
235
236 else:
237
238 try:
239 pid = os.fork()
240 if pid:
241
242
243
244
245 os.write(p_writefd, ">%s\n%s\n\n" %(sequenceObject.get_sequenceID(), sequenceObject.get_sequence()))
246 os.close(p_writefd)
247 else:
248
249 try:
250
251 os.close(0)
252 os.dup(c_readfd)
253
254 os.close(1)
255 os.dup(c_writefd)
256
257 for fd in (c_readfd, c_writefd, p_readfd, p_writefd):
258 os.close(fd)
259
260 os.execv(command, [command] + args)
261 except:
262 safe_traceback()
263 os._exit(127)
264 except:
265 safe_traceback()
266 os._exit(127)
267 else:
268 os._exit(0)
269
270 return blast_results
271
272
273 -def _calculate_similarities(dbaccess, length_blast_db = None, effective_length_space_search = None, sequenceID_list=[], fd_output_file=sys.stdout, representant=None):
274 """
275 Calculates the similarity between all proteins in the list "cluster_sequences".
276
277 "cluster_sequences" must be a list of the sequenceIDs
278
279 It uses bl2seq or blastall to calculate them
280
281 Results are printed to "fd_output_file"
282
283 """
284
285 if len(sequenceID_list)<=1:
286
287 return
288 elif len(sequenceID_list)>10:
289
290
291 def safe_traceback():
292
293
294
295 import traceback
296 sys.stderr.write("Error in child process, pid %d.\n" %os.getpid())
297 sys.stderr.flush()
298 traceback.print_exc()
299 sys.stderr.flush()
300
301
302 file_prefix = "./cluster_%s" %sequenceID_list[0]
303 file_name = file_prefix+".fa"
304 out_fasta_file = open(file_name, 'w')
305
306
307
308
309 load_time = time.time()
310 sequenceObjectDict = dbaccess._load_sequences( sequenceIdList = sequenceID_list, type = "proteinsequence" )
311
312
313 [ out_fasta_file.write(">%s\n%s\n" %(actual_sequence.get_sequenceID(), actual_sequence.get_sequence())) for actual_sequence in sequenceObjectDict.itervalues() ]
314 out_fasta_file.close()
315
316
317
318
319
320
321
322
323
324
325
326 command = "%s -t %s -i %s -l %s.log -p T -a F -o F" %(biana_globals.FORMATDB_EXEC, file_name, file_name, file_prefix)
327
328 format_db = os.system(command)
329
330
331
332
333 command = biana_globals.BLASTALL_EXEC
334
335
336
337
338 args = ["-p","blastp","-d",file_name,"-z","1720800858","-F","F","-v","0","-b","1000000"]
339 if length_blast_db is not None:
340 args.append("-z")
341 args.append(length_blast_db)
342 if effective_length_space_search is not None:
343 args.append("-Y")
344 args.append(effective_length_space_search)
345
346 args = map(str,args)
347
348
349 p_readfd, c_writefd = os.pipe()
350 c_readfd, p_writefd = os.pipe()
351
352 pid1 = os.fork()
353 if pid1:
354
355 for fd in (c_readfd, c_writefd, p_writefd):
356 os.close(fd)
357
358
359 fp = os.fdopen(p_readfd, 'r')
360
361 blastall_iterator = BlastallParserIterator( fd_blastall_output = fp, parse_detailed_alignments = False )
362
363
364
365 try:
366 while True:
367 fd_output_file.write(blastall_iterator.next().__str__())
368 except:
369 pass
370
371 fp.close()
372
373 os.wait()
374
375
376 os.remove(file_prefix+".fa")
377 os.remove(file_prefix+".log")
378 os.remove(file_prefix+".fa.phr")
379 os.remove(file_prefix+".fa.pin")
380 os.remove(file_prefix+".fa.psq")
381 else:
382
383 try:
384 pid = os.fork()
385 if pid:
386
387
388
389
390 if representant is None:
391 [ os.write(p_writefd, ">%s\n%s\n\n" %(x.sequenceID,x.get_sequence())) for x in sequenceObjectDict.values() ]
392 else:
393 raise NotImplementedError("To implement representat similarity searches")
394
395
396 os._exit(0)
397 else:
398
399 try:
400
401 os.close(0)
402 os.dup(c_readfd)
403
404 os.close(1)
405 os.dup(c_writefd)
406
407
408 os.close(2)
409
410
411 for fd in (c_readfd, c_writefd, p_readfd, p_writefd):
412 os.close(fd)
413
414 os.execv(command, [command] + args)
415
416 except:
417 safe_traceback()
418 os._exit(127)
419 except:
420 safe_traceback()
421 os._exit(127)
422 else:
423 os._exit(0)
424
425 else:
426
427
428
429 temp_file_name = "./temp_seqfile_"
430
431 temporal_files = []
432
433 sequenceObjectDict = dbaccess._load_sequences( sequenceIdList = sequenceID_list, type = "proteinsequence" )
434
435
436
437
438 for sequenceID in sequenceID_list:
439
440
441 file_name = temp_file_name+str(sequenceID)
442 temporal_files.append(file_name)
443 output_file_fd = file(file_name, "w")
444 output_file_fd.write(">%s\n%s\n" %(sequenceID,sequenceObjectDict[sequenceID].get_sequence()) )
445 output_file_fd.close()
446
447 for i in xrange(len(sequenceID_list)):
448 for j in xrange(i+1,len(sequenceID_list)):
449
450
451
452
453 command_list = [biana_globals.BL2SEQ_EXEC,"-i",temporal_files[i],"-j",temporal_files[j],"-p","blastp","-F","F"]
454
455 if length_blast_db is not None:
456 command_list.append("-d")
457 command_list.append(length_blast_db)
458 if effective_length_space_search is not None:
459 command_list.append("-Y")
460 command_list.append(effective_length_space_search)
461
462 command = " ".join(map(str,command_list))
463
464
465
466
467
468
469 bl2seq_f = os.popen(command,"r")
470 bl2seq_data = bl2seq_f.read()
471 bl2seq_f.close()
472
473
474
475
476
477
478 parse_bl2seq_output(sequenceID_A = sequenceID_list[i], sequenceID_B = sequenceID_list[j], bl2seq_output=bl2seq_data,fd_output_file=fd_output_file)
479
480
481 map(os.remove, temporal_files)
482
483
485 """
486 Does a blast within all sequences in a file
487
488 If file is very big, it won't work. It is intended for small sets
489 """
490
491 def safe_traceback():
492
493
494
495 import traceback
496 sys.stderr.write("Error in child process, pid %d.\n" %os.getpid())
497 sys.stderr.flush()
498 traceback.print_exc()
499 sys.stderr.flush()
500
501
502 sequences_list = []
503 ids_list = []
504
505 input_file_fd = file(fasta_file, 'r')
506
507 for line in input_file_fd:
508 if line[0] == ">":
509 ids_list.append(line[0:].strip())
510 else:
511 sequences_list.append(line.strip())
512
513 input_file_fd.close()
514
515
516
517
518
519
520
521 command = "%s -t clusterFASTA -i %s -l clusterFASTA.log -p T -a F -o F" %(biana_globals.FORMATDB_EXEC, fasta_file)
522
523 format_db = os.system(command)
524
525 command = biana_globals.BLASTALL_EXEC
526 args = ["-p","blastp","-d",fasta_file,"-z","1720800858","-F","F","-v","0"]
527
528
529
530 p_readfd, c_writefd = os.pipe()
531 c_readfd, p_writefd = os.pipe()
532
533 if os.fork():
534
535 for fd in (c_readfd, c_writefd, p_writefd):
536 os.close(fd)
537
538
539 fp = os.fdopen(p_readfd, 'r')
540
541 blast_results = parse_blastall_output(fp)
542 [ fd_output_file.write(x.__str__()) for x in blast_results ]
543 fp.close()
544
545 else:
546
547 try:
548 if os.fork():
549
550 for current_sequence in sequences_list:
551 os.write(p_writefd, "%s\n\n" %current_sequence)
552 else:
553
554 try:
555
556 os.close(0)
557 os.dup(c_readfd)
558
559 os.close(1)
560 os.dup(c_writefd)
561
562 for fd in (c_readfd, c_writefd, p_readfd, p_writefd):
563 os.close(fd)
564
565 os.execv(command, [command] + args)
566 except:
567 safe_traceback()
568 os._exit(127)
569 except:
570 safe_traceback()
571 os._exit(127)
572 else:
573 os._exit(0)
574
575
577 """
578 """
579
580 query_re = re.compile("Query=\s*(.+)\s*")
581 letters_re = re.compile("\(\s*([\,\d]+)\s*letters\s*")
582 sbjct_re = re.compile(">([\w\d\_\.\|]+)")
583 length_re = re.compile("Length \= ([\,\d]+)")
584 score_re = re.compile("Score\s+=\s+([\.\d]+)\s+bits\s+\((\d+)\),\s+Expect\s+=\s+([\d\.e\-]+)")
585 score_option2_re = re.compile("Score\s+=\s+([\.\d]+)\s+\(([\.\d]+)\s+bits\)\,\s+Expect\s+=\s+([\d\.e\-]+)")
586 identities_re = re.compile("Identities\s+=\s+\d+\/(\d+)\s+\((\d+)%\)")
587 positives_re = re.compile("Positives\s+=\s+\d+\/\d+\s+\((\d+)%\)")
588 gaps_re = re.compile("Gaps\s+=\s+\d+\/\d+\s+\((\d+)%\)")
589 intervals_query_re = re.compile("Query:\s+(\d+)\s+(\S+)\s+(\d+)$")
590 sbjct_intervals_re = re.compile("Sbjct:\s+(\d+)\s+(\S+)\s+(\d+)$")
591
592 - def __init__(self, fd_blastall_output, parse_detailed_alignments=False):
593
594 self.fd = fd_blastall_output
595
596
597
598
599
600 self.current_blastResult_obj = None
601 self.parse_lines = parse_detailed_alignments
602
605
606
608 """
609 """
610
611
612
613 print "Parsing line"
614 print alignment_line_list
615 alignment_line = "".join(alignment_line_list)
616 aligned_query = "".join(aligned_query)
617 aligned_sbjct = "".join(aligned_sbjct)
618
619 if len(alignment_line) != len(aligned_query) or len(aligned_query) != len(aligned_sbjct):
620 print aligned_query
621 print alignment_line
622 print aligned_sbjct
623 raise ValueError("Alignments must be of the same size")
624
625 query_gaps = 0
626 sbjct_gaps = 0
627
628 for x in xrange(len(alignment_line)):
629 value = alignment_line[x]
630 if aligned_query[x]=="-":
631 query_gaps += 1
632 if aligned_sbjct[x]=="-":
633 sbjct_gaps += 1
634 if value == " ":
635 continue
636 else:
637 if value != "+":
638 self.current_blastResult_obj.query_exact_match_list.append(x+self.current_blastResult_obj.query_start-query_gaps)
639 self.current_blastResult_obj.sbjct_exact_match_list.append(x+self.current_blastResult_obj.sbjct_start-sbjct_gaps)
640 self.current_blastResult_obj.query_similar_match_list.append(x+self.current_blastResult_obj.query_start-query_gaps)
641 self.current_blastResult_obj.sbjct_similar_match_list.append(x+self.current_blastResult_obj.sbjct_start-sbjct_gaps)
642 else:
643 self.current_blastResult_obj.query_similar_match_list.append(x+self.current_blastResult_obj.query_start-query_gaps)
644 self.current_blastResult_obj.sbjct_similar_match_list.append(x+self.current_blastResult_obj.sbjct_start-sbjct_gaps)
645
647
648
649 alignment_start_index = None
650 capture_matching_line = False
651 sbjct_matching = False
652 alignment_summary = []
653 aligned_query = []
654 aligned_sbjct = []
655
656
657
658
659
660
661
662 for line in self.fd:
663
664
665 if capture_matching_line:
666 alignment_summary.append(line[alignment_start_index:alignment_start_index+subalignment_length])
667 capture_matching_line = False
668 continue
669
670 m = BlastallParserIterator.query_re.search(line)
671
672 if m:
673 self.current_blastResult_obj = BlastResult(method="blastall",mode="F")
674 self.current_blastResult_obj.sequenceID_A = m.group(1)
675
676 m = BlastallParserIterator.letters_re.search(line)
677 if m:
678 self.current_blastResult_obj.query_length = int(m.group(1).replace(",",''))
679
680 sequenceID_B_search = BlastallParserIterator.sbjct_re.search(line)
681
682 if sequenceID_B_search:
683
684
685 if self.current_blastResult_obj.e_value is not None:
686 if self.current_blastResult_obj.sequenceID_A != self.current_blastResult_obj.sequenceID_B:
687 if self.parse_lines:
688 self.parse_alignment_line(alignment_summary, aligned_query, aligned_sbjct)
689 t = self.current_blastResult_obj
690 self.current_blastResult_obj = BlastResult(method="blastall",mode="F")
691 self.current_blastResult_obj.sequenceID_A = t.sequenceID_A
692 self.current_blastResult_obj.sequenceID_B = sequenceID_B_search.group(1)
693 self.current_blastResult_obj.query_length = t.query_length
694 return t
695 else:
696 self.current_blastResult_obj.sequenceID_B = sequenceID_B_search.group(1)
697
698 m = BlastallParserIterator.length_re.search(line)
699 if m:
700 self.current_blastResult_obj.sbjct_length = int(m.group(1).replace(",",''))
701
702 if re.search("^Matrix",line):
703
704 if self.current_blastResult_obj.e_value is not None:
705 if self.current_blastResult_obj.sequenceID_A != self.current_blastResult_obj.sequenceID_B:
706 if self.parse_lines:
707 self.parse_alignment_line(alignment_summary, aligned_query, aligned_sbjct)
708 t = self.current_blastResult_obj
709 self.current_blastResult_obj = BlastResult(method="blastall",mode="F")
710 return t
711
712 else:
713 get_evalue = BlastallParserIterator.score_re.search(line)
714 if not get_evalue:
715 get_evalue = BlastallParserIterator.score_option2_re.search(line)
716 if get_evalue:
717
718 if self.current_blastResult_obj.e_value is not None:
719 if self.current_blastResult_obj.sequenceID_A != self.current_blastResult_obj.sequenceID_B:
720 if self.parse_lines:
721 self.parse_alignment_line(alignment_summary, aligned_query, aligned_sbjct)
722 t = self.current_blastResult_obj
723 self.current_blastResult_obj = BlastResult(method="blastall",mode="F")
724 self.current_blastResult_obj.sequenceID_A = t.sequenceID_A
725 self.current_blastResult_obj.query_length = t.query_length
726 self.current_blastResult_obj.set_evalue(get_evalue.group(3))
727 self.current_blastResult_obj.score_bits = str(get_evalue.group(1))
728 self.current_blastResult_obj.score = str(get_evalue.group(2))
729 self.current_blastResult_obj.sequenceID_B = t.sequenceID_B
730 self.current_blastResult_obj.sbjct_length = t.sbjct_length
731 return t
732
733 self.current_blastResult_obj.set_evalue(get_evalue.group(3))
734 self.current_blastResult_obj.score_bits = str(get_evalue.group(1))
735 self.current_blastResult_obj.score = str(get_evalue.group(2))
736
737 get_identities = BlastallParserIterator.identities_re.search(line)
738
739 if get_identities:
740 self.current_blastResult_obj.align_length = get_identities.group(1)
741 self.current_blastResult_obj.identities= str(get_identities.group(2))
742
743 get_positives = BlastallParserIterator.positives_re.search(line)
744
745 if get_positives:
746 self.current_blastResult_obj.positives = str(get_positives.group(1))
747
748 get_gaps = BlastallParserIterator.gaps_re.search(line)
749 if get_gaps:
750 self.current_blastResult_obj.gaps = str(get_gaps.group(1))
751
752 get_intervals_query = BlastallParserIterator.intervals_query_re.search(line)
753
754 if get_intervals_query:
755 if self.current_blastResult_obj.query_start is None:
756 self.current_blastResult_obj.query_start = int(get_intervals_query.group(1))
757 alignment_start_index = line.index(get_intervals_query.group(2))
758 subalignment_length = len(get_intervals_query.group(2))
759 capture_matching_line = True
760 sbjct_matching = True
761 aligned_query.append(get_intervals_query.group(2))
762 self.current_blastResult_obj.query_end = int(get_intervals_query.group(3))
763
764 get_intervals_Sbjct = BlastallParserIterator.sbjct_intervals_re.search(line)
765
766 if get_intervals_Sbjct and sbjct_matching:
767 if self.current_blastResult_obj.sbjct_start is None:
768 self.current_blastResult_obj.sbjct_start = int(get_intervals_Sbjct.group(1))
769 self.current_blastResult_obj.sbjct_end = int(get_intervals_Sbjct.group(3))
770 aligned_sbjct.append(get_intervals_Sbjct.group(2))
771 sbjct_matching = False
772
773 raise StopIteration
774
775
776
777 -def parse_blastall_output(fd_blastall_output, temporalOutputFile_fd=None, return_only_ids=False, limit_to_sequenceIDs=sets.Set()):
778 """
779 "fd_blastall_output" is the output fd of the blast process (input for this method)
780
781 "temporalOutputFile" is a file where all the input of fd_blastall_output is saved
782
783 "return_only_ids" is used to store only ids, not complete blast results
784
785 "limit_to_sequenceIDs" is used to filter blast parsing to only those sequenceids
786 """
787
788 blast_results = []
789
790 query_re = re.compile("Query=\s*(.+)\s*")
791 letters_re = re.compile("\(\s*([\,\d]+)\s*letters\s*\)")
792 sbjct_re = re.compile("^>([\w\d\_\.\|]+)")
793 length_re = re.compile("Length \= ([\,\d]+)")
794 score_re = re.compile("Score\s+=\s+([\.\d]+)\s+bits\s+\((\d+)\),\s+Expect\s+=\s+([\d\.e\-]+)")
795 identities_re = re.compile("Identities\s+=\s+\d+\/(\d+)\s+\((\d+)%\)")
796 positives_re = re.compile("Positives\s+=\s+\d+\/\d+\s+\((\d+)%\)")
797 gaps_re = re.compile("Gaps\s+=\s+\d+\/\d+\s+\((\d+)%\)")
798 intervals_query_re = re.compile("Query:\s+(\d+)\s+(\S+)\s+(\d+)$")
799 sbjct_intervals_re = re.compile("Sbjct:\s+(\d+)\s+(\S+)\s+(\d+)$")
800
801
802 alignment_start_index = None
803 capture_matching_line = False
804 sbjct_matching = False
805 alignment_summary = []
806
807 blastResult_obj = BlastResult(method="blastall",mode="F")
808
809 def parse_alignment_line(alignment_line_list, blastResult_obj, aligned_query, aligned_sbjct):
810 """
811
812 """
813
814
815
816 alignment_line = "".join(alignment_line_list)
817 aligned_query = "".join(aligned_query)
818 aligned_sbjct = "".join(aligned_sbjct)
819
820 if len(alignment_line) != len(aligned_query) or len(aligned_query) != len(aligned_sbjct):
821 print aligned_query
822 print alignment_line
823 print aligned_sbjct
824 raise ValueError("Alignments must be of the same size")
825
826 query_gaps = 0
827 sbjct_gaps = 0
828
829 for x in xrange(len(alignment_line)):
830 value = alignment_line[x]
831 if aligned_query[x]=="-":
832 query_gaps += 1
833 if aligned_sbjct[x]=="-":
834 sbjct_gaps += 1
835 if value == " ":
836 continue
837 else:
838 if value != "+":
839 blastResult_obj.query_exact_match_list.append(x+blastResult_obj.query_start-query_gaps)
840 blastResult_obj.sbjct_exact_match_list.append(x+blastResult_obj.sbjct_start-sbjct_gaps)
841 blastResult_obj.query_similar_match_list.append(x+blastResult_obj.query_start-query_gaps)
842 blastResult_obj.sbjct_similar_match_list.append(x+blastResult_obj.sbjct_start-sbjct_gaps)
843 else:
844 blastResult_obj.query_similar_match_list.append(x+blastResult_obj.query_start-query_gaps)
845 blastResult_obj.sbjct_similar_match_list.append(x+blastResult_obj.sbjct_start-sbjct_gaps)
846
847
848
849
850
851 for line in fd_blastall_output:
852
853 if temporalOutputFile_fd:
854 temporalOutputFile_fd.write(line)
855
856 if capture_matching_line:
857 alignment_summary.append(line[alignment_start_index:alignment_start_index+subalignment_length])
858 capture_matching_line = False
859 continue
860
861 m = query_re.search(line)
862 if m:
863 sequenceID_A = m.group(1)
864 blastResult_obj.sequenceID_A = sequenceID_A
865
866 m = letters_re.search(line)
867 if m:
868 blastResult_obj.query_length = int(m.group(1).replace(",",''))
869
870 sequenceID_B_search = sbjct_re.search(line)
871 if sequenceID_B_search:
872
873
874 if blastResult_obj.e_value is not None:
875 if blastResult_obj.sequenceID_A != blastResult_obj.sequenceID_B:
876 parse_alignment_line(alignment_summary, blastResult_obj, aligned_query, aligned_sbjct)
877 if return_only_ids:
878 blast_results.append(blastResult_obj.sequenceID_B)
879 else:
880 if len(limit_to_sequenceIDs)==0 or blastResult_obj.sequenceID_B in limit_to_sequenceIDs:
881 blast_results.append(copy.copy(blastResult_obj))
882
883 alignment_start_index = None
884 alignment_summary = []
885 aligned_query = []
886 aligned_sbjct = []
887
888 blastResult_obj.reset()
889 blastResult_obj.sequenceID_B = sequenceID_B_search.group(1)
890
891 m = length_re.search(line)
892 if m:
893 blastResult_obj.sbjct_length = int(m.group(1).replace(",",''))
894
895 if re.search("^Matrix",line):
896
897 if blastResult_obj.e_value is not None:
898 if blastResult_obj.sequenceID_A != blastResult_obj.sequenceID_B:
899 parse_alignment_line(alignment_summary, blastResult_obj, aligned_query, aligned_sbjct)
900 if return_only_ids:
901 blast_results.append(blastResult_obj.sequenceID_B)
902 else:
903 if len(limit_to_sequenceIDs)==0 or blastResult_obj.sequenceID_B in limit_to_sequenceIDs:
904 blast_results.append(blastResult_obj)
905
906 alignment_start_index = None
907 alignment_summary = []
908 aligned_query = []
909 aligned_sbjct = []
910
911 blastResult_obj = BlastResult(method="blastall",mode="F")
912
913 else:
914 get_evalue = score_re.search(line)
915 if get_evalue:
916
917 if blastResult_obj.e_value is not None:
918 if blastResult_obj.sequenceID_A != blastResult_obj.sequenceID_B:
919 parse_alignment_line(alignment_summary, blastResult_obj, aligned_query, aligned_sbjct)
920 if return_only_ids:
921 blast_results.append(blastResult_obj.sequenceID_B)
922 else:
923 if len(limit_to_sequenceIDs)==0 or blastResult_obj.sequenceID_B in limit_to_sequenceIDs:
924 blast_results.append(copy.copy(blastResult_obj))
925
926 blastResult_obj.reset()
927
928 alignment_start_index = None
929 alignment_summary = []
930 aligned_query = []
931 aligned_sbjct = []
932
933 blastResult_obj.set_evalue(get_evalue.group(3))
934 blastResult_obj.score_bits = str(get_evalue.group(1))
935 blastResult_obj.score = str(get_evalue.group(2))
936
937 get_identities = identities_re.search(line)
938
939 if get_identities:
940 blastResult_obj.align_length = get_identities.group(1)
941 blastResult_obj.identities= str(get_identities.group(2))
942
943 get_positives = positives_re.search(line)
944
945 if get_positives:
946 blastResult_obj.positives = str(get_positives.group(1))
947
948 get_gaps = gaps_re.search(line)
949 if get_gaps:
950 blastResult_obj.gaps = str(get_gaps.group(1))
951
952 get_intervals_query = intervals_query_re.search(line)
953
954 if get_intervals_query:
955 if blastResult_obj.query_start is None:
956 blastResult_obj.query_start = int(get_intervals_query.group(1))
957 alignment_start_index = line.index(get_intervals_query.group(2))
958 subalignment_length = len(get_intervals_query.group(2))
959 capture_matching_line = True
960 sbjct_matching = True
961 aligned_query.append(get_intervals_query.group(2))
962 blastResult_obj.query_end = int(get_intervals_query.group(3))
963
964 get_intervals_Sbjct = sbjct_intervals_re.search(line)
965
966 if get_intervals_Sbjct and sbjct_matching:
967 if blastResult_obj.sbjct_start is None:
968 blastResult_obj.sbjct_start = int(get_intervals_Sbjct.group(1))
969 blastResult_obj.sbjct_end = int(get_intervals_Sbjct.group(3))
970 aligned_sbjct.append(get_intervals_Sbjct.group(2))
971 sbjct_matching = False
972
973 return blast_results
974
975
977
978 score_re = re.compile("Score\s+=\s+([\.\d]+)\s+bits\s+\((\d+)\),\s+Expect\s+=\s+([\d\.e\-]+)")
979 identities_re = re.compile("Identities\s+=\s+\d+\/(\d+)\s+\((\d+)%\)")
980 positives_re = re.compile("Positives\s+=\s+\d+\/\d+\s+\((\d+)%\)")
981 gaps_re = re.compile("Gaps\s+=\s+\d+\/\d+\s+\((\d+)%\)")
982 intervals_query_re = re.compile("Query:\s+(\d+)\s+.+\s(\d+)$")
983 sbjct_intervals_re = re.compile("Sbjct:\s+(\d+)\s+.+\s(\d+)$")
984
985
986 letters_re = re.compile("\(\s*([\d\,]+)\s*letters\s*\)")
987 length_re = re.compile("Length\s+\=\s+([\d\,]+)")
988
989 if fd_output_file is None:
990 fd_output_file = sys.stdout
991
992 if bl2seq_output is None:
993 return
994 else:
995
996
997 bl2seq_lines = bl2seq_output.split("\n")
998
999 blastResult_obj = BlastResult(method="bl2seq",mode="F")
1000 blastResult_obj.sequenceID_A = sequenceID_A
1001 blastResult_obj.sequenceID_B = sequenceID_B
1002
1003 for line in bl2seq_lines:
1004 if re.search("Lambda",line):
1005
1006
1007 if blastResult_obj.e_value is not None:
1008 if blastResult_obj.e_value < 0.1:
1009 fd_output_file.write(str(blastResult_obj))
1010
1011 blastResult_obj.reset()
1012
1013 else:
1014 get_evalue = score_re.search(line)
1015 if get_evalue:
1016
1017 if blastResult_obj.e_value is not None:
1018 if blastResult_obj.e_value < 0.1:
1019
1020 fd_output_file.write(str(blastResult_obj))
1021 blastResult_obj.reset()
1022
1023 blastResult_obj.set_evalue(get_evalue.group(3))
1024 blastResult_obj.score= get_evalue.group(2)
1025 blastResult_obj.score_bits = get_evalue.group(1)
1026
1027 m = letters_re.search(line)
1028 if m:
1029 blastResult_obj.query_length = int(m.group(1).replace(',',''))
1030
1031 m = length_re.search(line)
1032 if m:
1033 blastResult_obj.sbjct_length = int(m.group(1).replace(',',''))
1034
1035 get_identities = identities_re.search(line)
1036 if get_identities:
1037 blastResult_obj.align_length = get_identities.group(1)
1038 blastResult_obj.identities= get_identities.group(2)
1039
1040 get_positives = positives_re.search(line)
1041 if get_positives:
1042 blastResult_obj.positives = get_positives.group(1)
1043
1044 get_gaps = gaps_re.search(line)
1045 if get_gaps:
1046 blastResult_obj.gaps = get_gaps.group(1)
1047
1048 get_intervals_query = intervals_query_re.search(line)
1049 if get_intervals_query:
1050 if blastResult_obj.query_start is None:
1051 blastResult_obj.query_start = get_intervals_query.group(1)
1052 blastResult_obj.query_end = get_intervals_query.group(2)
1053
1054 get_intervals_Sbjct = sbjct_intervals_re.search(line)
1055 if get_intervals_Sbjct:
1056 if blastResult_obj.sbjct_start is None:
1057 blastResult_obj.sbjct_start = get_intervals_Sbjct.group(1)
1058 blastResult_obj.sbjct_end = get_intervals_Sbjct.group(2)
1059