1 """
2 BIANA: Biologic Interactions and Network Analysis
3 Copyright (C) 2009 Javier Garcia-Garcia, Emre Guney, Baldo Oliva
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>.
17
18 """
19
20 from Sequence import ProteinSequence, RNASequence, DNASequence
21 import biana.biana_globals as biana_globals
22
23 import traceback
24 import math
25 import gzip
26 import sys
27 import os
28 import re
29
30
31 temp_acc = 0
33
34 - def __init__(self, name, resolution=None):
35
36 self.name = name
37 self.resolution = resolution
38
39 self.num_atoms = 0
40
41 self.chains = {}
42
43
44
45
46 self.sorted_residues = {}
47 self.sorted_chains = None
48
49 self.C_structure = None
50
51 self.executed_dssp = False
52
53
54
55
56
57
58
59
60 - def read_pdb_file(pdbFile, fragments=[], merge_fragments=False, chain_value='A'):
61 """
62 merge_fragments is used to reenumerate residues in fragments and consider all the fragments as a single chain
63
64 "chain_value" is used only when merging fragments
65 """
66
67
68
69 pdbObject = PDB(name=pdbFile)
70
71
72 requested_chains = [ x.get_chain() for x in fragments ]
73
74
75
76
77
78
79
80
81
82
83
84
85 atom_regex = re.compile("ATOM\s+(\d+)\s+(\w{1,4})\s*(\w+)\s+(\w{0,1})\s+(\-{0,1}\d+)\w*\s+(\-*\d*\.\d{3})\s*(\-*\d*\.\d{3})\s*(\-*\d*\.\d{3})\s*")
86
87
88 if pdbFile.endswith(".gz"):
89 pdb_fd = gzip.open(pdbFile)
90 else:
91 pdb_fd = open(pdbFile)
92
93
94 control_num = 0
95
96 residue_num_value = 0
97
98 current_residue_num = None
99
100 for line in pdb_fd:
101
102 if line.startswith("ATOM"):
103 m = atom_regex.match(line)
104
105 if m:
106
107 chain_id = m.group(4)
108
109 if chain_id.strip() == '':
110 chain_id = " "
111
112 residue_num = int(m.group(5))
113
114 if len(fragments)>0:
115 if True in [ current_fragment.includes( chain=chain_id, res_num = residue_num ) for current_fragment in fragments ]:
116 selected_residue = True
117 else:
118 continue
119 if merge_fragments:
120 if residue_num != current_residue_num:
121 residue_num_value += 1
122 current_residue_num = residue_num
123 chain_id = chain_value
124 else:
125 residue_num_value = residue_num
126
127
128 current_atom = PDBAtom( atom_num = int(m.group(1)),
129 atom_type = m.group(2).strip(),
130 atom_name = m.group(2).strip(),
131 x = float(m.group(6)),
132 y = float(m.group(7)),
133 z = float(m.group(8)) )
134 control_num += 1
135 pdbObject.add_atom( atomObject = current_atom, chain_name = chain_id, residue_num = residue_num_value, residue_type = m.group(3) )
136
137 else:
138 sys.stderr.write("Alert. Following atom does not match with regular expression\n%s" %line)
139
140 if control_num == 0:
141 raise ValueError("No residues were added")
142
143 return pdbObject
144
145
146
147
148 from Bio.PDB.PDBParser import PDBParser
149
150 parser = PDBParser()
151
152 temp_struct = parser.get_structure(pdbFile,pdbFile)
153
154 if len(temp_struct.get_list())>1:
155 print "ALERT. PDB %s has more than a single model" %pdbFile
156
157 pdbObject = PDB(name=pdbFile)
158
159
160 requested_chains = [ x.get_chain() for x in fragments ]
161
162
163
164
165
166 chain_list = temp_struct[0].get_list()
167
168 residue_num_value = 0
169
170 control_num = 0
171
172 for current_chain in chain_list:
173
174
175
176 for current_residue in current_chain.get_list():
177 if current_residue.get_resname() == 'HOH' or current_residue.get_resname() == 'GDP' or current_residue.get_resname()==" MG":
178 continue
179
180
181 residue_num = int(current_residue.get_id()[1])
182
183 if merge_fragments:
184 chain_id = 'A'
185 else:
186 chain_id = current_chain.get_id()
187
188 if len(fragments)>0:
189 if True in [ current_fragment.includes( chain=current_chain.get_id(), res_num = residue_num ) for current_fragment in fragments ]:
190 selected_residue = True
191 else:
192 continue
193
194
195
196
197 control_num = control_num + 1
198
199 if merge_fragments:
200 residue_num_value += 1
201 else:
202 residue_num_value = residue_num
203
204
205 for current_atom in current_residue.get_list():
206 current_atom = PDBAtom( atom_num = current_atom.get_serial_number(),
207 atom_type = current_atom.get_name(),
208 atom_name = current_atom.get_name(),
209 x = current_atom.get_coord()[0],
210 y = current_atom.get_coord()[1],
211 z = current_atom.get_coord()[2])
212 pdbObject.add_atom( atomObject = current_atom, chain_name = chain_id, residue_num = residue_num_value, residue_type = current_residue.get_resname())
213
214 if control_num == 0:
215 raise ValueError("No residues were added")
216
217
218 return pdbObject
219
220 read_pdb_file = staticmethod(read_pdb_file)
221
222
224
225 if self.executed_dssp is False:
226 self._get_dssp_results()
227
228 return [ self.chains[chain_name][x].dssp_rel_accessibility for x in self._get_sorted_residues(chain_name=chain_name) ]
229
230
231
232
233
234
235
236
237
238
239
246
247
248
249
250
251
252
253
254
255
256
263
264
265
266
267
268
269
270
271
272
273
275 self.resolution = float(resolution)
276
278 if self.sorted_residues[chain_name] is None:
279 self.sorted_residues[chain_name] = self.chains[chain_name].keys()
280 self.sorted_residues[chain_name].sort()
281 return self.sorted_residues[chain_name]
282
284 """
285 Returns a list with the chain names sorted
286 """
287 if self.sorted_chains is None:
288 self.sorted_chains = self.chains.keys()
289 self.sorted_chains.sort()
290
291 return self.sorted_chains
292
294 return self.chains.keys()
295
297 return self.num_atoms
298
300 temp = [ x.get_num_atoms() for x in self.chains[chain_name].values() ]
301 return sum(temp)
302
304 return self.chains[chain_name].values()
305
307 if chain_name is None:
308 return sum([len(x) for x in self.chains.values()])
309 else:
310 return len(self.chains[chain_name])
311
314
315 - def add_residue(self, chain_name, residue_num=0, atoms_initial_list=[], residue_type=None,hssp_conservation=None, hssp_entropy=None, hssp_exposure=None, hssp_norm_entropy=None, hssp_variability=None, dssp=None, residue_object = None):
316 """
317 To add residue. Chain_name is mandatory always. residue_num and atoms_initial_list are mandatory only if residue_object is not specified
318
319 if residue_object is specified, the rest of parameters except chain_name are not used
320 """
321
322 residue_num = int(residue_num)
323
324 if residue_object is None:
325 residue_object = PDBResidue(residue_num = residue_num, residue_type=residue_type, atoms_initial_list = atoms_initial_list,
326 hssp_conservation=hssp_conservation, hssp_entropy=hssp_entropy, hssp_exposure=hssp_exposure,
327 hssp_norm_entropy=hssp_norm_entropy, hssp_variability=hssp_variability, dssp=dssp)
328 else:
329 residue_num = residue_object.residue_num
330
331 try:
332 if self.chains[chain_name].has_key(residue_num):
333 print "Trying to insert twice the same residue... error?"
334
335 else:
336 self.chains[chain_name][residue_num] = residue_object
337 self.sorted_residues[chain_name]=None
338 except:
339 self.chains[chain_name] = {residue_num: residue_object}
340 self.sorted_residues[chain_name]=None
341
342
343 - def add_atom(self, chain_name, residue_num, residue_type=None, atomObject=None):
344 """
345 Atom object can be None in order to indicate that it must be added a residue but no atoms information
346 """
347
348 self.num_atoms += 1
349 residue_num = int(residue_num)
350 try:
351 if self.chains[chain_name].has_key(residue_num):
352 self.chains[chain_name][residue_num].add_atom(atomObject)
353 else:
354 self.chains[chain_name][residue_num] = PDBResidue(residue_num = residue_num, residue_type=residue_type, atoms_initial_list = [atomObject])
355 self.sorted_residues[chain_name]=None
356 except:
357 self.chains[chain_name] = {residue_num: PDBResidue(residue_num = residue_num, residue_type=residue_type, atoms_initial_list = [atomObject])}
358 self.sorted_residues[chain_name]=None
359
360
362
363 return "PDB with id %s, with %s chains, %s residues and %s atoms" %(self.get_name(),
364 len(self.chains),
365 sum([len(x) for x in self.chains.values()]),
366 self.num_atoms)
394
395
410
411
415
419
423
427
431
433
434 return "".join([ self.chains[chain_name][x].dssp for x in self._get_sorted_residues(chain_name=chain_name) ])
435
437 """
438 Returns the dssp result in psi-pred nomenclature
439 """
440
441 return self.get_dssp(chain_name=chain_name).replace('T','C').replace('G','C').replace('S','C').replace(' ','C').replace('B','E')
442
443
444
446 """
447 Returns a new PDB object with the residues filtered by binary_list
448
449 The PDB must contain a single chain
450 """
451
452 if len(self.chains) != 1:
453 raise ValueError("Trying to filter a PDB with a more than a single chain")
454
455 chain_name = self.chains.keys()[0]
456
457 new_pdb = PDB(name="filtered_%s" %self.name)
458
459 sorted_residues = self._get_sorted_residues(chain_name)
460
461 if len(sorted_residues) != len(binary_list):
462 raise ValueError("PDB has a distinct number of residues (%s) than binary list (%s)" %(self.get_num_residues(chain_name),len(binary_list)))
463
464 for actual_residue_x in xrange(len(sorted_residues)):
465 if binary_list[actual_residue_x]==1:
466 residue_object = self.chains[chain_name][sorted_residues[actual_residue_x]]
467 new_pdb.add_residue( chain_name = chain_name, residue_object = residue_object )
468
469 return new_pdb
470
471
510
511 - def get_distances(self, pdb2, chains1, chains2, type="min", fragments1=None, fragments2=None):
512 """
513 "type" can be "min", "mean", "ca" or "cb"
514
515 "fragments" is used to determine which fragments are going to be used to calculate distances
516 TODO!!!
517
518 For the moment, it returns a dictionary with the wcm of the different chain combinations
519 """
520
521
522
523
524 global temp_acc
525 temp_acc = 0
526
527 try:
528 import pprc
529 except:
530 raise ValueError("It is necessary pprc module to execute this method")
531
532 if type.lower() == "min":
533 method = PDBResidue.get_min_distance
534 elif type.lower() == "mean":
535 method = PDBResidue.get_mean_distance
536 elif type.lower() == "ca":
537 method = PDBResidue.get_ca_distance
538 elif type.lower() == "cb":
539 method = PDBResidue.get_cb_distance
540 else:
541 raise "Trying to calculate distances with a method unknown (%s)" %type
542
543 distances_list = []
544
545 distances = {}
546
547 for actual_chain1 in chains1:
548 residues1 = self._get_sorted_residues(chain_name = actual_chain1)
549 distances[actual_chain1] = {}
550 distances_list = []
551 for actual_chain2 in chains2:
552
553
554 residues2 = pdb2._get_sorted_residues(chain_name = actual_chain2)
555
556 distances_list.extend( [ method(self.chains[actual_chain1][actual_residue1],residue2=pdb2.chains[actual_chain2][actual_residue2])
557 for actual_residue1 in residues1 for actual_residue2 in residues2 ] )
558
559
560
561
562
563
564
565
566
567
568 distances[actual_chain1][actual_chain2] = pprc.wcm(rows = self.get_num_residues(chain_name = actual_chain1),
569 columns = pdb2.get_num_residues(chain_name = actual_chain2),
570 values = distances_list,
571 description = "Distances between cb atoms between structure %s chain %s and %s chain %s" %(self.name,
572 actual_chain1,
573 pdb2.name,
574 actual_chain2 ),
575 type="DISTANCE_TYPE")
576
577 return distances
578
579
580
582
583
584
585
586
587 surface = {'A': 115,
588 'C': 149,
589 'D': 170,
590 'E': 207,
591 'F': 230,
592 'G': 86,
593 'H': 206,
594 'I': 187,
595 'K': 222,
596 'L': 192,
597 'M': 210,
598 'N': 184,
599 'P': 140,
600 'Q': 208,
601 'R': 263,
602 'S': 140,
603 'T': 164,
604 'V': 161,
605 'W': 269,
606 'Y': 257}
607
608
609 start_regex = re.compile(" # RESIDUE AA STRUCTURE BP1 BP2")
610 in_results = None
611
612 temp_seq = []
613
614 for line in fp:
615
616 if in_results is None:
617 if re.search(start_regex,line):
618 in_results = 1
619 continue
620
621
622 if line[13]=="!":
623 continue
624
625 temp_seq.append(line[13])
626 acc = int(line[35:38])
627
628 chain = line[11]
629 res_num = int(line[5:10].strip())
630 res_type = line[13]
631
632 residue_object = self.chains[chain][res_num]
633
634 residue_object.dssp_accessibility = acc
635 if ProteinSequence.get_aminoacid_code_3to1(residue_object.get_residue_type())!=res_type and res_type!="X":
636 print ProteinSequence.get_aminoacid_code_3to1(residue_object.get_residue_type())
637 print res_type
638 sys.stderr.write("DSSP LINE: %s" %line)
639 raise ValueError("Discordance between PDB and DSSP")
640
641
642 try:
643 residue_object.dssp_rel_accessibility = float(acc)*100/surface[line[13]]
644
645 except:
646 print "The code arrives here? if yes... why??"
647 pass
648
649
650
651
652 if line[16]==' ':
653 residue_object.dssp_ss = 'N'
654
655 else:
656 residue_object.dssp_ss = line[16]
657
658
659
660
661
663
664
665
666
667 sys.stderr.write("Error in child process, pid %d.\n" %os.getpid())
668 sys.stderr.flush()
669 traceback.print_exc()
670 sys.stderr.flush()
671
673 """
674 Executes the program dssp to get the accessibility
675 """
676
677 import popen2
678 import readline
679
680 dssp_program = biana_globals.DSSP_EXEC
681 args = ["--"]
682
683 print dssp_program
684
685
686 p_readfd, c_writefd = os.pipe()
687 c_readfd, p_writefd = os.pipe()
688
689 if os.fork():
690
691 for fd in (c_readfd, c_writefd, p_writefd):
692 os.close(fd)
693
694
695 fp = os.fdopen(p_readfd, 'r')
696
697
698 self._parse_dssp_results(fp)
699 fp.close()
700
701
702
703 else:
704
705 try:
706 if os.fork():
707
708 os.write(p_writefd,self.get_in_pdb_format())
709 else:
710
711 try:
712
713 os.close(0)
714 os.dup(c_readfd)
715
716 os.close(1)
717 os.dup(c_writefd)
718
719
720 os.close(2)
721
722
723 for fd in (c_readfd, c_writefd, p_readfd, p_writefd):
724 os.close(fd)
725
726 print dssp_program
727 print args
728 os.execv(dssp_program, [dssp_program] + args)
729 except:
730 self.safe_traceback()
731 os._exit(127)
732 except:
733 self.safe_traceback()
734 os._exit(127)
735 else:
736 os._exit(0)
737
738 self.executed_dssp = True
739
740
742
743 - def __init__(self, residue_num, residue_type, atoms_initial_list=[], hssp_conservation=None, hssp_entropy=None, hssp_exposure=None, hssp_norm_entropy=None, hssp_variability=None, dssp=None):
744 self.residue_num = int(residue_num)
745 self.residue_type = residue_type
746
747 self.atoms = []
748
749 for current_atom in atoms_initial_list:
750 self.add_atom(current_atom)
751
752 self.ca_index = None
753 self.cb_index = None
754
755
756 self.hssp_conservation = hssp_conservation
757 self.hssp_entropy = hssp_entropy
758 self.hssp_exposure = hssp_exposure
759 self.hssp_norm_entropy = hssp_norm_entropy
760 self.hssp_variability = hssp_variability
761 self.dssp = dssp
762
763 self.dssp_accessibility = None
764 self.dssp_rel_accessibility = None
765 self.dssp_ss = None
766
769
771 return self.hssp_conservation
772
774 return self.hssp_entropy
775
777 return self.hssp_exposure
778
780 return self.hssp_norm_entropy
781
783 return self.hssp_variability
784
786 if atomObject.atom_name == "CA":
787 self.ca_index = len(self.atoms)
788 elif atomObject.atom_name == "CB":
789 self.cb_index = len(self.atoms)
790 self.atoms.append(atomObject)
791
793 return len(self.atoms)
794
796 return self.residue_num
797
800
802 return self.residue_type
803
805 return len(self.atoms)
806
808
809 return min([x.get_distance(y) for x in self.atoms for y in residue2.atoms])
810
812
813 raise "TODO"
814 return sum([ x.get_distance(atom2=y) for x in self.atoms for y in residue2.atoms ])/(len(self.atoms)*len(residue2.atoms))
815
817 if self.ca_index and residue2.ca_index:
818 return self.atoms[self.ca_index].get_distance(atom2 = residue2.atoms[residue2.ca_index])
819 else:
820 print "Trying to calculate distances between alpha atoms and coordinates are not known"
821 return None
822
824
825 if self.cb_index and residue2.cb_index:
826 distance= self.atoms[self.cb_index].get_distance(atom2 = residue2.atoms[residue2.cb_index])
827 elif self.ca_index and residue2.cb_index:
828 distance= self.atoms[self.ca_index].get_distance(atom2 = residue2.atoms[residue2.cb_index])
829 elif self.cb_index and residue2.ca_index:
830 distance= self.atoms[self.cb_index].get_distance(atom2 = residue2.atoms[residue2.ca_index])
831 elif self.ca_index and residue2.ca_index:
832 distance= self.atoms[self.ca_index].get_distance(atom2 = residue2.atoms[residue2.ca_index])
833 else:
834 print "Trying to calculate distances between beta atoms and coordinates are not known for cb and ca: Residue %s %s" %(self.residue_num, self.residue_type)
835 distance= None
836
837 global temp_acc
838
839
840
841
842 if( distance <= 8 ):
843
844 temp_acc = temp_acc+1
845
846 return distance
847
848
850
851 - def __init__(self, atom_num, atom_type, atom_name, x, y, z):
852
853 self.atom_num = int(atom_num)
854 self.atom_type = atom_type
855 self.atom_name = atom_name
856 self.x = x
857 self.y = y
858 self.z = z
859
861 return (self.x,self.y,self.z)
862
865
868
871
873 return self.atom_name
874
876 return self.atom_type
877
880
882 dx = self.x - atom2.x
883 dy = self.y - atom2.y
884 dz = self.z - atom2.z
885 return math.sqrt(dx*dx+dy*dy+dz*dz)
886
887
889 """
890 Class to represent a pdb fragment
891 """
892
893 - def __init__(self, start_residue=None, end_residue=None, chain=None):
894
895
896 self.start_residue = start_residue
897 self.end_residue = end_residue
898 self.chain = chain
899
901 return "PDB Fragment. Chain: %s. Start: %s. End: %s" %(self.chain, self.start_residue, self.end_residue)
902
905
907 return self.start_residue
908
910 return self.end_residue
911
914
916 """
917 Determines if the residue belongs to this fragment or not
918 """
919
920 if self.chain is None or self.chain==chain:
921 if self.start_residue is None and self.end_residue is None:
922 return True
923 elif self.start_residue is None and self.end_residue>=res_num:
924 return True
925 elif self.end_residue is None and self.start_residue<=res_num:
926 return True
927
928
929
930
931 return False
932
933
935
936 frag_regex = re.compile("(\w*)\:*(\-*\d*)\-*(\-*\d*)")
937
938
939 splitted = fragment_str.split(separator)
940
941
942 fragments = []
943
944 for current_splitted in splitted:
945
946 if current_splitted.strip()=='':
947 continue
948
949 if current_splitted=="-":
950 current_splitted=''
951 fragments.append(PDBFragment( chain = None, start_residue = None, end_residue = None ))
952 continue
953
954 m = frag_regex.match(current_splitted)
955
956 if m:
957 if m.group(1) == '':
958 chain = None
959 else:
960 chain = m.group(1)
961
962 if m.group(2)=='':
963 start_residue = None
964 else:
965 start_residue = int(m.group(2))
966
967 if m.group(3)=='':
968 end_residue = None
969 else:
970 end_residue = int(m.group(3))
971
972 fragments.append(PDBFragment( chain = chain, start_residue = start_residue, end_residue = end_residue ))
973 else:
974 print "Fragment %s does not match with regular expresion" %current_splitted
975
976 return fragments
977
978 fragment_parser = staticmethod(fragment_parser)
979