"""
File        : parse_scop_domains_distribution.py
Author      : Ramon Aragues
Creation    : 01.03.2006
Contents    : parses a fasta file where the title line describes the scop domains
              distribution for a proteinPiana
Called from : 

=======================================================================================================


"""

# parse_scop_domains_distribution.py: parses a fasta file where the title
# line describes the scop domains distribution for a proteinPiana
#
# Copyright (C) 2005  Ramon Aragues
# author email: ramon.aragues@upf.edu and boliva@imim.es
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#    http://www.gnu.org/copyleft/gpl.html
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
#
# University Pompeu Fabra, hereby disclaims all copyright
# interest in the program 'PIANA'
# (software for working with protein-protein interaction networks) written 
# by Ramon Aragues

import sys
import getopt

import re
import readline
import MySQLdb

import utilities

from PianaDBaccess import *

from Bio import Fasta  # needed to read the pdbaa file (which is in fasta format)

# ---------------------------------------------------------------
# Set here the default values for command line arguments
# ---------------------------------------------------------------
#

# These will be the values taken by the program when called directly from build_database.py:

input_file = None

verbose = 0
verbose_detailed = 0


per_covered_thresholds_to_test = [50, 60, 70, 80, 90]

# ----------------------
# Function usage()
# ----------------------
def usage():
    print "--------------------------------------------------------------------------------------------------------------"
    print "This program fills up tables in database piana with information from pdbaa \n"
    print "Usage: python pdbaa2piana.py  --input-file=input_file_name   [--help] [--verbose]"
    print "\nwhere:"
    print "      input_file_name  : file name of input file containing the fasta file describing the scop distribution across proteinPianas"
    print "     --help            : prints this message and exits"
    print "     --verbose         : prints process info to stdout"
    print "---------------------------------------------------------------------------------------------------------------"
        

   
# ---------------------------
# Function parseArguments()                                               
# --------------------------- 

def parseArguments():
    

    global input_file
    global verbose
    

    try:
        opts, args = getopt.getopt(sys.argv[1:], "", ["verbose","help", "input-file="])
    except getopt.GetoptError, bad_opt:
        # print help information and exit:
        sys.stderr.write( bad_opt.__str__() )
        usage()
        sys.exit(2)
     
    for option,value in opts:
        
        if option == "--input-file":
            input_file = value
            
        elif option == "--verbose":
            verbose = 1
            
        elif option == "--help":
            # print help information and exit
            usage()
            sys.exit(2)
            
    # END OF for option,value in opts:

# --------
# --------
#  Main()               
# --------                               
# --------

input_file = None

# parsing arguments from the command line
parseArguments()


# initialize the files that will hold results
for one_threshold in per_covered_thresholds_to_test:
    file_name = input_file + ".thres_" + str(one_threshold)
    file_fd = file(file_name, "w")
    file_fd.close()
# END OF for one_threshold in per_covered_thresholds_to_test:
    



scop_domains_parser = Fasta.RecordParser()
scop_domains_input_file =  utilities.open_file(input_file, 'r')
scop_domains_iterator = Fasta.Iterator(scop_domains_input_file, scop_domains_parser)

scop_domains_record = scop_domains_iterator.next()


if verbose:
    print "Processing file"

# while record read is not None, parse the record and insert data into piana
while scop_domains_record is not None:


    protein_title_line = scop_domains_record.title
    protein_sequence = scop_domains_record.sequence

    sequence_length = len(protein_sequence) # this is the total length of the protein

    title_fields = protein_title_line.split(":")

    # title_fields[0] is proteinPiana_proteinPiana

    this_proteinPiana = title_fields[0].split("_")[0]
    
    # title_fields[1:] is a list of strings "7,157 d1juqa_ a.118.9.2 cl=46456 cf=48370 sf=48464 fa=48468 sp=69098"
    #                                       "168,230 d1hcz_2 b.84.2.2 cl=48724 cf=51229 sf=51246 fa=51256 sp=51258"


    covered_length = 0
    for one_scop_description in title_fields[1:]:
        # for each scop description, get the length of the domain and add it to the total length covered

        # get the residues covered
        fields_one_scop_description = one_scop_description.split()

        if not fields_one_scop_description:
            # some proteins are not described at all... skip them
            continue

        # fields_one_scop_description[0] is 'residue1,residue2'
        init_res = int( fields_one_scop_description[0].split(",")[0])
        end_res = int( fields_one_scop_description[0].split(",")[1])
    
        covered_length += (end_res - init_res)

    # END OF for one_scop_description in title_fields[1:]:


    # calculate the percentage of the protein sequence that is covered by SCOP

    per_scop_covered = 100 * covered_length / float(sequence_length)

    for one_threshold in per_covered_thresholds_to_test:
        file_name = input_file + ".thres_" + str(one_threshold)
        file_fd = file(file_name, "a")

        if per_scop_covered > one_threshold:
            # if the percentage of the protein covered is higher than the threshold set by user, print it to stdout
            file_fd.write("%s\n" %(this_proteinPiana))
            sys.stderr.write("writing %s to file %s because %s was higher than %s\n" %(this_proteinPiana, file_name, per_scop_covered, one_threshold))
            
        file_fd.close()
    # END OF for one_threshold in per_covered_thresholds_to_test:

    
    
    scop_domains_record = scop_domains_iterator.next()
    
# END OF while scop_domains_record is not None

