"""
File        : scop_ints2piana.py
Author      : Ramon Aragues
Creation    : 15.02.2006
Contents    : script that fills up tables in database piana related to external DB "scop_ints"
Called from : 

=======================================================================================================

This program parsers a text file (following scop_ints format) with interactions and inserts them into piana

"""

# scop_ints2piana.py: script that fills up tables in database piana related to external DB "scop_ints"
#
# Copyright (C) 2005  Ramon Aragues
# author email: ramon.aragues@upf.edu and boliva@imim.es
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#    http://www.gnu.org/copyleft/gpl.html
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
#
# University Pompeu Fabra, hereby disclaims all copyright
# interest in the program 'PIANA'
# (software for working with protein-protein interaction networks) written 
# by Ramon Aragues

import sys
import getopt

import re
import readline

from sets import *

import MySQLdb

import utilities

from PianaDBaccess import *

verbose = 0

# ---------------------------------------------------------------
# Set here the default values for command line arguments
# ---------------------------------------------------------------

no_species = 0 # by default 0, will not insert interactions between proteins of different species

# ----------------------
# Function usage()
# ----------------------
def usage():
    print "--------------------------------------------------------------------------------------------------------------"
    print "This program fills up tables in database piana related to external DB 'scop_ints'\n"
    print "Usage: python scop_ints2piana.py --piana-dbname=piana_dbname --piana-dbhost=piana_dbhost"
    print "                                 --piana-dbuser=piana_dbuser --piana-dbpass=piana_dbpass "
    print "                                 --scop_ints-file=scop_ints_file --no-species  [--help] [--verbose]"
    print "\nwhere:"
    print "     piana_dbname   : name of database piana to be used (required)"
    print "     piana_dbhost   : name of host where database piana to be used is placed (required)"
    print "     piana_dbuser   : username accessing the database (not required in most systems)"
    print "     piana_dbpass   : password of username accessing the database (not required in most systems)"
    print "     scop_ints_file : the name of the scop_ints input file"
    print "     --no-species   : inserts all interactions, regardless of the species of proteins"
    print "     --help         : prints this message and exits"
    print "     --verbose      : prints process info to stdout"
    print "--------------------------------------------------------------------------------------------------------------"
    print "                 REMEMBER TO REMOVE THE HEADER LINE FROM THE INPUT FILE!!!!"
    print "--------------------------------------------------------------------------------------------------------------"
        

# ---------------------------
# Function parseArguments()                                               
# --------------------------- 

def parseArguments():
    
    global piana_dbname
    global piana_dbhost
    global piana_dbuser
    global piana_dbpass
    
    global scop_ints_file
    
    global no_species
    
    global verbose
    
    try:
        opts, args = getopt.getopt(sys.argv[1:], "", ["verbose","help","scop_ints-file=","no-species",
                                                      "piana-dbname=", "piana-dbhost=", "piana-dbuser=", "piana-dbpass="])
    except getopt.GetoptError:
        # print help information and exit:
        usage()
        sys.exit(2)
     
    for option,value in opts:
        
        if option == "--scop_ints-file":
            scop_ints_file = value
            
        elif option == "--piana-dbname":
            piana_dbname = value
            
        elif option == "--piana-dbhost":
            piana_dbhost = value
            
        elif option == "--piana-dbuser":
            piana_dbuser = value
            
        elif option == "--piana-dbpass":
            piana_dbpass = value
            
        elif option == "--no-species":
            no_species = 1
            
        elif option == "--verbose":
            verbose = 1
            
        elif option == "--help":
            # print help information and exit
            usage()
            sys.exit(2)
            
    # END OF for option,value in opts:
    
    if scop_ints_file is None:
        sys.stderr.write("You didn't set a scop_ints file name\n")
        usage()
        sys.exit(2)
        
        
# --------
# --------
#  Main()               
# --------                               
# --------

piana_dbname = None
piana_dbuser = None
piana_dbhost = None
piana_dbpass = None

# file containing external DB "scop_ints" must be set on command line
scop_ints_file = None


method_name = "scop_fa_pred"
source_db = "expansion"

# parsing arguments from the command line
parseArguments()

# Initialisating connection to piana
piana_access = PianaDBaccess(dbname=piana_dbname, dbhost=piana_dbhost, dbuser=piana_dbuser, dbpassword= piana_dbpass)


scop_ints_fd = open(scop_ints_file,"r")

#
# Reading external DB "scop_ints" and inserting its data into piana DB
# 

if verbose:
    sys.stderr.write( "Reading data from scop_ints file...\n")

number_of_interactions_added = 0
number_of_interactions_dif_species = 0
number_of_interactions_no_id = 0


already_processed = {} # dictionary used to know which pairs scop_1-scop_2 have already been processed

for line in scop_ints_fd:

    # Retrieving different fields in line PDB chain1 class1 fold1 sf1 fa1 dm1 sp1 px1 chain2 class2 fold2 sf2 fa2 dm2 sp2 px2
    line_fields = line.split()   # line_fields[0] is PDB
                                 # line_fields[1] is chain1
                                 # line_fields[2] is class1
                                 # line_fields[3] is fold1
                                 # line_fields[4] is sf1
                                 # line_fields[5] is fa1
                                 # line_fields[6] is dm1
                                 # line_fields[7] is sp1
                                 # line_fields[8] is px1
                                 # line_fields[9] is chain2
                                 # line_fields[10] is class2
                                 # line_fields[11] is fold2
                                 # line_fields[12] is sf2
                                 # line_fields[13] is fa2
                                 # line_fields[14] is dm2
                                 # line_fields[15] is sp2
                                 # line_fields[16] is px2

    if len(line_fields) == 17:
        # if the number of fields matches the format we are following...
        # do:
        #    1. get all proteins that have scop fa1
        #    2. get all proteins that have scop fa2
        #    3. insert interactions between all proteins with fa1 and all proteins with fa2, checking for species
        scop_fa_1 = line_fields[5]
        scop_fa_2 = line_fields[13]

        # check that this pair has not already been processed
        if scop_fa_1 <= scop_fa_2:       key = "%s.%s" %(scop_fa_1, scop_fa_2)
        else:                            key = "%s.%s" %(scop_fa_2, scop_fa_1)
        
        if already_processed.has_key(key):
            continue

        already_processed[key] = None

        # 1. and 2.
        list_proteins_fa_1 = piana_access.get_proteins_with_scop(cf=None, sf=None, fa=scop_fa_1)
        list_proteins_fa_2 = piana_access.get_proteins_with_scop(cf=None, sf=None, fa=scop_fa_2)

        for protein_fa_1 in list_proteins_fa_1:
            set_species_1 = Set(piana_access.get_protein_taxonomy_ids(proteinPiana_value = protein_fa_1))

            for protein_fa_2 in list_proteins_fa_2:
                
                set_species_2 = Set(piana_access.get_protein_taxonomy_ids(proteinPiana_value = protein_fa_2))

                # check if proteins are of the same species. If not, interaction not inserted (except command line option says other)
                same_species = 0
                if set_species_1 and set_species_2:
                    if set_species_1.intersection(set_species_2):
                        same_species=1
                else:
                    if verbose:
                        sys.stderr.write( "Warning! No taxonomy id found for proteinPiana %s (%s) or proteinPiana %s (%s)\n" %(protein_fa_1,
                                                                                                                               set_species_1,
                                                                                                                               protein_fa_2,
                                                                                                                               set_species_2))

                if same_species or no_species:
                    # inserting the interaction if proteins belong to same species, or if command line specified to insert all interactions

                    number_of_interactions_added += 1

                    if verbose:
                        sys.stderr.write( "inserting interaction: %s --> %s with method %s\n" %(protein_fa_1, protein_fa_2 , method_name))

                    interactionPiana = piana_access.insert_interaction(proteinPianaA_value = protein_fa_1,
                                                                       isSourceA_value = 1,
                                                                       proteinPianaB_value = protein_fa_2,
                                                                       isSourceB_value = 1,
                                                                       interactionConfidence_value = 1,
                                                                       methodDescription_value = method_name,
                                                                       sourceDBDescription_value = "expansion",
                                                                       confidenceAssignedSourceDB_value=1      )


                else:
                    # different species for proteins
                    number_of_interactions_dif_species +=1
                    if verbose:
                        sys.stderr.write( "proteinA %s (species = %s) and proteinB %s (species =%s) are of different species\n" %(protein_fa_1,
                                                                                                                                  set_species_1,
                                                                                                                                  protein_fa_2,
                                                                                                                                  set_species_2) )
                
            # END OF for protein_fa_2 in list_proteins_fa_2:
        # END OF for protein_fa_1 in list_proteins_fa_1:
                
        
        if verbose:
            if not list_proteins_fa_1 or not list_proteins_fa_2:

                
                sys.stderr.write("proteinPiana not found for scop fa %s (%s) or scop fa %s (%s) \n" %(scop_fa_1, list_proteins_fa_1,
                                                                                                      scop_fa_2, list_proteins_fa_2))
                number_of_interactions_no_id += 1
                
        # END OF if verbose:
        
    # END OF if len(line_fields) == 17:
    
    else:
        sys.stderr.write("Line was not well formatted! Check format used for file! README.expansion_data_format\n")
        
# END OF for line in expansion_fd:

        
if verbose:
   sys.stderr.write( "\n------------------------------------------------------------------------\n")
   sys.stderr.write( "All done! Number of ints added: %s.\n" %(number_of_interactions_added))
   sys.stderr.write( "Number of interactions not inserted: between different species: %s\n" %(number_of_interactions_dif_species))
   sys.stderr.write( "Number of interactions with proteins unknown to PIANA: %s\n" %(number_of_interactions_no_id))
   sys.stderr.write( "------------------------------------------------------------------------\n")
