"""
File        : expansion2piana.py
Author      : Ramon Aragues
Creation    : 18.02.2004
Contents    : script that fills up tables in database piana related to external DB "expansion"
Called from : 

=======================================================================================================

This program parsers a text file (following expansion format) with interactions and inserts them into piana

The format that this parser accepts is described in README.expansion_data_format

"""

# expansion2piana.py: script that fills up tables in database piana related to external DB "expansion"
#
# Copyright (C) 2005  Ramon Aragues
# author email: ramon.aragues@upf.edu and boliva@imim.es
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#    http://www.gnu.org/copyleft/gpl.html
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
#
# University Pompeu Fabra, hereby disclaims all copyright
# interest in the program 'PIANA'
# (software for working with protein-protein interaction networks) written 
# by Ramon Aragues

import sys
import getopt

import re
import readline

import MySQLdb

import utilities

from PianaDBaccess import *

verbose = 0

# ---------------------------------------------------------------
# Set here the default values for command line arguments
# ---------------------------------------------------------------

no_species = 0 # by default 0, will not insert interactions between proteins of different species

# ----------------------
# Function usage()
# ----------------------
def usage():
    print "--------------------------------------------------------------------------------------------------------------"
    print "This program fills up tables in database piana related to external DB 'expansion' \n"
    print "Usage: python expansion2piana.py --piana-dbname=piana_dbname --piana-dbhost=piana_dbhost --piana-dbuser=piana_dbuser --piana-dbpass=piana_dbpass "
    print "             --expansion-file=expansion_file --num-expansions=num_expansions --code-type-name=code_type_name --no-species  [--help] [--verbose]"
    print "\nwhere:"
    print "     piana_dbname   : name of database piana to be used (required)"
    print "     piana_dbhost   : name of host where database piana to be used is placed (required)"
    print "     piana_dbuser   : username accessing the database (not required in most systems)"
    print "     piana_dbpass   : password of username accessing the database (not required in most systems)"
    print "     expansion_file : the name of the expansion input file"
    print "     code_type_name : the code type being used in expansion input file (normally, it should be md5)"
    print "     num_expansions : number of expansions performed to get the expansions file (1 or 2)"
    print "     --no-species   : inserts all interactions, regardless of the species of proteins"
    print "     --help         : prints this message and exits"
    print "     --verbose      : prints process info to stdout"
    print "--------------------------------------------------------------------------------------------------------------"
        

# ---------------------------
# Function parseArguments()                                               
# --------------------------- 

def parseArguments():
    
    global piana_dbname
    global piana_dbhost
    global piana_dbuser
    global piana_dbpass
    
    global expansion_file
    global code_type_name
    global num_expansions
    
    global no_species
    
    global verbose
    
    try:
        opts, args = getopt.getopt(sys.argv[1:], "", ["verbose","help","expansion-file=","code-type-name=","num-expansions=","no-species",
                                                      "piana-dbname=", "piana-dbhost=", "piana-dbuser=", "piana-dbpass="])
    except getopt.GetoptError:
        # print help information and exit:
        usage()
        sys.exit(2)
     
    for option,value in opts:
        
        if option == "--expansion-file":
            expansion_file = value
            
        elif option == "--code-type-name":
            code_type_name = value
            
        elif option == "--num-expansions":
            num_expansions = value
            
        elif option == "--piana-dbname":
            piana_dbname = value
            
        elif option == "--piana-dbhost":
            piana_dbhost = value
            
        elif option == "--piana-dbuser":
            piana_dbuser = value
            
        elif option == "--piana-dbpass":
            piana_dbpass = value
            
        elif option == "--no-species":
            no_species = 1
            
        elif option == "--verbose":
            verbose = 1
            
        elif option == "--help":
            # print help information and exit
            usage()
            sys.exit(2)
            
    # END OF for option,value in opts:
    
    if expansion_file is None:
        sys.stderr.write("You didn't set a expansion file name\n")
        usage()
        sys.exit(2)
        
    if num_expansions is None:
        sys.stderr.write("You didn't set a num_expansions\n")
        usage()
        sys.exit(2)
        
        
# --------
# --------
#  Main()               
# --------                               
# --------

piana_dbname = None
piana_dbuser = None
piana_dbhost = None
piana_dbpass = None

# file containing external DB "expansion" must be set on command line
expansion_file = None
num_expansions = None
code_type_name = None


# parsing arguments from the command line
parseArguments()

# Initialisating connection to piana
piana_access = PianaDBaccess(dbname=piana_dbname, dbhost=piana_dbhost, dbuser=piana_dbuser, dbpassword= piana_dbpass)


expansion_fd = open(expansion_file,"r")

#
# Reading external DB "expansion" and inserting its data into piana DB
# 

if verbose:
    sys.stderr.write( "Reading data from expansion file...\n")


code_type = utilities.get_code_column(code_type_name)

number_of_interactions_added = 0
number_of_interactions_dif_species = 0
number_of_interactions_no_id = 0

for line in expansion_fd:

    # Retrieving different fields in line
    line_fields = line.split()   # line_fields[0] is protein_code_a
                                 # line_fields[1] is protein_code_b
                                 # line_fields[2] is expansion_name
                                 # line_fields[3] is source_edge_db_id (not using it: read file comments above)
                                 # line_fields[4] is source_protein_code
    
    if len(line_fields) == 5:
        
        protein_code_a = line_fields[0]
        protein_code_b = line_fields[1]
        protein_code_source = line_fields[4]

        expansion_type = piana_access.get_from_dict(dict_name= PianaGlobals.expansion_types, description_value=line_fields[2] )
        method_name = "exp" + expansion_type + num_expansions
            

        list_proteinPiana_a = piana_access.get_list_protein_piana(proteinCode_value= protein_code_a,
                                                                  proteinCodeType_value= code_type,
                                                                  tax_id_value= 0,
                                                                  source_db_info= "no")
        
        list_proteinPiana_b = piana_access.get_list_protein_piana(proteinCode_value= protein_code_b,
                                                                  proteinCodeType_value= code_type,
                                                                  tax_id_value= 0,
                                                                  source_db_info= "no")
        
        list_proteinPiana_source = piana_access.get_list_protein_piana(proteinCode_value= protein_code_source,
                                                                       proteinCodeType_value= code_type,
                                                                       tax_id_value= 0,
                                                                       source_db_info= "no")



        for proteinPiana_a in list_proteinPiana_a:
            for proteinPiana_b in list_proteinPiana_b:
                for proteinPiana_source in list_proteinPiana_source:
                    
                    list_species_a = piana_access.get_protein_taxonomy_ids(proteinPiana_value = proteinPiana_a)
                    list_species_b = piana_access.get_protein_taxonomy_ids(proteinPiana_value = proteinPiana_b)

                    # check if proteins are of the same species. If not, interaction not inserted (except command line option says other)
                    same_species = 0
                    if list_species_a and list_species_b:
                        for species_a in list_species_a:
                            if species_a in list_species_b:
                                same_species = 1
                                break
                    else:
                        if verbose:
                            sys.stderr.write( "Warning! No taxonomy id found for proteinPiana %s (%s) or proteinPiana %s (%s)\n" %(proteinPiana_a,
                                                                                                                                   list_species_a,
                                                                                                                                   proteinPiana_b,
                                                                                                                                   list_species_b))

                    if same_species or no_species:
                        # inserting the interaction if proteins belong to same species, or if command line specified to insert all interactions
                        
                        number_of_interactions_added += 1

                        if verbose:
                            sys.stderr.write( "inserting interaction: %s --> %s with method %s\n" %(proteinPiana_a, proteinPiana_b , method_name))

                        last_interaction_id = piana_access.insert_interaction(proteinPianaA_value = proteinPiana_a,
                                                                              isSourceA_value = 1,
                                                                              proteinPianaB_value = proteinPiana_b,
                                                                              isSourceB_value = 1,
                                                                              interactionConfidence_value = 1,
                                                                              methodDescription_value = method_name,
                                                                              sourceDBDescription_value = "expansion",
                                                                              confidenceAssignedSourceDB_value=1)

                        # as we have kept the interactionPiana, we now insert specific source protein for it
                        if last_interaction_id is not None:
                            piana_access.insert_interaction_protein_source(interactionPiana_value = last_interaction_id,
                                                                           proteinPianaSource_value = proteinPiana_source,
                                                                           sourceDBDescription_value = "expansion")
                        else:
                            sys.stderr.write("Weird: last_intearction_id is None. How can this be?\n")

                    # END OF if same_species or no_species:

                    else:
                        number_of_interactions_dif_species +=1
                        if verbose:
                            sys.stderr.write( "proteinA %s (species = %s) and proteinB %s (species =%s) are of different species\n" %(proteinPiana_a,
                                                                                                                                      list_species_a,
                                                                                                                                      proteinPiana_b,
                                                                                                                                      list_species_b) )
                    # END OF else: (if same_species or no_species:)

                # END OF for proteinPiana_source in list_proteinPiana_source:
            # END OF for proteinPiana_b in list_proteinPiana_b:
        # END OF for proteinPiana_a in list_proteinPiana_a:
        
        if verbose:
            if not list_proteinPiana_a or not list_proteinPiana_b:

                
                sys.stderr.write("proteinPiana not found for %s (%s) or %s (%s) \n" %(protein_code_a, list_proteinPiana_a,
                                                                                      protein_code_b, list_proteinPiana_b))
                number_of_interactions_no_id += 1
                
        # END OF if verbose:
        
    # END OF if len(line_fields) == 5:
    
    else:
        sys.stderr.write("Line was not well formatted! Check format used for file! README.expansion_data_format\n")
        
# END OF for line in expansion_fd:

        
if verbose:
   sys.stderr.write( "\n------------------------------------------------------------------------\n")
   sys.stderr.write( "All done! Number of ints added: %s.\n" %(number_of_interactions_added))
   sys.stderr.write( "Number of interactions not inserted: between different species: %s\n" %(number_of_interactions_dif_species))
   sys.stderr.write( "Number of interactions with proteins unknown to PIANA: %s\n" %(number_of_interactions_no_id))
   sys.stderr.write( "------------------------------------------------------------------------\n")
