"""
File        : links_pavlidis2links_proteinPiana.py
Author      : Ramon Aragues
Creation    : 27.04.06 (in NY - MSKCC)
Contents    : 
Called from : command line

=======================================================================================================

"""

# links_pavlidis2links_proteinPiana.py: transforms a pavlidis coexpression file 
#                                       to proteinPianas coexpression file
#
# Copyright (C) 2006  Ramon Aragues
# author email: ramon.aragues@upf.edu and boliva@imim.es
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#    http://www.gnu.org/copyleft/gpl.html
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
#
# University Pompeu Fabra, hereby disclaims all copyright
# interest in the program 'PIANA'
# (software for working with protein-protein interaction networks) written 
# by Ramon Aragues

import sys
import getopt
import os
import time
import re
import readline
import MySQLdb

from sets import *

import copy

from Bio.Blast import NCBIStandalone

import utilities

from PianaDBaccess import *

verbose = 0
verbose_detailed = 0

# ----------------------
# Function usage()
# ----------------------
def usage():
    print "\n--------------------------------------------------------------------------------------------------------------"
    print "\n"
    print "\nUsage: python links_pavlidis2links_proteinPiana.py --input-file=input_file "
    print "      --piana-dbname=piana_dbname --piana-dbhost=piana_dbhost --piana-dbuser=piana_dbuser --piana-dbpass=piana_dbpass"
    print "       --input-proteins-type=input-proteins-type-value (NOT ACTIVE --output-proteins-type=output-proteins-type-value)"
    print "       --input-proteins-species=input-proteins-species-value  --remove-redundancy [--help] [--verbose] \n" 
    print "\nwhere:"
    print "     input_file   : name of the file with coexpression (format is one coexpression per line, tab separated: (+ or -) num_microarrays gene1 gene2"
    print "     input_proteins_type   : type of identifier used for proteins in input file (eg. geneName, uniacc, unientry, gi, ...)"
    print "     input_proteins_species: species of proteins in input file (eg. human, yeast, ...)"
    print "     --remove-redundancy: set flag if you want to remove homology redundancy from the sets of partners"
    print "     piana_dbname   : name of database piana to be used (required)"
    print "     piana_dbhost   : name of host where database piana to be used is placed (required)"
    print "     piana_dbuser   : username accessing the database (not required in most systems)"
    print "     piana_dbpass   : password of username accessing the database (not required in most systems)"
    print "     --help       : prints this message and exits"
    print "     --verbose    : prints process info to stdout"
    print "--------------------------------------------------------------------------------------------------------------"
        

   
# ---------------------------
# Function parseArguments()                                               
# --------------------------- 

def parseArguments():
    
    global input_file
    global remove_redundancy

    global piana_dbname
    global piana_dbhost
    global piana_dbuser
    global piana_dbpass
    
    global input_proteins_type
    global input_proteins_species
    global output_proteins_type

    global verbose
    
    try:
        opts, args = getopt.getopt(sys.argv[1:], "", ["verbose", "help", "input-file=", "piana-dbname=", "piana-dbhost=", "piana-dbuser=", "piana-dbpass=",
                                      "input-proteins-type=","input-proteins-species=", "output-proteins-type=", "remove-redundancy"])
    except getopt.GetoptError, bad_opt:
        # print help information and exit:
        sys.stderr.write( bad_opt.__str__() )
        usage()
        sys.exit(2)
     
    for option,value in opts:
        
        if option == "--input-file":
            input_file = value
            
        elif option == "--piana-dbname":
            piana_dbname = value
            
        elif option == "--piana-dbhost":
            piana_dbhost = value
            
        elif option == "--piana-dbuser":
            piana_dbuser = value
            
        elif option == "--piana-dbpass":
            piana_dbpass = value
            
        elif option == "--input-proteins-type":
            input_proteins_type = value
            
        elif option == "--input-proteins-species":
            input_proteins_species= value
            
        elif option == "--output-proteins-type":
            output_proteins_type = value
             
        elif option == "--remove-redundancy":
            remove_redundancy = 1
             
        elif option == "--verbose":
            verbose = 1
            
        elif option == "--help":
            # print help information and exit
            usage()
            sys.exit(2)

    # check arguments
    if input_file is None:
        raise ValueError("trying to run the program without giving an input file")


# --------
# --------
#  Main()               
# --------                               
# --------
piana_dbname = None
piana_dbuser = None
piana_dbhost = None
piana_dbpass = None

input_file = None

input_proteins_type = None
input_proteins_species = None
output_proteins_type = None

remove_redundancy = None

# parsing arguments from the command line
parseArguments()

# -------
# parameters that will be used to retrieve interaction partners:
#
use_self_ints = "no"

list_source_dbs = "all"
inverse_dbs = "no"

list_source_methods = "all"
inverse_methods = "no"

hub_threshold = 0

# parameters that will be used to decide if proteins are homologous
#
min_ident = 0.70
min_length = 30
max_evalue= 0.0000000001

#
# -------


# Initialisating connection to piana
piana_access = PianaDBaccess(dbname=piana_dbname, dbhost=piana_dbhost, dbuser=piana_dbuser, dbpassword= piana_dbpass)

number_of_links = 0
num_pairs_no_proteinPiana =0
num_pairs_too_many_partners =0

input_id_column = utilities.get_code_column(input_proteins_type)
output_id_column = utilities.get_code_column(output_proteins_type)
input_tax_id = piana_access.get_taxonomies_from_species_name(species_name_value= input_proteins_species)[0]

dic_protein_partners = {}  # keeps partners for proteins that have already been processed (without removing redundancy)

for one_line in file(input_file, "r"):
    
    
    line_fields = one_line.split()

    if len(line_fields) != 4:
	# protect against bad lines
	continue


    number_of_links += 1

    # line_fields[0] is kind of coexpression (+ or -)
    # line_fields[1] is number of microarrays confirming the coexpression
    # line_fields[2] is gene name 1
    # line_fields[3] is gene name 2
    #
    protein_1 = line_fields[2]
    protein_2 = line_fields[3]

    
    # --
    # 1. get partners for 1
    # --
    
    if dic_protein_partners.has_key(protein_1):
	set_partners_1 = dic_protein_partners[protein_1]

    else:
        # no partners have been retrieved for protein_1 before... do it now

	#  1.1. get all proteinPianas associated to protein 1 
	list_proteinPiana_1 = piana_access.get_list_protein_piana(proteinCode_value= protein_1,
								  proteinCodeType_value= input_id_column,
								  tax_id_value= input_tax_id,
								  source_db_info= "no")

	#  1.2. get partners for all proteinPianas associated to gene name 1 (put together all partners in a set1)
	set_partners_1 = Set([])
	for one_proteinPiana in list_proteinPiana_1:
	    # get partners for every proteinPiana associated to protein1
	    set_partners_1.union_update(piana_access.get_all_partners(proteinPiana_value= one_proteinPiana,
								      use_self_ints=use_self_ints,
								      list_source_dbs= list_source_dbs,
								      inverse_dbs=inverse_dbs,
								      list_source_methods= list_source_methods,
								      inverse_methods=inverse_methods,
								      threshold= hub_threshold ) )
	    
	# END OF for one_proteinPiana in list_proteinPiana_1:


	# save set for using it in case we process again this protein
	dic_protein_partners[protein_1] = copy.deepcopy(set_partners_1)
    # END OF else: (if dic_protein_partners.has_key():)

    # --
    # 2. get partners for 2
    # --
    
    if dic_protein_partners.has_key(protein_2):
	set_partners_2 = dic_protein_partners[protein_2]

    else:
        # no partners have been retrieved for protein_2 before... do it now

	#  2.1. get all proteinPianas associated to protein 2 
	list_proteinPiana_2 = piana_access.get_list_protein_piana(proteinCode_value= protein_2,
								  proteinCodeType_value= input_id_column,
								  tax_id_value= input_tax_id,
								  source_db_info= "no")

	#  2.2. get partners for all proteinPianas associated to gene name 2 (put together all partners in a set2)
	set_partners_2 = Set([])
	for one_proteinPiana in list_proteinPiana_2:
	    # get partners for every proteinPiana associated to protein2
	    set_partners_2.union_update(piana_access.get_all_partners(proteinPiana_value= one_proteinPiana,
								      use_self_ints=use_self_ints,
								      list_source_dbs= list_source_dbs,
								      inverse_dbs=inverse_dbs,
								      list_source_methods= list_source_methods,
								      inverse_methods=inverse_methods,
								      threshold= hub_threshold ) )
	    
	# END OF for one_proteinPiana in list_proteinPiana_2:


	# save set for using it in case we process again this protein
	dic_protein_partners[protein_2] = copy.deepcopy(set_partners_2)
    # END OF else: (if dic_protein_partners.has_key():)
    

    if not list_proteinPiana_1 or not list_proteinPiana_2:
	# if there was no proteinPiana for any of the proteins in the pair... skip it
	num_pairs_no_proteinPiana += 1
	continue

    if len(set_partners_1) > 200 or len(set_partners_2) > 200:
	# do not use proteins that have 'too many' partners
	num_pairs_too_many_partners += 1
	continue

    if remove_redundancy:
	# 3. get the non_redundant sets from set_partners_1 and set_partners_2
	(non_redundant_set_1, 
	 non_redundant_set_2, 
	 intersection_set)    = utilities.get_non_redundant_sets_blast(piana_access= piana_access, 
								       set_1= set_partners_1, 
								       set_2= set_partners_2, 
								       min_length= min_length, min_identity= min_ident, max_evalue= max_evalue,
								       prefix = "analisis.")
    else:
	# redundancy is not to be removed
	non_redundant_set_1 =set_partners_1
	non_redundant_set_2 =set_partners_2
	intersection_set = set_partners_1.intersection(set_partners_2)


    # 4. calculate all the scores: min_per, max_per, combined_per, num_ints
    min_per_score = utilities.get_partners_similarity(set_a=non_redundant_set_1, 
						      set_b=non_redundant_set_2, 
						      intersection_set=intersection_set, 
						      sim_mode= "min_per")

    max_per_score = utilities.get_partners_similarity(set_a=non_redundant_set_1, 
						      set_b=non_redundant_set_2, 
						      intersection_set=intersection_set, 
						      sim_mode= "max_per")

    combined_per_score = utilities.get_partners_similarity(set_a=non_redundant_set_1, 
							   set_b=non_redundant_set_2, 
							   intersection_set=intersection_set, 
							   sim_mode= "combined_per")

    num_ints_score = utilities.get_partners_similarity(set_a=non_redundant_set_1, 
						       set_b=non_redundant_set_2, 
						       intersection_set=intersection_set, 
						       sim_mode= "num_ints")
    


    # 5. write to file protein1 protein2 (+ or -) num_microarrays num_ints min_per max_per combined_per  num_ints_1 num_ints_2
    sys.stdout.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" %(protein_1, protein_2, line_fields[0], line_fields[1], num_ints_score, min_per_score, max_per_score, combined_per_score, len(non_redundant_set_1), len(non_redundant_set_2)  ) )
    
    # 6. analyze? how?


# END OF for one_line in file(input_file, "r"):

sys.stderr.write("Num pairs processed: %s\n" %(number_of_links))
sys.stderr.write("Num pairs skipped (no proteinPiana found): %s\n" %(num_pairs_no_proteinPiana))
sys.stderr.write("Num pairs skipped (too many partners): %s\n" %(num_pairs_too_many_partners))
