"""
File        : CirClusteringSimilarityFunction.py
Author      : Ramon Aragues
Creation    : 9.2.2006
Contents    : similarity function that determines similarity between clusters with IR terms
Called from : 

=======================================================================================================

"""

# CirClusteringSimilarityFunction.py: similarity function that determines
#                                     similarity between clusters with IR terms
#
# Copyright (C) 2005  Ramon Aragues
# author email: ramon.aragues@upf.edu and boliva@imim.es
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#    http://www.gnu.org/copyleft/gpl.html
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
#
# University Pompeu Fabra, hereby disclaims all copyright
# interest in the program 'PIANA'
# (software for working with protein-protein interaction networks) written 
# by Ramon Aragues

import numarray

import PianaGlobals

from GraphCluster import *
from Clustering import *
from ClusteringSimilarityFunction import *

verbose = 0
verbose_detailed = 0

#------------------------------------------------------------------------------------------------
class CirClusteringSimilarityFunction(ClusteringSimilarityFunction):
    """
    Similarity function that determines similarity between clusters with go terms
    """
    
    def __init__(self, piana_access= None, path_length_threshold= None, similarity_mode=None):

        """
        
        "piana_access" is a database accession object used to access information from a PIANA database


        "path_length_threshold": Maximum distance between two clusters to be joined

        "similarity_mode" determines how will be the similarity between two clusters be calculated

            (all of these modes depend on the number of common protein partners between the clusters: this is a way to
             control that clusters are only fused if they interact with the same proteins)

            - 'total_num': based on global number of common interaction partners
                            ->  number_of_protein_partners_in_common(cluster1, cluster2) x
                                   number_of_protein_partners_in_common(proteins in cluster1, proteins in cluster2)

            - 'combined_per': based on which percentage of the protein partners in both clusters are also common partners between them
                            ->   number_of_protein_partners_in_common(cluster1, cluster2) x

                    1/2( number_of_protein_partners_in_common(proteins in cluster1, proteins in cluster2)x100/number_of_protein_partners(proteins in cluster1)
                         + number_of_protein_partners_in_common(proteins in cluster1, proteins in cluster2)x100/number_of_protein_partners(proteins in cluster2)
                       )
                    
            - 'min_per': based on which percentage of the protein partners are shared with the other protein (taking the protein with fewer partners)
            
                            ->  number_of_protein_partners_in_common(cluster1, cluster2) x
                                   number_of_protein_partners_in_common(proteins in cluster1, proteins in cluster2)x100/
                                       min(number_of_protein_partners(proteins in cluster1), number_of_protein_partners(proteins in cluster2))

            
            - 'max_per': based on which percentage of the protein partners are shared with the other protein (taking the protein with more partners)
            
                            ->  number_of_protein_partners_in_common(cluster1, cluster2) x
                                   number_of_protein_partners_in_common(proteins in cluster1, proteins in cluster2)x100/
                                       max(number_of_protein_partners(proteins in cluster1), number_of_protein_partners(proteins in cluster2))

        """

        self.similarity_mode = similarity_mode

        # for CIR clustering, mode is irrelevant (similarity is always calculated looking at the cluster as a whole)
        ClusteringSimilarityFunction.__init__(self, dbaccess = piana_access,
                                              mode = None,
                                              path_length_threshold_value = path_length_threshold)



    def get_proteinPianas_list(self, list_node_attribute=None):
        """
        Method that returns a protein piana list, from an attributes list given

        "list_node_attribute": list of attributes into a cluster. Each attribute is a protein pianaID
        """
        proteinPiana_dic={}

        for node_attribute in list_node_attribute:
            proteinPiana_dic[node_attribute.get_proteinPiana()] = None

        return proteinPiana_dic.keys()


    def calculate_similarity(self, list_node_attributes1, list_node_attributes2,
                             cluster1_id= None, cluster2_id=None,
                             clustered_graph=None, original_graph=None  ):
        """
        Method that returns similarity score from two lists of attributes of nodes that are being clustered
         ( called from Clustering.cluster_graph() to fill the similarity matrix)

        "list_node_attributes1" is a list of node attributes that belong to cluster1
        "list_node_attributes2" is a list of node attributes that belong to cluster2

        "cluster1_id" is the id for cluster 1 (used to get protein partners of cluster1)
        "cluster2_id" is the id for cluster 1 (used to get protein partners of cluster2)

        "clustered_graph" is the current ClusterGraph (used to get protein partners of clusters)

        "original_graph" is the PianaGraph that originated this clustering

        This method calculates how similar two IR clusters (ie two CIRs) are using the similarity_mode
        established when initializing the class
        """

        # calculate the "similarity" between cluster1 and cluster2

        # similarity is determined by the similarity mode (see comments on __init__(self, ...) )
        #
        # Basically, similarity is:
        #
        # number_of_protein_partners_in_common(cluster1, cluster2) x
        #                           second term that depends on the similarity mode

        # The first term has to be calculated for all similarity modes
        #
        # 1. number_of_protein_partners_in_common(cluster1, cluster2) is obtained by looking to the current clustered graph
        #    1.1 --> get clusters that interact with cluster1 (list_partner_c1)
        #    1.2 --> get clusters that interact with cluster2 (list_partner_c1)
        #    1.3 --> get proteins in partners of cluster1 (proteins_in_partner_c1)
        #    1.4 --> get proteins in partners of cluster2 (proteins_in_partner_c2)
        #
        #    1.5 --> num_intersection_clusters_partners  = len(intersection(proteins_in_partner_c1, proteins_in_partner_c2))

        # 1.1
        list_partner_c1 = clustered_graph.get_node(identifier=cluster1_id).get_neighbour_ids()

        # 1.2
        list_partner_c2 = clustered_graph.get_node(identifier=cluster2_id).get_neighbour_ids()
        
        # 1.3
        proteins_in_partner_c1 = Set([])
        for partner_c1 in list_partner_c1:
            list_ir_attributes_1= clustered_graph.get_node(identifier= partner_c1).get_node_attribute_object().get_list_elements()
            proteins_in_partner_c1.union_update( self.get_proteinPianas_list(list_node_attribute=list_ir_attributes_1) )

        # 1.4
        proteins_in_partner_c2 = Set([])
        for partner_c2 in list_partner_c2:
            list_ir_attributes_2= clustered_graph.get_node(identifier= partner_c2).get_node_attribute_object().get_list_elements()
            proteins_in_partner_c2.union_update(self.get_proteinPianas_list(list_node_attribute=list_ir_attributes_2) )
        
        # 1.5
        num_intersection_clusters_partners= len(proteins_in_partner_c1.intersection(proteins_in_partner_c2))

        if verbose_detailed:
            sys.stderr.write("\n======================================================\n")
            sys.stderr.write("IN CLUSTER %s, protein partners are %s\n" %(cluster1_id, proteins_in_partner_c1))
            sys.stderr.write("IN CLUSTER %s, protein partners are %s\n" %(cluster2_id, proteins_in_partner_c2))
            sys.stderr.write("num_intersection_clusters_partners: %s\n" %(num_intersection_clusters_partners))

        if num_intersection_clusters_partners:
            # second term of the formula only needs to be calculated in the first term is not 0
            #  (this wouldn't be true if the formula weren't a product of the two terms)

            # The second term is different depending on the similarity_mode
            # 2. In all cases, it is obtained by looking to the current clustered graph and to the original PPI network
            #    2.1 --> get proteins in cluster1 (proteins_c1)
            #    2.2 --> get proteins in cluster2 (proteins_c2)
            #    2.3 --> get proteins that interact (in original graph) with proteins in cluster1 (partners_of_proteins_in_c1)
            #    2.4 --> get proteins that interact (in original graph) with proteins in cluster2 (partners_of_proteins_in_c2)
            #
            #    if len(partners_of_proteins_in_c1)==1 or len(partners_of_proteins_in_c2)==1:
            #             --> set second term to 0
            #                    -> we do this to avoid considering those proteins for which only one interaction has been described
            #
            #    2.5 --> similarity_mode == "total_num"
            #            result = len(intersection(partners_of_proteins_in_c1, partners_of_proteins_in_c2))
            #
            #            similarity_mode == "combined_per"
            #            result = 1/2 ( len(intersection(partners_of_proteins_in_c1, partners_of_proteins_in_c2)) / len(partners_of_proteins_in_c1)
            #                            + len(intersection(partners_of_proteins_in_c1, partners_of_proteins_in_c2)) / len(partners_of_proteins_in_c2)
            #                          )
            #
            #            similarity_mode == "min_per"
            #            result = len(intersection(partners_of_proteins_in_c1, partners_of_proteins_in_c2)) / min( len(partners_of_proteins_in_c1),
            #                                                                                                       len(partners_of_proteins_in_c2))
            #            similarity_mode == "max_per"
            #            result = len(intersection(partners_of_proteins_in_c1, partners_of_proteins_in_c2)) / max( len(partners_of_proteins_in_c1),
            #                                                                                                       len(partners_of_proteins_in_c2))
            #


            # 2.1 & 2.3
            partners_of_proteins_in_c1 = Set([])

            for proteinPiana_in_c1 in self.get_proteinPianas_list(list_node_attribute= list_node_attributes1):
                neighbours_node_protein_in_c1 = original_graph.get_node(identifier=proteinPiana_in_c1).get_neighbour_ids()
                partners_of_proteins_in_c1.union_update(neighbours_node_protein_in_c1)
            # END OF for proteinPiana_in_c1 in proteins_c1:

            number_partners_proteins_in_c1 = float(len(partners_of_proteins_in_c1))
            if number_partners_proteins_in_c1 < 2:
                # if there is only one partner for this protein (or 0), return similarity 0
                # because we avoid fusing those proteins for which only one interaction has been described
                #   --> this is done to avoid considering a 100% overlap for proteins for which only
                #       one interaction appears in the network
                return 0

            # 2.2 & 2.4
            partners_of_proteins_in_c2 = Set([])

            for proteinPiana_in_c2 in self.get_proteinPianas_list(list_node_attribute= list_node_attributes2):
                neighbours_node_protein_in_c2 = original_graph.get_node(identifier=proteinPiana_in_c2).get_neighbour_ids()
                partners_of_proteins_in_c2.union_update(neighbours_node_protein_in_c2)
            # END OF for proteinPiana_in_c2 in proteins_c2:

            number_partners_proteins_in_c2 = float(len(partners_of_proteins_in_c2))
            if number_partners_proteins_in_c2 < 2:
                # if there is only one partner for this protein (or 0), return similarity 0
                # because we avoid fusing those proteins for which only one interaction has been described
                return 0

            # 2.5
            num_intersection_protein_partners = len(partners_of_proteins_in_c1.intersection(partners_of_proteins_in_c2))
            if verbose_detailed:
                sys.stderr.write("protein partners for proteins in cluster %s are %s\n" %(cluster1_id, proteins_in_partner_c1))
                sys.stderr.write("protein partners for proteins in cluster %s are %s\n" %(cluster2_id, proteins_in_partner_c2))


            if self.similarity_mode == "num_ints":
                # in this mode, we are just interested in the total number of common partners
                second_term = num_intersection_protein_partners
            else:

                if self.similarity_mode == "min_per":
                    # percentage of partners that are common with the other cluster (using the cluster with fewer partners)
                    second_term = 100*num_intersection_protein_partners/ min(number_partners_proteins_in_c1, number_partners_proteins_in_c2)

                elif self.similarity_mode == "max_per":
                    # percentage of partners that are common with the other cluster (using the cluster with more partners)
                    second_term = 100*num_intersection_protein_partners/ max(number_partners_proteins_in_c1, number_partners_proteins_in_c2)

                elif self.similarity_mode == "combined_per":
                    # combination of percentages for both clusters
                    second_term = (1/2.0) * (100*num_intersection_protein_partners/number_partners_proteins_in_c1 + \
                                             100*num_intersection_protein_partners/number_partners_proteins_in_c2)
                else:
                    raise ValueError("Invalid value given to similarity_mode argument of the CirClusteringSimilarityFunction\n")

            # END OF else: (if similarity_mode == "num_ints": .... )

        # END OF if num_intersection_clusters_partners:
        else:
            # if the first term is 0, no need to calculate the second term: return 0 as similarity
            return 0
        
        if verbose_detailed:
            sys.stderr.write("second term for similarity_mode %s is: %s\n" %(self.similarity_mode, second_term))
            sys.stderr.write("============================================================\n")

        # if we have reach this point, it means the first term was 1: therefore, return the value for the second term
        # otherwise, the method has already returned a 0 as similarity...
        return second_term
