"""
File        : ClusteringSimilarityFunction.py
Author      : Pablo Boixeda & Ramon Aragues
Creation    : 4.2005
Contents    : implements the template for a clustering similarity function
Called from : Clustering.py

=======================================================================================================

This class implements the template that has to be followed by clustering similarity functions.

A similarity function determines how similar two clusters are.
"""

# ClusteringSimilarityFunction.py: implements the template for a clustering similarity function
#
# Copyright (C) 2005  Ramon Aragues
# author email: ramon.aragues@upf.edu and boliva@imim.es
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#    http://www.gnu.org/copyleft/gpl.html
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
#
# University Pompeu Fabra, hereby disclaims all copyright
# interest in the program 'PIANA'
# (software for working with protein-protein interaction networks) written 
# by Ramon Aragues

from Graph import *
from GraphCluster import *

verbose = 0
verbose_detailed = 0
verbose_shallow = 0

#------------------------------------------------------------------------------------------------
class ClusteringSimilarityFunction(object):
    """
    This class implements the template that has to be followed by clustering similarity functions.
    
    A similarity function determines how similar two clusters are.
    """
  

    def __init__(self, dbaccess= None, mode= None, path_length_threshold_value = None):

        """
        
        "dbaccess" is a database accession object used to access information from a database, in case the similarity function needs information
        that is not contained in the node itself. If not being used, just set to None

        "mode" defines how to evaluate the distance between two clusters:
          - random takes a random element from each cluster and evaluates similarity between them
          - min takes the minimal distance between elements of each cluster
          - max takes the maximal distance between elements of each cluster
          - average takes the average distance between all elements of each cluster
          
        "path_length_threshold_value" is the maximum distance allowed between two clusters in order to cluster them

             --> valid values are positive integers and -1 (do not set any restriction)
        
                 --> for example, if we want to fuse clusters only when they are directly connected, this argument should
                     be set to 1
                 --> in clusterings where we do not want to restrict clustering to fuse clusters that are connected, set this
                     argument to -1
        """
        self.similarity_matrix= None
        self.max_value= None
        self.index_to_node_id= {}
        
        self.positions_max_value = [] # a list of tuples (i,j) that had max_value
        
        self.mode = mode
        self.graph=None
        self.dbaccess = dbaccess

        self.path_length_threshold = path_length_threshold_value

        self.similarities_already_calculated = {}  # this is a dictionary that keeps similarities already calculated,
                                                   # the program doesn't need to recalculate them at each step
                                                   # The keys for the similarities are formed by the elements in
                                                   # each cluster, as follows (returned by method _get_pair_key()):
                                                   # key = "elem1_c1.elem2_c1.elem2_c1....-elem1_c2.elem2_c2.elem3_c2...."
                                                   # the content is the similarity value
        

    def _get_pair_key(self, list_elems_c1, list_elems_c2):
        """
        returns a key to be used on dictionary self.similarities_already_calculated

        key follows the format described above (on comments of the dictionary)
        """

        this_key = "%s" %list_elems_c1[0].get_node_id()
        
        for elem in list_elems_c1[1:]:
            this_key += ".%s" %elem.get_node_id()

        this_key += "-%s" %list_elems_c2[0].get_node_id()
        for elem in list_elems_c2[1:]:
            this_key += ".%s" %elem.get_node_id()

        return this_key

    
    def get_proteinPianas_dic(self, list_node_attribute=None):
        """
        Method that returns a protein piana list from an attributes list

        "list_node_attribute": list of attributes from the original nodes of the graph we are clustering
        """
        proteinPiana_dic={}

        for node_attribute in list_node_attribute:
            proteinPiana_dic[node_attribute.get_proteinPiana()] = None

        return proteinPiana_dic
    
    def get_similarity_matrix(self, cluster_graph, dbaccess= None, original_graph=None, root_protein=None): # TO CHECK! why is dbaccess here?
                                                                                                            # TO DO! This is the general
                                                                                                            # clustering class: we cannot have
                                                                                                            # references to proteins here!
                                                                                                            # Move everything related to proteins
                                                                                                            # to the classes where methods
                                                                                                            # are particular to proteins...
        """
        Method that returns similarity matrix from a given cluster graph "cluster_graph"

        each position in the matrix indicates how similar two clusters are.

        If root_protein is different from None, then the similarity matrix will only contain values (ie will only have values != 0) in those
        positions where the root protein is involved (ie. similarities between clusters that contain the root protein)
        """
        # TO DO! This is the general
        #        clustering similiraty class: we cannot have
        #        references to proteins here!
        #        Move everything related to proteins
        #        to the classes where methods
        #        are particular to proteins...
        
        # calculates similarity matrix from cluster graph

        # - keeps the maximum score found in the matrix in self.max_value
        # - keeps a dictionary of correspondences between matrix indexes and node ids
        #   in dictionary index_to_node_id
        # calculates similarity matrix from cluster graph

        if cluster_graph is None:
            raise ValueError("Error: there isn't a Graph")

        self.graph=cluster_graph
        self.max_value=0
        node_list= cluster_graph.get_node_object_list()
        
        number_nodes = len(node_list)
        self.similarity_matrix= numarray.zeros( ( number_nodes, number_nodes), numarray.Float32)  

        for i in range(number_nodes):

            if verbose_shallow:
                sys.stderr.write("%sof%s--" %(i, number_nodes))
            
            node_1 = node_list[i]            
            self.index_to_node_id[i]= node_1.get_node_id()
            node_attribute_1= node_1.get_node_attribute_object()
            attributes_list1= node_attribute_1.get_list_elements()

            if self.path_length_threshold != -1:
                # In case a distance restriction is set on which clusters can be fused, find distances for this node vs all nodes
                #    -> distances_dic is a dictionary for node_1 that has the other node ids as keys, and distances as values
                #       (this distances refer to the number of edges between two nodes)
                distances_dic= self.graph.get_distances(node_1.get_node_id())

            for j in range(i+1, number_nodes):
                node_2 = node_list[j]

                if self.path_length_threshold != -1:
                    distance_to_node = distances_dic[node_2.get_node_id()]
                else:
                    # if no distance restriction is set, just set it to None so we are able to detect it afterwards
                    distance_to_node = None
                
                if distance_to_node and self.path_length_threshold <= distance_to_node:
                    # if restriction is not respected, similarity is 0
                    self.similarity_matrix[i][j]=0
                else:
                    node_attribute2=node_2.get_node_attribute_object()
                    attributes_list2=node_attribute2.get_list_elements()

                    if root_protein:
                        # if there is an argument 'root_protein' it means that we must only calculate scores for those positions
                        # where the root protein is involved

                        dic_proteinPianas_in_cluster1= self.get_proteinPianas_dic(list_node_attribute=attributes_list1)
                        dic_proteinPianas_in_cluster2= self.get_proteinPianas_dic(list_node_attribute=attributes_list2)

                        if not (dic_proteinPianas_in_cluster1.has_key(root_protein) or dic_proteinPianas_in_cluster2.has_key(root_protein)):
                            # if the root protein does not appear in any of the clusters, skip comparison (ie. leave to similarity = 0)
                            continue
                    # END OF if root_protein:
                        

                    # get score for this position
                    #  --> in most clusterings, the attributes (ie the original nodes of the graph to cluster)
                    #      are enougth to calculate the similarity between two clusters. But in some clusterings
                    #      (eg. CIR) we need to know as well the configuration of the PPI network and the
                    #      configuration of the current cluster graph (eg. the partners of the clusters)
                    #
                    # Attention: remember that the PPI network is not necessarily the graph to cluster: for example,
                    #            when clustering GO, the graph to cluster is the graph of GO terms. The PPI network
                    #            is the protein interaction network that was used to generate that first GO graph that
                    #            is going to be used afterwards to do the clustering.


                    # check if the similarity was already calculated
                    the_key = self._get_pair_key(list_elems_c1= attributes_list1, list_elems_c2= attributes_list2)

                    if self.similarities_already_calculated.has_key(the_key):
                        # if it had already been calculated, take it from the dictionary
                        self.similarity_matrix[i][j] = self.similarities_already_calculated[the_key]
                    else:
                        # if it is a new comparison, find the similarity
                            
                        self.similarity_matrix[i][j] = self.calculate_similarity(list_node_attributes1= attributes_list1,
                                                                                 list_node_attributes2= attributes_list2,
                                                                                 cluster1_id= node_1.get_node_id(),
                                                                                 cluster2_id= node_2.get_node_id(),
                                                                                 clustered_graph = cluster_graph,
                                                                                 original_graph = original_graph)

                        if self.similarity_matrix[i][j] != 0 or root_protein:
                            # in order to save memory, save the similarity only if different from 0
                            # except in mode root_protein, where memory explosion is not a problem
                            self.similarities_already_calculated[the_key] = self.similarity_matrix[i][j]
                
                    # END OF else: (if self.similarities_already_calculated.has_key(the_key):)

                    if self.max_value < self.similarity_matrix[i][j]:
                        # if the current value is < max_value, update max_value and initiliaze list with position
                        self.max_value = self.similarity_matrix[i][j]
                        self.positions_max_value = [(i,j)]
                    elif self.max_value == self.similarity_matrix[i][j]:
                        # if the current value is == max_value, add position to list with max values
                        self.positions_max_value.append((i,j))

                        
                # END OF else: (if distance_to_node and self.path_length_threshold <= distance_to_node:)
            # END OF for j in range(i+1, number_nodes):
        # END OF for i in range(number_nodes):
        
        if verbose:
            sys.stderr.write("\n")
            
        return self.similarity_matrix

    def get_max_value(self):
        """
        method that returns max  value from
        the similarity matrix
        """
        return self.max_value
    
    def get_positions_max_value(self):
        """
        method that returns pairs (i, j) with matrix positions where a max_value was found

        """
        return self.positions_max_value

    def get_index_to_node_id(self):
        """
        Method that returns the correspondence between
        matrix indexes and GraphNode ids
        """
        return self.index_to_node_id

    def get_node_id_by_index(self,number):
        """
        Method that returns the correnspondendence betrween a given index to a node_id
        """
        return self.index_to_node_id[number]

    def calculate_similarity(self, list_node_attributes1, list_node_attributes2,
                             cluster1_id= None, cluster2_id=None, clustered_graph=None, original_graph=None):
        """
        Method that returns similarity score from two lists of attributes of nodes that are being clustered

        "list_node_attributes1" is a list of node attributes that belong to the same cluster
        
        "list_node_attributes2" is a list of node attributes that belong to the same cluster

        "cluster1_id" is the id for cluster 1 (might be of use for some similarity functions: set it to None if not useful)
        "cluster2_id" is the id for cluster 1 (might be of use for some similarity functions: set it to None if not useful)

        "clustered_graph" is the current ClusterGraph (might be of use for some similarity functions: set it to None if not useful)
        
        "original_graph" is a Graph (might be of use for some similarity functions: set it to None if not useful)

        This method calculates how similar two clusters are

        Attention: this is the method that has to be overriden by the specific clustering you are implementing
        """
        pass  # to be overwritten by the particular similarity function of the clustering being implemented
