"""
File        : Clustering.py
Author      : Pablo Boixeda & Ramon Aragues
Creation    : 5.2005
Contents    : Methods to cluterize Graphs using ClusteringStopCondition object and ClusteringSimilarityFunction object
Called from : Class to cluster graphs

=======================================================================================================
"""

# Clustering.py: Implements a class that cluterizes nodes of Graph objects
#
# Copyright (C) 2005  Ramon Aragues (University Pompeu Fabra)
# author email: ramon.aragues@upf.edu and boliva@imim.es
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#    http://www.gnu.org/copyleft/gpl.html
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
#
# University Pompeu Fabra, hereby disclaims all copyright
# interest in the program 'PIANA'
# (software for working with protein-protein interaction networks) written 
# by Ramon Aragues

import copy

from GraphCluster import *
from Clustering import *
from GraphClusterNodeAttribute import *

verbose= 0
verbose_process = 0
verbose_shallow = 1

#------------------------------------------------------------------------------------------------
class Clustering(object):
#------------------------------------------------------------------------------------------------
    """
    Implements a class that cluterizes nodes of Graph objects

    Methods to cluterize Graphs using ClusteringStopCondition object and ClusteringSimilarityFunction object
    """

    def __init__(self):
        self.node_id_counter = 0

    def initialize_from_graph(self, graph_to_cluster=None, clustered_graph_generator= None):
        """
        Initializes the GraphCluster from a Graph object "graph"

        Taking as input a graph, object, creates a cluster graph with
        one node attribute per cluster. This cluster graph object is
        then clusterized with method cluster_graph()

        "graph_to_cluster": graph to transform to a GraphCluster object
        """
        
        if graph_to_cluster is None:
            raise ValueError("Error:No graph to init clustering Given")

        else :
            node_list= graph_to_cluster.get_node_object_list()
            graphInit= clustered_graph_generator.get_new_graph_cluster()
            edge_dictionary={}
            node_dictionary={}
            new_node_to_old_node_list_dictionary={}
            new_node_list=[]
            
            """
            for each node, it creates new graph node that contents a ClusterNodeAttribute
            Next step is save the references between new nodes and old nodes, saved in node_dictionary
            Finally we save the old edges
            """
            
            for node in node_list:
                cluster_node= GraphNode(nodeID= self._get_new_node_id(), graph=graphInit)
                attribute= clustered_graph_generator.get_new_graph_cluster_node_attribute()
                attribute.add_element_list(list_node_attribute_object= [(node.get_node_attribute_object())])
                cluster_node.set_attribute(attribute)
                node_dictionary[node.get_node_id()]= cluster_node.get_node_id()
                
                if  new_node_to_old_node_list_dictionary.has_key(cluster_node.get_node_id()) is False:
                    new_node_to_old_node_list_dictionary[cluster_node.get_node_id()]=[node.get_node_id()]
                else :
                    new_node_to_old_node_list_dictionary[cluster_node.get_node_id()].append(node.get_node_id())

                new_node_list.append(cluster_node)

            
            edge_dictionary= graph_to_cluster.get_nodes_connectivities() 


            return Clustering._build_cluster_graph(node_list=new_node_list,
                                                   old_edges_dictionary=edge_dictionary,
                                                   old_node_2_new_node_dictionary=node_dictionary,
                                                   new_node_to_old_node_list_dictionary= new_node_to_old_node_list_dictionary,
                                                   newGraph=graphInit)

    def _get_new_node_id(self):
        
        """
        This method gives an id to new nodes, if two
        nodes have the same id some problems can occur so we insur
        that no two nodes ids are equal.        
        """
        new_node_id = self.node_id_counter
        self.node_id_counter += 1
        return new_node_id




    def _build_cluster_graph(node_list=None, old_edges_dictionary=None,
                             old_node_2_new_node_dictionary=None,
                             new_node_to_old_node_list_dictionary= None, newGraph=None):
        """
        Method that builds a new cluster_graph from:

        "node_list": a list of new nodes to be added to the new cluster_graph

        "old_edge_dictionary": a dictionary that in each position there is a list of nodes that had edge with this node

        "old_node_2_new_node_dictionary": a dictionary that references old nodes_id to new_node_id e.g: old_node_2_new_node[old_node_id]=new_node_id

        "node_to_old_node_list_dictionary": a dictionary that references new nodes_id to a list of old nodes_id that contents inside
                                            eg: node_to_old_node_list_dictionary[new_node_id]=[old_node_id1,old_node_id2...]

        """
        if node_list is None or old_edges_dictionary is None or old_node_2_new_node_dictionary is None:
            raise ValueError("Error: bad use of function Clustering.build_cluster_graph")

        for node in node_list:
            # Graph object require that all nodes are inserted before the edge insertion
            newGraph.add_node(node)

        for node in node_list:

            associated_old_nodes = new_node_to_old_node_list_dictionary[node.get_node_id()]

            for associated_old_node in associated_old_nodes:

                for old_node_to_link in old_edges_dictionary[associated_old_node]:
                    """
                    Here we search each old node edges to make new edges
                    old_node_2_new_node_dictionary conteGGnt the references between new nodes created and old nodes in old graph
                    """
                    # add the edge
                    if verbose:
                        sys.stderr.write("creating new edge betwee node %s and node %s\n" %(node.get_node_id(),
                                                                                            old_node_2_new_node_dictionary[old_node_to_link]) )
                    if node.get_node_id()!=old_node_2_new_node_dictionary[old_node_to_link]:        
                        edge=GraphEdge(node1_id= node.get_node_id(),
                                       node2_id= old_node_2_new_node_dictionary[old_node_to_link],
                                       graph=newGraph)
                        
                        newGraph.add_edge(edge)
                # END OF for old_node_to_link in old_edges_dictionary[associated_old_node]:
            # END OF for associated_old_node in associated_old_nodes:
        # END OF for node in node_list:

        return newGraph
        
    _build_cluster_graph = staticmethod(_build_cluster_graph)


    def print_similarity_matrix(self, output_target, similarity_function, similarity_matrix):
        """
        prints similarity matrix using node ids
        (only prints terms which are different from 0
        """
        similarity_matrix_size = len(similarity_matrix)
            
        output_target.write("similarities different from 0 are:\n")
        for i in range(similarity_matrix_size):
            for j in range(i+1, similarity_matrix_size):

                if similarity_matrix[i][j] != 0:
                    node_id1= similarity_function.get_node_id_by_index(i)
                    node_id2= similarity_function.get_node_id_by_index(j)
                    
                    output_target.write("similarity between %s and %s: %.2f\n" %(node_id1, node_id2, similarity_matrix[i][j]))

                    

    def _create_results_files(self, current_graph, output_dir, output_prefix, label, similarity_function, similarity_matrix, max_similarity, root_protein):
        """
        Creates the three files with results (DOT file, clusters composition and clusters interactions)

        "current_graph" is the cluster graph to print out
        
        "output_dir": directory where results files will be created (must end with slash eg ./)
        "output_prefix" is the prefix that will be given to files

        "label" is the specific label that will distinguish these result files from others generated in this clustering (eg. level)
        
        """
        this_level_dot_file_name= "%s%s_%s.dot" %(output_dir, output_prefix, label)
        this_level_dot_file_fd = file(this_level_dot_file_name, "w")
        current_graph.output_dot_file(output_target= this_level_dot_file_fd, use_alternative_id="no")
        this_level_dot_file_fd.close()

        this_level_composition_file_name =  "%s%s_%s.cluster_composition" %(output_dir, output_prefix, label)
        this_level_composition_file_fd = file( this_level_composition_file_name , "w")
        current_graph.print_cluster_composition(output_target= this_level_composition_file_fd)
        this_level_composition_file_fd.close()

        this_level_ints_file_name = "%s%s_%s.cluster_ints" %(output_dir, output_prefix, label)
        this_level_ints_file_fd = file( this_level_ints_file_name, "w")
        current_graph.print_cluster_interactions(output_target= this_level_ints_file_fd)
        this_level_ints_file_fd.close()

        #similarity_file_name = "%s%s_%s.sim_matrix" %(output_dir, output_prefix, label)
        #similarity_file_fd = file( similarity_file_name, "w")
        #self.print_similarity_matrix(output_target=similarity_file_fd, similarity_function=similarity_function, similarity_matrix=similarity_matrix)
        #similarity_file_fd.close()
        
        results_file_name = "%s%s_%s.results" %(output_dir, output_prefix, label)
        results_file_fd = file(results_file_name , "w")

        results_file_fd.write("description\tprotein=%s\tlabel=%s\tsimilarity=%s\n" %(root_protein, label, max_similarity))
        current_graph.print_pairs_same_cluster(output_target=results_file_fd, root_protein=root_protein) # method from CirGraphCluster
        current_graph.print_pairs_interactions(output_target=results_file_fd, root_protein=root_protein) # method from CirGraphCluster
        results_file_fd.close()




    def cluster_graph(self, graph_to_cluster=None, clustered_graph_generator=None,
                      similarity_function=None, stop_condition=None,
                      clustering_print_mode=None, output_dir=None, output_prefix=None, root_protein=None, 
                      original_graph = None, call_do_action= 0):
        """
        Method that clusters a given Graph graph_to_cluster

        Returns the clustered graph at the point where the stop condition has been met
        
        "graph to cluster": a graph witch we want to cluster

        "clustered_graph_generator" is a Class that returns an empty GraphCluster (or a child class) when called with get_new_graph_cluster()
           -> this allows the user to control which kind of GraphCluster is going to use

        "similarity_function": a ClusteringSimilarityFunction object

        "stop_condition": a ClusteringStopCondition object

        "clustering_print_mode" is used to tell this method which results have to be printed out

           - "all"  : will print to files with output_prefix the network, clusters composition and their interactions at all levels
           - None: won't print anything to output_prefix files
           - "final": will print to files with output_prefix  the clusters composition and their interactions at the final level
                     (ie. when stop condition met)

        "output_dir": directory where results files will be created (must end with slash eg ./)
        "output_prefix": prefix that will be appended to results files
        "root_protein": when training, this is the protein for which the results are being generated
        
        "original_graph" is a Graph (might be of use for some similarity functions: set it to None if not useful)

        if "call_do_action" is 1, then the method do_action() of the GraphCluster being used is called
        """

        if graph_to_cluster is None or similarity_function is None or stop_condition is None:
            raise ValueError("Error: Impossible to cluster, there isn't similarity function")

        # initialising GraphCluster (will be of the type generated by clustered_graph_generator)
        current_cluster_graph= self.initialize_from_graph(graph_to_cluster= graph_to_cluster,
                                                          clustered_graph_generator= clustered_graph_generator)

        clustering_level = 0
        while 1:
            # while nothing breaks the loop (with a return), do clustering
            if verbose_shallow:
                sys.stderr.write("%s." %(clustering_level))
                
            if verbose_process:
                sys.stderr.write("=====================================\n")
                sys.stderr.write("========NEW CLUSTERING LEVEL %s=========\n" %(clustering_level))
                sys.stderr.write("=====================================\n")
            
            next_level_graph = clustered_graph_generator.get_new_graph_cluster() # initialize the next level graph cluster

            
            if verbose_process:
                sys.stderr.write("calculating similarity matrix\n")
            
            if verbose_shallow:
                sys.stderr.write("s.")
                
            similarity_matrix= similarity_function.get_similarity_matrix(cluster_graph= current_cluster_graph,
                                                                         dbaccess= None,
                                                                         original_graph= original_graph,
                                                                         root_protein= root_protein)    # TO CHECK! why is dbaccess here?



            if verbose_process:
                sys.stderr.write("max value in similarity matrix= %s\n" %(similarity_function.get_max_value()))

            if verbose_shallow:
                sys.stderr.write("m=%.0f." %(similarity_function.get_max_value()))
            
            if clustering_print_mode == "all":
                self._create_results_files(current_graph= current_cluster_graph,
                                           output_dir= output_dir,
                                           output_prefix= output_prefix,
                                           label= clustering_level,
                                           similarity_function= similarity_function,
                                           similarity_matrix=similarity_matrix,
                                           max_similarity=similarity_function.get_max_value(),
                                           root_protein = root_protein)

            if stop_condition.condition_is_met():
                # when stop condition met, stop clustering and return clustered graph
                if verbose_process:
                    sys.stderr.write("STOP CONDITION MET with max value %s\n" %(similarity_function.get_max_value()))

                if clustering_print_mode == "final":
                    self._create_results_files(current_graph=current_cluster_graph,
                                               output_dir= output_dir,
                                               output_prefix=output_prefix,
                                               label="final",
                                               similarity_function= similarity_function,
                                               similarity_matrix=similarity_matrix,
                                               max_similarity=similarity_function.get_max_value(),
                                               root_protein = root_protein )

                    
                return current_cluster_graph
            # END OF if stop_condition.condition_is_met():

            # if stop condition not met, continue with clustering:
                
            next_level_nodes= []   # keeps nodes that have to be added to next level cluster graph
            
            used_nodes={}   # keeps track of nodes that have already been clustered
                            #  -> used to make sure that a node is not clustered into
                            #     two different clusters
                            
            old_node_2_new_node_dic={} # keeps the relationship between new nodes and old nodes
                                       # old_node_2_new_node_dic[old_node_id]=new_node_id
                                       
            new_node_to_old_node_dic={} # keeps the relationship between new nodes and old nodes
                                        # new_node_to_old_node_dic[new_node_id]=[old_node1,old_node2...]
                            

            similarity_matrix_size = len(similarity_matrix)
            
            if verbose_shallow:
                sys.stderr.write("n.len=%s." %(similarity_matrix_size))


            clustered_nodes_dic = {}  # dic that keeps as keys the current node ids and as values the
                                      # cluster object for next level graph
                                      #   -> used to add several clusters into the same one of the new level
                                      #      All clusters with the max_value that are transitive between
                                      #      them (ie. all of them have max_value in between) are clustered
                                      #      into the same cluster in just one clustering step
                                      
            # create the clusters with the previous clusters that had maximal similarity
            for pair_i_j in similarity_function.get_positions_max_value():

                i = pair_i_j[0]
                j = pair_i_j[1]

                # before: if used_nodes.has_key(j) or used_nodes.has_key(i): continue


                # new from here to...
                
                if used_nodes.has_key(j) and used_nodes.has_key(i):
                    # dont cluster nodes when both have already been clustered
                    continue
                
                elif used_nodes.has_key(j) or used_nodes.has_key(i):
                    # one of the two nodes was already clustered: place the other one on that cluster

                    if used_nodes.has_key(j):
                        used_nodes[i] = None
                        to_add_node_id =  similarity_function.get_node_id_by_index(i)
                        to_add_index = i
                        already_added_index = j
                    else:
                        # ie. used node is 'i'
                        used_nodes[j] = None
                        to_add_node_id =  similarity_function.get_node_id_by_index(j)
                        to_add_index = j
                        already_added_index = i
                    # END OF else: (if used_nodes.has_key(j):)
                        

                    # get the cluster where already_added_node_id was placed...
                    previous_cluster= clustered_nodes_dic[already_added_index]

                    clustered_nodes_dic[to_add_index] = previous_cluster  # update the dic with clusters for this index_to_add
                    
                    # get the list of elements that have to be added to previous_cluster
                    to_add_node_list_elements = current_cluster_graph.get_node(to_add_node_id).get_node_attribute_object().get_list_elements()
                    
                    # add to previous_cluster the elements of to_add_node_id
                    previous_cluster.get_node_attribute_object().add_element_list(to_add_node_list_elements)
                    previous_cluster_id = previous_cluster.get_node_id()
                    old_node_2_new_node_dic[to_add_node_id]= previous_cluster_id
                    new_node_to_old_node_dic[previous_cluster_id].append(to_add_node_id)
                    continue
                # END OF elif used_nodes.has_key(j) or used_nodes.has_key(i):

                # ... here
                
                # both elements are new: create a new cluster and update list of nodes correspondences
                used_nodes[i]= None
                used_nodes[j]= None

                node_id1= similarity_function.get_node_id_by_index(i)
                node_id2= similarity_function.get_node_id_by_index(j)

                clustered_node= next_level_graph.create_grouped_node(node_id1= node_id1,
                                                                     node_id2= node_id2,
                                                                     new_node_id= self._get_new_node_id(),
                                                                     old_graph= current_cluster_graph)

                next_level_nodes.append(clustered_node)
                
                clustered_nodes_dic[i] = clustered_node 
                clustered_nodes_dic[j] = clustered_node 
                

                new_cluster_id=clustered_node.get_node_id()
                old_node_2_new_node_dic[node_id1]= new_cluster_id
                old_node_2_new_node_dic[node_id2]= new_cluster_id

                new_node_to_old_node_dic[new_cluster_id]= [node_id1, node_id2]
            # END OF for pair_i_j in similarity_function.get_positions_max_value():

            if not next_level_nodes:
                # if there are no nodes to be clustered, then we exit the clustering
                if verbose_process:
                    sys.stderr.write( "NO MORE NODES TO CLUSTER: exit clustering\n")
                return current_cluster_graph
                
            aux_dictionary=similarity_function.get_index_to_node_id()

            if verbose_shallow:
                sys.stderr.write("a.")
            # ---
            # Adding nodes that were not "clustered" to the nodes of the next level
            # ---
            for i in range(similarity_matrix_size):
                if not used_nodes.has_key(i):
                    node_id= aux_dictionary[i]

                    node= current_cluster_graph.get_node(node_id)
                    
                    new_node_id = self._get_new_node_id()
                    
                    old_node_2_new_node_dic[node_id]= new_node_id
                    new_node_to_old_node_dic[new_node_id]=[node_id]
                     
                    newNode=GraphNode(nodeID=new_node_id)
                    newNodeAttribute=clustered_graph_generator.get_new_graph_cluster_node_attribute()
                    newNodeAttribute.add_element_list(list_node_attribute_object=node.get_node_attribute_object().get_list_elements())
                    newNode.set_attribute(newNodeAttribute)
                    
                    next_level_nodes.append(newNode)
            #END OF for i in range(similarity_matrix_size):

            if verbose_shallow:
                sys.stderr.write("x.")
                
            next_level_graph= Clustering._build_cluster_graph(node_list= next_level_nodes,
                                                              old_edges_dictionary= current_cluster_graph.get_nodes_connectivities(),
                                                              old_node_2_new_node_dictionary=old_node_2_new_node_dic,
                                                              new_node_to_old_node_list_dictionary= new_node_to_old_node_dic,
                                                              newGraph=next_level_graph)

            if call_do_action:
                next_level_graph.do_action() # this is a generic method that can be used by particular GraphCluster subclasses to do
                                             # something after the clustering of this level.
                                             # In those GraphClusters that nothing has to be done
                                             # there is a method call do_action() that doesn't do anything
            
            if verbose_shallow:
                sys.stderr.write("e--")
                
            current_cluster_graph = next_level_graph
            clustering_level += 1

        # END OF while 1:  (the process never goes out of this loop.... it will always break the loop with a return)
    

