"""
File        : PatchDecomposition.py
Author      : Ramon Aragues
Creation    : 09.2004
Contents    : implementation of PatchDecomposition.py
Called from : programs/classes that handle a PatchDecomposition
Subclass    : 

=======================================================================================================

This file implements class PatchDecomposition

PatchDecomposition handles the process of going from a PianaGraph to a GraphPatchGroup

It has several modules that can be applied to use different techniques to achieve the decomposition of proteins into patches

"""
import sys
from numarray import *
import cPickle


from PianaGraph import *
from GraphPatchGroup import *
from GraphPatchGroupNodeAttribute import *
from GraphPatchGroupEdgeAttribute import *
from Patch import *

import ScoringFunctionBenchmark

import PianaGlobals


MAX_SIZE_ALLOWED = 120000   # max size allowed for GraphPatchGroup objects (before it was 150000
                            # used to skip proteins whose networks are too big (to skip cases for which there isn't enough memory)

verbose_process = 1
verbose_process_detailed = 1
verbose_print_matrix = 0
verbose_show_patches = 0
verbose_comparison = 1
verbose_clustered_elements = 0

#----------------------------------------------------------------------------------------------
class PatchDecomposition:
#----------------------------------------------------------------------------------------------
    """
    This class is used to decompose proteins of a network into their functional sites (ie. patches)

        Note: a bit of terminology used in PatchDecomposition...

            1. Patch is a class that describes one of the initial patches in the protein, from the first decomposition
            2. Patchgroup is a class that basically contains Patch objects that are similar according to scoring function
            3. The graph nodes attributes are patchgroup objects. Even the first network! In this first network, each patchgroup will
               only have one patch.

    """
    def __init__(self, piana_graph = None, clustering_mode = None, scoring_function= None, clustering_steps = None, score_threshold= 1,
                 save_mode = "memory", disc_name= None, gold_std= None, comparison_results_file_name = None, stop_condition= None):
        
        """
        Initialize the PatchDecomposition with values that will determine how the decomposition is made


        "piana_graph" is the graph that will be used to decompose proteins into patches

        "clustering_mode" can be: (NOT WORKING! Only one mode implemented!!!)

            - greedy clustering

        "scoring_function" is a object that implements the calculation of how similar two patches are. See class ScoringFunction

        "clustering_steps" sets a maximum of clustering steps that can be performed
           TO DO!!! Introduce some kind of infinite number till condition stop met

        "score_threshold" sets the minimum score value (default is 1) required to cluster two patches

        "save_mode" determines whether the patchgroup_graphs of the different clustering levels are saved on disc or in memory
        
          Valid values are:
           - "memory": all the patchgroup graphs will be saved in memory
           - "disc"  : all (except the current level graph ) the patchgroup graphs will be saved on disc

        "disc_name" determines the file name path prefix used for saving to disc the patchgroup_graphs for clustering levels
           - for example, if you want your temporary graphs to be saved under directory /home/pepito/graphs, and their names be temp_graph,
             disc_name must be /home/pepito/graphs/temp_graph (and graphs will be saved to temp_graph_1, temp_graph_2, etc)
           - if save_mode is memory, this argument is ignored
           
        "gold_std" is used in benchmarking for setting the file name that holds the "gold standard" of protein decomposition
           - it must be None if not doing benchmarking
           - the file contains patchgroup graph result from dumping an object of class GraphPatchGroup

        "comparison_results_file_name" is the file name where comparison results will be saved. The first part (separated by .) of the name must be the protein code
           -> for example, path_to_directory_with_comp_files/11111.whatever_sufix_you_want_to_add

        "stop_condition" has format type_of_stop_condition=value and it is used to stop the clustering at the optimal level

           -> currently, types of stop conditions are:
               - 'score_clusters': ratio between the max score in the similarity matrix and the number of clusters in the clustering level
               - 'score_links': ratio between the max score in the similarity matrix and the number of links (ie interactions) in the clustering level
               - 'clusters_links': ratio between the max score in the similarity matrix and the number of links (ie interactions) in the clustering level
               - 'max_score_range': range of the max score in the similarity matrix  in the clustering level

           -> values of ratios stop conditions are ranges obtained from utilities.get_ratio_group(): current groups for ratios used are:
                      "-0-" "0-01" "01-02" "02-03" "03-04" "04-05" "05-06" "06-07" "07-08" "08-09" "09-1" "1-2" "2-3"
                      "3-4" "4-5" "5-6" "6-7" "7-8" "8-9" "9-10" "10-20" "20-30" "huge"
                      
           -> values of max score range stop conditions are ranges obtained from utilities.get_range_group(): current groups for max score ranges used are:
                      "1-10" "20-30" "30-40" "40-50" "50-60" "60-70" "70-80" "80-90" "90-100" "100-110" "110-120" "120-130" "130-140" "140-150"
                      "150-160" "160-170" "170-180" "180-190" "190-200" "200-250" "250-300" "huge" 
        """

        if piana_graph is None:
            raise ValueError("piana_graph is needed to decompose proteins into patches")
        
        if clustering_steps is None:
            raise ValueError("clustering_steps is needed to decompose proteins into patches")
        
        
        if scoring_function is None:
            raise ValueError("scoring_function is needed in order to decompose proteins into patches")
        
        self.piana_graph = piana_graph
        self.scoring_function = scoring_function
        self.clustering_steps = clustering_steps
        self.score_threshold = score_threshold
        self.save_mode = save_mode
        self.disc_name = disc_name

        if stop_condition is not None:
            temp = stop_condition.split("=")
            self.stop_condition_type= temp[0]
            self.stop_condition_value= temp[1]
        else:
            self.stop_condition_type= None
            self.stop_condition_value=  None
            

        if gold_std is not None:
            self.gold_std_patchgroup_graph = utilities.get_patchgroup_graph_from_file_name(file_name= gold_std)
        else:
            self.gold_std_patchgroup_graph = None

        self.comparison_results_file_name = comparison_results_file_name
            


        self.patchgroup_graphs_levels = {}   # dictionary with keys "level number" and content the patchgroup_graph for that level
                                             # only used in save_mode "memory"

        self.patchgroup_graph_filenames = {} # dictionary with keys "level number" and content the file name with pickle data for that level patchgroup_graph
                                             # only used in save_mode "disc"

        self.current_patchgroup_graph = None  # in save_mode "disc", will hold the current patchgroup_graph

        self.current_level_number = 0   # holds the current level of clustering
        self.patch_id_counter = 0       # holds a new patch id 
        self.patchgroup_id_counter = 0  # holds a new patchgroup id

        if self.save_mode != "memory" and self.save_mode != "disc":
            raise ValueError("save_mode must be 'memory' or 'disc' (instead of wrong value %s)\n" %(save_mode))

        
    def decompose(self, comparison_mode= None, root_protein= None, similar_proteins_dic={}, hub_threshold=0, patch_mode= None):
        """

        Method that performs the decomposition of proteins in patches, using parameters fixed when initializing the class

        Returns 1 is stop condition is met before the end of the clustering. 0 otherwise.

        "comparison_mode" can be:

           - 'standard': all against all, we are considering that both patchgroup graphs should be identical
           - 'training': If other_patchgroup_graph (ie. the gold standard) has no info about a protein, it is ignored (ie. no differences computed).
                          -> if there is no info about any of the proteins in other_patch_group_graph (ie. the gold standard) then this function returns None
           - 'root':     Only checking composition of root protein.
                          -> If there is no info of the root protein in other_patch_group_graph (ie. the gold standard) then this function returns None
                          -> if comparison_mode is set to root, then parameter "root_protein" is required
                          -> parameter "similar_proteins_dic" is used to avoid using redundant proteins when calculating statistics

        "root_protein" sets which is the root protein that generated the patchgroup graph. Only used when comparison_mode == "root"


        "similar_proteins_dic" is required for comparison_mode root: it is a list of proteins that are similar to root_protein (ie. are the same protein
                                                                                                                                with different sequence)

        "hub_threshold" is just used to print out in the comparison file the threshold used for building the graph. This will be useful info
        when analyzing which threshold did find an answer...

        "patch_mode" is used to set what is the decomposition being done for
             - train: creating comparison files used by ScoringFunctionBenchmark (needs a gold std and benchmark conf file)
             - eval: finding patches and evaluating how good the results are (needs a gold std and benchmark conf file) (ie. prints accuracy file) 
             - exec: finding patches and printing decomposition for the proteins (ie. prints clustering results file)

        
        GENERAL PATCHES DIVISION ALGORITHM
        ----------------------------------
        
        input: the protein-protein interaction network
        
        output: a domain-domain interacion network
        
        
        1. subdivide each protein into as many patches as interactions where it is involved
        2. create a patch-patch interaction network (it is actually a GraphPatchGroupNodeAttribute-GraphPatchGroupNodeAttribute network)
            --> if protein A interacts with protein B then: each patch "ai" of a protein "A" interacts with all patches (b1, ..., bn) of proteins B
        3. calculate similarity scores between each patch
        4. cluster patches with highest similarity scores


        
        """
        stop_condition_met = 0    # this will be set to 1 when the stop condition is met.
        
        if verbose_process:   sys.stderr.write("Initialising graph patchgroup\n")

        # 1. subdivide each protein into as many patches as interactions where it is involved
        # 2  create a patch-patch interaction network (it is actually a GraphPatchGroupNodeAttribute-GraphPatchGroupNodeAttribute network)
        initial_patchgroup_graph = self.initialize_graph_patch_group()

        if self.save_mode == "memory":
            self.patchgroup_graphs_levels[0] = initial_patchgroup_graph
        else:
            self.current_patchgroup_graph  = initial_patchgroup_graph


        if verbose_show_patches:
            sys.stderr.write("---------------------INIT---------------------------------------\n")
            self.print_protein_decomposition_from_level_number(level_number= 0, output_target= sys.stderr,
                                             protein_type_name= "proteinPiana" , alternative_type_names=[] )
            sys.stderr.write("\n")
            self.print_proteins_in_clusters_from_level_number(level_number= 0,output_target= sys.stderr,
                                             protein_type_name= "proteinPiana" , alternative_type_names=[] )
            sys.stderr.write("\n")
            self.print_patchgroups_interactions_from_level_number(level_number= 0, output_target= sys.stderr)
            sys.stderr.write("---------------------END OF INIT---------------------------------------\n")


        for i in range(self.clustering_steps):
            # the algorithm will do clustering steps till reaching the number set by the user (unless stop_condition is met before (and patch_mode != train))


            if verbose_process_detailed: sys.stderr.write("-------\nNew clustering step: %s: " %(i))
            
            current_patchgroup_graph = self.get_patchgroup_graph_from_level_number(level_number= self.current_level_number)
            current_patchgroup_ids = current_patchgroup_graph.get_node_ids_list()
            number_of_current_patchgroup_ids = len(current_patchgroup_ids)
            
            # ---
            # 3. calculate similarity scores between each patch
            # ---

            # calculating arrays that will be used by scoring function:
            #
            # These arrays count, for each pair of patchgroups, the following statistics:
            #
            # array_interacting_patchgroups_in_common:
            # array_interacting_proteins_in_common:
            # array_belong_to_same_protein:
            
            if verbose_process:   sys.stderr.write("calculating arrays...")
            
            (array_interacting_patchgroups_in_common, array_interacting_proteins_in_common, array_belong_to_same_protein) = self.get_arrays_scores(
                patchgroup_graph= current_patchgroup_graph,
                all_patchgroup_ids= current_patchgroup_ids,
                number_of_patchgroup_ids=number_of_current_patchgroup_ids)
            
            # calculate clustering scores using the scoring function chosen by user
            if verbose_process:   sys.stderr.write("Calculating clustering scores...")
            array_clustering_scores = self.scoring_function.get_scores_array( array_interacting_patchgroups_shared = array_interacting_patchgroups_in_common,
                                                                              array_interacting_proteinPianas_shared = array_interacting_proteins_in_common,
                                                                              array_belong_to_same_protein = array_belong_to_same_protein)
            if verbose_print_matrix:   print array_clustering_scores

            # ---
            # 4. cluster patches with highest similarity scores
            # ---
            """
            At this point, we have the scoring matrix telling us how similar two patchgroups are.

            Get the patchgroups that have to be clustered by retrieving the highest scores from the matrix
            """
            
            
            if verbose_process:   sys.stderr.write("Getting matrix positions to be clustered: ")

            # get the indices of score matrix  that hold maximum values (ie. the ones that are to be clustered)
            #  4.1. get max value in matrix
            #  4.2. get positions in matrix with max value by setting to 0 all those different from max value, and then retrieving those different from 0
            positions_to_be_clustered = []

            if len(array_clustering_scores) > 0:
                max_score = array_clustering_scores.max()  # 4.1.
            else:
                max_score = 0

            if max_score > self.score_threshold:
                # clustering will be perfomed only if the max score is higher than threshold set by user
                if verbose_process:   sys.stderr.write("(max score was=%s) " %(max_score) )
                max_score_cells = equal(array_clustering_scores, max_score).nonzero()  # 4.2.
                rows = max_score_cells[0]
                columns = max_score_cells[1]
                for i in range( rows.nelements() ):
                   positions_to_be_clustered.append([rows[i],columns[i]])  # positions_to_be_clustered is a list of matrix cell coordinates
                                                                           # where each coordinate (ie. [row,colum]) represents two
                                                                           # patchgroups that have to be clustered
            # END OF if max_score > self.score_threshold:
           
                
            if verbose_print_matrix:   sys.stderr.write(" Num of matrix positions to be clustered: %s\n" %(positions_to_be_clustered))

            if not positions_to_be_clustered:
                # if no patchgroups were clustered for the given score_threshold, then we can exit the loop
                if verbose_process: sys.stderr.write("Execution stopped for these parameters: score %s was lower than threshold %s\n" %(max_score,
                                                                                                                                        self.score_threshold) )
                break

            if verbose_process:   sys.stderr.write("Number of patchgroup pairs to be clustered =%s\n" %( len(positions_to_be_clustered)        )    )

            # transforming the list of patchgroup_ids into a list of patchgroups: going from matrix positions to actual patchgroups
            patchgroups_to_be_clustered = []
            for position in positions_to_be_clustered:

                patchgroup_0 = current_patchgroup_graph.get_patchgroup(patchgroup_id= current_patchgroup_ids[position[0]])
                patchgroup_1 = current_patchgroup_graph.get_patchgroup(patchgroup_id= current_patchgroup_ids[position[1]])

                patchgroups_to_be_clustered.append( [patchgroup_0, patchgroup_1] )
            # END OF for patchgroup_id_pair in patchgroup_ids_to_be_clustered: 

            if verbose_clustered_elements:
                sys.stderr.write("Patchgroups to be clustered are: \n")
                for patchgroup_pair in patchgroups_to_be_clustered:
                    sys.stderr.write(" [%s, %s] " %( patchgroup_pair[0].get_patch_group_id(), patchgroup_pair[1].get_patch_group_id() )  )
                    sys.stderr.write(" %s has proteins: %s\n" %(patchgroup_pair[0].get_patch_group_id(), patchgroup_pair[0].get_list_proteinPiana()) )
                    sys.stderr.write(" %s has proteins: %s\n" %(patchgroup_pair[1].get_patch_group_id(), patchgroup_pair[1].get_list_proteinPiana()) )
                sys.stderr.write("\n")

            """
            
            With the list of patchgroups to cluster, create a new network where those patches are clustered (and their interactions)
            
            """
            if patch_mode == "train" and verbose_comparison:
                sys.stderr.write("comparison results for level %s: \n" %(self.current_level_number + 1))

            if verbose_process:   sys.stderr.write("Creating next level graph (%s)\n" %(self.current_level_number + 1 ))

            next_level_graph = self.create_next_level_graph( current_level = self.current_level_number,
                                                             patchgroup_graph= current_patchgroup_graph ,
                                                             patchgroup_ids = current_patchgroup_ids,
                                                             patchgroups_to_be_clustered = patchgroups_to_be_clustered )

            # --------------
            # TRAINING MODE
            # -------------- 
            if patch_mode == "train":

                print "In training, comparing with parameters root=%s, comparison_mode=%s and similar_prots=%s" %(root_protein, comparison_mode,
                                                                                                                  similar_proteins_dic)
                # if we are in benchmarking mode, compare next_level_graph to the gold standard graph
                comparison_results = next_level_graph.compare_to( other_patchgroup_graph= self.gold_std_patchgroup_graph,
                                                                  comparison_mode = comparison_mode,
                                                                  root_protein= root_protein,
                                                                  similar_proteins_dic= similar_proteins_dic)

                number_of_patches_divergence = comparison_results[0]
                spec_shared_patches = comparison_results[1]
                sens_shared_patches = comparison_results[2]
                spec_patches_int = comparison_results[3]
                sens_patches_int = comparison_results[4]
                shared_tps = comparison_results[5]
                shared_fps = comparison_results[6]
                shared_fns = comparison_results[7]
                int_tps = comparison_results[8]
                int_fps = comparison_results[9]
                int_fns = comparison_results[10] 
                
                

                if verbose_process_detailed: sys.stderr.write("Writing results to comparison file...")
                
                comparison_file = file(self.comparison_results_file_name, "a")
                
                comparison_file.write("%s=%s\t%s=%s\t%s=%s\t%s=%s\t%s=%s\t%s=%s\t%s=%s\t%s=%s\t%s=%s\t%s=%s\t%s=%s\t%s=%s\t%s=%s\t%s=%s\t%s=%s\t%s=%s\t%s=%s\t%s=%s\t%s=%s\t%s=%s\n" %(
                    ScoringFunctionBenchmark.level_title,                    self.current_level_number + 1,
                    ScoringFunctionBenchmark.num_clusters_title,             number_of_current_patchgroup_ids,
                    ScoringFunctionBenchmark.num_links_title,                len(next_level_graph.get_edge_ids_list()),
                    ScoringFunctionBenchmark.max_score_title,                max_score,
                    ScoringFunctionBenchmark.name_title,                     self.scoring_function.get_name(),
                    ScoringFunctionBenchmark.w_patches_title,                self.scoring_function.get_w_patches(), 
                    ScoringFunctionBenchmark.w_prots_title,                  self.scoring_function.get_w_prots(), 
                    ScoringFunctionBenchmark.w_belong_title,                 self.scoring_function.get_w_belong(),
                    ScoringFunctionBenchmark.num_patches_title,              number_of_patches_divergence,
                    ScoringFunctionBenchmark.spec_shared_title,              spec_shared_patches,
                    ScoringFunctionBenchmark.sens_shared_title,              sens_shared_patches,
                    ScoringFunctionBenchmark.spec_int_title,                 spec_patches_int,
                    ScoringFunctionBenchmark.sens_int_title,                 sens_patches_int,
                    ScoringFunctionBenchmark.hub_threshold_title,            hub_threshold,    
                    ScoringFunctionBenchmark.tps_shared_title,               shared_tps,       
                    ScoringFunctionBenchmark.fps_shared_title,               shared_fps,        
                    ScoringFunctionBenchmark.fns_shared_title,               shared_fns,      
                    ScoringFunctionBenchmark.tps_int_title,                  int_tps,       
                    ScoringFunctionBenchmark.fps_int_title,                  int_fps,        
                    ScoringFunctionBenchmark.fns_int_title,                  int_fns                            )    )
                
                comparison_file.close()

                """
                This code was here to make training faster... in case two Nones where found, going to next training

                However, this has a side effect that is not good: it doesn't penalize cases where tp=0, fp=0, fn= 0 and then fp becomes>0

                At the point of 0,0,0, the training is stop. But, when mode "eval", it keeps looking for stop condition, and it can make new
                predictions that are FP. Therefore, in many eval cases, spec= 0, sens= None, because FPs appear.

                Removing this code, the training will go ahead with all situations till score is 0, meaning that cases where FPs increase will penalize
                those stop conditions

                
                if spec_shared_patches is None and sens_shared_patches is None:
                    # if there is no shared patches info for this protein, exit decomposition
                    # Note: if we were interested in finding accuracy for interactions, we would need to check as well for *_int figures...
                    
                    if verbose_process: sys.stderr.write("\nStop condition met: no info for protein %s at hub threshold %s\n" %(root_protein, hub_threshold )  )
                    no_info_file_fd = file("/home/raragues/phd/piana/code/execs/temp/" + str(root_protein) + "." + str(hub_threshold) + ".no_info_available", "w")
                    no_info_file_fd.write("no info found for protein %s at hub threshold %s\n" %(root_protein, hub_threshold))
                    no_info_file_fd.close()
                    break

                """
                    
                """
                THis code was here to stop training when no more good results could be found... However, it doesn't make sense to stop the training when
                in this case (FN==0), since at the time of evaluating (or executing) the algorithm, we only exit the process when the stop condition
                is met... and how could we ever get the good stop condition if the training didn't search for all the possibilities?
                Continuing the training even when FN is 0 could mean a lot of things, mainly that the stop condition will vary...
              
                elif shared_fns == 0:
                    if verbose_process: sys.stderr.write("Stopping training: no FNs found\n")
                    break

                """
                    
                if next_level_graph.get_size() > MAX_SIZE_ALLOWED:
                    # this condition is needed do to limited memory in my machine... when the network is too big, just write the protein
                    # to a file (to keep track of proteins ignored) and break the loop
                    if verbose_process: sys.stderr.write("Size of GraphPatchGroup is too big: %s. Skipping protein\n" %(next_level_graph.get_size()))
                    file_skipped_prots = file( "/home/raragues/phd/piana/code/execs/skipped_proteins.txt", "a")
                    file_skipped_prots.write("skipped_protein=%s\tsize=%s\tthreshold=%s" %(root_protein, next_level_graph.get_size(),
                                                                                           hub_threshold ))
                    file_skipped_prots.write("\tw_patches=%s\tw_prots=%s\tw_belong=%s\n" %( self.scoring_function.get_w_patches(),
                                                                                            self.scoring_function.get_w_prots(),
                                                                                            self.scoring_function.get_w_belong()))
                    file_skipped_prots.close()
                    break
            # END OF if self.gold_std_patchgroup_graph is not None:  (ie END OF benchmarking mode)

            # --------------------
            # END OF TRAINING MODE
            # -------------------- 

            if verbose_process_detailed: sys.stderr.write(" ...and saving graph (size %s) with mode %s: ..." %(next_level_graph.get_size(),
                                                                                                               self.save_mode))

            if self.save_mode == "memory":
                self.patchgroup_graphs_levels[self.current_level_number + 1]= next_level_graph
                
            elif self.save_mode == "disc":
                sys.stderr.write(" ...1...")
                self.current_patchgroup_graph =  copy.deepcopy(next_level_graph)
                sys.stderr.write(" ...2...")
                graph_file_name = "%s_%s" %(self.disc_name, self.current_level_number + 1 )
                sys.stderr.write(" ...3...")
                cPickle.dump(next_level_graph, file(graph_file_name, "wb"))
                sys.stderr.write(" ...4...")
                self.patchgroup_graph_filenames[self.current_level_number + 1] = graph_file_name

            if verbose_process_detailed: sys.stderr.write("graph saved!\n")

            if verbose_print_matrix:
                sys.stderr.write( "New Graph is: \n" )
                self.print_protein_decomposition_from_level_number(level_number=self.current_level_number , output_target= sys.stderr,
                                                 protein_type_name= "proteinPiana" , alternative_type_names=[] )
                sys.stderr.write("\n")
                self.print_proteins_in_clusters_from_level_number(level_number= self.current_level_number, output_target= sys.stderr,
                                                protein_type_name= "proteinPiana" , alternative_type_names=[] )
                sys.stderr.write("\n")
                self.print_patchgroups_interactions_from_level_number(level_number= self.current_level_number, output_target= sys.stderr)
        
            # END OF if verbose_print_matrix:

            self.current_level_number += 1

            if patch_mode != "train":
                # Checking stop condition
                #  - get the type of stop condition being used and its value
                #  - calculate the value for this type of stop condition
                #  - check if condition is true
                num_links = len(next_level_graph.get_edge_ids_list())

                if self.stop_condition_type == "score_clusters":
                    this_stop_value = utilities.get_ratio_group(numerator= max_score, denominator= number_of_current_patchgroup_ids)

                elif self.stop_condition_type == "score_links":
                    this_stop_value   = utilities.get_ratio_group(numerator= max_score, denominator= num_links)

                elif self.stop_condition_type == "clusters_links":
                    this_stop_value = utilities.get_ratio_group(numerator= number_of_current_patchgroup_ids, denominator= num_links)

                elif self.stop_condition_type == "max_score_range":
                    this_stop_value = utilities.get_range_group(value= max_score)

                else:
                    raise ValueError("Using a stop condition of unknown type")

                if this_stop_value == self.stop_condition_value:
                    stop_condition_met = 1
                    sys.stderr.write("STOP CONDITION MET!!! ratio value %s for type %s\n" %(this_stop_value, self.stop_condition_type))

                    if patch_mode == "eval":

                        print "doing comparison for parameters: root=%s, comparison_mode=%s, similar_prots=%s" %(root_protein,
                                                                                                                 comparison_mode,
                                                                                                                 similar_proteins_dic)
                        

                        
                        comparison_results = next_level_graph.compare_to( other_patchgroup_graph= self.gold_std_patchgroup_graph,
                                                                          comparison_mode = comparison_mode,
                                                                          root_protein= root_protein,
                                                                          similar_proteins_dic= similar_proteins_dic,
                                                                          piana_access = self.piana_graph.piana_access)


                        spec_shared_patches = comparison_results[1]
                        sens_shared_patches = comparison_results[2]
                        spec_patches_int = comparison_results[3]
                        sens_patches_int = comparison_results[4]

                        results_fd = file("/home/raragues/phd/piana/code/execs/temp_results/" + str(root_protein) + ".accuracy", "w")

                        results_fd.write("RESULTS FOR PROTEIN %s HAVE AN ACCURACY: spec_shared %s and sens_shared %s and spec_int %s and sens_int %s\n" %(
                            root_protein,
                            spec_shared_patches, sens_shared_patches,
                            spec_patches_int, sens_patches_int          ))

                    # END OF if patch_mode == "eval":

                    # in modes other than train, once the stop condition is met, we break the loop
                    break
                # END OF if this_stop_value == self.stop_condition_value:
                
                else:
                    # stop condition not met: continue execution
                    if verbose_process:
                        sys.stderr.write("Continue execution: iteration range= %s and stop_condition value= %s (of type %s)\n" %(this_stop_value,
                                                                                                                                 self.stop_condition_value,
                                                                                                                                 self.stop_condition_type))
                # END OF else: (if this_stop_value == self.stop_condition_value:)
                
                
            # END OF if patch_mode != "train":
  
        # END OF for i in range(clustering_steps):

        return stop_condition_met
    
    
    def get_patchgroup_graph_from_level_number(self, level_number = None):
        """
        returns the patchgroup_graph of a given level_number
        """

        if level_number < 0:
            # negative numbers refer to last clustering steps (current + negative_level_number)
            level_number = self.current_level_number + level_number +1  # a dirty trick... -1 means current level...
                                                                        
            
        if level_number > self.current_level_number or level_number < 0:
            # if the level required doesn't exist, return None and warn user
            return None

        if self.save_mode == "memory":
            # if save mode is memory, just retrieve it from memory
            return self.patchgroup_graphs_levels[level_number]
        
        elif self.save_mode == "disc":
            # if save mode is disc, get it from the disc (unless it is current graph, which are in memory)
            #   we keep current graph in memory to speed up processing in the clustering...
            if level_number == self.current_level_number:
                return self.current_patchgroup_graph
            else:
                # get it from disc
                print "existing file names are: %s and tryng to get %s" %(self.patchgroup_graph_filenames, level_number)
                
                return utilities.get_patchgroup_graph_from_file_name( file_name= self.patchgroup_graph_filenames[level_number] )
            

    # ------------------------------------------
    #  PatchDecomposition  output Methods
    # ------------------------------------------
    #
    # There are two kind of output methods: those that take as argument a level number and those that take a patchgroup_graph
    #
    # Those taking a level_number will print the level results associated to the object calling the method
    #    
    #
    # Those taking a patchgroup_graph will print directly results for the graph passed as argument
    #   
    #
    #
    #
    


    # # #
    # printing protein decomposition
    # # #
    def print_protein_decomposition_from_graph(self, patchgroup_graph=None, output_target= None,
                                                     protein_type_name= None, alternative_type_names= []):
        """
        print the protein decomposition of a given patchgroup_graph

        (ie prints functional site identifiers for each protein)
        """
 
        if patchgroup_graph is None:
            raise ValueError("No patchgroup_graph given to print the protein decomposition")

        
        protein_decomposition_dict = patchgroup_graph.get_protein_decomposition(piana_access= self.piana_graph.piana_access, protein_type_name= protein_type_name,
                                                                               alternative_type_names= alternative_type_names                   )
        

        for protein_code in protein_decomposition_dict:  

            output_target.write("protein %s has patchgroups:" %(protein_code) )

            for patchgroup_id in protein_decomposition_dict[protein_code]:
                output_target.write("\t%s" %(patchgroup_id) )

            output_target.write("\n")

        # END OF for protein_code in protein_decomposition_dict:


    def print_protein_decomposition_from_level_number(self, level_number= None,output_target= None,
                                    protein_type_name= None, alternative_type_names= []):
        """
        prints the protein decomposition for all patchgroups in a given level number

        (ie prints functional site identifiers for each protein)

        if level_number is negative, then it prints (last clustering step done - level_number)
        """                
        patchgroup_graph = self.get_patchgroup_graph_from_level_number(level_number = level_number)

        if patchgroup_graph is None:
            output_target.write("Level number not existing in current decomposition\n")
        else:
            self.print_protein_decomposition_from_graph(patchgroup_graph= patchgroup_graph, output_target= output_target,
                                                        protein_type_name= protein_type_name, alternative_type_names= alternative_type_names)


    
    # # #
    # printing clusters composition (ie. proteins in each patchgroup)
    # # #
    
    def print_proteins_in_clusters_from_graph(self, patchgroup_graph= None, output_target= None,
                                              protein_type_name= None, alternative_type_names= []):
        """
        prints the proteins in each patchgroup cluster for a given patchgroup_graph

        (ie prints which proteins share a functional site)
        """

        if patchgroup_graph is None:
            raise ValueError("No patchgroup_graph given to print the clusters composition")


        # get a dictionary with keys the patchgroup ids, and contents the proteins external codes that have that patchgroup id
        patchgroup_proteins_dict = patchgroup_graph.get_patchgroup_proteins(piana_access= self.piana_graph.piana_access, protein_type_name= protein_type_name,
                                                                            alternative_type_names= alternative_type_names                   )


        for patchgroup_id in patchgroup_proteins_dict:  

            output_target.write("these proteins share patchgroup_id %s:" %(patchgroup_id))
            for protein_in_cluster in patchgroup_proteins_dict[patchgroup_id]:
                output_target.write("\t%s" %(protein_in_cluster) )
                
            output_target.write("\n")
        # END OF for patchgroup_id in patchgroup_proteins_dict:


    def print_proteins_in_clusters_from_level_number(self, level_number= None, output_target= None,
                                                     protein_type_name= None, alternative_type_names= []):
        """
        prints which are the proteins that have patches for each cluster in a given level number

        (ie prints which proteins share a functional site)

        if level_number is negative, then it prints (last clustering step done - level_number)
        """

        patchgroup_graph = self.get_patchgroup_graph_from_level_number(level_number = level_number)


        if patchgroup_graph is None:
            output_target.write("Level number not existing in current decomposition\n")
        else:
            self.print_proteins_in_clusters_from_graph(patchgroup_graph= patchgroup_graph, output_target= output_target,
                                                       protein_type_name= protein_type_name, alternative_type_names=alternative_type_names )


    
    # # #
    # printing pathgroups interactions
    # # #
    
    def print_patchgroups_interactions_from_graph(self, patchgroup_graph= None, output_target= None):
        """
        prints the patchgroup-patchgroup interaction network for a given level number

        (ie prints functional sites interactions)

        if level_number is -1, then prints the last clustering level achieved
        """

        if patchgroup_graph is None:
            raise ValueError("No patchgroup_graph given to print the patchgroups interactions")

        # get a list of pathgroup pairs [patchgroup_id, patchgroup_id] that interact
        patchgroup_pathgroup_links = patchgroup_graph.get_node_node_links()


        for patchgroup_pathgroup_link in patchgroup_pathgroup_links:

            output_target.write("%s\t%s\n" %(patchgroup_pathgroup_link[0], patchgroup_pathgroup_link[1]))
        


    def print_patchgroups_interactions_from_level_number(self, level_number= None, output_target= None):
        """
        prints the patchgroup-patchgroup interaction network for a given level number

        (ie prints functional sites interactions)

        if level_number is negative, then it prints (last clustering step done - level_number)
        """

        patchgroup_graph = self.get_patchgroup_graph_from_level_number(level_number = level_number)


        if patchgroup_graph is None:
            output_target.write("Level number not existing in current decomposition\n")
        else:
            self.print_patchgroups_interactions_from_graph(patchgroup_graph= patchgroup_graph, output_target= output_target )

    
    # # #
    # printing pathgroups network in .dot format
    # # #
    
    def print_patchgroups_network_from_graph(self, patchgroup_graph= None, output_target= None):
        """
        prints the .dot patchgroup-patchgroup interaction network for a given level number

        (ie prints functional sites network in .dot format)

        if level_number is -1, then prints the last clustering level achieved
        """

        if patchgroup_graph is None:
            raise ValueError("No patchgroup_graph given to print the patchgroups network")

        patchgroup_graph.output_dot_file(filter_mode="all", output_target=output_target)


    def print_patchgroups_network_from_level_number(self, level_number= None, output_target= None):
        """
        prints the patchgroup-patchgroup interaction network for a given level number

        (ie prints functional sites network)

        if level_number is negative, then it prints (last clustering step done - level_number)
        """

        patchgroup_graph = self.get_patchgroup_graph_from_level_number(level_number = level_number)


        if patchgroup_graph is None:
            output_target.write("Level number not existing in current decomposition\n")
        else:
            self.print_patchgroups_network_from_graph(patchgroup_graph= patchgroup_graph, output_target= output_target )


    
    # # #
    # printing all information
    # # #
 
        
    def print_all_info_from_graph(self, patchgroup_graph= None, output_target= None,
                                  protein_type_name= None, alternative_type_names= []):
        """
        prints all info for patchgroup_graph
        """
        self.print_protein_decomposition_from_graph(patchgroup_graph = patchgroup_graph, output_target=output_target,
                                                    protein_type_name= protein_type_name,
                                                    alternative_type_names=alternative_type_names)
        
        self.print_proteins_in_clusters_from_graph(patchgroup_graph = patchgroup_graph, output_target= output_target,
                                                   protein_type_name= protein_type_name,
                                                   alternative_type_names=alternative_type_names)
        
        self.print_patchgroups_interactions_from_graph(patchgroup_graph = patchgroup_graph, output_target= output_target)



    def print_all_info_from_level_number(self, level_number= None, output_target= None,
                                         protein_type_name= None, alternative_type_names= []):
        """
        prints all info for a given level number
        """

        patchgroup_graph = self.get_patchgroup_graph_from_level_number(level_number = level_number)

        
        
        if patchgroup_graph is None:
            output_target.write("Level number not existing in current decomposition\n")
        else:
            self.print_all_info_from_graph(patchgroup_graph= patchgroup_graph, output_target= output_target,
                                           protein_type_name= protein_type_name, alternative_type_names=alternative_type_names )


   
    # # #
    # printing a table with CIRs identifiers and the proteins that have that CIR
    # # #
 
        
    def print_table_cir_assigned_from_graph(self, patchgroup_graph= None, output_target= None, cir_prefix = None, similar_proteins_dic= [],
                                            protein_type_name= None, alternative_type_names= [], root_protein = None, gold_std= None):
        """
        prints table_cir_assigned  for patchgroup_graph

        this table follows format:

        CIR_id<TAB>protein1_with_this_CIR<TAB>protein2_with_this_CIR<TAB>....
        """

        if patchgroup_graph is None:
            raise ValueError("No patchgroup_graph given to print the table_cir_assigned")


        patchgroup_graph.output_table_cir_assigned(output_target=output_target, piana_access= self.piana_graph.piana_access,
                                                   protein_type_name= protein_type_name, alternative_type_names= alternative_type_names,
                                                   cir_prefix= cir_prefix, root_protein=root_protein, gold_std= gold_std,
                                                     similar_proteins_dic= similar_proteins_dic )



    def print_table_cir_assigned_from_level_number(self, level_number= None, output_target= None, cir_prefix = None, similar_proteins_dic= [],
                                                   protein_type_name= None, alternative_type_names= [], root_protein = None, gold_std= None):
        """
        prints table_cir_assigned for a given level number

        this table follows format:

        CIR_id<TAB>protein1_with_this_CIR<TAB>protein2_with_this_CIR<TAB>....
        """

        patchgroup_graph = self.get_patchgroup_graph_from_level_number(level_number = level_number)

        
        if patchgroup_graph is None:
            output_target.write("Level number not existing in current decomposition\n")
        else:
            self.print_table_cir_assigned_from_graph(patchgroup_graph= patchgroup_graph, output_target= output_target,
                                                     protein_type_name= protein_type_name, alternative_type_names=alternative_type_names,
                                                     cir_prefix =cir_prefix, root_protein=root_protein, gold_std= gold_std,
                                                     similar_proteins_dic= similar_proteins_dic)




   
    # # #
    # printing a table with CIR interactions
    # # #
 
        
    def print_table_cir_int_from_graph(self, patchgroup_graph= None, output_target= None, cir_prefix = None, root_protein = None, gold_std= None):
        """
        prints table_cir_ints  for patchgroup_graph

        this table follows format:

        CIR_id1<TAB>CIR_id2
        """

        if patchgroup_graph is None:
            raise ValueError("No patchgroup_graph given to print the table_cir_ints")

        patchgroup_graph.output_table_cir_ints( output_target= output_target, cir_prefix= cir_prefix , root_protein=root_protein, gold_std= gold_std)



    def print_table_cir_int_from_level_number(self, level_number= None, output_target= None, cir_prefix = None, root_protein = None, gold_std= None):
        """
        prints  table_cir_ints for a given level number

        this table follows format:

        CIR_id1<TAB>CIR_id2
        
        """

        patchgroup_graph = self.get_patchgroup_graph_from_level_number(level_number = level_number)

        
        if patchgroup_graph is None:
            output_target.write("Level number not existing in current decomposition\n")
        else:
            self.print_table_cir_int_from_graph(patchgroup_graph= patchgroup_graph, output_target= output_target, cir_prefix=cir_prefix,
                                                root_protein=root_protein , gold_std= gold_std)



    
    # ------------------------------------------
    #  PatchDecomposition  processing Methods
    #------------------------------------------

    def initialize_graph_patch_group(self):
        """
        1. subdivide each protein into as many patches as interactions where it is involved
        2. create a patch-patch interaction network (it is actually a GraphPatchGroupNodeAttribute-GraphPatchGroupNodeAttribute network)
        """

        initial_protein_patchnodes = {}  # initial patches are stored in a dictionary that follows the structure:
                                              #
                                              #         a dictionary with protein_ids as keys, and content is a list of protein patch objects
                                              #
                                              # initial_protein_patchnodes = { protein_id1: list of patch objects for protein_id1,
                                              #
                                              #                             protein_id2: list of patch objects for protein_id2,
                                              #                             .................................................
                                              #                           }

        if verbose_show_patches:
            sys.stderr.write("ready to show patches")

        # 1.
        # for each protein in the protein interaction graph, divide it into patches, keeping them in initial_protein_patchnodes
        for protein_id in self.piana_graph.get_node_ids_list():

            initial_protein_patchnodes[protein_id] = [] # empty list of patch objects for current protein
            
            protein_object = self.piana_graph.get_node(identifier= protein_id)

            protein_neighbour_ids = protein_object.get_neighbour_ids()

            if verbose_show_patches:
                sys.stderr.write("neighbours of %s are: %s ||" %(protein_id, protein_neighbour_ids))

            # for each neighbour of the protein, create a new node (with patchgroup attribute) for current protein
            for protein_neighbour_id in protein_neighbour_ids:

                new_patch = Patch(patch_id= self.get_patch_id_counter(), proteinPiana=protein_id)

                patchgroup_id = self.get_patchgroup_id_counter()
                protein_patch_att = GraphPatchGroupNodeAttribute(patch_group_id = patchgroup_id)
                
                # in this initial stage, each patchgroup only has one protein patch.
                protein_patch_att.add_list_patch_objects( [new_patch] )
                current_node = GraphNode(nodeID= patchgroup_id, attribute= protein_patch_att )

                initial_protein_patchnodes[protein_id].append( current_node )
                
            # END OF for protein_neighbour_id in protein_neighbour_ids:
        # END OF for protein_id in self.get_node_ids_list():

        if verbose_show_patches:
            for protein_id in initial_protein_patchnodes:    
                for protein_patchnode in initial_protein_patchnodes[protein_id]:
                    patchgroup_att = protein_patchnode.get_node_attribute_object()
                    patchgroup_id = patchgroup_att.get_patch_group_id()
                    patches = patchgroup_att.get_list_patch_id()
                    
                # END OF for neighbour_id in initial_protein_patchnodes[protein_id]:
            # END OF for protein_id in initial_protein_patchnodes:
        # END OF if verbose_show_patches:

        # 2.
        # create a patch-patch interaction network (it is actually a GraphPatchGroupNodeAttribute-GraphPatchGroupNodeAttribute network)

        
        # temp_patchgroup_graph will hold the GraphPatchGroup for this initial stage
        temp_patchgroup_graph = GraphPatchGroup(graph_id= "0")


        # for each patch of each protein in the network, create an interaction with all other patches in neighbouring proteins
        for protein_id in self.piana_graph.get_node_ids_list():
            
            # get neighbours of protein, needed to create interactions afterwards
            protein_object = self.piana_graph.get_node(identifier= protein_id)
            protein_neighbour_ids = protein_object.get_neighbour_ids()
            
            for protein_patchnode in initial_protein_patchnodes[protein_id]:

                # neighbours_patchnodes will hold the patchnodes for neighbouring proteins
                # this will be the list of nodes that are "interacting" with patchnodes in protein being processed (ie protein_id)
                neighbours_patchnodes = []
                for neighbour_id in protein_neighbour_ids:
                    # extending list of neighbour patchnodes with patchnodes of current neighbour id
                    neighbours_patchnodes.extend(initial_protein_patchnodes[neighbour_id])
                # END OF for neighbour_id in protein_neighbour_ids::

                temp_patchgroup_graph.add_node(protein_patchnode)
                # for each neighbour patchnode add an edge with current protein patchnode
                for neighbour_patchnode in neighbours_patchnodes:

                    temp_patchgroup_graph.add_node(neighbour_patchnode)

                    current_edge_attribute = GraphPatchGroupEdgeAttribute() # empty attribute to maintain compatability with Graph class
                    
                    current_edge = temp_patchgroup_graph.get_edge(identifier1=protein_patchnode,
                                                                  identifier2=neighbour_patchnode,
                                                                  attribute_object=current_edge_attribute,
                                                                  get_mode="new")
                    temp_patchgroup_graph.add_edge(current_edge)
                # END OF for neighbour_patch_id in neighbour_patch_ids:
            # END OF for protein_patch in initial_protein_patchnodes[protein_id]:
        # END OF for protein_id in self.piana_graph.get_node_ids_list():
    

        if verbose_show_patches:
            temp_patchgroup_graph.output_dot_file(output_target= sys.stderr)
            

        return temp_patchgroup_graph
        

    def get_current_level_number(self):
        """
        returns the current level of clustering

        It doesn't modify the level!! Just returns current number
        """
        return self.current_level_number
    
    def get_patch_id_counter(self):
        """
        returns the current patch id counter, increasing it by one to keep it unique
        """

        current_patch_id_counter = self.patch_id_counter
        self.patch_id_counter +=1
        return current_patch_id_counter
        
    def get_patchgroup_id_counter(self):
        """
        returns the current patch id counter, increasing it by one to keep it unique
        """

        current_patchgroup_id_counter = self.patchgroup_id_counter
        self.patchgroup_id_counter +=1
        return current_patchgroup_id_counter


    def create_next_level_graph(self, current_level, patchgroup_graph, patchgroup_ids, patchgroups_to_be_clustered):
        """
        returns a new graph by clustering patches of current_level graph that are in patches_to_be_clustered
        
        patchgroup_graph is the patchgroup graph of current level
        """
         
       
        transition_matrix = {}    # transition matrix describes the transition of patchgroups from one level to the other
                                  #   for each patchgroup_id, it contains into which patchgroup_id it has been merged
                                  #   (of course, a patchgroup that has not been clustered keeps the same patchgroup_id)
                                  # structure is the following:
                                  #   { patchgroup_id1_level_n : patchgroup_idx_level_n+1,
                                  #     patchgroup_id2_level_n : patchgroup_idy_level_n+1,
                                  #     .................................................
                                  #   }

        # initialization transition matrix: each patchgroup_id initialise to transit to itsself
        for patchgroup_id in patchgroup_ids:
            transition_matrix[patchgroup_id] = patchgroup_id
        # END OF for patchgroup_id in patchgroup_ids:

        patchgroup_ids_clustered = {}   # keep a list (ie. the keys of the dictionary) of those patchgroup ids that have been clustered
        new_patchgroups = []            # holds patchgroups created during the clustering


        # Create new patchgroups 
        for patchgroup_pair_to_be_clustered in patchgroups_to_be_clustered:

            patchgroup_id_a = patchgroup_pair_to_be_clustered[0].get_patch_group_id()
            patchgroup_id_b = patchgroup_pair_to_be_clustered[1].get_patch_group_id()

            # only clustering the patchgroups if none of them was already clustered (ie if not (a or b):)
            if not ( patchgroup_ids_clustered.has_key( patchgroup_id_a ) or
                     patchgroup_ids_clustered.has_key( patchgroup_id_b )    ):

                new_patchgroup_id = self.get_patchgroup_id_counter()
                new_patchgroup_attribute = GraphPatchGroupNodeAttribute(patch_group_id= new_patchgroup_id)

                # adding patches from both patchgroups in pair to be clustered
                new_patchgroup_attribute.add_list_patch_objects(patchgroup_pair_to_be_clustered[0].get_list_patch())
                new_patchgroup_attribute.add_list_patch_objects(patchgroup_pair_to_be_clustered[1].get_list_patch())

                # update list of patchgroup ids that have been clustered
                patchgroup_ids_clustered[patchgroup_id_a] = None # keeping the key
                patchgroup_ids_clustered[patchgroup_id_b] = None # keeping the key

                # update transition matrix with new transitions
                transition_matrix[patchgroup_id_a] = new_patchgroup_id
                transition_matrix[patchgroup_id_b] = new_patchgroup_id

                new_patchgroups.append(new_patchgroup_attribute)
                
            # END OF if not (patchgroup_pair_to_be_clustered[0].get_patch_group_id() in  ....
        
        # END OF for patchgroup_pair_to_be_clustered in patchgroups_to_be_clustered:
            

        #
        # At this point, we've got:
        #          - a list of new patchgroups (result from clustering pairs of patchgroups): new_patchgroups
        #          - the transition matrix (patchgroup_id_before: patchgroup_id_now): transition_matrix
        #          
        #
        # this is enough to create a new graph that comes from the previous one, where some nodes have been clustered

        new_clustering_level = current_level + 1
        new_patchgroup_network = GraphPatchGroup(graph_id = new_clustering_level)

        # Add new patchgroups
        for new_patchgroup_att in new_patchgroups:
            new_patchnode = GraphNode(nodeID= new_patchgroup_att.get_patch_group_id(), attribute = new_patchgroup_att)
            new_patchgroup_network.add_node(new_patchnode)
        # END OF for new_patchgroup_att in new_patchgroups:

        # Add old patchgroups that were not clustered
        for patchnode in patchgroup_graph.get_node_object_list():

            if not patchgroup_ids_clustered.has_key(patchnode.get_node_id()):
                # the node cannot be directly added... it is actually a new node with some modifications to characteristics of the old one
                modified_node = GraphNode(nodeID=patchnode.get_node_id(),
                                          attribute = patchnode.get_node_attribute_object(),
                                          isRoot = patchnode.is_root(),
                                          graph = new_patchgroup_network,
                                          expanded_from = patchnode.is_expanded()[0],
                                          expansion_type = patchnode.is_expanded()[1],
                                          ishidden= patchnode.is_hidden() )

                # this node doesn't have an updated neighbours and edges list. These lists will be updated by add_edge when
                # creating the edged for this new graph
                new_patchgroup_network.add_node(modified_node)
        # END OF for patchnode in patchgroup_graph.get_node_object_list():
        
        # Add edges taking into account the id transitions that have occurred
        for old_edge in patchgroup_graph.get_edge_object_list():
            new_edge = GraphEdge(node1_id=transition_matrix[old_edge.get_start_node_id()],
                                 node2_id=transition_matrix[old_edge.get_end_node_id()],
                                 attribute_object=old_edge.get_edge_attribute_object(),
                                 graph=new_patchgroup_network,
                                 original=old_edge.is_original(),
                                 propagated=old_edge.is_propagated(),
                                 extended=old_edge.is_extended(),
                                 hidden=old_edge.is_hidden()  )

            new_patchgroup_network.add_edge(new_edge)
        # END OF for old_edge in patchgroup_graph.get_edge_object_list():
            
        return new_patchgroup_network
                                                         
        
    # -------------------------------------------------------
    # Methods for calculating similarity between patchgroups
    # -------------------------------------------------------



    def get_arrays_scores(self, patchgroup_graph= None, all_patchgroup_ids=None,
                          number_of_patchgroup_ids=None):
        
        """
        returns arrays needed by scoring function to calculate the scores

        "patchgroup_graph" is the patchgroup graph over which the arrays will be calculated
        "all_patchgroup_ids" is a list with all patchgroup ids in the patchgroup graph
        "number_of_patchgroup_ids" determines the dimension of the arrays


        
        returns three arrays as (array_interacting_patchgroups_in_common, array_interacting_proteins_in_common, array_belong_to_same_protein)

           [0] is array containing number of interacting patchgroups in common between all patchgroups
           [1] is array containing number of interacting proteins in common between all patchgroups
           [2] is array containing number of interacting proteins in common between all patchgroups
        """
        interacting_proteinPianas = {}       # interacting_proteinPianas is a dictionary with proteinPianas that interact with each patchgroup
                                             # follows the structure:
                                             #
                                             #     { patchgroup_id1: {keys are the list of proteinPianas of patchgroups that interact with patchgroup_id1},
                                             #       patchgroup_id2: {keys are the list of proteinPianas of patchgroups that interact with patchgroup_id2},
                                             #       ...................................................
                                             #                  }
        interacting_patchgroup_ids = {}      # interacting_patchgroup_ids is a dictionary with patchgroup_ids  that interact with each patchgroup
                                             # follows the structure:
                                             #
                                             #     { patchgroup_id1: {keys are the list of patchgroup_ids that interact with patchgroup_id1},
                                             #       patchgroup_id2: {keys are the list of patchgroup_ids that interact with patchgroup_id2},
                                             #       ...................................................
                                             #                  }

        # initialize arrays to zeros, of dimension number of patchgroups X number of patchgroups
        array_interacting_patchgroups_in_common = zeros((number_of_patchgroup_ids, number_of_patchgroup_ids), Float32)
        array_interacting_proteins_in_common = zeros((number_of_patchgroup_ids, number_of_patchgroup_ids), Float32)
        array_belong_to_same_protein= zeros((number_of_patchgroup_ids, number_of_patchgroup_ids), Float32)


        # --
        # first of all, populate the dictionaries that will be used later to do calculations
        #
        #  --> sets_interacting_patchgroup_ids: dictionary of Sets used to calculate array_interacting_patchgroups_in_common
        #  --> sets_interacting_proteinPianas: dictionary of Sets used to calculate array_interacting_proteins_in_common
        #  --> sets_have_proteinPianas: dictionary of Sets used to calculate array_belong_to_same_protein
        # --
        if verbose_process:
            sys.stderr.write(" - filling sets - ")
        sets_interacting_patchgroup_ids = {} 
        sets_interacting_proteinPianas = {}
        sets_have_proteinPianas = {}
        # for each patchgroup, calculate its Sets
        for current_patchgroup_id in all_patchgroup_ids:

            interacting_patchgroup_ids[current_patchgroup_id] = {}  
            interacting_proteinPianas[current_patchgroup_id] = {}            

            current_patchnode = patchgroup_graph.get_node(identifier=current_patchgroup_id)
            current_patchnode_att = current_patchnode.get_node_attribute_object()
            sets_have_proteinPianas[current_patchgroup_id] = Set( current_patchnode_att.get_dict_proteinPianas() )
            
            current_patchnode_id_neighbour_list = current_patchnode.get_neighbour_ids()

            for current_patchnode_neighbour_id in current_patchnode_id_neighbour_list:

                interacting_patchgroup_ids[current_patchgroup_id][current_patchnode_neighbour_id] = None
                
                current_patchnode_neighbour_att = (patchgroup_graph.get_node(identifier=current_patchnode_neighbour_id)).get_node_attribute_object()

                for current_patchnode_neighbour_proteinPiana in current_patchnode_neighbour_att.get_dict_proteinPianas():

                    # the important thing here is to set the key. value is None we will never access this content
                    # no need to check if key exists... it is faster just to set the value
                    interacting_proteinPianas[current_patchgroup_id][current_patchnode_neighbour_proteinPiana]=None
                        
                # END OF for current_patchnode_neighbour_proteinPiana in current_patchnode_neighbour_proteinPiana_list:
            # END OF  for current_patchnode_neighbour in current_patchnode_neighbour_list:

            sets_interacting_patchgroup_ids[current_patchgroup_id] = Set(interacting_patchgroup_ids[current_patchgroup_id] )
            sets_interacting_proteinPianas[current_patchgroup_id] = Set(interacting_proteinPianas[current_patchgroup_id] )
        # END OF for current_patchgroup_id in all_patchgroup_ids:


        # --
        # go through all elements to the right of the diagonal of the matrix calculating arrays positions for each pair
        # --
        if verbose_process:
            sys.stderr.write(" - calc arrays - \n")
        for row in range(0, number_of_patchgroup_ids):
            for column in range(row+1, number_of_patchgroup_ids):

                # ---
                # calculating array_interacting_patchgroups_in_common
                # ---
                array_interacting_patchgroups_in_common[row][column] = len(
                    (sets_interacting_patchgroup_ids[all_patchgroup_ids[row]]).intersection( sets_interacting_patchgroup_ids[all_patchgroup_ids[column]]) )

                # ---
                # calculating array_interacting_proteins_in_common
                # ---
                array_interacting_proteins_in_common[row][column] =  len(
                    (sets_interacting_proteinPianas[all_patchgroup_ids[row]]).intersection( sets_interacting_proteinPianas[all_patchgroup_ids[column]]) )

                # ---
                # calculating array_belong_to_same_protein
                # ---

                if sets_have_proteinPianas[all_patchgroup_ids[row]].intersection( sets_have_proteinPianas[all_patchgroup_ids[column]] ):
                    array_belong_to_same_protein[row][column] = 1
                
            # END OF for column in range(row+1, number_of_patchgroup_ids):
        # END OF for row in range(0, number_of_patchgroup_ids):
    
        return (array_interacting_patchgroups_in_common, array_interacting_proteins_in_common, array_belong_to_same_protein)
