"""
File        : PianaGraph.py
Author      : Ramon Aragues
Creation    : 2003
Contents    : implementation of graph handling protein interaction networks (subclass of Graph.py)
Called from : programs/classes that need to use piana graphs

=======================================================================================================

This file implements class PianaGraph
"""

# PianaGraph.py: implements a class for managing protein-protien interaction networks
#
# Copyright (C) 2005  Ramon Aragues
# author email: ramon.aragues@upf.edu and boliva@imim.es
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#    http://www.gnu.org/copyleft/gpl.html
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
#
# University Pompeu Fabra, hereby disclaims all copyright
# interest in the program 'PIANA'
# (software for working with protein-protein interaction networks) written 
# by Ramon Aragues

import types
import copy

from Graph import *
from PianaDBaccess import *
from PianaGraphEdgeAttribute import *
from PianaGraphNodeAttribute import *

verbose = 0
verbose_output_graph = 0
verbose_patches = 0
verbose_spots = 0
verbose_expression= 0


#----------------------------------------------------------------------------------------------
class PianaGraph(Graph):
#----------------------------------------------------------------------------------------------
    """
    Implementation of graph handling protein interaction networks (subclass of Graph.py)
    """
    def __init__(self, graphID = None, piana_access_object = None):


        self.graph_id = graphID
	self.piana_access = piana_access_object

        self.ext_code_connecting = {}    # used in connecting mode to keep a linkers and the roots
                                         # that it connects (using ext_code instead of proteinPiana)
                                         
        # These variables used to unify names used for the proteins:
        
        self.last_key_list = -1               # initializing to a key that does not match to any list of proteins
        self.number_of_nodes_names_filled = 0 # used to know if nodes where added since dic_protein_naming was filled
        self.last_output_type= None
        self.dic_protein_naming = {}    # used for controlling which name is assigned to each protein
                                        # has structure: { protein_name: unique protein name to be used,
                                        #                  protein_name: unique protein name to be used,
                                        #                  .......................
                                        #                }


        self.unified_type_name= None      # used to check whether the unification was already done or not
        self.first_unified_node= None     # used to check whether the unification was already done or not
        self.last_unified_node= None      # used to check whether the unification was already done or not
        self.number_of_unified_nodes = 0  # used to check whether the unification was already done or not
        
        self.ext_code_unified_node_correspondence = {}  # a dictionary used for creating unified nodes
                                                        # follows structure:
                                                        #                     { ext_code: unified_node,
                                                        #                       ext_code: unified_node,
                                                        #                       ...................... }
                                                        
        self.number_of_unified_edges = 0                                
        self.edge_key_unified_edge_correspondence = {}   # a dictionary used for creating unified edges
                                                        # follows structure:
                                                        #                     { edge_id: unified_edge,
                                                        #                       edge_id: unified_edge,
                                                        #                       ...................... }


        # These variables used to keep track of over/infra expressed proteins:
        

        self.over_expressed_proteinPianas = {}  # used for keeping a list of which proteinPianas are over expressed according to a expression file
        self.infra_expressed_proteinPianas = {}  # used for keeping a list of which proteinPianas are over expressed according to a expression file


        # initialize graph
        Graph.__init__(self, graphID= self.graph_id)

    # ---------
    # Methods needed for pickling and unpickling 
    # ---------

    def __getstate__(self):

        odict = self.__dict__.copy() # copy the dict since we change it
        #del odict['db']              # remove filehandle entry
        return odict

    def __setstate__(self, dict):

        self.__dict__ = dict
        dict.__class__.__init__(dict)
    
    def __getnewargs__(self):

        return (self.graph_id, self.piana_access)

    # ------------------------------------------
    #  PianaGraph Methods: get information about the graph and its componets
    #------------------------------------------

    def get_nodes_molecular_weights(self):
        """
        returns a dictionary (hashed by nodeID (ie proteinPiana)) of molecular weights:
                                                      { nodeID1:molecularWeight1, nodeID2:molecularWeight2 , ...}
        """

        molecular_weights = {}

        for node in self.get_node_object_list():

            node_attribute = node.get_node_attribute_object()

            molecular_weights[node.get_node_id()]= node_attribute.get_molecular_weight()
        # END OF for node in self.get_node_object_list():

        return molecular_weights

    def get_nodes_isoelectric_points(self):
        """

        returns a dictionary (hashed by nodeID (ie proteinPiana)) of isoelectric points:
                                                       {nodeID1:isoelectricPoint1, nodeID2:isoelectricPoint2 , ...}
        """

        isoelectric_points = {}

        for node in self.get_node_object_list():

            node_attribute = node.get_node_attribute_object()

            isoelectric_points[node.get_node_id()]= node_attribute.get_isoelectric_point()
        # END OF for node in self.get_node_object_list():

        return isoelectric_points


    def add_interaction(self, protein_a= None, protein_b=None, protein_type_name= None, tax_id_value=0 , source_db=None, method=None, confidence=None):
        """
        Adds an interaction to the current PianaGraph object

        "protein_a" is the protein code of one side of the interaction

        "protein_b" is the protein code of the other side of the interaction

        "protein_type_name" is the type of protein code being used for protein_a and protein_b
           --> valid types of protein code are listed in PianaGlobals.valid_protein_types

        "tax_id_value" sets the species of the interactions that are being added (can be used for eliminating ambiguities between codes across species)
           --> valid species names are 0 (do not take into account the species) and those taxonomy ids provided by ncbi taxonomy
        
        "source_db", "method" and "confidence" not being used: that data is obtained directly from the database instead
        """
        
        protein_column_type = utilities.get_code_column(protein_type_name)

        # Search for the db identifiers of protein_a and protein_b
        # We don't allow to add interactions between proteins that I don't have in the database
        list_proteinPiana_a = self.piana_access.get_list_protein_piana(proteinCode_value= protein_a,
                                                                       proteinCodeType_value= protein_column_type,
                                                                       tax_id_value= tax_id_value,
                                                                       source_db_info="no")
        list_proteinPiana_b = self.piana_access.get_list_protein_piana(proteinCode_value= protein_b,
                                                                       proteinCodeType_value= protein_column_type,
                                                                       tax_id_value= tax_id_value,
                                                                       source_db_info="no")
        

        for proteinPiana_a in list_proteinPiana_a:
            for proteinPiana_b in list_proteinPiana_b:
                
                # 1.0 first of all, we create an edge attribute describing the propagation of the edge
                new_edge_attribute = PianaGraphEdgeAttribute(interactionPiana_value= None, mem_mode = "inMemory", piana_access = self.piana_access)

                # create_edge returns a tuple [edge_object, new] where new==1 means edge didn't exist in the graph before
                #             -> it doesn't add the edge to the graph
                new_edge = self.create_edge(node_id1= proteinPiana_a,
                                            node_id2= proteinPiana_b,
                                            attribute_object= new_edge_attribute)

                if new_edge[1] == 1:
                    if verbose:
                        sys.stderr.write(" adding edge between %s (%s) and %s (%s) \n" %(proteinPiana_a, protein_a, proteinPiana_b, protein_b))

                    # edge has to be added: create nodes, add them and then add the edge
                    new_node_a_attribute = PianaGraphNodeAttribute(proteinPiana_value=proteinPiana_a, mem_mode = "inMemory",
                                                                   piana_access = self.piana_access)
                    new_node_b_attribute = PianaGraphNodeAttribute(proteinPiana_value=proteinPiana_b, mem_mode = "inMemory",
                                                                   piana_access = self.piana_access)
                    
                    new_node_a = self.get_node(identifier=proteinPiana_a, attribute=new_node_a_attribute)
                    new_node_b = self.get_node(identifier=proteinPiana_b, attribute=new_node_b_attribute)
                    
                    # we do not know if the nodes are already in the graph: add them (add_node takes care of merging attributes if different)
                    self.add_node(new_node_a)
                    self.add_node(new_node_b)
                    self.add_edge(new_edge[0])

                # END OF if new_edge[1] == 1:
                else:
                    if verbose:
                        sys.stderr.write(" edge between %s (%s) and %s (%s) already existed in network\n" %(proteinPiana_a,
                                                                                                            protein_a,
                                                                                                            proteinPiana_b,
                                                                                                            protein_b))
            # END OF for proteinPiana_b in list_proteinPiana_b:
        # END OF for proteinPiana_a in list_proteinPiana_a:

        if not list_proteinPiana_a or not list_proteinPiana_b:
            sys.stderr.write("No proteinPiana found for protein (%s->%s) or protein (%s->%s)\n" %(protein_a,
                                                                                                  list_proteinPiana_a,
                                                                                                  protein_b,
                                                                                                  list_proteinPiana_b))
 
    # ----------------------------------------------------------------------------------------------
    #  PianaGraph Methods: Internal methods for unifying edges and nodes, deciding names to use, etc
    #-----------------------------------------------------------------------------------------------
   
    def _get_edge_key(self, this_edge, protein_type_name, list_alternative_type_names ):
        """
        returns a tuple (unique key for an edge, node_1_ext_code, node_2_ext_code)

        The edge key is generated considering its external codes instead of the proteinPianas

        the ext_codes are taken using the dic_protein_naming
        """
        proteinPiana_start, proteinPiana_end = this_edge.get_node_ids()



        
        node_1_ext_code = self.get_unified_name( proteinPiana= proteinPiana_start,
                                                 protein_type_name=protein_type_name, list_alternative_type_names=list_alternative_type_names)
        node_2_ext_code = self.get_unified_name( proteinPiana= proteinPiana_end,
                                                 protein_type_name=protein_type_name, list_alternative_type_names=list_alternative_type_names)
        
        if node_1_ext_code <= node_2_ext_code:
            edge_key = "%s.%s" %(node_1_ext_code, node_2_ext_code)
        else:
            edge_key = "%s.%s" %(node_2_ext_code, node_1_ext_code)

        return (edge_key, node_1_ext_code, node_2_ext_code)

    def get_unified_name(self, proteinPiana, protein_type_name, list_alternative_type_names):
        """
        returns the name (of code type "protein_type_name") that is assigned to "proteinPiana" as the unified name
        """

        # get the ext_code that is going to be used for this proteinPiana
        one_code = self.piana_access.get_list_protein_external_codes(proteinPiana= proteinPiana,
                                                                     protein_type_name= protein_type_name,
                                                                     alternative_type_names= list_alternative_type_names,
                                                                     answer_mode="single")

        # convert to uppercase in case it is not an int (we always work with uppercase names...)
        if not isinstance(one_code, int):     code_to_use = one_code.upper()
        else:                                 code_to_use = one_code

        # get he unified name
        if not one_code:     ext_code = "no_code_found"
        else:                ext_code= self.dic_protein_naming[code_to_use]

        return ext_code
                
        
    def _create_unified_network(self, protein_type_name, list_alternative_type_names, user_keywords=[], list_proteinPianas= None):
        """

        Attention!!! It is mandatory to call self._fill_dic_protein_naming before calling this method!!!!
                       (and self._fill_expression_dics  in case you are calling it from a method that requires expression information)

        

        This method is here to solve the following problems:

           - N nodes with a different node_id (ie proteinPiana) can have the same ext_code
             Each of these nodes can have different characteristics (be root or not, have a keyword or not, ...)
             If we print the nodes directly to output, the characteristics that will be used for the output for all nodes with the same ext_code
             will be those of the first node printed, since we avoid repeating the same ext_code twice
             However, the correct thing to do is to print just one ext_code, but one that has all the characteristics of the proteinPianas that correspond
             to it
             Therefore, we must unify the nodes by their ext_codes, and the new nodes must have all the characteristics of the corresponding proteinPianas

           - something similar happens with the edges
             Edges are defined by the proteinPianas that they join. However, proteinPianas can correspond to the same ext_codes, and therefore, if we
             only print one edge for two ext_codes, we are loosing the characteristics of the other edges

             eg:  edge 1 is between proteinPianas 11 and 12, and is given by DIP
                  edge 2 is between proteinPianas 21 and 22, and is given by STRING
                  edge 3 is between proteinPianas 31 and 32, and is given by HPRD

                  then, if proteinPiana 11 has the same ext_code as proteinPiana 31, and proteinPiana 12 has the same ext_code as proteinPiana 32
                        we should be printing the edge for DIP and HPRD. If this method didn't exist, we would only consider DIP, since the second
                        edge would be ignore because it is between the same ext_codes

        What this method does is create a list of unified nodes and another one of unified edges, and creates correspondences between the
        piana_graph nodes and the unified nodes and the same for the edges. Then, when printing, we will use the characteristics of the
        unified nodes and edges instead of using the piana_graph nodes and edges. There are a number of node characteristics that are not filled
        up when creating the network (eg. taxonomies) but are used for printing. These characteristics are stored in PianaGraphNodeAttribute but
        are only filled up in this method. If we have a network in memory, its nodes will not contain those characteristics, they will only be
        used when creating 'unified' nodes to storing info associated to several proteinPianas that correspond to the same ext_code

        "user_keywords" is a list of keywords that the user wants to highlight when printing information


        "list_proteinPianas" is usually None, but... if you are printing information for proteins that are not in the network (ie. using
        method output_list:*) then, since there are no Nodes associated to the proteins that we want to print, we need to create the nodes
        from scratch and not based on a previous one. In those cases, set here the list of proteinPianas for which you want to do the unification.
        Setting this parameter also makes the method to skip the edge unification, since there are no edges associated to these proteins.
        
        """
        if list_proteinPianas:
            # we are unifying nodes that do not appear in a network...(they are being provided via a list of proteins)
            #   -> create GraphNode objects for these proteinPianas with the corresponding attributes

            if len(list_proteinPianas) == self.number_of_unified_nodes and list_proteinPianas[0]== self.first_unified_node and (
                list_proteinPianas[-1]== self.last_unified_node and protein_type_name== self.unified_type_name):
                # do not redo the unification if we already did it for this list of proteinPianas
                # the way of checking it is not very nice, but I think it works almost always and doesn't take too much time to do the check
                # In order to do a perfect check on whether the unification must be done again or not, we would have to create a unique
                # identifier for the list for which the unification is happening, and redo it only if the unique identifier changes
                # In the way I am doing the reunification it will fail (ie. it won't do the unification when in fact, it was needed) in the following
                # case: the number of elements in the new list is the same as in the old list, and the elements 0, and last of the list
                # are identical in both lists
                return 0
            
            self.first_unified_node= list_proteinPianas[0]
            self.last_unified_node= list_proteinPianas[-1]

            list_of_nodes = []
            for proteinPiana in list_proteinPianas:
                new_attribute = PianaGraphNodeAttribute(proteinPiana_value= proteinPiana,  piana_access= self.piana_access, mem_mode = "onDemand")
                new_node = GraphNode(nodeID=proteinPiana,
                                     attribute = new_attribute,
                                     isRoot =1, graph = None, expanded_from=None, expansion_type=None, ishidden=0, alternative_id= None)

                list_of_nodes.append(new_node)

        else:
            # unifying nodes of the network
            number_of_nodes_in_current_graph = len(self.get_node_ids_list())
            number_of_edges_in_current_graph = len(self.get_edge_ids_list())
            
            if len(self.ext_code_unified_node_correspondence) > 0 and number_of_nodes_in_current_graph == self.number_of_unified_nodes and (
                number_of_edges_in_current_graph == self.number_of_unified_edges and protein_type_name== self.unified_type_name):
                # do not redo the unification if it was already done (unless there has been a change in the number of nodes/edges in the graph)
                
                # TO DO!!! This is not exactly correct: if there has been a change in the graph, but the number of
                #          nodes and edges is still the same (eg. an edge was added and another edge was removed), this 'if' will
                #          consider that dics self.*_unified_*_correspondence do not need recalculating, which is false...
                #          This is not very likely to happen, but it would be nice to think of a way that checks in a perfect manner
                #          whether the dics have to be recalculated or not: how do I create a unique key of the graph that was used
                #          to fill the dics? A string with all node ids and edge ids? Ummm.... too complicated and costly...
                return 0

            list_of_nodes = self.get_node_object_list()
        # END OF else: (if list_proteinPianas:)

        # network has to be unified: initialize variables

        self.number_of_unified_nodes = 0
        self.number_of_unified_edges = 0
        self.unified_type_name = protein_type_name
        self.ext_code_unified_node_correspondence = {}                              
        self.edge_key_unified_edge_correspondence = {}
        
        # ----
        # node unification
        # ---
        for current_node in list_of_nodes:
            # for each node in the graph, create (or merge attributes with a previous node) a unique node for the ext_code that corresponds to it

            node_proteinPiana = current_node.get_node_id()
            
            # set the characteristics of the attribute that are not filled up when creating the network
            #   -> in case we afterwards find out that there was no node already created for this ext_code, these characteristics will be the
            #      characteristics of the initial unified node. Otherwise, they will be merge with the characteristics that existed previously
            #   -> these _set_* methods change the node in the PianaGraph, but it is not a problem because the modifications are made to
            #      variables that are only used by the unification process
            current_node.get_node_attribute_object()._set_associated_proteinPianas(proteinPiana= node_proteinPiana)
            current_node.get_node_attribute_object()._set_dic_functions()
            current_node.get_node_attribute_object()._set_dic_descriptions()
            current_node.get_node_attribute_object()._set_dic_taxonomies()
            current_node.get_node_attribute_object()._set_keywords_appearing(user_keywords=user_keywords)
            current_node.get_node_attribute_object()._set_fitness_scores()
            current_node.get_node_attribute_object()._set_union_neighbours(neighbours_dic=current_node.neighbour_ids_dic)

            if self.over_expressed_proteinPianas.has_key(node_proteinPiana):           node_expression = PianaGlobals.over_expressed_protein
            elif self.infra_expressed_proteinPianas.has_key(node_proteinPiana):        node_expression = PianaGlobals.infra_expressed_protein
            else:                                                                      node_expression = None
            
            current_node.get_node_attribute_object()._set_expression(expression=node_expression)

            
            # get the ext_code that is going to be used for this proteinPiana
            node_ext_code = self.get_unified_name( proteinPiana= node_proteinPiana,
                                                   protein_type_name=protein_type_name, list_alternative_type_names=list_alternative_type_names)

            if self.ext_code_unified_node_correspondence.has_key(node_ext_code):
                # a node for this ext_code was already created: merge unique node characteristics with the current node
                self.ext_code_unified_node_correspondence[node_ext_code].merge_nodes(node_object= current_node, ignore_ids=1)
                
            else:
                # no node was created for this ext_code: create a unique node with the characteristics of the current node
                #   - we set node id to 0, because we are not going to use the ids and in this way, we avoid conflicts when merging nodes
                #   - other characteristics will be added to this node when another proteinPiana has the same ext_code

                current_attribute= copy.deepcopy(current_node.get_node_attribute_object())   # to avoid modifying the attribute of a node in the network
                                                                                             # PianaGraphNodeAttribute makes sure of not deepcopying the
                                                                                             # mysql connection piana_access
                
                # create new node and associate it to the ext_code
                new_node = GraphNode(nodeID= 0,
                                     attribute = current_attribute,
                                     isRoot = current_node.is_root(),
                                     graph = None,
                                     expanded_from= current_node.is_expanded()[0],
                                     expansion_type= current_node.is_expanded()[1],
                                     ishidden= current_node.is_hidden(),
                                     alternative_id= node_ext_code)
                
                self.ext_code_unified_node_correspondence[node_ext_code] = new_node
            # END OF else: (if self.ext_code_unified_node_correspondence.has_key(node_ext_code):)
            
            self.number_of_unified_nodes += 1
            
        # END OF for node in self.get_node_object_list():

        if list_proteinPianas:
            # in case we have received a list of proteinPianas, skip the edge unification, since we are only interested in
            # unifying nodes because we are unifying nodes that do not appear in a network..
            return 0


        # ----
        # edge unification
        # ---
        for current_edge in self.get_edge_object_list():

            (edge_key, node_1_ext_code, node_2_ext_code) = self._get_edge_key(this_edge= current_edge,
                                                                              protein_type_name= protein_type_name,
                                                                              list_alternative_type_names=list_alternative_type_names )
            
            if self.edge_key_unified_edge_correspondence.has_key(edge_key):
                # and edge was already created for this pair for ext_codes: merge current edge with the one already created
                self.edge_key_unified_edge_correspondence[edge_key].merge_edges(new_edge= current_edge, ignore_ids= 1)
            else:
                # no edge has been created for this pair of ext_codes: create one and set it as the unified edge
                # in the unified edge, the node ids are never used: we set them to 0 (unified edges are used just to unify characteristics
                # of different edges that have the same ext_codes
                current_attribute = current_edge.get_edge_attribute_object()
                
                new_attribute = copy.deepcopy(current_attribute)  # to avoid modifying the attribute of an edge in the network
                                                                                             # PianaGraphEdgeAttribute makes sure of not deepcopying the
                                                                                             # mysql connection piana_access

                new_edge = GraphEdge(node1_id= 0,
                                     node2_id= 0,
                                     attribute_object= new_attribute,
                                     graph= None,
                                     original= current_edge.is_original(),
                                     propagated= current_edge.is_propagated() ,
                                     extended= current_edge.is_extended() ,
                                     hidden = current_edge.is_hidden() )

                self.edge_key_unified_edge_correspondence[edge_key] = new_edge
            # END OF else:  (if self.edge_key_unified_edge_correspondence.has_key(edge_key):)
            
            self.number_of_unified_edges += 1
            
        # END OF for current_edge in self.get_edge_object_list():
                

    def _fill_dic_protein_naming(self, protein_type_name, list_alternative_type_names, list_proteinPianas= None, key_list= None,
                                 user_protein_names = {}):
        """
        fills the naming dictionary, used to be coherent with the names used and avoid using different codes for the same protein

        it is called whenever protein names have to be used.

        "protein_type_name" is the preferred type of code to be used and list_alternative_type_names are others to use in case the preferred is not found
          -> valid type names are found in PianaGlobals.valid_protein_types

        "list_proteinPianas" is normally None, but can be set to a list of proteinPianas for those cases in which we are not printing proteins of the network
        but just proteins that the user asked information about. In case list_proteins is not None, then the names are unified for the proteins in the list
        instead of doing it for proteins of the network. This will not have an effect on successive calls asking information about proteins of the network,
        because the dictionary will be filled again when this method detects that the counter self.number_of_nodes_names_filled does not coincide with
        the number of nodes in the network (to make sure, we set self.number_of_nodes_names_filled to -1 when a protein list is passed as argument)
         (Attention! the list_proteins is a python list of proteinPianas. You can get a list of proteinPianas using PianaDBaccess.get_list_protein_pianas())

        "user_protein_names" can be used to fix a set of protein names that have preference over other names when a protein has several names
           (eg. if proteinPiana has gene names CXCL1 and MGSA, and this dictionary has CXCL1, it is guaranteed that CXCL1 will be used for output)
           --> user_protein_names is a dictionary that follows the structure: { protein_name:None, protein_name:None, ...}

        If the method was already called (and no more nodes have been added to the network) this call is ignored
        """

        # FIXING WHICH NAME WILL BE USED FOR EACH PROTEIN
        # -----------------------------------------------
        # In order to fix the naming problems with proteins, we create a dictionary that tries to set which will be the unique protein name to be
        # used for a protein that is represented by several nodes
        # Attention: this is not enterely correct... the unique name that will be taken for a protein depends on the order in which the nodes are
        #            read, since many codes do not have "primary name", and the first returned from the database is the one that will be used.
        #            I do not know how to fix this arbitrary way of chosing the protein names... somebody would have to look to all protein names
        #            and decide which one to use... but still, not all biologists use the same name for the same protein...
        #            In any case, I guarantee that if one of those possible names was given by the user as a root protein, that will be the one used
        #
        # This looks very complicated (and it is) but choosing the name to be used for output is important...


        # ---------------
        # 1. First of all, decide whether we are going to unify names for all proteins in the network or just for those in a list given by the user
        #    --> detect as well if the unification was already established... if it was, exit the method
        # ---------------
        if list_proteinPianas is None or not list_proteinPianas:
            # --> Unifying names for all proteins in the network
            
            number_of_nodes_in_current_graph = len(self.get_node_ids_list())

            if len(self.dic_protein_naming)> 0 and number_of_nodes_in_current_graph == self.number_of_nodes_names_filled and (
                self.last_output_type == protein_type_name):
                # no nodes have been added and self.dic_protein_naming has already been populated: don't do anything
                # TO DO!!! This is not exactly perfect: if there has been a change in the graph, but the number of
                #          nodes is still the same (eg. a node was added and another node was removed), this 'if' will
                #          consider that dic_protein_naming does not need recalculating, which is false...
                #          This is not very likely to happen, but it would be nice to think of a way that checks in a perfect manner
                #          whether self.dic_protein_naming has to be recalculated or not: how do I create a unique key of the graph that was used
                #          to fill the dic? A string with all node ids and edge ids? Ummm.... too complicated and costly...
                return 0

            # initialize
            self.last_output_type = protein_type_name
            self.number_of_nodes_names_filled = number_of_nodes_in_current_graph
            self.dic_protein_naming = {}    # has structure: { protein_name: unique protein name to be used,
                                            #                  protein_name: unique protein name to be used,
                                            #                  .......................
                                            #                }
            list_proteinPianas = []
            for node in self.get_node_object_list():
                list_proteinPianas.append(node.get_node_id())

        # END OF if list_proteinPianas is None:
        
        else:
            # --> Unifying names for proteins in list_proteinPianas
            
            if self.number_of_nodes_names_filled == -1 and key_list == self.last_key_list and self.last_output_type == protein_type_name:
                # if the dic was previosly filled for this list, skip this method
                return 0
            
            # TO DO!!! Change the way list of proteins are handled
            #          This is completely inneficient, since the dictionary is going to be filled as many times as proteins there are in the list.
            #          Something has to be done to detect that the dictionary was already filled for this list of proteins. If it is a different list
            #          of proteins, it has to be filled again... Right now, I have solve it using the key_list argument, but this is not very clean
            #          and I am obliging the user to set unique keys for lists...
            self.last_output_type = protein_type_name
            self.number_of_nodes_names_filled = -1
            self.last_key_list = key_list
            self.dic_protein_naming = {} 
        # END OF else:(if list_proteinPianas is None or not list_proteinPianas:)
            
        # ---------------
        # 2. Now, for each proteinPiana, fix which 'unified' external code will be used
        # ---------------
        for proteinPiana in list_proteinPianas:

            dic_unique_protein_name = {}   # although this should have only one element, we use a dic to make sure no more than one name
                                           # is assigned to same protein


            # 
            # 2.1 get all the protein names that can be given to this proteinPiana
            #
            temp_list_protein_names = self.piana_access.get_list_protein_external_codes(proteinPiana= proteinPiana,
                                                                                        protein_type_name= protein_type_name,
                                                                                        alternative_type_names= list_alternative_type_names,
                                                                                        answer_mode="list")
            list_protein_names = []
            # 2.1.1 convert all names to uppercase (to avoid considering two names as different when they just differ in case)
            for protein_name in temp_list_protein_names:
                # avoid errors when trying to do upper case over protein names that are ints
                if not isinstance(protein_name, int):      list_protein_names.append(protein_name.upper())
                else:                                      list_protein_names.append(protein_name)

            #
            # 2.2 Check if any of the names that can be given to this protein was already 'assigned to be another unique name'
            #     -> in case it was, keep it in dic_unique_protein_name so later on we know which is the 'unified name'
            #             -> we also initialize to 0 the frequency: later on it will be used to count the frequency for each unique name
            #
            for protein_name in list_protein_names:
                if self.dic_protein_naming.has_key(protein_name):
                    dic_unique_protein_name[ self.dic_protein_naming[protein_name] ] = 0 # we set it to Zero, it will be used later in cases with name conflict.
                                                                                         # the name with highest frequency is the one used... 
                                                                                         # if there are no conflicts, the number will not be used for anything  

            # END OF for protein_name in list_protein_names:


            #
            # 2.3 Check whether there is more than one unique name already fixed by another protein (2.3.1), there is only one (2.3.2) or None (2.3.3)
            # 
            number_of_unique_protein_names = len(dic_unique_protein_name)
            
            if number_of_unique_protein_names > 1:
                # 2.3.1
                # if there are more than one name, this means that two proteins with different lists of ext codes do share at least one ext code
                # My approach to this problem is a bit chapucero, but I cannot think of another way of doing it... I will take as unique code for the current 
                # protein and for all the names that previously had that unique name, the unique name that was associated to a larger number of ext codes
                # Of course, this doesn't apply if one of those unique names is in the list of names given originally by the user... user_protein_names
                # ... in this case, the name given by the user has preference over the other regardless of their frequencies of use
                # ... if more than one unique name is a user protein name, then frequency rules over the decision on which one to use
                unique_names_that_are_user_names = {}
                # check if one name in dic_unique_protein_name appear in the user_protein_names dictionary
                for protein_unique_name in dic_unique_protein_name:
                    if user_protein_names.has_key(protein_unique_name):
                        unique_names_that_are_user_names[protein_unique_name] = None
                
                number_of_unique_names_that_are_user_names=  len(unique_names_that_are_user_names)
                
                if number_of_unique_names_that_are_user_names == 1:
                    # only one of the names is a user name, set that protein as the one to be used
                    protein_highest_freq = unique_names_that_are_user_names.popitem()[0] 
                else:
                    # none of the unique_names appears in the list of user names or more than one of the unique names is a user name (ie. freq rules)


                    # if there were no unique names that were user names, check frequency for all proteins in dic_unique_protein_name
                    # otherwise (ie. 2 or more unique names were user names) check frequency only for those
                    if number_of_unique_names_that_are_user_names > 1:
                        dic_unique_protein_name = unique_names_that_are_user_names  # change the dictionary to use for calculating freqs

                    # calculate frequencies
                    for ext_code in self.dic_protein_naming:
                        # self.dic_protein_naming[ext_code] is the unique code assigned to ext_code

                        if dic_unique_protein_name.has_key(self.dic_protein_naming[ext_code]):
                            dic_unique_protein_name[self.dic_protein_naming[ext_code]] += 1


                    # END OF for ext_code in self.dic_protein_naming:

                    # find the name with the highest frequency
                    max_freq= 0
                    protein_highest_freq= None
                    for name in dic_unique_protein_name:
                        if dic_unique_protein_name[name] > max_freq:
                            max_freq = dic_unique_protein_name[name]
                            protein_highest_freq = name
                    # END OF for name in dic_unique_protein_name:
                # END OF else: (if) DECIDING WHICH OF THE UNIQUE NAMES WILL BE USED

                # set all proteins associated to any of the unique codes in dic_unique_protein_name, to the highest frequency name found
                for name_from_complete_list in self.dic_protein_naming:
                    # self.dic_protein_naming[name_from_complete_list] is the unique name formerly assigned to name_from_complete_list

                    if dic_unique_protein_name.has_key( self.dic_protein_naming[name_from_complete_list] ):
                        self.dic_protein_naming[name_from_complete_list]= protein_highest_freq
                # END OF for name_from_complete_list in self.dic_protein_naming:

                
                # fix the unique protein name for all ext codes of this protein
                for protein_name in list_protein_names:
                    self.dic_protein_naming[protein_name] =  protein_highest_freq

            
            elif number_of_unique_protein_names == 1:
                # 2.3.2
                # there is just one name to be used: set all the ext codes to use this name

                # get the only pair (key, value) and fix the unique protein name for this ext code
                unique_name = dic_unique_protein_name.popitem()[0] 
                for protein_name in list_protein_names:
                    self.dic_protein_naming[protein_name] =  unique_name

            elif number_of_unique_protein_names == 0:
                # 2.3.3
                # in case no code was set already as unique code for any of the names of this protein , get a new single code and set it for the other names

                user_name_coincidences = {}
                # if any of the names given to this protein is a user name, use it. Otherwise, get one from the database using answer_mode 'single'
                for protein_name in list_protein_names:
                    if user_protein_names.has_key(protein_name):
                        user_name_coincidences[protein_name] = None

                number_of_coincidences = len(user_name_coincidences)

                if number_of_coincidences == 0 or number_of_coincidences > 1:
                    # if there are no coincidences (none of the protein names is a user name) or there are more than 1 (more than one of the names of the
                    # proteins is a user name), use answer_mode 'single' to retrieve which would be the preferred name to use according to the database
                    single_name = self.piana_access.get_list_protein_external_codes(proteinPiana= proteinPiana,
                                                                                    protein_type_name= protein_type_name,
                                                                                    alternative_type_names= list_alternative_type_names,
                                                                                    answer_mode="single")
                    if number_of_coincidences > 1:
                        # if there are 2 or more coincidences, choose which one to use:

                        if not isinstance(single_name, int):     name_to_use = single_name.upper()
                        else:                                    name_to_use = single_name
                        
                        if user_name_coincidences.has_key(name_to_use):
                            #   - if one of those coincidences corresponds to the db name, take that one
                            unique_name = name_to_use
                        else:
                            #   - if none of the coincidences corresponds to the db name, take a random one
                            #     (the user name has preference over the db preference over names)
                            unique_name = user_name_coincidences.popitem()[0]
                    else:
                        # no coincidences--> take the database single name
                        if not isinstance(single_name, int):     unique_name = single_name.upper()
                        else:                                    unique_name = single_name

                # END OF if coincidences == 0 or coincidences >1:
                
                else:
                    # only one of the protein names is also a user name: use it
                    unique_name = user_name_coincidences.popitem()[0]
                # END OF else: (if coincidences == 0 or coincidences >1:)

                    
                # fix the unique protein name for all ext codes of this protein
                for protein_name in list_protein_names:
                    self.dic_protein_naming[protein_name] =  unique_name
            
        # END OF for node in self.get_node_object_list(): (FIXING WHICH NAME WILL BE USED FOR EACH PROTEIN)

    def _fill_expression_dics(self, file_over_expressed= None, file_infra_expressed= None, expression_protein_type= None, tax_id_value=0):
        """
        fills self dictionaries that keep the proteinPianas that are over of infra expressed
        """

        if len(self.over_expressed_proteinPianas) != 0 or len(self.infra_expressed_proteinPianas) != 0 or (
                                                           file_over_expressed is None and file_infra_expressed is None):
            # do not do anything if the dictionaries are already filled or if no files were passed as argument
            return 0
          
        if expression_protein_type is None:
            raise ValueError("You must give a protein code type for your expression files")  

        self.over_expressed_proteinPianas = {}  
        self.infra_expressed_proteinPianas = {} 

        protein_column_type = utilities.get_code_column(expression_protein_type)
        

        if file_over_expressed is not None:
            over_fd = file(file_over_expressed, "r")
            list_proteins = utilities.return_proteins_from_file(file_object= over_fd, proteins_type=expression_protein_type)

            for protein_name in list_proteins:

                for proteinPiana in self.piana_access.get_list_protein_piana(proteinCode_value= protein_name,
                                                                             proteinCodeType_value= protein_column_type,
                                                                             tax_id_value= tax_id_value,
                                                                             source_db_info="no"):
                    self.over_expressed_proteinPianas[proteinPiana] = None
                    
            over_fd.close()
        # END OF if file_over_expressed is not None:

        if file_infra_expressed is not None:
            infra_fd = file(file_infra_expressed, "r")
            list_proteins = utilities.return_proteins_from_file(file_object= infra_fd, proteins_type=expression_protein_type)

            for protein_name in list_proteins:

                for proteinPiana in self.piana_access.get_list_protein_piana(proteinCode_value= protein_name,
                                                                             proteinCodeType_value= protein_column_type,
                                                                             tax_id_value= tax_id_value,
                                                                             source_db_info="no"):
                    self.infra_expressed_proteinPianas[proteinPiana] = None
                    
            infra_fd.close()
        # END OF if file_infra_expressed is not None:


    # ------------------------------------------
    #  PianaGraph Methods: Output methods
    #------------------------------------------

    def output_nodes_molecular_weights(self):

        string_molecular_weights = "Molecular weights for proteins in the network"

        node_molecular_weights = self.get_nodes_molecular_weights()

        for node_molecular_weight_key in node_molecular_weights.keys():
            
            string_molecular_weights += "Molecular weight for node %s is %s\n" %(node_molecular_weight_key,
                                                                                 node_molecular_weights[node_molecular_weight_key])
        # END OF for node_molecular_weight_key in node_molecular_weights.keys():
        
        return string_molecular_weights

    def output_nodes_isoelectric_points(self):

        string_isoelectric_points = "Isoelectric points for proteins in the network"

        node_isoelectric_points = self.get_nodes_isoelectric_points()

        for node_isoelectric_point_key in node_isoelectric_points.keys():
            
            string_isoelectric_points += "Isoelectric point for node %s is %s\n" %(node_isoelectric_point_key,
                                                                                   node_isoelectric_points[node_isoelectric_point_key])
        # END OF for node_isoelectric_point_key in node_isoelectric_points.keys():
        
        return string_isoelectric_points


    def _passes_species_test(self, required_protein_tax_id, list_protein_taxonomies ):
        """
        internal function used to check if a given species "required_protein_tax_id" is in a list of taxonomies "list_protein_taxonomies"

        "required_protein_tax_id" is the taxonomy id we require the protein to be (eg. human, yeast, ...)
            --> valid tax id values are 0 (do not take into account the species) and those taxonomy ids provided by ncbi taxonomy  

        "list_protein_taxonomies" is the list of current protein taxonomies
        
        returns 0 if species test is not passed. 1 otherwise
        """
        # TO DO!!! This can be made faster using dics
        passes_species_test =0
        
        if required_protein_tax_id == 0:
            passes_species_test = 1
        else:
            if required_protein_tax_id in list_protein_taxonomies:
                passes_species_test = 1
        # END OF else: (if required_protein_tax_id == 0:)
        
        return passes_species_test

    def _get_unified_neighbours(self, neighbours_proteinPianas, protein_type_name= None, list_alternative_type_names=[]):
        """
        given a list of proteinPianas "neighbours_proteinPianas", returns a list with their unified names
        """
        dic_neighbours = {}
        for neighbour_proteinPiana in neighbours_proteinPianas:
            # get the unified name for each neighbour and added it to the dictionary
            dic_neighbours[self.get_unified_name( proteinPiana= neighbour_proteinPiana,
                                                   protein_type_name=protein_type_name, list_alternative_type_names=list_alternative_type_names)] = None

        return dic_neighbours.keys()

    
    def _create_ext_code_connecting(self, protein_type_name= None, list_alternative_type_names=[], connecting_proteinPianas= {}):
        """
        using the unified nodes and names, returns a dictionary {protein:[root_protein, root_protein, ...], protein: [], ...]
        using protein_type_name as type of protein code


        connecting_proteinPianas is the dictionary returned by Graph.get_connecting_nodes_dic()


        it is mandatory that _create_unified_network has been called before calling this method
        """
        number_of_nodes_in_current_graph = len(self.get_node_ids_list())
        number_of_edges_in_current_graph = len(self.get_edge_ids_list())
        
        if len(self.ext_code_connecting) > 0 and number_of_nodes_in_current_graph == self.number_of_unified_nodes and (
            number_of_edges_in_current_graph == self.number_of_unified_edges):
            # do not create the dictionary if it was already done (unless there has been a change in the number of nodes/edges in the graph)
            
            # TO DO!!! This is not exactly correct: if there has been a change in the graph, but the number of
            #          nodes and edges is still the same (eg. an edge was added and another edge was removed), this 'if' will
            #          consider that dics self.*_unified_*_correspondence do not need recalculating, which is false...
            #          This is not very likely to happen, but it would be nice to think of a way that checks in a perfect manner
            #          whether the dics have to be recalculated or not: how do I create a unique key of the graph that was used
            #          to fill the dics? A string with all node ids and edge ids? Ummm.... too complicated and costly...
            return 0
        
        new_dictionary = {}
        
        for connect_proteinPiana in connecting_proteinPianas:



            # get the ext_code that is going to be used for this proteinPiana
            node_ext_code = self.get_unified_name( proteinPiana= connect_proteinPiana,
                                                   protein_type_name=protein_type_name, list_alternative_type_names=list_alternative_type_names)


            if not new_dictionary.has_key(node_ext_code):
                new_dictionary[node_ext_code] = []

            for root_proteinPiana in connecting_proteinPianas[connect_proteinPiana]:

                root_ext_code = self.get_unified_name( proteinPiana= root_proteinPiana,
                                                       protein_type_name=protein_type_name, list_alternative_type_names=list_alternative_type_names)

                if root_ext_code not in new_dictionary[node_ext_code]:
                    new_dictionary[node_ext_code].append(root_ext_code)
                    
            # END OF for root_proteinPiana in connecting_proteinPianas[connect_proteinPiana]:
            
        # END OF for connect_proteinPiana in connecting_proteinPianas:

        # Now, remove those instances where the connecting node is only connected to one proteinPiana
        #   (this occurs when a node is connected to two nodes with different proteinPianas (and therefore, identified as connecting node)
        #    but those proteinPianas all correspond to the same ext_code... therefore, it is not a connecting node anymore)
        for connecting_ext_code in new_dictionary:
            if len(new_dictionary[connecting_ext_code]) > 1:
                self.ext_code_connecting[connecting_ext_code] = new_dictionary[connecting_ext_code]

                            
    def _passes_connecting_mode_test(self, id_a, is_root_a, id_b, is_root_b, connecting_prots_dic ):
        """
        internal function used to check is a protein passes the print mode='connecting' test

        this means interactions will be printed only in these cases:
           - interactions between root nodes
           - interaction between a root node and a node that connects more than 1 root nodes


        the only restriction is that id_a, id_b and connecting_prots_dic keys must use the same type of protein code

        returns 0 if connecting mode test is passed. 1 otherwise
        """
        if is_root_a and is_root_b:
            # both proteins are root: print the interaction
            result_test = 1
        
        elif not (is_root_a or is_root_b):
            # none of the proteins is root: do not print
            result_test = 0
        
        elif self.ext_code_connecting.has_key(id_a) and is_root_b:
            result_test = 1
            
        elif self.ext_code_connecting.has_key(id_b) and is_root_a:
            result_test = 1
            
        elif self.ext_code_connecting.has_key(id_a) and self.ext_code_connecting.has_key(id_b):
            result_test = 1

        else:
            # this is an interaction between a root protein and some other protein (that is only connected to this root protein)
            # do not print it
            result_test = 0

         
        return result_test
        
    def _passes_root_mode_test(self, is_root_a, is_root_b, print_mode=None):
        """
        
        
        internal function used to check is a protein passes the print mode test

        if print_mode is all: all proteins pass it: return 1
        if print_mode is "all_root": one of the proteins must be root
        if print_mode is "only_root": both proteins must be root
        if print_mode is "connecting": error!!! This method should never be called in connecting mode: use instead _passes_connecting_mode_test
        
        """

        if print_mode == "all":
            return 1
        
        elif print_mode == "all_root":
            if is_root_a == 1 or is_root_b == 1:
                return 1
            else:
                return 0
            
        elif print_mode == "only_root":
            if is_root_a ==1 and is_root_b == 1:
                return 1
            else:
                return 0
            
        elif print_mode == "connecting":
            if (is_root_a or is_linker_a ) and (is_root_b == 1 or is_linker_b):
                return 1
            else:
                return 0

        else:
            raise ValueError("Incorrect print mode (%s) given. Valid values are: 'all', 'all_root', 'only_root' and 'connecting'" %(print_mode))

    def output_protein_information(self, proteinPiana_value, protein_type_name, list_alternative_type_names= None, output_target= sys.stdout,
                                   output_mode= "compact", format_mode="txt", filter_mode="all", list_keywords= [],
                                   tax_id_value= 0, user_protein_names = {}, list_connects_nodes= None, 
                                    list_proteinPianas= None, key_list= -1, 
                                   file_over_expressed= None, file_infra_expressed= None, expression_protein_type= None):
        """
        writes information about protein "proteinPiana_value" to "output_target", following format "output_mode"


        "output_target" is a file object (sys.stdout to print to screen)


        "protein_type_name" is the easy-to-remember type name that will be used for printing the protein
          -> valid protein_type_name are those listed in PianaGlobals.valid_protein_types.keys()
      
        "filter_mode" not working at the minute: it will be used for hidden, unhidden, all

        "output_mode" can be:
           - 'compact': all relevant information in one line
           - 'extended': all information in text paragraphs
            
        "format_mode" sets the type of format that will be used for output
               - 'txt' will print flat text
               - 'html' will print html 

        "list_connects_nodes" is a list of root nodes that are connected through protein 'proteinPiana_value'. It comes from
        one of the dictionary rows of self.get_connecting_nodes_dic(). If it is set to None, the method considers that the user does not
        want to know information about connected root nodes

        "list_alternative_type_names" can be used to set a list of alternative types in case no protein_type_name code is found
         --> user must provide pairs a list of valid easy-to-remember type names
             list_alternative_types can for example look like this: ["gi", "uniacc", "md5"]
             I suggest always placing md5 at the end of alternative types, so you never get a None in the output

        "tax_id_value" sets the species of the proteins being printed (can be used for eliminating ambiguities between codes across species)
           --> valid tax ids are 0 (do not take into account the species) and those taxonomy ids provided by ncbi taxonomy

        "user_protein_names" can be used to fix a set of protein names that have preference over other names when a protein has several names
           (eg. if proteinPiana has gene names CXCL1 and MGSA, and this dictionary has CXCL1, it is guaranteed that CXCL1 will be used for output)
           --> user_protein_names is a dictionary that follows the structure: { protein_name:None, protein_name:None, ...}
           
        "list_keywords" sets a list of keywords that will be used to highlight important proteins in the network
           -> when printing protein information, it prints in the protein information line user_keyword=word_found user_keyword=word_found ...
           -> currently, it is not used when printing the interactions in a table
           -> If you are interested in highlighting proteins related to cancer, list_keywords could be: ['cancer', 'onco', 'carcinoma', 'tumor']

        "list_proteinPianas" is normally None, but can be set to a list of proteinPianas for those cases in which we are not printing proteins of the network
        but just proteins that the user asked information about. In case list_proteins is not None, then the names are unified for the proteins in the list
        instead of doing it for proteins of the network. Look at PianaApi.print_list_proteins to better understand how this works

        "key_list" is a unique key for this list of proteins, used to know when the unification of names has to be performed again
           -> if you are making calls for different lists of proteins, just set the key to consecutive numbers so this method can
              differentiate between the lists
           -> a user key_list  cannot be -1, because -1 is used as default to indicate that no key was used

        "file_over_expressed" and "file_infra_expressed" are names of files that contain proteins that are either over expressed or infra expressed

        "expression_protein_type" is the type of protein code used in files file_over_expressed and file_infra_expressed

        """
        

        # make sure self.dic_protein_naming is filled
        self._fill_dic_protein_naming(protein_type_name=protein_type_name, list_alternative_type_names=list_alternative_type_names,
                                      list_proteinPianas= list_proteinPianas, key_list=key_list, user_protein_names= user_protein_names)

        
        self._fill_expression_dics(file_over_expressed= file_over_expressed, file_infra_expressed= file_infra_expressed,
                                   expression_protein_type= expression_protein_type, tax_id_value = tax_id_value)
        
        # make sure the unified nodes and edges are created
        self._create_unified_network(list_proteinPianas= list_proteinPianas,
                                     protein_type_name=protein_type_name, list_alternative_type_names=list_alternative_type_names, user_keywords=list_keywords )


        protein_user_type = self.get_unified_name( proteinPiana= proteinPiana_value,
                                                   protein_type_name=protein_type_name, list_alternative_type_names=list_alternative_type_names)

        
        if self.already_printed_proteins.has_key(protein_user_type):
            # do not print information for a protein that has already been printed (the unified node contained all the information associated to this ext_code
            return 0
        
        self.already_printed_proteins[protein_user_type] = None

        # Important!
        # get the unified node associated to this ext_code (the unique node for this ext_code, created by merging info from all proteinPianas with that ext_code 
        node = self.ext_code_unified_node_correspondence[protein_user_type]


        dic_associated_proteinPianas = node.get_node_attribute_object().get_associated_proteinPianas()
        
        # getting all external codes of all types for proteinPianas associated to this unified node
        dic_ext_codes = {}
        for proteinPiana_value in dic_associated_proteinPianas:
            list_protein_ext_code = self.piana_access.get_list_protein_external_codes(proteinPiana= proteinPiana_value, protein_type_name= "all")
            for temp_ext_code in list_protein_ext_code:
                dic_ext_codes[temp_ext_code] = None
        list_protein_ext_code = dic_ext_codes.keys()

        # get taxonomies for this unified node
        list_protein_taxonomies = node.get_node_attribute_object().get_dic_taxonomies().keys()

        # check if the protein being processed respects the species restriction
        passes_species_test = self._passes_species_test(required_protein_tax_id= tax_id_value,
                                                        list_protein_taxonomies= list_protein_taxonomies)

        if passes_species_test == 1:
            # passes the species restriction: continue processing information for the protein
            list_protein_description = node.get_node_attribute_object().get_descriptions()
            list_protein_function = node.get_node_attribute_object().get_functions()
            list_protein_subcellularLocation = node.get_node_attribute_object().get_dic_cell_location().keys()
            list_protein_user_keywords = node.get_node_attribute_object().get_keywords_appearing()
            list_fitness_scores = node.get_node_attribute_object().get_fitness_scores()
            if list_fitness_scores: cell_fitness = PianaGlobals.positive_fitness_value[format_mode]
            else:                   cell_fitness = PianaGlobals.negative_fitness_value[format_mode]
            protein_expression = node.get_node_attribute_object().get_expression()

            if list_protein_user_keywords:
                # if there were user keywords in this protein, write it underlined in the html table
                html_prefix = "<u><font color=CC0000>"
                html_sufix = "</font></u>"
            else:
                html_prefix = ""
                html_sufix = ""

            if cell_fitness == "yes":
                fitness_html_prefix="<font color=0033FF>"
                fitness_html_sufix="</font>"
            else:
                fitness_html_prefix=""
                fitness_html_sufix=""

            if protein_expression == "over_expressed":
                expression_html_prefix="<font color=FF33FF>"
                expression_html_sufix="</font>"
                
            elif protein_expression == "infra_expressed":
                expression_html_prefix="<font color=33FF00>"
                expression_html_sufix="</font>"
            else:
                expression_html_prefix=""
                expression_html_sufix=""



            if output_mode == "extended":

                
                
                output_target.write("========================================================================%s" %PianaGlobals.line_separators[format_mode] )
                output_target.write("Information for protein %s (proteinPianas = %s)%s%s" %(protein_user_type,
                                                                                           dic_associated_proteinPianas.keys(),
                                                                                           PianaGlobals.line_separators[format_mode],
                                                                                           PianaGlobals.line_separators[format_mode] ))
                if format_mode == "txt":
                    output_target.write("(other equivalent codes: %s)%s" %(list_protein_ext_code, PianaGlobals.line_separators[format_mode]) )
                elif format_mode == "html":
                    output_target.write("equivalent codes: %s%s" %(list_protein_ext_code, PianaGlobals.line_separators[format_mode] ))
            # END OF if output_mode == "extended":

            # in case a list of connecting nodes was passed , print information for the protein as well as which are the root nodes that it connects
            # list_connects_nodes can be None (no connecting node info desired) or a list (it could be an empty list. (ie. we wanted information but list is empty))
            if list_connects_nodes is not None:

                dic_connected_roots_extcode = {}
                dic_all_dbs = {}

                for connected_root_proteinPiana in list_connects_nodes:

                    # get the ext_code that is going to be used for this proteinPiana
                    connected_root_ext_code = self.get_unified_name( proteinPiana= connected_root_proteinPiana,
                                                                     protein_type_name=protein_type_name, list_alternative_type_names=list_alternative_type_names)

                    # get the unified node for the connected root ext_code
                    connected_root_node = self.ext_code_unified_node_correspondence[connected_root_ext_code]

                    dic_connected_roots_extcode[connected_root_ext_code] = None

                    # Important!
                    # get the unified edge for the edge that joins the root node and the connecting node
                    temp_connecting_edge= self.get_edge(identifier1= proteinPiana_value, identifier2= connected_root_proteinPiana)
                    edge_key = self._get_edge_key(this_edge= temp_connecting_edge, protein_type_name= protein_type_name,
                                                  list_alternative_type_names=list_alternative_type_names)

                    connecting_edge = self.edge_key_unified_edge_correspondence[edge_key[0]]
                    
                    # getting sourceDB of the unified edge, to print the source dbs of the interaction between the linker and the root
                    # TO DO!!! Right now, I am collecting the source dbs together, not specifying for which of the roots each db had the interaction
                    list_of_dbs = connecting_edge.get_edge_attribute_object().get_list_source_db()

                    for db_name in list_of_dbs:
                        dic_all_dbs[db_name] = None
                # END OF for connects_node in list_connects_nodes:
                
                number_of_connected_nodes = len(dic_connected_roots_extcode)
                if number_of_connected_nodes > 1:
                    if output_mode == "compact":
                        if format_mode == "txt":
                            output_target.write("%s%s connects %s root_proteins%s%s%sdatabases %s%sdescription=%s%sfunction=%s%sexpression=%s%sfitness=%s%s" %(
                                PianaGlobals.line_separators[format_mode],
                                protein_user_type,                             number_of_connected_nodes,
                                PianaGlobals.tab_separators[format_mode],
                                dic_connected_roots_extcode.keys(),                       PianaGlobals.tab_separators[format_mode],
                                dic_all_dbs.keys(),                                       PianaGlobals.tab_separators[format_mode],
                                list_protein_description,                                 PianaGlobals.tab_separators[format_mode],
                                list_protein_function,                                    PianaGlobals.tab_separators[format_mode],
                                protein_expression,                                       PianaGlobals.tab_separators[format_mode],
                                cell_fitness,                                             PianaGlobals.line_separators[format_mode]) )
                            
                        elif format_mode == "html":
                            output_target.write("\n<tr><td>%s%s%s</td><td>%s</td><td>%s</td><td>expression=%s%s%s fitness=%s%s%s</td><td>%s</td></tr>\n" %(
                                html_prefix,
                                protein_user_type,
                                html_sufix,
                                list_protein_description,
                                list_protein_function,
                                expression_html_prefix, protein_expression, expression_html_sufix,
                                fitness_html_prefix, cell_fitness, fitness_html_sufix,
                                dic_connected_roots_extcode.keys() ))
                            
                        # END OF elif format_mode == "html":
                    # END OF if output_mode == "compact":
                    elif output_mode == "extended":

                        if format_mode == "txt":
                            # this is printing the same as in 'compact' output_mode, but I prefer to keep it separate in case in the future
                            # we want to add some extra information in output_mode 'all'
                            output_target.write("%s%s connects %s root_proteins%s%s%sdatabases %s%sdescription=%s%sfunction=%s%sexpression=%s%sfitness=%s%s" %(
                                PianaGlobals.line_separators[format_mode],
                                protein_user_type,                             number_of_connected_nodes,
                                PianaGlobals.tab_separators[format_mode],
                                dic_connected_roots_extcode.keys(),                       PianaGlobals.tab_separators[format_mode],
                                dic_all_dbs.keys(),                                       PianaGlobals.tab_separators[format_mode],
                                list_protein_description,                                 PianaGlobals.tab_separators[format_mode],
                                list_protein_function,                                    PianaGlobals.tab_separators[format_mode],
                                protein_expression,                                       PianaGlobals.tab_separators[format_mode],
                                cell_fitness,                                             PianaGlobals.line_separators[format_mode]) )

                        elif format_mode == "html":
                            output_target.write("%sLinker %s%s%s -- description %s -- function %s (expression=%s%s%s fitness=%s%s%s) connects root proteins:%s%s\n" %(
                                PianaGlobals.line_separators[format_mode],
                                html_prefix,
                                protein_user_type,
                                html_sufix,
                                list_protein_description,
                                list_protein_function,
                                expression_html_prefix, protein_expression, expression_html_sufix, 
                                fitness_html_prefix, cell_fitness, fitness_html_sufix,
                                dic_connected_roots_extcode.keys(),
                                PianaGlobals.line_separators[format_mode] ))

                    # END OF elif output_mode == "extended":
                # END OF if number_of_connected_nodes > 1:
            # END OF if list_connects_nodes is not None:

            else:
                # there is no list of connecting nodes (i.e info requested about all proteins, not just the linker proteins)

                if output_mode == "compact":
                    if format_mode == "txt":
                        # output in compact mode is (txt) external_code<TAB>protein description<TAB>protein function<TAB>alternative external codes<NEW LINE>
                        output_target.write("%s%s%s%s%s%sroot=%s%sexpression=%s%sfitness=%s" %(protein_user_type,
                                                                                      PianaGlobals.tab_separators[format_mode],
                                                                                      list_protein_description,
                                                                                      PianaGlobals.tab_separators[format_mode],
                                                                                      list_protein_function,
                                                                                      PianaGlobals.tab_separators[format_mode],
                                                                                      node.is_root(),
                                                                                      PianaGlobals.tab_separators[format_mode],
                                                                                      protein_expression,
                                                                                      PianaGlobals.tab_separators[format_mode],
                                                                                      cell_fitness))

                        for user_keyword in list_protein_user_keywords:
                            output_target.write("%suser_keyword=%s" %(PianaGlobals.tab_separators[format_mode], user_keyword))

                        for protein_ext_code in list_protein_ext_code:
                            type = protein_ext_code.split(":")[0]
                            if type != PianaGlobals.proteinSequence_col and type != PianaGlobals.proteinMD5_col:
                                # do not print sequence and md5
                                output_target.write("%s%s" %(PianaGlobals.tab_separators[format_mode], protein_ext_code ))

                            
                        output_target.write("%s" %PianaGlobals.line_separators[format_mode])

                        
                    elif format_mode == "html":

                            
                        output_target.write("<tr><td>%s%s%s</td><td>%s</td><td>%s</td><td>is_root=%s expression=%s%s%s fitness=%s%s%s</td></tr>\n" %(
                            html_prefix,
                            protein_user_type,
                            html_sufix,
                            list_protein_description,
                            list_protein_function,
                            node.is_root(),
                            expression_html_prefix, protein_expression, expression_html_sufix,
                            fitness_html_prefix, cell_fitness, fitness_html_sufix ))


                elif output_mode == "extended":

                    for protein_taxonomy in list_protein_taxonomies:
                        output_target.write("%sprotein %s taxonomy: %s%s" %(PianaGlobals.line_separators[format_mode],
                                                                            protein_user_type, protein_taxonomy,
                                                                            PianaGlobals.line_separators[format_mode]) )

                    for description in list_protein_description:
                        output_target.write("%sprotein %s description: %s%s" %(PianaGlobals.line_separators[format_mode],
                                                                               protein_user_type, description,
                                                                               PianaGlobals.line_separators[format_mode]) )

                    for function in list_protein_function:
                        output_target.write("%sprotein %s function: %s%s" %(PianaGlobals.line_separators[format_mode],
                                                                            protein_user_type, function,
                                                                            PianaGlobals.line_separators[format_mode]) )

                    for subcellularLocation in list_protein_subcellularLocation:
                        output_target.write("%sprotein %s subCellularLocation: %s%s" %(PianaGlobals.line_separators[format_mode],
                                                                                       protein_user_type, subcellularLocation,
                                                                                       PianaGlobals.line_separators[format_mode]) )
                        
                    for user_keyword in list_protein_user_keywords:
                        output_target.write("%sprotein %s user keyword: %s%s" %(PianaGlobals.line_separators[format_mode],
                                                                                protein_user_type, user_keyword, PianaGlobals.line_separators[format_mode]) )

                        
                    output_target.write("========================================================================%s" %PianaGlobals.line_separators[format_mode])
                # END OF elif output_mode == "extended": 
            # END OF else: (if list_connects_nodes is not None:)
        # END OF if passes_species_test == 1:

    def _print_table_headers_protein_information(self, output_target=None, connecting_nodes_dic= None, user_keywords=[]):
        """
        prints the table headers for protein information to output_target
        """
        output_target.write("<html><body><br><br>\n")
        output_target.write("Keywords used to highlight proteins: %s<br><br>\n" %user_keywords)
        output_target.write("<table border=1>\n")

        if connecting_nodes_dic is None:
            output_target.write("<tr><td><b>protein</b></td><td><b>description</b></td><td><b>function</b></td><td><b>other</b></td></tr>\n")
        else:
            output_target.write("<tr><td><b>linker protein</b></td><td><b>description</b></td><td><b>function</b></td><td><b>other</b></td><td><b>roots connected</b></td></tr>\n")

            
    def _print_table_closure_protein_information(self, output_target=None):
        """
        prints the table headers for protein information to output_target
        """
        output_target.write("\n</table></body></html>\n")


                
    def output_all_proteins_information(self,  protein_type_name, output_target, output_mode= "compact", format_mode= "txt",
                                        filter_mode="all", list_alternative_type_names= [], tax_id_value= 0, list_keywords= [],
                                        user_protein_names = {}, file_over_expressed= None, file_infra_expressed= None, expression_protein_type= None):
        """
        writes information about all proteins in the network in file "output_target"
        
        The description of the arguments in this method is the same as in method output_protein_information()
        """
        if output_mode == "compact" and format_mode == "html":
            self._print_table_headers_protein_information(output_target= output_target, connecting_nodes_dic=None, user_keywords=list_keywords)

        self.already_printed_proteins = {} # used in output_protein_information to avoid printing information for the same protein many times
        
        for node in self.get_node_object_list():

            self.output_protein_information(proteinPiana_value= node.get_node_id(),
                                            protein_type_name=protein_type_name,
                                            output_target= output_target,
                                            output_mode= output_mode,
                                            format_mode= format_mode,
                                            filter_mode= filter_mode,
                                            list_alternative_type_names=list_alternative_type_names,
                                            list_keywords= list_keywords,
                                            tax_id_value= tax_id_value,
                                            list_connects_nodes= None,
                                            list_proteinPianas= None,
                                            key_list= -1,
                                            user_protein_names = user_protein_names,
                                            file_over_expressed= file_over_expressed,
                                            file_infra_expressed= file_infra_expressed,
                                            expression_protein_type= expression_protein_type)

            
        if output_mode == "compact" and format_mode == "html":
             self._print_table_closure_protein_information(output_target=output_target)

            
    def output_root_proteins_information(self, protein_type_name,  output_target, output_mode= "compact", format_mode= "txt", filter_mode="all",
                                         list_alternative_type_names= [], tax_id_value= 0, list_keywords= [],
                                         user_protein_names = {}, file_over_expressed= None, file_infra_expressed= None, expression_protein_type= None):
        """
        writes information about root proteins in the network in file "output_target"
        
        The description of the arguments in this method is the same as in method output_protein_information()
        """
        if output_mode == "compact" and format_mode == "html":
            self._print_table_headers_protein_information(output_target= output_target, connecting_nodes_dic=None, user_keywords=list_keywords)

        self.already_printed_proteins = {} # used in output_protein_information to avoid printing the same protein many times
            
        root_protein_ids = self.get_root_node_ids()

        for node_id in root_protein_ids:

            self.output_protein_information(proteinPiana_value= node_id,
                                            protein_type_name=protein_type_name,
                                            output_target= output_target,
                                            output_mode= output_mode,
                                            format_mode= format_mode,
                                            filter_mode= filter_mode,
                                            list_alternative_type_names=list_alternative_type_names,
                                            list_keywords= list_keywords,
                                            tax_id_value= tax_id_value,
                                            list_connects_nodes= None,
                                            list_proteinPianas= None,
                                            key_list= -1,
                                            user_protein_names = user_protein_names,
                                            file_over_expressed= file_over_expressed,
                                            file_infra_expressed= file_infra_expressed,
                                            expression_protein_type= expression_protein_type)

        if output_mode == "compact" and format_mode == "html":
             self._print_table_closure_protein_information(output_target=output_target)



    def output_connecting_proteins_information(self, protein_type_name,  output_target, output_mode= "compact", format_mode= "txt", filter_mode="all",
                                               list_alternative_type_names= [], tax_id_value = 0, list_keywords= [],
                                               user_protein_names = {}, file_over_expressed= None, file_infra_expressed= None, expression_protein_type= None):
        """
        writes information  to file object "output_target" about linker proteins in the network
        
        The description of the arguments in this method is the same as in method output_protein_information()
        """
        connecting_node_dic = self.get_connecting_nodes_dic() # returns a dictionary with proteinPiana:[root_nodes]

        if output_mode == "compact" and format_mode == "html":
            self._print_table_headers_protein_information(output_target= output_target, connecting_nodes_dic=connecting_node_dic, user_keywords=list_keywords)

        self.already_printed_proteins = {} # used in output_protein_information to avoid printing the same protein many times


        for connecting_node_id in connecting_node_dic:

            self.output_protein_information(proteinPiana_value= connecting_node_id,
                                            protein_type_name=protein_type_name,
                                            list_alternative_type_names=list_alternative_type_names,
                                            output_target= output_target,
                                            output_mode= output_mode,
                                            format_mode= format_mode,
                                            filter_mode= filter_mode,
                                            list_keywords= list_keywords,
                                            tax_id_value= tax_id_value,
                                            user_protein_names = user_protein_names,
                                            list_connects_nodes= connecting_node_dic[connecting_node_id],
                                            list_proteinPianas= None,
                                            key_list= -1,
                                            file_over_expressed= file_over_expressed,
                                            file_infra_expressed= file_infra_expressed,
                                            expression_protein_type= expression_protein_type)
            

        if output_mode == "compact" and format_mode == "html":
             self._print_table_closure_protein_information(output_target=output_target)


    def output_list_proteins_information(self,  list_proteins, input_proteins_type, output_proteins_type_name, output_target, output_mode= "compact",
                                         format_mode= "txt", filter_mode="all", list_alternative_type_names= [], list_keywords= [], 
                                         tax_id_value= 0, key_list=-1, 
                                         file_over_expressed= None, file_infra_expressed= None, expression_protein_type= None):
        """
        writes information  to file object output_target about all proteins in list "list_proteins" (of type "input_proteins_type")

        This method is used to print information about proteins that are not in a network!!!! (ie. it won't print information about interactions of the proteins)

        The description of the rest of the arguments of this method is the same as in method output_protein_information()
        """
        
        if output_mode == "compact" and format_mode == "html":
            self._print_table_headers_protein_information(output_target= output_target, connecting_nodes_dic=None, user_keywords=list_keywords)

        self.already_printed_proteins = {} # used in output_protein_information() to avoid printing the same protein many times


        this_user_protein_names= {} # in output_list_* the user protein names are the same as those in the list, therefore it is
                                    # not required as an argument and it is populated inside this method
            
        # get all the proteinPianas associated to the proteins in the list
        dic_proteinPiana = {}
        protein_column_type = utilities.get_code_column(input_proteins_type)
        for user_protein in list_proteins:
            this_user_protein_names[user_protein] = None # will be used to fix preference of user names over the other possible names

            temp_list_user_proteinPiana = self.piana_access.get_list_protein_piana(proteinCode_value= user_protein,
                                                                                   proteinCodeType_value= protein_column_type,
                                                                                   tax_id_value= tax_id_value,
                                                                                   source_db_info="no" )

            for user_proteinPiana in temp_list_user_proteinPiana:
                dic_proteinPiana[user_proteinPiana] = None
        # END OF for user_protein in list_proteins:

        
        list_proteinPianas = dic_proteinPiana.keys()
        for proteinPiana in dic_proteinPiana:
            # list_proteinPianas is used to unify nodes and decide which names will be used for the proteins given by the user
            self.output_protein_information(proteinPiana_value= proteinPiana,
                                            protein_type_name= output_proteins_type_name,
                                            output_target= output_target,
                                            output_mode= output_mode,
                                            format_mode= format_mode,
                                            filter_mode= filter_mode,
                                            list_connects_nodes= None,
                                            list_alternative_type_names= list_alternative_type_names,
                                            list_keywords= list_keywords,
                                            tax_id_value= tax_id_value,
                                            list_proteinPianas= list_proteinPianas,
                                            key_list= key_list,
                                            user_protein_names = this_user_protein_names,
                                            file_over_expressed= file_over_expressed,
                                            file_infra_expressed= file_infra_expressed,
                                            expression_protein_type= expression_protein_type)
        # END OF for proteinPiana in list_proteinPiana:

        if output_mode == "compact" and format_mode == "html":
             self._print_table_closure_protein_information(output_target=output_target)


            
    def print_table_interaction(  output_target= None,
                                  format_mode= "txt",
                                  root_connectivity= None,
                                  extcode_start= None,
                                  num_neigh_start= None,
                                  node_type_start= None,
                                  expression_start= None,
                                  appears_fitness_start= None,
                                  list_protein_user_keywords_start= None,
                                  extcode_end= None,
                                  num_neigh_end= None,
                                  node_type_end= None,
                                  expression_end= None,
                                  appears_fitness_end= None,
                                  list_protein_user_keywords_end= None,
                                  same_loc= None,
                                  same_species= None,
                                  list_interaction_dbs= None,
                                  list_interaction_methods= None ):
        """

        Prints one interaction  between "extcode_start" and "extcode_end" in a table to "output_target"
          -> html table if "format_mode" is 'html' or tabulated text if "format_mode" is 'txt'

        "output_target" is a file object with writing rights

        "list_interaction_dbs" is a list of the database names where this interaction appears

        in format_mode 'html', the table is simplified to fit to the screen.
 
        """
        if format_mode == "txt":
            # printing root connectivity of the protein that is not the root protein
            if root_connectivity is not None:
                if root_connectivity != -1:     output_target.write("connectivity=%s" %(root_connectivity))
                else:                           output_target.write("connectivity=None")
            else:
                output_target.write("connectivity=None")

            # printing start protein and number of neighbours
            # TO DO!!! Number of neighbours information is not correct: it is not merging neighbours in the unified node...
            output_target.write("\tprotein_1=%s\tneighbours_1=%s" %(extcode_start, num_neigh_start) )
            
            # printing start protein root info
            output_target.write("\troot_1=%s" %(node_type_start) )
            
            # printing start protein expression information
            output_target.write("\texpression_1=%s" %(expression_start) )

            # printing fitness information for start protein in case it is provided
            if node_type_start is not None:   output_target.write("\tfitness_1=%s" %(appears_fitness_start) )
            else:                             output_target.write("\tfitness_1=None")

            # printing end protein and number of neighbours
            # TO DO!!! Number of neighbours information is not correct: it is not merging neighbours in the unified node...
            output_target.write("\tprotein_2=%s\tneighbours_2=%s" %(extcode_end, num_neigh_end) )
            
            # printing end protein root info
            output_target.write("\troot_2=%s" %(node_type_end) )
            
            # printing start protein and number of neighbours
            output_target.write("\texpression_2=%s" %(expression_end) )

            # printing fitness information for end protein in case it is provided
            if node_type_end is not None:     output_target.write("\tfitness_2=%s" %(appears_fitness_end) )
            else:                             output_target.write("\tfitness_2=None")

            # printing cellular localization and species information
            output_target.write("\tlocation=%s\tspecies=%s" %(same_loc, same_species) )

            # printing source databases
            for interaction_database in list_interaction_dbs:
                    output_target.write("\tdb=%s" %interaction_database)
                    
            # printing methods
            for interaction_method in list_interaction_methods:
                    output_target.write("\tmethod=%s" %interaction_method)

            # END OF for interaction_database in PianaGlobals.interaction_databases:
            output_target.write("\n")
        # END OF if format_mode == "txt":

        elif format_mode == "html":

            if list_protein_user_keywords_start:
                # if there were user keywords in this protein, write it underlined in the html table
                start_html_prefix = "<u><font color=CC0000>"
                start_html_sufix = "</font></u>"
            else:
                start_html_prefix = ""
                start_html_sufix = ""

            if appears_fitness_start == "yes":
                fitness_html_prefix_start="<font color=0033FF>"
                fitness_html_sufix_start="</font>"
            else:
                fitness_html_prefix_start=""
                fitness_html_sufix_start=""

            if expression_start == "over_expressed":
                expression_html_prefix_start="<font color=FF33FF>"
                expression_html_sufix_start="</font>"
                
            elif expression_start == "infra_expressed":
                expression_html_prefix_start="<font color=33FF00>"
                expression_html_sufix_start="</font>"
            else:
                expression_html_prefix_start=""
                expression_html_sufix_start=""
                
            if list_protein_user_keywords_end:
                # if there were user keywords in this protein, write it underlined in the html table
                end_html_prefix = "<u><font color=CC0000>"
                end_html_sufix = "</font></u>"
            else:
                end_html_prefix = ""
                end_html_sufix = ""

            if appears_fitness_end == "yes":
                fitness_html_prefix_end="<font color=0033FF>"
                fitness_html_sufix_end="</font>"
            else:
                fitness_html_prefix_end=""
                fitness_html_sufix_end=""

            if expression_end == "over_expressed":
                expression_html_prefix_end="<font color=FF33FF>"
                expression_html_sufix_end="</font>"
                
            elif expression_end == "infra_expressed":
                expression_html_prefix_end="<font color=33FF00>"
                expression_html_sufix_end="</font>"
            else:
                expression_html_prefix_end=""
                expression_html_sufix_end=""
                

            output_target.write("\n<tr>")
            output_target.write("<td>%s%s%s</td>" %(start_html_prefix, extcode_start, start_html_sufix) )
            output_target.write("<td>is_root=%s expression=%s%s%s fitness=%s%s%s</td>" %(node_type_start,
                                                                                         expression_html_prefix_start, expression_start, expression_html_sufix_start,
                                                                                         fitness_html_prefix_start, appears_fitness_start, fitness_html_sufix_start))
            output_target.write("<td>%s%s%s</td>" %(end_html_prefix, extcode_end, end_html_sufix) )
            output_target.write("<td>is_root=%s expression=%s%s%s fitness=%s%s%s</td>" %(node_type_end,
                                                                                 expression_html_prefix_end, expression_end, expression_html_sufix_end,
                                                                                 fitness_html_prefix_end, appears_fitness_end, fitness_html_sufix_end ) )
            output_target.write("<td><center>%s</center></td><td><center>%s<center></td>" %(same_loc, same_species) )
            output_target.write("<td>" )
            # printing source databases
            for interaction_database in list_interaction_dbs:
                output_target.write(" %s " %interaction_database)
            output_target.write("</td>" )
            output_target.write("<td>" )


            # printing detection method
            for interaction_method in list_interaction_methods:
                output_target.write(" %s " %interaction_method)
            output_target.write("</td>" )
            output_target.write("</tr>\n")
        # END OF elif format_mode == "html":

        else:
            raise ValueError("Incorrect format mode given for outputting table")

    print_table_interaction = staticmethod(print_table_interaction)   



    def print_dot_interaction(output_target= None,
                              format_mode = "dot", 
                              edge_object= None,
                              piana_access = None,
                              extcode_start= None,
                              extcode_end= None,
                              list_interaction_dbs= None ):
        """
        prints to "output_target" one interaction between "extcode_start" and "extcode_end" in format defined by "format_mode"

        "output_target" is a file object with write rights

        "format_mode" sets the format used for printing the network interaction (right now, the only valid format is 'dot')
        
        "edge_object" is the GraphEdge object of this interaction

        "list_interaction_dbs" are the database names where this interaction appears

        files written by this method can be used to create an image of the network using neato from package Graphviz:

           $> cat "output_of_this_method" | neato -Tgif -o output_file.gif
        
        """
        
        # setting the edge color
        number_of_dbs = len(list_interaction_dbs)
        
        if number_of_dbs == 1:
            # if there is just one db with this interaction, print its colour
            interaction_style = piana_access.get_interaction_line_style("normal")
            interaction_color= piana_access.get_interaction_source_database_color(list_interaction_dbs[0])
            
        elif number_of_dbs == 0:
            # If interaction does not belong to any database, then default color given... (it is a prediction made just now)
            interaction_color= piana_access.get_interaction_source_database_color("")
            # setting the line style of the edge to be printed
            if edge_object.is_extended() or edge_object.is_propagated():
                if edge_object.is_extended():
                    interaction_style = piana_access.get_interaction_line_style("extended")
                else:
                    interaction_style = piana_access.get_interaction_line_style("propagated")

            else:
                interaction_style = piana_access.get_interaction_line_style("normal")

        else:
            # if it is an interaction that appears in two or more different databases, print a special colour to indicate it
            interaction_style = piana_access.get_interaction_line_style("normal")
            interaction_color= piana_access.get_interaction_source_database_color("intersection")
            

        if format_mode == "dot":
            output_target.write(""""%s" -- "%s" [len=%s, color=%s, style=%s];\n""" %(extcode_start, extcode_end, PianaGlobals.dot_edge_lenght,
                                                                                     interaction_color, interaction_style ))
        # END OF for interaction_color in interaction_colors:

    print_dot_interaction = staticmethod(print_dot_interaction)

    def _check_special_node(self,  node_object, keywords):
        """
        returns 1 if node is a root protein, a expanded node or contains keywords in its description, function or name
        """

        if node_object.is_root() or node_object.is_expanded()[0] or \
           self.piana_access.check_keywords_in_protein(list_proteinPiana=[node_object.get_node_id()], keywords=keywords):
            return 1
        else:
            return 0

    def _print_node_dot(self, output_target, node_object, node_ext_code, keywords= [], expression = None, format_mode = "dot"  ):
        """
        prints a node in dot format

        
        "output_target" is a file object (set to sys.stdout if you want the output printed to your screen)

        "format_mode" sets the format used for printing the network interaction (right now, the only valid format is 'dot')
        
        "node_object" is the node object (obviously)

        "node_ext_code" is the external code that you want to use for this node

        "keywords" is the list of keywords that will be used to highlight the node in the network (see argument list_keywords below in output_interactions)

        "expression" indicates if the protein is over of infra expressed. Valid values are:
           - 'over_expressed'
           - 'infra_expressed'
           - None --> not info available 
        """

        keywords_appearing = node_object.get_node_attribute_object().get_keywords_appearing()
        
        # set the color of the node border depending on its expression information
        if expression:
            bordercolor = self.piana_access.get_node_border_color(expression)
            if verbose_expression:
                sys.stderr.write("protein %s has expression %s\n" %(node_ext_code, expression))
        else:
            bordercolor =  self.piana_access.get_node_border_color("normal")

        if node_object.is_root():
            if not keywords_appearing:
                if format_mode == "dot":
                    output_target.write(""""%s" [fillcolor = %s, color= %s]\n""" %(node_ext_code, self.piana_access.get_node_fill_color("root"),
                                                                                   bordercolor))
            else:
                if format_mode == "dot":
                    output_target.write(""""%s" [fillcolor = %s, color= %s]\n""" %(node_ext_code, self.piana_access.get_node_fill_color("root_keyword"),
                                                                                   bordercolor ))
                

        elif keywords_appearing:

            if format_mode == "dot":
                output_target.write(""""%s" [fillcolor = %s, color= %s]\n""" %(node_ext_code, self.piana_access.get_node_fill_color("keyword"),
                                                                               bordercolor ))

        elif expression:
            if format_mode == "dot":
                output_target.write(""""%s" [fillcolor = %s, color= %s]\n""" %(node_ext_code, self.piana_access.get_node_fill_color("normal"),
                                                                               bordercolor ))
            
            
        elif node_object.is_expanded()[0] is not None:
            if format_mode == "dot":
                output_target.write(""""%s" [color = %s]\n""" %(node_ext_code, self.piana_access.get_node_border_color("expanded")))

        
    def output_interactions(self,  protein_type_name= None,
                            output_target= None, output_format= None, format_mode= "txt", intersection_dbs=None,
                            filter_mode="all", print_mode= "all", list_alternative_type_names=[],
                            tax_id_value= 0, list_keywords= [], file_over_expressed= None,
                            file_infra_expressed= None, expression_protein_type= None, user_protein_names = {} ):
        """

        writes a (table or network in dot format) in file "output_target" describing graph information in the following formats:

        dot format: follows the standard dot language for describing networks
        table format:follows the format described in method print_table_interaction

        "protein_type_name" is the easy-to-remember type name that will be used for printing the proteins in the network.
           --> Valid protein_type_name are those listed in PianaGlobals.valid_protein_types
        
        "output_target" is a file object (set to sys.stdout if you want the output printed to your screen)

        "output_format" is the format that will be followed for the output
           -"table": prints interactions in table format
           -"network": prints interactions in a network
            
        "format_mode" sets the type of format that will be used for output
               - 'txt' will print flat text (only valid for output_format 'table')
               - 'html' will print html  (only valid for output_format 'table')
               - 'dot' will print a DOT file (only valid for output_format 'network')

        "intersection_dbs" sets intersection mode, which only prints out interactions that appear in all dbs of the list being passed
            -> it can be None (no intersection mode applied) or a list of database names
            -> valid database names are those in PianaGlobals.interaction_databases

        "filter_mode" not being used at the minute... will be used for hidden, unhidden, all

        "print_mode" sets which proteins will be printed
           -> "all" will print all interactions in the network
           -> "all_root" will print all interactions in the network where at least one partner is a root protein
           -> "only_root" will print only interactions between root proteins
           -> "connecting" will print only interactions between root proteins and those proteins that connect more than one root protein

        "list_alternative_type_names" can be used to set a list of alternative types in case no protein_type_name code is found
         --> user must provide pairs a list of valid easy-to-remember type names
             list_alternative_types can for example look like this: ["gi", "uniacc", "md5"]
             I suggest always placing md5 at the end of alternative types, so you never get a None in the output

        "tax_id_value" sets the species of the interactions that are being added (can be used for eliminating ambiguities between codes across species)
           --> valid species names are 0 (do not take into account the species) and those taxonomy ids provided by ncbi taxonomy  

        "list_keywords" sets a list of keywords that will be used to highlight important proteins in the network
           -> currently, it highlights in red the proteins in the DOT file that contain at least one keyword in the function, description or name
           -> currently, it is not used when printing the interactions in a table
           -> If you are interested in highlighting proteins related to cancer, list_keywords could be: ['cancer', 'onco', 'carcinoma', 'tumor']
           
        "file_over_expressed" and "file_infra_expressed" are names of files that contain proteins that are either over expressed or infra expressed

        "expression_protein_type" is the type of protein code used in files file_over_expressed and file_infra_expressed

        "user_protein_names" can be used to fix a set of protein names that have preference over other names when a protein has several names
           (eg. if proteinPiana has gene names CXCL1 and MGSA, and this dictionary has CXCL1, it is guaranteed that CXCL1 will be used for output)
           --> user_protein_names is a dictionary that follows the structure: { protein_name:None, protein_name:None, ...}
        """

        # make sure self.dic_protein_naming is filled
        self._fill_dic_protein_naming(protein_type_name=protein_type_name, list_alternative_type_names=list_alternative_type_names,
                                      user_protein_names = user_protein_names)
        
        self._fill_expression_dics(file_over_expressed= file_over_expressed, file_infra_expressed= file_infra_expressed,
                                   expression_protein_type= expression_protein_type, tax_id_value= tax_id_value)

        self._create_unified_network(protein_type_name=protein_type_name, list_alternative_type_names=list_alternative_type_names, user_keywords=list_keywords )
        
        connecting_prots = self.get_connecting_nodes_dic() # returns a dictionary with proteinPiana:[root_nodes] where root_nodes are proteinPianas

        # create the connecting dictionary using ext codes instead of proteinPianas
        self._create_ext_code_connecting(protein_type_name= protein_type_name,
                                         list_alternative_type_names= list_alternative_type_names,
                                         connecting_proteinPianas= connecting_prots) 

        if output_format == "network":

            if format_mode== "dot":
                # -----------------
                # PRINT DOT HEADERS
                # -----------------
                output_target.write("graph G { graph [orientation=%s, pack=%s, overlap=%s]" %(PianaGlobals.dot_orientation,
                                                                                              PianaGlobals.dot_pack,
                                                                                              PianaGlobals.dot_overlap) )
            
                output_target.write(" node [shape=%s, fontsize=%s, width=%s, height=%s,style=%s, fillcolor=%s];\n" %(PianaGlobals.dot_node_shape,
                                                                                                                     PianaGlobals.dot_node_font_size,
                                                                                                                     PianaGlobals.dot_node_width,
                                                                                                                     PianaGlobals.dot_node_height,
                                                                                                                     PianaGlobals.dot_node_style,
                                                                                                                     PianaGlobals.dot_standard_node_fill_color))
            # END OF if format_mode== "dot":

            # -------------------------------------
            # PRINT SPECIAL PROTEINS IN DOT FORMAT
            # -------------------------------------
            # for dot output format, set the root nodes to be of a different colour
            #                        set the nodes with keywords to be of a different colour
            #                        other nodes will have the standard color
            #
            

            already_printed_nodes = {} # used in output_protein_information to avoid printing same unified node many times in the dot file
                                       

            # setting nodes output parameters depending on their attributes 
            for this_node in self.get_node_object_list():

                proteinPiana = this_node.get_node_id()

                # get the ext_code that is going to be used for this proteinPiana
                node_ext_code = self.get_unified_name( proteinPiana= proteinPiana,
                                                       protein_type_name=protein_type_name, list_alternative_type_names=list_alternative_type_names)
            
                if already_printed_nodes.has_key(node_ext_code):
                    # if node was printed, no need to continue: the unified node has printed all the info associated to proteinPianas associated to this ext_code
                    continue

                already_printed_nodes[node_ext_code] = None # set as already printed

                # get the unified node associated to this ext_code
                node = self.ext_code_unified_node_correspondence[node_ext_code]

                # check if the proteins of the interaction respect the species restriction: if not, skip protein
                list_protein_taxonomies = node.get_node_attribute_object().get_dic_taxonomies().keys()
                passes_species_test = self._passes_species_test(required_protein_tax_id= tax_id_value,
                                                                list_protein_taxonomies= list_protein_taxonomies)

                expression = node.get_node_attribute_object().get_expression()
                
                if not passes_species_test:
                    continue

                # check if it passes print_mode
                #          TO DO!!! in print_mode "all_root" I am printing all nodes, even if they don't have any interaction afterwards
                #                   this is wrong! I should only print the node if it has an interaction to a root node
                if print_mode == "only_root" and not node.is_root():
                    continue
                #elif print_mode == "all_root" and interaction doesn't have at least one root node:
                #    continue # The problem is that at this point I do not know whether this node will be connected to a root
                #               or not... If I want this to work, I have to either print the nodes when printing the edges
                #               or keep a list of nodes that are connected to root nodes and then check if this
                #               node is in that list
                elif print_mode == "connecting" and not ( node.is_root() or self.ext_code_connecting.has_key(node_ext_code) ):
                    continue

                self._print_node_dot(output_target=output_target, format_mode=format_mode, node_object= node, node_ext_code=node_ext_code,
                                     keywords= list_keywords, expression=expression )

            # END OF  for node in self.nodes:

        # END OF if output_format == "dot":
            
        root_connectivity = 0
        print_databases = "no"
        
        already_printed_interactions = {} # keeps track of interactions already printed to skip them
           
        # print html table header
        if output_format == "table":
            if format_mode == "html":
                output_target.write("<html><body><br><br>\n")
                output_target.write("Keywords used to highlight proteins: %s<br><br>\n" %list_keywords)
                output_target.write("<table border=1>\n")
                output_target.write("<tr><td><center><b>protein A</b></td><td><center><b>protein A information</b></center></td><td><center><b>protein B</b></center></td><td><center><b>protein B information</b></center></td><td><center><b>same cell location</b></center></td><td><center><b>same species</b></center></td><td><center><b>source databases</b></center></td><td><center><b>methods</b></center></td></tr>\n")


        for edge in self.get_edge_object_list():
            # ------------------
            # PRINT INTERACTIONS
            # ------------------
            # -----
            # Getting external codes for edge proteins and the edge_key
            # -----

            (edge_key, protein_extcode_start , protein_extcode_end) = self._get_edge_key(this_edge=edge,
                                                                                         protein_type_name= protein_type_name,
                                                                                         list_alternative_type_names= list_alternative_type_names )

            # skip interactions that have already been printed
            if already_printed_interactions.has_key(edge_key):
                continue
                
            already_printed_interactions[edge_key] = None

            # Important! setting the edge for this key to be the unified edge: this guarantees that all
            #            edges between these ext_codes (even if they come from different proteinPianas)
            #            have the same characteristics (source dbs, method, ...) and therefore
            #            they are not lost from the fact that we are using this type of code for the output
            edge = self.edge_key_unified_edge_correspondence[edge_key]

            # Important! setting the nodes for the edge nodes to be their unified nodes: this guarantees that the nodes
            #            have all the characteristics of the proteinPianas that have the same ext_codes
            protein_node_start = self.ext_code_unified_node_correspondence[protein_extcode_start]
            protein_node_end = self.ext_code_unified_node_correspondence[protein_extcode_end]

            list_protein_taxonomies_start = protein_node_start.get_node_attribute_object().get_dic_taxonomies().keys()
            list_protein_taxonomies_end = protein_node_end.get_node_attribute_object().get_dic_taxonomies().keys()
            
            # -----
            # Checking if interaction respects species restrictions
            # -----
            if tax_id_value != 0:
                # check if the proteins of the interaction respect the species restriction: if not, skip interaction
                passes_species_test_start = self._passes_species_test(required_protein_tax_id= tax_id_value,
                                                                      list_protein_taxonomies= list_protein_taxonomies_start)

                passes_species_test_end = self._passes_species_test(required_protein_tax_id= tax_id_value,
                                                                    list_protein_taxonomies= list_protein_taxonomies_start)

                if not passes_species_test_start or not passes_species_test_end:
                    continue
            # END OF if tax_id_value != 0:

            # -----
            # Checking if interaction respects print_mode restrictions
            # -----
            if print_mode != "all":
                if print_mode == "connecting":
                    
                    # returns the connecting prots using ext_code protein_type_name
                    
                    connecting_mode_test = self._passes_connecting_mode_test(id_a= protein_extcode_start,
                                                                             is_root_a = protein_node_start.is_root(),
                                                                             id_b= protein_extcode_end,
                                                                             is_root_b= protein_node_end.is_root(),
                                                                             connecting_prots_dic= self.ext_code_connecting )

                        
                    if not connecting_mode_test:
                        continue
                    
                else:
                    # print mode is not "connecting", check for root restrictions
                    interaction_print_mode_test = self._passes_root_mode_test( is_root_a= protein_node_start.is_root(),
                                                                               is_root_b= protein_node_end.is_root(),
                                                                               print_mode= print_mode )
                    
                    if not interaction_print_mode_test:
                        continue
                # END OF else: (if print_mode == "connecting":)

            # -----
            # Checking database intersection: used when intersection_dbs is not None
            # -----
            # if interaction doesn't appear in all dbs, continue with next loop iteration
            edge_attribute = edge.get_edge_attribute_object()


            list_interaction_dbs = edge_attribute.get_list_source_db()

            # remove duplicates
            #   -> for printing the network, we consider that a completion db xxx_c is the same as its father db xxx
            #   -> for printing the table, we consider the completion db as a different db (so the user can look to the real origin of the interaction)
            dic_network_interaction_dbs = {}
            dic_table_interaction_dbs = {}
            for interaction_db in list_interaction_dbs:
                dic_network_interaction_dbs[ interaction_db.replace("_c", "") ] = None
                dic_table_interaction_dbs[ interaction_db] = None
                
            list_network_interaction_dbs = dic_network_interaction_dbs.keys()
            list_table_interaction_dbs = dic_table_interaction_dbs.keys()

            if intersection_dbs is not None:
                # intersection mode activated... print only interactions that appear in all dbs in intersection_dbs
                
                appears_in_all_dbs = 1
                for intersection_db in intersection_dbs:
                    if intersection_db not in list_table_interaction_dbs:
                        appears_in_all_dbs = 0
                        break
                # END OF for intersection_db in intersection_dbs:

                if appears_in_all_dbs == 0:
                    continue
                
            # END OF if intersection_dbs is not None:

            # getting detection methods for this interaction
            list_interaction_methods = edge_attribute.get_list_method_id()
            
            # -----
            # Checking if both proteins are in the same subcellular location
            # -----
            same_subcellularLocation = PianaGlobals.negative_location_value[format_mode]
                
            list_protein_start_subcellularLocations = protein_node_start.get_node_attribute_object().get_dic_cell_location().keys()
            list_protein_end_subcellularLocations   = protein_node_end.get_node_attribute_object().get_dic_cell_location().keys()


            for protein_start_subcellularLocation in list_protein_start_subcellularLocations:

                if protein_start_subcellularLocation in list_protein_end_subcellularLocations:
                    same_subcellularLocation =  PianaGlobals.positive_location_value[format_mode]
                    break
                # END OF if protein_start_subcellularLocation in protein_end_subcellularLocations:
            # END OF for protein_start_subcellularLocation in protein_start_subcellularLocations:


            # -----
            # Checking if both proteins are in the same species
            # -----
            same_species = PianaGlobals.negative_species_value[format_mode]
                
            for protein_start_taxonomy in list_protein_taxonomies_start:
                if protein_start_taxonomy in list_protein_taxonomies_end:
                    same_species = PianaGlobals.positive_species_value[format_mode]
                    break
                # END OF if protein_start_one_species in list_protein_end_species:
            # END OF for protein_start_one_species in list_protein_start_species:

            # -----
            # Calculating root connectivity
            # -----
            # for the protein of the pair that is not the root protein (if none of them is a root protein, connectivity=-1)
            # find how many other root proteins it is connecting. It is an indication on how important is the role of
            # this non-root protein in the network
 
            if protein_node_start.is_root() and protein_node_end.is_root():
                root_connectivity = max( self.get_node_root_connectivity(node=protein_node_start), self.get_node_root_connectivity(node=protein_node_end))

            elif not protein_node_start.is_root() and not protein_node_end.is_root():
                root_connectivity = -1

            elif protein_node_start.is_root():
                if self.ext_code_connecting.has_key(protein_extcode_end):  root_connectivity = len(self.ext_code_connecting[protein_extcode_end])
                else:                                                      root_connectivity = 0

            elif protein_node_end.is_root():
                if self.ext_code_connecting.has_key(protein_extcode_start): root_connectivity = len(self.ext_code_connecting[protein_extcode_start])
                else:                                                       root_connectivity = 0

            else:
                raise ValueError("Root connectivity cannot be calculated: case not taken into account")


            # -----
            # Retrieving fitness score 
            # -----
            appears_fitness_start = None
            appears_fitness_end   = None
            type_start = None
            type_end = None
            
            list_fitness_score_reaction_conditions_start = protein_node_start.get_node_attribute_object().get_fitness_scores()
            list_fitness_score_reaction_conditions_end   = protein_node_end.get_node_attribute_object().get_fitness_scores()

            # TO DO!!! These values must be centralized somewhere instead of hardcoded here...
            # initialising to "not found in fitness table"

            if list_fitness_score_reaction_conditions_start:       appears_fitness_start= PianaGlobals.positive_fitness_value[format_mode]
            else:                                                  appears_fitness_start= PianaGlobals.negative_fitness_value[format_mode]


            if list_fitness_score_reaction_conditions_end:         appears_fitness_end= PianaGlobals.positive_fitness_value[format_mode]
            else:                                                  appears_fitness_end= PianaGlobals.negative_fitness_value[format_mode]

            if protein_node_start.is_root():                       node_type_start = PianaGlobals.positive_root_value[format_mode]
            else:                                                  node_type_start = PianaGlobals.negative_root_value[format_mode]

            if protein_node_end.is_root():                         node_type_end = PianaGlobals.positive_root_value[format_mode]
            else:                                                  node_type_end = PianaGlobals.negative_root_value[format_mode]
   
               
            list_protein_user_keywords_start = protein_node_start.get_node_attribute_object().get_keywords_appearing()
            list_protein_user_keywords_end = protein_node_end.get_node_attribute_object().get_keywords_appearing()


            # get number of neighbours for the unified start and end nodes

            neighbours_start_proteinPianas = protein_node_start.get_node_attribute_object().get_union_neighbours()
            start_neighbours_ext_codes= self._get_unified_neighbours(neighbours_proteinPianas= neighbours_start_proteinPianas,
                                             protein_type_name= protein_type_name, list_alternative_type_names=list_alternative_type_names)
            num_neigh_start = len(start_neighbours_ext_codes)

            neighbours_end_proteinPianas = protein_node_end.get_node_attribute_object().get_union_neighbours()
            end_neighbours_ext_codes= self._get_unified_neighbours(neighbours_proteinPianas= neighbours_end_proteinPianas,
                                             protein_type_name= protein_type_name, list_alternative_type_names=list_alternative_type_names)
            num_neigh_end = len(end_neighbours_ext_codes)
            
            # -----
            # Printing interaction with info collected 
            # -----
            if output_format == "table":
                # TO DO!!! Number of neighbours information is not correct: it is not merging neighbours in the unified node...
                PianaGraph.print_table_interaction(output_target=output_target,
                                                   format_mode = format_mode,
                                                   root_connectivity= root_connectivity,
                                                   extcode_start= protein_extcode_start,
                                                   num_neigh_start= num_neigh_start,
                                                   node_type_start= node_type_start,
                                                   expression_start= protein_node_start.get_node_attribute_object().get_expression(),
                                                   appears_fitness_start= appears_fitness_start,
                                                   list_protein_user_keywords_start= list_protein_user_keywords_start ,
                                                   extcode_end= protein_extcode_end,
                                                   num_neigh_end= num_neigh_end,
                                                   node_type_end= node_type_end,
                                                   expression_end= protein_node_end.get_node_attribute_object().get_expression(),
                                                   appears_fitness_end= appears_fitness_end,
                                                   list_protein_user_keywords_end= list_protein_user_keywords_end ,
                                                   same_loc= same_subcellularLocation,
                                                   same_species=same_species,
                                                   list_interaction_dbs= list_table_interaction_dbs,
                                                   list_interaction_methods= list_interaction_methods)
            elif output_format == "network":

                PianaGraph.print_dot_interaction(output_target= output_target,
                                                 format_mode = format_mode,
                                                 edge_object= edge,
                                                 piana_access = self.piana_access,
                                                 extcode_start= protein_extcode_start,
                                                 extcode_end= protein_extcode_end,
                                                 list_interaction_dbs= list_network_interaction_dbs)
                
            else:
                sys.stderr.write("unknown output format (%s) set" %(output_format))
        # END OF for edge in self.edges:
            

        # PRINT HTML TABLE CLOSURE
        if output_format == "table":
            if format_mode == "html":
                output_target.write("\n</table></body></html>\n\n")
                
        if output_format == "network":
            if format_mode == "dot":
                # closing statement for  dot file
                output_target.write("}")
    
    def match_spots_to_proteins(self, spots_file_object= None, mw_error_bounds= None, ip_error_bounds= None, match_mode="all"):
        """

        matches spots in a spot file with proteins in the network
        
        "spots_file_object": a file object with spots from a Gel, and their Molecular Weight and Isoelectric Point

               text file with spots must follows format (one spot per line):
        
                    spot_id<TAB>Molecular Weight<TAB>Isoelectric Point

               Attention!!! - Numbers must be in american style: 234234.45 and not 234234,45
                            - No headers or footers allowed



        returns a dictionary (with error levels as keys) with a list of tuples [spot_id, proteinPiana]

               { error_level1: [ [spot_id1, proteinPiana1], [spot_id2, proteinPiana2], ...],
                 error_level2: [ [spot_id1, proteinPiana1], [spot_id2, proteinPiana2], ...],
                 ......................
               }

               (one spot can be assigned to several proteinPianas and viceversa (this is just a matching by mw and ip...)


        "mw_error_bounds" and "ip_error_bounds" are lists of error bounds (they must have the same number of elements)

            the error bounds describe the percentage of error admitted when matching a spot mw or ip to the theoretical mw or ip of a protein
            for example:

                    mw_error_bounds   = [0.0, 0.0025, 0.005, 0.01]

                    ip_error_bounds   = [0.0, 0.0025, 0.005, 0.01]


        "match_mode" can be (default is "all")
          - "all" : matching performed by comparing molecular weight and isoelectric point
          - "mw"  : matching performed by comparing only molecular weight
          - "ip"  : matching performed by comparing only isoelectric point

        """


        if spots_file_object is None or mw_error_bounds is None or ip_error_bounds is None or match_mode is None:
            raise ValueError("all arguments needed to match spots to proteins: spots file=%s, mw errors=%s, ip errors=%s and match_mode=%s" %(spots_file_object,
                                                                                                                                        mw_error_bounds,
                                                                                                                                        ip_error_bounds,
                                                                                                                                        match_mode))

        # Important! molecular_error_bounds and isoelectric_error_bounds must have same number of elements
        # It is the case because the resulting matches will be given for "error levels", where each position in the list of
        # error bounds is a new "error level". Therefore, mw_error_bounds[n] will be in the same error level as ip_error_bounds[n], regardless
        # of the actual values
        if len(mw_error_bounds) != len(ip_error_bounds):
            raise ValueError("mw_error_bounds and ip_error_bounds must have the same number of elements")

        # Perform matching with spots file
        #    molecular weight and isoelectric point matching will determine the identity of the spots of a gel
        # -----------------------------------------------------------------------------------------------------
        # 1. Filtering by molecular weight
        # 2. Filtering by isoelectric point
        # 3. Printing out intersection of two filters

        spots_mw = {}  # dic used to store spots mol weights (keys are spot_id)
        molecular_n_match = 0    # counter of number of matches with molecular weight filter

        spots_ip = {}  # dic used to store spots isoelectric points (keys are spot_id)
        isoelectric_n_match = 0    # counter of number of matches with isoelectric point filter


        # then extract the spot data from the input file

        for line in spots_file_object.readlines():
            line = line.rstrip()
            values = line.split("\t")

            if verbose_spots:
                sys.stderr.write("for spot_id %s reading mw=%s (type %s) and ip=%s (type %s)" %(values[0],values[1], type(values[1]),values[2], type(values[2]) ))
            
            spots_mw[values[0]] = int( float(values[1]) * 1000) # conversion to units used by piana molecular weight
                                                               # piana has weights like: 10575.9, 132796, ...
                                                               # and the spots file 10.575, 132.796, ...
            spots_ip[values[0]] = float(values[2])
        # END OF for line in spots_file_object.readlines():

        # -----------------------------
        # 1. Filtering by molecular weight
        # -----------------------------

        # nodes_molecular_weights is a dictionary of the form:
        # {proteinPiana1:molecular_weight1, proteinPiana:molecular_weight2, ...}
        nodes_molecular_weights = self.get_nodes_molecular_weights()

        # nodes_isoelectric_points is a dictionary of the form:
        # {proteinPiana:[isoelectric_point1, isoelectric_point2, ...], proteinPiana:[], ...}
        nodes_isoelectric_points = self.get_nodes_isoelectric_points()


        # for each level of error do the matching
        # level error can have different values for mw_error_bounds and ip_error_bounds, but since len(xx_error_bounds) is equal
        # they both use the same index

        matchings_to_return = {}  # dictionary that will contain all spots matches to proteinPianas for each error level
                                  #  it follows structure:
                                  #                      { error_level1: { spot_id: proteinPiana,
                                  #                                        spot_id: proteinPiana,
                                  #                                          ......................},
                                  #                        error_level2: { spot_id: proteinPiana,
                                  #                                        spot_id: proteinPiana,
                                  #                                          ....................},
                                  #                        .....................................
                                  #                      }



        

        for i in range(len(mw_error_bounds)):

            matchings_to_return[i] = {}

            assignments_molecular_weight = []   # a list of tuples [spot_id, proteinPiana] matching spots and proteins
                                                # according to molecular weight

            assignments_isoelectric_point = []  # a list of tuples [spot_id, proteinPiana] matching spots and proteins 
                                                # according to isoelectric point

            mw_n_match = 0   # counts number of mw matches
            ip_n_match = 0 # counts number of mw matches

            # for each spot_id, check molecular weight
            for spot_id in spots_mw.keys():

                # ...compare with each  protein in the network
                for proteinPiana in nodes_molecular_weights.keys():
                    lower_bound = int( (1.0 - mw_error_bounds[i]) * spots_mw[spot_id] )
                    upper_bound = int( (1.0 + mw_error_bounds[i]) * spots_mw[spot_id] )

                    if (nodes_molecular_weights[proteinPiana] >= lower_bound) and (nodes_molecular_weights[proteinPiana] <= upper_bound):
                        mw_n_match += 1

                        assignments_molecular_weight.append( [spot_id, proteinPiana] )

                    # END OF if (nodes_molecular_weights[proteinPiana] > lower_bound) and (nodes_molecular_weights[proteinPiana] < upper_bound):
                # END OF for proteinPiana in nodes_molecular_weights.keys():
            # END OF for spot_id in spots_mw.keys():


            # for each spot_id, check isoelectric point
            for spot_id in spots_ip.keys():

                # ...compare with each linked protein
                for proteinPiana in nodes_isoelectric_points.keys():
                    lower_bound = int( (1.0 - ip_error_bounds[i]) * spots_ip[spot_id] )
                    upper_bound = int( (1.0 + ip_error_bounds[i]) * spots_ip[spot_id] )

                    if (nodes_isoelectric_points[proteinPiana] >= lower_bound) and (nodes_isoelectric_points[proteinPiana] <= upper_bound):
                        ip_n_match += 1

                        assignments_isoelectric_point.append( [spot_id, proteinPiana] )

                    # END OF if (nodes_isoelectric_points[proteinPiana] > lower_bound) and (nodes_isoelectric_points[proteinPiana] < upper_bound) 
                # END OF for proteinPiana in nodes_isoelectric_points.keys():
            # END OF for spot_id in spots_ip.keys():

            # loops finished: update dictionary with results obtained for this error level (update depends on match_mode)


            if match_mode == "mw":
                matchings_to_return[i] =  assignments_molecular_weight
                
            elif match_mode == "ip":
                matchings_to_return[i] =  assignments_isoelectric_point
                
            elif match_mode == "all":

                matchings_to_return[i] = []
                
                for assignment_molecular_weight in assignments_molecular_weight:

                    if assignment_molecular_weight in assignments_isoelectric_point:
                        # if tuple is in both filters, then append to matchings_to_return
                        matchings_to_return[i].append(assignment_molecular_weight)
                        
                # END OF for assignment_molecular_weight in assignments_molecular_weight:

            else:
                raise ValueError('match_mode must have a value "mw", "ip" or "all"')
            
        # for i in len(mw_error_bounds):

        return matchings_to_return


    def print_spots_matches(self, spots_matches= None, output_target= None, format_mode = "txt",
                            protein_type_name= None, mw_error_bound= None, ip_error_bound= None, list_alternative_type_names=[],
                            user_protein_names = {} ):
        """
        prints to "output_target" (a file object) the spots maches for each level, using protein codes indicated by "protein_type_name"
            
        "format_mode" sets the type of format that will be used for output
               - 'txt' will print flat text
               - 'html' will print html
               
        "protein_type_name" is the easy-to-remember type name that will be used for printing the proteins in the network.
        Valid protein_type_name are those listed in PianaGlobals.valid_protein_types.keys()

        "list_alternative_type_names" can be used to set a list of alternative types in case no protein_type_name code is found
        --> user must provide pairs a list of valid easy-to-remember type names
            list_alternative_types can for example look like this: ["gi", "uniacc", "md5"]
             
            I suggest always placing md5 at the end of alternative types, so you never get a None in the output

        "user_protein_names" can be used to fix a set of protein names that have preference over other names when a protein has several names
           (eg. if proteinPiana has gene names CXCL1 and MGSA, and this dictionary has CXCL1, it is guaranteed that CXCL1 will be used for output)
           --> user_protein_names is a dictionary that follows the structure: { protein_name:None, protein_name:None, ...}

             
        spots_matches were obtained with method PianaGraph.match_spots_to_proteins

        spots_matches follow structure:
                                   { error_level1: { spot_id: proteinPiana,
                                                     spot_id: proteinPiana,
                                                       ......................},
                                     error_level2: { spot_id: proteinPiana,
                                                     spot_id: proteinPiana,
                                                       ....................},
                                     .....................................
                                   }

                                   
        """
        # make sure self.dic_protein_naming is filled
        self._fill_dic_protein_naming(protein_type_name=protein_type_name, list_alternative_type_names=list_alternative_type_names,
                                      user_protein_names = user_protein_names)

        self._create_unified_network(protein_type_name=protein_type_name, list_alternative_type_names=list_alternative_type_names, user_keywords=[])
        
        dic_already_printed = {}   # used to avoid printing the same combination spot--> protein more than once

        if format_mode == "html":
            # print table headers
            output_target.write("<table border=1><tr><td>mw error</td><td>ip error</td><td>spot id</td><td>protein id</td></tr>")

            
        for spot_match_error_level in spots_matches.keys():

            for spot_match in spots_matches[spot_match_error_level]:

                protein_ext_code = self.get_unified_name( proteinPiana= spot_match[1],
                                                          protein_type_name=protein_type_name, list_alternative_type_names=list_alternative_type_names)

                key = spot_match[0] + "." + protein_ext_code
                if dic_already_printed.has_key(key):
                    # avoid printing the same combination spot-protein
                    continue
                
                dic_already_printed[key] = None
                
                if verbose_spots:
                    sys.stderr.write("at error level %s (mw_error %s - ip_error %s) spot_id %s matched protein %s\n" %(spot_match_error_level,
                                                                                                                     mw_error_bound[spot_match_error_level],
                                                                                                                     ip_error_bound[spot_match_error_level],
                                                                                                                     spot_match[0],
                                                                                                                     protein_ext_code) )
                if format_mode == "txt":
                    output_target.write("error level %s (mw_error %s - ip_error %s) spot_id %s matches protein %s\n" %(spot_match_error_level,
                                                                                                                       mw_error_bound[spot_match_error_level],
                                                                                                                       ip_error_bound[spot_match_error_level],
                                                                                                                       spot_match[0],
                                                                                                                       protein_ext_code) )
                elif format_mode == "html":
                    output_target.write("<tr><td>%s</td><td>%s</td><td>%s</td><td>%s</td></tr>\n" %(mw_error_bound[spot_match_error_level],
                                                                                                    ip_error_bound[spot_match_error_level],
                                                                                                    spot_match[0],
                                                                                                    protein_ext_code))
            # END OF for spot_match in spots_matches[spot_match_error_level]:
        # END OF for spot_match_error_level in spots_matches.keys():
    
        if format_mode == "html":
            # print table closure
            output_target.write("\n</table></body></html>\n")

          
        
    def output_shortest_route(self, protein_a_name, protein_b_name, input_proteins_type,
                              output_proteins_type, list_alternative_type_names, output_file_object,
                              format_mode, tax_id_value):
        """
        
        Prints to "output_file_object" the route from "protein_a_name" to "protein_b_name"
        

        "input_protein_type" is the type of code used for protein query_protein
             -> valid protein-type values are those defined in PianaGlobals.valid_protein_types

        "output_protein_type" is the type of code that will be used to print proteins
             -> valid protein-type values are those defined in PianaGlobals.valid_protein_types
        
        "list_alternative_types" can be used to set a list of alternative types in case no protein_type_name code is found
         --> user must provide a list of valid easy-to-remember type names
             list_alternative_types can for example look like this: ["gi", "uniacc", "md5"]
             I suggest always placing md5 at the end of alternative types, so you never get a None in the output
             
        "output_file_object"is the file object (sys.stdout to print to screen) where the route will be printed
        
        
        "format_mode" sets the type of format that will be used for output
               - 'txt' will print flat text
               - 'html' will print html 

        "tax_id_value" sets the species of the protein that will be printed (can be used for eliminating ambiguities between codes across species)
           --> valid tax ids are 0 (do not take into account the species) and those taxonomy ids provided by ncbi taxonomy
        """

        # make sure self.dic_protein_naming is filled
        self._fill_dic_protein_naming(protein_type_name=output_proteins_type, list_alternative_type_names=list_alternative_type_names,
                                      user_protein_names = [protein_a_name, protein_b_name])

        self._create_unified_network(protein_type_name=output_proteins_type, list_alternative_type_names=list_alternative_type_names, user_keywords=[])
        
        protein_column_type = utilities.get_code_column(input_proteins_type)

        list_proteinPiana_a = self.piana_access.get_list_protein_piana(proteinCode_value= protein_a_name,
                                                                     proteinCodeType_value= protein_column_type,
                                                                     tax_id_value=tax_id_value,
                                                                     source_db_info="no")

        list_proteinPiana_b = self.piana_access.get_list_protein_piana(proteinCode_value= protein_b_name,
                                                                     proteinCodeType_value= protein_column_type,
                                                                     tax_id_value=tax_id_value,
                                                                     source_db_info="no")


        min_shortest_route = (1000, []) # init to very high value so it is changed at the first route found
        
        # find shortest routes for all combinations of proteinPianas: we will keep the shortest one of them
        for proteinPiana_a in list_proteinPiana_a:
            for proteinPiana_b in list_proteinPiana_b:
                this_shortest_route= self.find_shortest_route(start_node_id=proteinPiana_a, end_node_id=proteinPiana_b)

                if this_shortest_route[0] < min_shortest_route[0]:
                    min_shortest_route = this_shortest_route
                
        # END OF for proteinPiana_a in list_proteinPiana_a:

        output_file_object.write("protein_a=%s\tprotein_b=%s\tdistance=%s\n" %(protein_a_name, protein_b_name, min_shortest_route[0]))
 
        ext_code = self.get_unified_name( proteinPiana= min_shortest_route[1][0].get_node_id(),
                                          protein_type_name= output_proteins_type,
                                          list_alternative_type_names=list_alternative_type_names )
        
        output_file_object.write( "START=%s" %(ext_code))
        
        for node in min_shortest_route[1][1:-1]:
            # for each node in the route, print its external code in order

                            
            ext_code = self.get_unified_name( proteinPiana= node.get_node_id(),
                                              protein_type_name= output_proteins_type,
                                              list_alternative_type_names=list_alternative_type_names )
 
            
            output_file_object.write( "<-->%s" %(ext_code))
        # END OF for node in min_shortest_route[1][1:-1]:

        ext_code = self.get_unified_name( proteinPiana= min_shortest_route[1][-1].get_node_id(),
                                          protein_type_name= output_proteins_type,
                                          list_alternative_type_names=list_alternative_type_names )
        
        output_file_object.write( "<-->END=%s\n" %(ext_code))
    

    
    def get_distance_groups(self, protein_ext_code=None, input_protein_type_name= None, output_protein_type_name= None, list_alternative_type_names=[],
                            user_protein_names= {}, tax_id_value= 0):
        """

        gets proteins that are at a certain distance from a given protein protein_ext_code
        
        returns a dictionary that follows the structure:

                            { 1:[list of protein ext codes at distance 1 from protein_ext_code],
                              2:[list of protein ext codes at distance 2 from protein_ext_code],
                              3:[list of protein ext codes at distance 3 from protein_ext_code],
                              4:[list of protein ext codes at distance 4 from protein_ext_code]
                            }
                            
        "protein_ext_code" is the protein for which we want to find proteins at a certain distance

        "input_protein_type_name" is the easy-to-remember protein type used for protein_ext_code
          --> valid values are those listed in PianaGlobals.valid_protein_types

        "output_protein_type_name" is the easy-to-remember protein type that you want to use for results
          --> valid values are those listed in PianaGlobals.valid_protein_types

        "list_alternative_type_names" can be used to set a list of alternative types in case no protein_type_name code is found
        --> user must provide pairs a list of valid easy-to-remember type names
            list_alternative_types can for example look like this: ["gi", "uniacc", "md5"]
             
            I suggest always placing md5 at the end of alternative types, so you never get a None in the output

        "tax_id_value" sets the species of the proteins being returned (can be used for eliminating ambiguities between codes across species)
           --> valid tax ids are 0 (do not take into account the species) and those taxonomy ids provided by ncbi taxonomy
           
        """
        # make sure self.dic_protein_naming is filled
        self._fill_dic_protein_naming(protein_type_name=output_protein_type_name, list_alternative_type_names=list_alternative_type_names,
                                      user_protein_names=user_protein_names)

        self._create_unified_network(protein_type_name=output_protein_type_name, list_alternative_type_names=list_alternative_type_names, user_keywords=[])

        distance_dic = {} # initialize the dictionary of distances that will be returned


        protein_column_type = utilities.get_code_column(input_protein_type_name)

        list_proteinPiana = self.piana_access.get_list_protein_piana(proteinCode_value= protein_ext_code,
                                                                     proteinCodeType_value= protein_column_type,
                                                                     tax_id_value=tax_id_value,
                                                                     source_db_info="no")
        
        for proteinPiana in list_proteinPiana:
            # for each proteinPiana associated to the query protein, find which proteins in the network are connected to it
            
            # get_distances returns a dictionary { proteinPiana: distance, proteinPiana: distance, ....}
            protein_distances = self.get_distances(query_node_id= proteinPiana)


            proteins_at_distance_x_from_query= protein_distances.keys()
            
            for distant_proteinPiana in proteins_at_distance_x_from_query:
                # for each protein that is at a certain distance from the query protein, get the unified ext code and
                # add it to the dictionary of groups of proteins at a distance from the query protein
                
                ext_code = self.get_unified_name( proteinPiana= distant_proteinPiana,
                                                  protein_type_name= output_protein_type_name,
                                                  list_alternative_type_names= list_alternative_type_names)
               
                
                #       protein_distances[distant_proteinPiana] is an integer with the distance at which distant_proteinPiana can be found
                
                if distance_dic.has_key( protein_distances[distant_proteinPiana] ):
                    distance_dic[protein_distances[distant_proteinPiana]][ext_code] = None
                else:
                    distance_dic[protein_distances[distant_proteinPiana]] = {ext_code:None}
            # END OF for distant_proteinPiana in proteins_at_distance_x_from_query:
        # END OF for proteinPiana in list_proteinPiana:

        return distance_dic


    def print_distance_group(self, distance_group,  input_protein_type_name,  output_target, info, format_mode="txt",
                             user_protein_names= {}, tax_id_value=0):
        """
        prints a distance group to output_target 

        "distance_group" is a list of protein codes of type input_protein_type_name 
        
        "output_target" is a file object (sys.stdout to print to screen)

        "info" determines which information will be printed next to the proteins in the distance group
          (info will be preceded by a header stating the type of info being printed)
           - blank: nothing
           - all: all info available
           - scop: scop codes
           - cath: cath codes
           
        "format_mode" sets the type of format that will be used for output
               - 'txt' will print flat text
               - 'html' will print html

        "input_protein_type_name" is the easy-to-remember protein type used for protein_ext_code
          --> valid values are those listed in PianaGlobals.valid_protein_types

        "tax_id_value" sets the species of the proteins that will be printed (can be used for eliminating ambiguities between codes across species)
           --> valid species names are 0 (do not take into account the species) and those taxonomy ids provided by ncbi taxonomy
        """

        # I think there is no need for this, since it has already been done when creating the distance group
        # this is just printing a list of protein ext codes that were found when creating the distance group, together with theis scops and caths
        
        # make sure self.dic_protein_naming is filled
        #self._fill_dic_protein_naming(protein_type_name=output_protein_type_name, list_alternative_type_names= list_alternative_type_names,
        #                              user_protein_names=user_protein_names)

        #self._create_unified_network(protein_type_name=protein_type_name, list_alternative_type_names=list_alternative_type_names, user_keywords=[] )

        protein_column_type = utilities.get_code_column(input_protein_type_name)

        if format_mode == "html":
            output_target.write("<table border=1><tr><td>protein</td><td>scop codes</td>\<td>cath codes</td></tr>\n")
        
        for protein in distance_group:
            list_proteinPiana = self.piana_access.get_list_protein_piana(proteinCode_value= protein,
                                                                         proteinCodeType_value= protein_column_type,
                                                                         tax_id_value=tax_id_value,
                                                                         source_db_info="no")

            dic_scop = {}
            dic_cath = {}
            
            for proteinPiana in list_proteinPiana:

                list_scop= self.piana_access.get_protein_scop_cf_sf_fa(proteinPiana_value=proteinPiana)

                for one_scop in list_scop:
                    scop_key = str(one_scop[0]) + "." + str(one_scop[1]) + "." + str(one_scop[2])
                    dic_scop[scop_key] = (one_scop[0], one_scop[1], one_scop[2] )
                    
                list_cath= self.piana_access.get_protein_cath(proteinPiana_value=proteinPiana, residue_value=None)
                for one_cath in list_cath:
                    dic_cath[one_cath] = None

            # END OF for proteinPiana in list_proteinPiana:

            if format_mode == "txt":
                output_target.write("%s" %(protein) )
                
            elif format_mode == "html":
                output_target.write("<tr><td>%s</td>" %(protein) )
                
            if info== "all" or info == "scop":

                if format_mode == "html":
                    output_target.write("<td>")

                for scop_key in dic_scop:
                    if format_mode == "txt":
                        output_target.write("\tscopcf:%s\tscopsf:%s\tscopfa:%s" %(dic_scop[scop_key][0], dic_scop[scop_key][1], dic_scop[scop_key][2]) )
                    elif format_mode == "html":
                        output_target.write(" scop cf:%s scop sf:%s scop fa:%s " %(dic_scop[scop_key][0], dic_scop[scop_key][1], dic_scop[scop_key][2]) )

                if format_mode == "html":
                    output_target.write("</td>")

                    
            if info== "all" or info == "cath":
                if format_mode == "html":
                    output_target.write("<td>")
                    
                for cath in dic_cath:
                    if format_mode == "txt":
                        output_target.write("\tcath:%s" %(cath))
                    elif format_mode == "html":
                        output_target.write(" cath:%s " %(cath))
                if format_mode == "html":
                    output_target.write("</td>")
            
            if format_mode == "txt":
                output_target.write("\n")
            elif format_mode == "html":
                output_target.write("</tr>\n")
                
        
        # END OF for protein in distance_group:
        
        if format_mode == "html":
            output_target.write("\n</table></body></html>\n")
