"""
File       : PianaGraphBuilder.py
Author     : R. Aragues & D. Jaeggi
Creation   : 31.07.2003
Contents   : class for building a piana graph from a database
Called from: PianaGraph and piana.py
Comments :
=========================================================================

This class is used to populate a PianaGraph object with interactions and proteins from a piana database

"""

# PianaGraphBuilder.py: implements a class for building protein-protien interaction networks from a piana database
#
# Copyright (C) 2005  Ramon Aragues
# author email: ramon.aragues@upf.edu and boliva@imim.es
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#    http://www.gnu.org/copyleft/gpl.html
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
#
# University Pompeu Fabra, hereby disclaims all copyright
# interest in the program 'PIANA'
# (software for working with protein-protein interaction networks) written 
# by Ramon Aragues

import sys

from GraphBuilder import *
from PianaDB import *
from PianaDBaccess import *
from PianaGraphNodeAttribute import *
from PianaGraphEdgeAttribute import *

verbose = 0

class PianaGraphBuilder(GraphBuilder):
    """
    Class that implements graph building methods specific to Piana
    
    """
    def __init__(self, piana_access_object, depth=0,  hub_threshold= 0, use_self_ints='yes', list_protein_codes=None, code_type_name = None,
                 tax_id_value=0, list_source_dbs="all",  inverse_dbs="no", list_source_methods="all", inverse_methods="no", 
		 use_secondary_db="no", piana_secondary_access_object=None, force_secondary_db="no"):
        
        """
        Initilisation operations specific to Piana: mainly finding out the internal id (ie proteinPiana) for
        protein given as argument, and then calling the general __init__ of superclass GraphBuilder
        
        "depth" valid values are described in superclass GraphBuilder. It refers to the depth the graph should have.
        
        "list_protein_codes" are protein identifiers of type "code_type". These protein codes will be used to build the graph. All codes must be
        of the same type (e.g swissprot)
        
        "code_type_name" is an easy-to-remember  protein type name: establishes the type of protein identifiers that are being passed as argument.

           It is used to retrieve the internal proteinPiana from the database, which is as well the identifier used for node_ids of the graph. 
           Valid id_type are those defined in PianaGlobals.py. There is a direct correspondance between these types and the
           column names in the database

           valid "code_type_name" values are those in PianaGlobals.valid_protein_types



        "use_self_ints" is used to avoid adding self interactions in the network
                      -> 'yes' adds all interactions
                      -> 'no' doesn't add interactions between a protein and itself

                      
        "tax_id_value" can be used to fix the species we want to work on: only proteinPianas from that species will be used to build the graph
        (this doesn't mean that 'a posteriori' added proteins will be limited as well to that species... it only concerns proteins in list_protein_codes)
        If the database has an interaction between a protein of this species and a protein of another species, both proteins will appear in the network
        regardless of this paramter: to control which proteins appear in your output you should use output-species-name in your piana configuration file

             --> valid values are 0 or a taxonomy id as defined by NCBI taxonomy
                    - 0: retrieves all proteinPianas for protein_codes, regardless of species

        "list_source_dbs" can be used to set which interaction databases will be used to build the network: only interactions from databases in the list
        will be added to the network

          -> "list_source_dbs" can be "all" or a list of interactions databases

           - "all": uses all interactions in the piana database
           - [db1, db2, ...]: uses only interactions from databases listed (for example, [dip, string, posas]
              -> valid database names are those defined in PianaGlobals.interaction_databases

        "inverse_dbs" can be:
                       no (databases in list_source_dbs will be used to build the network)
                       yes (all databases except those in list_source_dbs will be used to build the network)
                       

         "list_source_methods" can be used to set which interaction detection methods will be used to build the network: only interactions
         that were detected through methods in the list will be added to the network

          -> "list_source_methods" can be "all" or a list of interactions methods

           - "all": uses all interaction detection methods in the piana database
           - [method1, method2, ...]: uses only interactions from methods listed (for example, [y2h, immuno]
              -> valid method names are those defined in PianaGlobals.method_names

        "inverse_methods" can be:
                       no (methods in list_source_methods will be used to build the network)
                       yes (all methods except those in list_source_methods will be used to build the network)
          
        """

        self.piana_access = piana_access_object
	self.use_secondary_db = use_secondary_db
        self.piana_secondary_access = piana_secondary_access_object
	self.force_secondary_db = force_secondary_db
        self.hub_threshold = hub_threshold
        self.list_source_dbs = list_source_dbs
        self.inverse_dbs = inverse_dbs
        self.list_source_methods = list_source_methods
        self.inverse_methods = inverse_methods
        self.use_self_ints = use_self_ints

        # get internal piana type denomination for the easy-to-remember code_type_name
        code_type = utilities.get_code_column(code_type_name= code_type_name)

        if code_type is None:
            raise ValueError("Using unknown type %s. Valid protein code types are listed with piana.py --help\n" %(code_type_name))

        # populate the list with node_ids that will be used to build the graph. node_ids are proteinPianas.
        #  since a protein_code can have several proteinPianas, the list of node_ids can be longer than the list of protein_codes
        list_node_id = []

        for protein_code in list_protein_codes:
            # Translating the identifier passed as argument into the piana internal identifier for a node
            # The identifier passed as argument must be translated because the id for graph nodes is proteinPiana
            list_node_id.extend( self.piana_access.get_list_protein_piana(proteinCode_value= protein_code,
                                                                          proteinCodeType_value= code_type,
                                                                          tax_id_value = tax_id_value,
                                                                          source_db_info="no") )
           
        # END OF for protein_code in list_protein_code:

        # remove duplicates
        dic_node_ids = {}
        for node_id in list_node_id:
            dic_node_ids[node_id] = None

        list_node_id= dic_node_ids.keys()
        
        if not list_node_id:
            sys.stderr.write("No protein found for your protein codes. Network could no be built\n")
            raise ValueError("No protein found for your protein codes. Network could no be built\n")
        else:
            # Call the superclass init method with pianaID identifiers for the nodes
            GraphBuilder.__init__(self, depth= depth, list_node_id = list_node_id, hub_threshold=hub_threshold)

            
    def create_node_attribute(self, node_id):
            
       """
       Overrides method from GraphBuilder

       returns a PianaGraphNodeAttribute for node_id
       """

       return PianaGraphNodeAttribute(proteinPiana_value= node_id,  piana_access = self.piana_access, mem_mode = "inMemory")

       
    def get_link_details(self, node_id):
       """
       Overrides method from GraphBuilder: this method specifies how specific nodes and edges look like in PianaGraph
       
       Before calling this method we are only dealing with node_id. get_links will create the node objects from node_id and return
       linkDetail objects (container for two node_id, one edge and attributes of all of them). This linkDetails objects are specific
       to PianaGraph, but can be used by GraphBuilder without knowing their internal structure.

       This method uses self.list_source_dbs to limit the interactions found to those that were retrieved from databases that appear
       in the list of databases given by the user when initializing the builder
       
       This method uses self.list_source_methods to limit the interactions found to those that were found with methods that appear
       in the list of methods given by the user when initializing the builder

       returns empty list if no link_details are found or number of links is higher than hub_threshold

        """


       list_link_detail_objects = []

       if verbose:
           sys.stderr.write("in PianaGraphBuilder: list source dbs is %s and list source methods is %s\n" %(self.list_source_dbs, self.list_source_methods))

       # Getting all interaction_id where node_id is involved (respecting restrictions... ignore_unreliable is controlled by
       #                                                       piana.py, which will place on the list_source_dbs the appropiate source dbs)
       #
       # The behavior for retrieving the interactions is the following:
       #
       #    -> if force_secondary_db is yes, then use the secondary database (and that's all... nothing else to check)
       #           -> it means the user only wants to use the secondary database to retrieve interactions
       #    -> if force_secondary is no, then there are several possibilities:
       #         1. the user has set use_secondary_db to no, then get partners from primary
       #           -> he doesn't want to use a secondary DB even in cases where no interactions are found in the primary DB
       #         1. the user has set use_secondary_db to yes, then get partners from primary
       #                1.1 if partners are found, keep those and forget about the secondary database
       #                1.2 if no partners are found, look for partners in the secondary database
       if self.force_secondary_db == "yes":

	   if not self.piana_secondary_access:
	       raise ValueError("You are forcing to use a secondary PIANA database, but you didn't define one in piana_configuration_parameters.py")
	   
	   list_partners = self.piana_secondary_access.get_all_partners(proteinPiana_value=node_id,
									use_self_ints= self.use_self_ints,
									list_source_dbs= self.list_source_dbs,
									inverse_dbs= self.inverse_dbs,
									list_source_methods= self.list_source_methods,
									inverse_methods= self.inverse_methods,
									threshold= self.hub_threshold)
	   piana_access_to_use = self.piana_secondary_access


       else:
	   list_partners = self.piana_access.get_all_partners(proteinPiana_value=node_id,
							      use_self_ints= self.use_self_ints,
							      list_source_dbs= self.list_source_dbs,
							      inverse_dbs= self.inverse_dbs,
							      list_source_methods= self.list_source_methods,
							      inverse_methods= self.inverse_methods,
							      threshold= self.hub_threshold)


	   if not list_partners:
	       # if no partners were found, check if user asked to use a secondary PIANA database
	       if self.use_secondary_db == "yes":
		   # if there is a secondary DB to be used, get the partners in that database, 
		   # and set the secondary database as the piana_access to use for those interactions 
		   if not self.piana_secondary_access:
		       raise ValueError("You are asking to use a secondary PIANA database, but you didn't define one in piana_configuration_parameters.py")

		   list_partners = self.piana_secondary_access.get_all_partners(proteinPiana_value=node_id,
										use_self_ints= self.use_self_ints,
										list_source_dbs= self.list_source_dbs,
										inverse_dbs= self.inverse_dbs,
										list_source_methods= self.list_source_methods,
										inverse_methods= self.inverse_methods,
										threshold= self.hub_threshold)
		   piana_access_to_use = self.piana_secondary_access
	       # END OF if self.use_secondary_db == "yes":
	       else:
		   piana_access_to_use = self.piana_access
	       # END OF else: (if self.use_secondary_db == "yes":)
	   # END OF if not list_partners:
	   else:
	       # if there were partners for the primary database, set it as the one to use
	       piana_access_to_use = self.piana_access
       # END OF else: (if self.force_secondary_db == "yes":)
	   
	       
       if verbose:
           sys.stderr.write("Number of interactions for proteinPiana %s found in database is: %s\n" %(node_id, len(list_interactions)) )


       # For each partner proteinPiana
       #  - find interactionPiana for that interaction
       #  - get attribute for the partner
       #  - get attribute for node_id (done only once before starting the loop: it doesn't change)
       #  - get attributes for interaction_id
       #  - create a  LinkDetail object
       #  - add all attributes previously obtained to LinkDetail
       #  - add the LinkDetail object to the list of LinkDetail objects

       attribute_node_id = PianaGraphNodeAttribute(proteinPiana_value= node_id, piana_access = piana_access_to_use, mem_mode = "inMemory")


       # for each interacion where node_id is involved, append on LinkDetail to the list of LinkDetail objects
       for partner_id in list_partners:

           interaction_id = piana_access_to_use.get_interactionPiana(proteinPianaA_value= partner_id,
								     proteinPianaB_value= node_id,
								     list_source_dbs= "all", inverse_dbs="no",
								     list_source_methods= "all", inverse_methods="no") # no need to place restrictions
                                                                                                                       # because they were placed before
                                                                                                                       # when retrieving the partners
           if verbose:
               sys.stderr.write( "----------------------------------------------------------------------\n")
               sys.stderr.write( "For interaction_id = %s the partner is %s\n" %(interaction_id, partner_id))
               sys.stderr.write( "----------------------------------------------------------------------\n")

           attribute_partner_id = PianaGraphNodeAttribute(partner_id, piana_access= self.piana_access, mem_mode = "inMemory")  # the piana_access for the nodes is always
                                                                                                                               # the primary one. The DBs are synchronized
                                                                                                                               # so... no need to have secondary access for 
                                                                                                                               # the nodes
    
           attribute_interaction_id = PianaGraphEdgeAttribute(interaction_id, piana_access= piana_access_to_use, mem_mode = "inMemory") # the piana_access that will be used
                                                                                                                                        # is the one that was used to get
                                                                                                                                        # the interaction, and the edge will
                                                                                                                                        # know where it comes from so it can
                                                                                                                                        # query the appropiate DB



           # link_detail_object is used to pass the attributes of two nodes and the edge connecting them
           link_detail_object = GraphBuilder.LinkDetail(node1_id= node_id,
                                                        node2_id= partner_id,
                                                        edge_id= interaction_id )

           # adding attribute of first node
           link_detail_object.set_attribute(attribute_object= attribute_node_id, node_id= node_id)

           # adding attribute of second node
           link_detail_object.set_attribute(attribute_object= attribute_partner_id,  node_id= partner_id)

           # adding attribute of edge
           link_detail_object.set_attribute(attribute_object= attribute_interaction_id)

           list_link_detail_objects.append(link_detail_object)

       # END OF for interaction_id in list_interactions:

       if verbose:
           sys.stderr.write( "returning the list of link detail objects\n")
               

       return list_link_detail_objects


