"""
File        : complete_piana.py
Author      : Ramon Aragues
Creation    : 13.12.2004
Contents    : program that populates piana with information inferred from itself
Called from : 

=======================================================================================================

This file implements a program that populates piana with information inferred from itself. Specific actions that this program does:

- completes protein codes, by assigning existing proteinPianas to external codes that were linked to a equivalent proteinPiana
- fills up table proteinSimilarity with pairs of proteinPianas that seem to be the same (have common external codes)


"""

# complete_piana.py: program that populates piana with information inferred from itself
#
# Copyright (C) 2005  Ramon Aragues
# author email: ramon.aragues@upf.edu and boliva@imim.es
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#    http://www.gnu.org/copyleft/gpl.html
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
#
# University Pompeu Fabra, hereby disclaims all copyright
# interest in the program 'PIANA'
# (software for working with protein-protein interaction networks) written 
# by Ramon Aragues

import sys
import getopt

import re
import readline
import MySQLdb

from sets import *


from PianaDBaccess import *
import PianaGlobals

verbose = 0
verbose_detailed= 0
verbose_very_detailed= 0

# ----------------------
# Function usage()
# ----------------------
def usage():
    print "\n--------------------------------------------------------------------------------------------------------------"
    print "This program populates piana with information inferred from itself\n"
    print "Usage: python complete_piana.py  --piana-dbname=piana_dbname --piana-dbhost=piana_dbhost --piana-dbuser=piana_dbuser --piana-dbpass=piana_dbpass"
    print "              --insert-similarity --insert-associated-codes [--help] [--verbose]"
    print "\nwhere:"
    print "     piana_dbname : name of database piana to be used (required)"
    print "     piana_dbhost : name of host where database piana to be used is placed (required)"
    print "     piana_dbuser : username accessing the database (not required in most systems)"
    print "     piana_dbpass : password of username accessing the database (not required in most systems)"
    print "     --insert-similarity       : fills table proteinSimilarity with pairs of proteins that have a common external code (uniacc, unientry or gi)"
    print "     --insert-associated-codes : inserts associations (ext_code, proteinPiana) found by transitive relations"
    print "     --help       : prints this message and exits"
    print "     --verbose    : prints process info to stderr"
    print "--------------------------------------------------------------------------------------------------------------"
        

   
# ---------------------------
# Function parseArguments()                                               
# --------------------------- 

def parseArguments():
    
    
    global piana_dbname
    global piana_dbhost
    global piana_dbuser
    global piana_dbpass

    global insert_similarity
    global insert_associated_codes
    
    global verbose
    
    try:
        opts, args = getopt.getopt(sys.argv[1:], "", ["verbose","help", "insert-similarity", "insert-associated-codes",
                                                      "piana-dbname=", "piana-dbhost=", "piana-dbuser=", "piana-dbpass="])
    except getopt.GetoptError, bad_opt:
        # print help information and exit:
        sys.stderr.write( bad_opt.__str__() )
        usage()
        sys.exit(2)
     
    for option,value in opts:
        
        if option == "--piana-dbname":
            piana_dbname = value
            
        elif option == "--piana-dbhost":
            piana_dbhost = value
            
        elif option == "--piana-dbuser":
            piana_dbuser = value
             
        elif option == "--piana-dbpass":
            piana_dbpass = value
             
        elif option ==  "--insert-similarity":
            insert_similarity = 1
             
        elif option == "--insert-associated-codes":
            insert_associated_codes = 1
             
        elif option == "--verbose":
            verbose = 1
            
        elif option == "--help":
            # print help information and exit
            usage()
            sys.exit(2)
            
# --------
# --------
#  Main()               
# --------                               
# --------

piana_dbname = None
piana_dbuser = None
piana_dbhost = None
piana_dbpass = None

insert_similarity = 0
insert_associated_codes = 0

# parsing arguments from the command line
parseArguments()

# Initialisating connection to piana
piana_access = PianaDBaccess(dbname=piana_dbname, dbhost=piana_dbhost, dbuser=piana_dbuser, dbpassword= piana_dbpass)

number_of_new_correspondences_inserted = 0
number_new_proteinPianas = 0
        
if verbose:
    sys.stderr.write("Obtaining all proteinPianas in database...\n")
    
# get all proteinPianas in the database
list_proteinPiana = piana_access.get_all_proteinPiana()

number_of_proteins = len(list_proteinPiana)
i = 0



for proteinPiana in list_proteinPiana:
    # for each protein in the database, do:
    #    - find all external codes (gi, uniprot entry, uniprot accession) associated to it
    #    - find all proteinPianas associated to these external codes
    #    - insert new info found to pianaDB


    if verbose:
        sys.stderr.write("%s:%s of %s=" %(proteinPiana, i, number_of_proteins))
        i += 1

    # will be used later to determine whether completion must be done or not (only completing between proteins of same species)
    proteinPiana_tax_id = Set(piana_access.get_protein_taxonomy_ids(proteinPiana_value = proteinPiana) )
    
    list_pairs_type_extcodes  = []  # list of pairs (code_type, code)

    for valid_type in PianaGlobals.valid_protein_types:

        if not (valid_type == "uniacc" or valid_type=="unientry" or valid_type=="gi"):
            # skip codes that will never have more than one proteinPiana
            # (ie. md5, sequence, proteinPiana), those that are not a unique ID (ie. interpro)
            # and those that are not reliable (the rest)
            continue

        # here, only working with types: unientry, uniacc, gi
        
        ext_codes = piana_access.get_list_protein_external_codes(proteinPiana= proteinPiana,
                                                                 protein_type_name=valid_type,
                                                                 alternative_type_names=[],
                                                                 answer_mode= "list",
                                                                 source_db_info="yes"   )


        # I have to retrieve both table and column, because some methods need the table, and others the column
        pair_table_column = utilities.get_code_table_column(code_type_name= valid_type)

        if verbose_detailed:
            sys.stderr.write("for type %s there are %s extcodes (%s) associated to pp %s\n" %(valid_type, len(ext_codes),ext_codes, proteinPiana))
            
        dic_codes_added = {}
        for ext_code in ext_codes:
            # ext_code is a tuple (ext_code, sourceDBID)

            if ext_code[1] == "completion":
                # do not complete codes that have been completed
                continue
            
            if dic_codes_added.has_key( ext_code[0] ):
                # prevent adding codes that have already been added for this protein code type
                continue

            dic_codes_added[ext_code[0]] = None
            
            # list_pairs_type_extcodes is a list of tuples ( (table, column), extcode)
            list_pairs_type_extcodes.append( (pair_table_column, ext_code[0]) )
    
    # END OF for valid_type in PianaGlobals.valid_protein_types:

    set_associated_proteinPianas = Set([])
    
    for pair_type_extcode in list_pairs_type_extcodes:
        # pair_type_extcode[0][1] is the column of the protein type
        # pair_type_extcode[0][0] is the table of the protein type
        # pair_type_extcode[1] is the code

        ext_code_proteinPianas = piana_access.get_list_protein_piana(proteinCode_value= pair_type_extcode[1],
                                                                     proteinCodeType_value= pair_type_extcode[0][1],
                                                                     tax_id_value= 0, 
                                                                     source_db_info="yes" )
        
        if verbose_detailed:
            sys.stderr.write("for code %s (%s) there are %s proteinPianas (%s) associated to ext_code %s\n" %(pair_type_extcode[1],
                                                                                                              pair_type_extcode[0][1],
                                                                                                              len(ext_code_proteinPianas),
                                                                                                              ext_code_proteinPianas,
                                                                                                              pair_type_extcode[1]        ))
        for ext_code_proteinPiana in ext_code_proteinPianas:
            # ext_code_proteinPiana[0] is proteinPiana
            # ext_code_proteinPiana[1] is source_db

            if ext_code_proteinPiana[1] != "completion":
                # do not complete codes that come from a completion
                set_associated_proteinPianas.add(ext_code_proteinPiana[0])
    # END OF for pair_type_extcode in list_pairs_type_extcodes:

    if verbose_very_detailed:
        sys.stderr.write("pps associated are: %s\n" %(set_associated_proteinPianas))
        
    if verbose_detailed:
        sys.stderr.write("----%s ProteinPianas associated to pp %s----\n" %(len(set_associated_proteinPianas),
                                                                            proteinPiana))

    if verbose:
        sys.stderr.write("%s--" %(len(set_associated_proteinPianas)))
        
    for associated_proteinPiana in set_associated_proteinPianas:

        if associated_proteinPiana == proteinPiana:
            # do not need to insert the current proteinPiana
            continue
        
        if verbose:
            number_new_proteinPianas += 1


        associated_proteinPiana_tax_id = Set(piana_access.get_protein_taxonomy_ids(proteinPiana_value = associated_proteinPiana) )
        
        if not proteinPiana_tax_id.intersection(associated_proteinPiana_tax_id):
            continue


        if insert_similarity:
            piana_access.insert_protein_similarity(proteinPiana_a_value=proteinPiana, proteinPiana_b_value=associated_proteinPiana)
            
        
        if insert_associated_codes:
            for pair_type_extcode in list_pairs_type_extcodes:
                # pair_type_extcode[0][0] is the table of the protein type
                # pair_type_extcode[0][1] is the column of the protein type
                # pair_type_extcode[1] is the code

                if verbose_detailed:
                    sys.stderr.write("Inserting proteinPiana %s for code %s\n" %(associated_proteinPiana , pair_type_extcode[1]))


                if not associated_proteinPiana in piana_access.get_list_protein_piana( proteinCode_value= pair_type_extcode[1],
                                                                                       proteinCodeType_value= pair_type_extcode[0][1],
                                                                                       tax_id_value= 0, source_db_info= "no"):
                    # if this proteinPiana is not already associated to that external code, insert it
                    if verbose:
                        number_of_new_correspondences_inserted += 1

                    piana_access.insert_protein_code( code_table=pair_type_extcode[0][0],
                                                      proteinPiana=associated_proteinPiana,
                                                      code_value=pair_type_extcode[1],
                                                      sourceDBID= "completion")

            # END OF for pair_type_extcode in list_pairs_type_extcodes:
        # END OF if insert_associated_codes:
    # END OF for associated_proteinPiana in set_associated_proteinPianas:


    del set_associated_proteinPianas
    del list_pairs_type_extcodes
    del ext_code_proteinPianas
# END OF for proteinPiana in list_proteinPiana:


if verbose:
    sys.stderr.write("Number of proteinPianas transferred to other external codes: %s\n" %(number_new_proteinPianas))
    sys.stderr.write("Number of new correspondences established: %s\n" %(number_of_new_correspondences_inserted))
