"""
File        : string2piana.py
Author      : Ramon Aragues
Creation    : 21.01.2005
Contents    : script that fills up tables in database piana related to external DB "string"
Called from : 

=======================================================================================================

This program reads database STRING. Then, inserts relevant information into database PIANA 


option --start-at=interaction_number permits the partitioning of the piana insertions. This has been done to
allow the user to update piana in several steps, since it takes so long to do all of them. 


Command line option '--help' describes usage of this program


"""

# string2piana.py: script that fills up tables in database piana related to external DB "string"
#
# Copyright (C) 2005  Ramon Aragues
# author email: ramon.aragues@upf.edu and boliva@imim.es
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#    http://www.gnu.org/copyleft/gpl.html
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
#
# University Pompeu Fabra, hereby disclaims all copyright
# interest in the program 'PIANA'
# (software for working with protein-protein interaction networks) written 
# by Ramon Aragues


import sys
import getopt
import re
import readline

import MySQLdb

from PianaDBaccess import *
import utilities

verbose = 0
verbose_detailed = 0
verbose_cog = 0

# ---------------------------------------------------------------
# Set here the default values for command line arguments
# ---------------------------------------------------------------
#
# These will be the values taken by the program by default. They can be changed through the command line
#

interaction_number = 0 # number of interaction from which the program will start piana insertions (unless otherwise stated on command line)
insert_cogs = 0 # unless specified in command line, cogs are not inserted
insert_ints = 0 # unless specified in command line, interactions are not inserted

# setting string that describes method used for finding interaction
methodDescription_value = "predicted"

# ----------------------
# Function usage()
# ----------------------
def usage():
    print "--------------------------------------------------------------------------------------------------------------"
    print "This program fills up tables in database piana related to external DB 'string' \n"
    print "Usage: python string2piana.py --start-at=interaction_number  --string-dbname=string_dbname --string-dbhost=string_server "
    print "                              --piana-dbname=piana_dbname --piana-dbhost=piana_dbhost --piana-dbuser=piana_dbuser --piana-dbpass=piana_dbpass"
    print "                              --insert-cogs --insert-ints --score-thres=score_thres --number-seqs-threshold=number_seqs_threshold"
    print "                              [--help] [--verbose]"
    print "\nwhere:"
    print "     piana_dbname   : name of database piana to be used (required)"
    print "     piana_dbhost   : name of host where database piana to be used is placed (required)"
    print "     piana_dbuser   : username accessing the database (not required in most systems)"
    print "     piana_dbpass   : password of username accessing the database (not required in most systems)"
    print "     interaction_number : number of interaction from which the program will start piana insertions (default is %s)" %(interaction_number)
    print "     string_dbname   : name of the string mysql database (required)"
    print "     string_dbhost   : name of the machine with string mysql server (required)"
    print "     string_dbuser   : name of the mysql string username (not required in most systems)"
    print "     string_dbpass   : name of the mysql string username (not required in most systems)"
    print "     score_thres     : minimum STRING combined score required for inserting interaction into piana_dbname"
    print "     number_seqs_threshold : avoid introducing noise to PIANA by setting a threshold on how many sequences can be associated to a STRING code"
    print "     --insert-cogs  : inserts cogs (and kogs) for proteins in string"
    print "     --insert-ints  : inserts string interactions"
    print "     --help         : prints this message and exits"
    print "     --verbose      : prints process info to stdout"
    print "--------------------------------------------------------------------------------------------------------------"

   
# ---------------------------
# Function parseArguments()                                               
# --------------------------- 

def parseArguments():

    global interaction_number

    global string_dbname  
    global string_dbhost
    global string_dbuser
    global string_dbpass
    
    global score_thres
    global number_seqs_threshold
    
    global piana_dbname
    global piana_dbhost
    global piana_dbuser
    global piana_dbpass

    global insert_cogs
    global insert_ints
    
    global verbose
    
    try:
        opts, args = getopt.getopt(sys.argv[1:], "", ["verbose","help","start-at=",
                                                      "string-dbname=","string-dbuser=","string-dbhost=","string-dbpass=","score-thres=",
                                                      "piana-dbname=", "piana-dbhost=", "piana-dbuser=", "piana-dbpass=",
                                                      "insert-cogs", "insert-ints", "number-seqs-threshold="])
        

    except getopt.GetoptError, msg:
        # print help information and exit:
        sys.stderr.write( "\n\n--\ncommand line arguments are not correct: %s\n--\n\n" %(msg))
        sys.exit(2)

    for option,value in opts:
        
        if option == "--start-at":
            interaction_number = int(value)

        elif option == "--string-dbhost":
            string_dbhost = value
            
        elif option == "--string-dbname":
            string_dbname = value
            
        elif option == "--string-dbuser":
            string_dbuser = value
            
        elif option == "--string-dbpass":
            string_dbpass = value
            
        elif option == "--score-thres":
            score_thres = int(value)
            
        elif option == "--number-seqs-threshold":
            number_seqs_threshold = int(value)
            
        elif option == "--piana-dbname":
            piana_dbname = value
            
        elif option == "--piana-dbhost":
            piana_dbhost = value
            
        elif option == "--piana-dbuser":
            piana_dbuser = value
             
        elif option == "--piana-dbpass":
            piana_dbpass = value
            
        elif option == "--insert-cogs":
            insert_cogs = 1
            
        elif option == "--insert-ints":
            insert_ints = 1
            
        elif option == "--verbose":
            verbose = 1
            
        elif option == "--help":
            # print help information and exit
            usage()
            sys.exit(2)
    # END OF for option,value in opts:

def get_associated_proteinPianas(piana_access, string_cursor, return_source_db="no", string_protein_id=None, number_seqs_threshold=1000):
    """
    returns a list of proteinPianas associated to protein "protein_code" of species "tax_id"

    The type of code being used is 'guessed' using regexps: if no type code can be inferred from the protein value, returns empty list
    """
    # get the code and the tax id
    fields_protein_id = string_protein_id.split(".")

    if len(fields_protein_id) == 2:
        protein_code= fields_protein_id[1]
        tax_id = int(fields_protein_id[0])
    else:
        return []

    # guess the type of identifier used in this id (a list is returned with potential types being used)
    list_code_types = utilities.get_code_type(protein_code)

    list_protein_piana = []

    if list_code_types:
        
        for code_type in list_code_types:

            if code_type == PianaGlobals.geneName_col:
                # string has _ instead of . for gene names
                protein_code = protein_code.replace("_", ".")

            list_protein_piana.extend(piana_access.get_list_protein_piana(proteinCode_value = protein_code,
                                                                          proteinCodeType_value = code_type,
                                                                          tax_id_value= tax_id, source_db_info= return_source_db))
        # END OF for code_type in list_code_types:
    # END OF if list_code_types:

    if not list_protein_piana:
        # if nothing found for the code given, search in STRING table identifiers_proteins

        list_protein_piana= []
        
        string_sqlquery = """SELECT identifier FROM identifiers_proteins where protein_id="%s" """ %(string_protein_id)
        string_cursor.execute(string_sqlquery)
        protein_identifiers = string_cursor.fetchall()

        
        for one_identifier in protein_identifiers:
            list_code_types = utilities.get_code_type(one_identifier[0])

            if not list_code_types:
                continue
            for code_type in list_code_types:
                
                if code_type == PianaGlobals.geneName_col:

                    # if it is a geneName... ignore: they introduce to much noise...
                    continue
                    
                    # string has _ instead of . for gene names
                    #protein_code  = one_identifier[0].replace("_", ".")
                else:
                    protein_code  = one_identifier[0]
                    
                list_protein_piana.extend( piana_access.get_list_protein_piana(proteinCode_value = protein_code,
                                                                               proteinCodeType_value = code_type,
                                                                               tax_id_value= tax_id,
                                                                               source_db_info= return_source_db))
    # END OF if not list_protein_piana:

    # avoid returning entries which are too ambigous... for example, 601.EFTS_SALTY corresponds to more than
    # 200 proteinPianas (there is an emblAccession associated to it (AL627266) which identifies a whole chromosome)
    # We avoid these cases in a not-so-clean way, but I cannot think of another way of doing it... we are just
    # skipping these cases by applying a threshold to the number of proteinPianas that a code can be associated to

    if len(list_protein_piana) > number_seqs_threshold:
        return []


    return list_protein_piana



# ---------------------------
# Function get_species()                                               
# --------------------------- 

# TO DO!!! Not using STRING species information!
# TO DO!!! There are lots of information in STRING that could be used in PIANA...

def get_species(string_protein_id):
    sqlquery = """select species_id from proteins_species where gene_id = "%s";""" %(string_protein_id)
    cursorString.execute(sqlquery)
    auxspecies = cursorString.fetchall()
    if auxspecies:
        return auxspecies[0][0]
    else:
        return None
 
# ---------------------------
# method insert_protein_cogs()                                               
# --------------------------- 
   
def insert_protein_cogs(string_cursor):
    """
    insert COG codes from STRING into pianaDB
    """

    string_sqlquery = """SELECT protein_id, orthgroup_id FROM proteins_orthgroups"""
    string_cursor.execute(string_sqlquery)
    protein_codes_ary = string_cursor.fetchall()

    for protein_code_info_fields in protein_codes_ary:

        # get protein name from the string protein id (which looks like this: 198094.Q81LD0 or  2214.GSA_METAC or ...)
        string_protein_id = protein_code_info_fields[0]
        protein_codes_fields = re.split("\.", string_protein_id)
        protein_code = protein_codes_fields[1].strip()

        # get the species for this protein
        tax_id = int(protein_codes_fields[0].strip())

        
        for cog_code in protein_code_info_fields[1:]:
            cog_code = cog_code.strip()

            list_protein_piana = get_associated_proteinPianas(piana_access= piana_access,
                                                              string_cursor= string_cursor,
                                                              return_source_db="yes",
                                                              string_protein_id=string_protein_id,
                                                              number_seqs_threshold=number_seqs_threshold)

            for protein_piana in list_protein_piana:
                # protein_piana is a pair (proteinPiana, source_db)

                if verbose_detailed:
                    sys.stderr.write( "inserting cog id %s for proteinPiana %s (code is %s of type %s)" %(cog_code, protein_piana[0],
                                                                                                              protein_code, code_type))

                if protein_piana[1] == "completion":
                    piana_access.insert_protein_cog( cog_id= cog_code,
                                                     proteinPiana_value= protein_piana[0],
                                                     proteinCogSource_value= "string_c")
                else:
                    piana_access.insert_protein_cog( cog_id= cog_code,
                                                     proteinPiana_value= protein_piana[0],
                                                     proteinCogSource_value= "string")
                        
            # END OF for protein_piana in list_protein_piana:
        # END OF for cog_code in protein_code_info_fields[1:]:
    # END OF for protein_code_info_fields in protein_codes_ary:

# --------
# --------
#  Main()               
# --------                               
# --------

string_dbname = None
string_dbuser = None
string_dbhost = None
string_dbpass = None

score_thres = 0 # used to limit the interactions inserted to piana: ints with low combined_score are not worth to be inserted into piana_dbname
                # default 0 means inserting all interactions
number_seqs_threshold = 100 # used to limit the number of proteinPianas (ie. sequences) that can be associated to a STRING protein code
                            # STRING gives codes associated to whole chromosomes, and this introduces lots of noise in the database...


piana_dbname = None
piana_dbuser = None
piana_dbhost = None
piana_dbpass = None

number_of_interactions_no_id = 0 # will count interactions not found because proteinPiana not found
number_of_interactions_low_score = 0 # will count interactions not found because proteinPiana not found
number_of_interactions_added = 0 # will count number of interactions added

parseArguments()

# Initialisating connection to piana
piana_access = PianaDBaccess(dbname=piana_dbname, dbhost=piana_dbhost, dbuser=piana_dbuser, dbpassword= piana_dbpass)

# opening connection to MySQL STRING database and create a cursor to work with the database
if string_dbuser is None and string_dbpass is None:
    stringdb = MySQLdb.connect(db=string_dbname, host=string_dbhost)
elif string_dbpass is None:
    stringdb = MySQLdb.connect(user=string_dbuser, db=string_dbname, host=string_dbhost)
else:
    stringdb = MySQLdb.connect(user=string_dbuser, db=string_dbname, host=string_dbhost, passwd= string_dbpass )

cursorString = stringdb.cursor()


if insert_cogs:
    if verbose:
        sys.stderr.write( "inserting cogs for all proteins in string\n")
    insert_protein_cogs(string_cursor= cursorString)
# END OF if insert_cogs:


if not insert_ints:
    sys.exit()

if verbose:
    sys.stderr.write( "Obtaining the list of all STRING interactions\n")


# finding out how many queries will be needed to get all String interactions
#  --> if the query is done in one single step, the computer runs out of memory and gets blocked...
sqlquery= """select count(*) from precomputed_protein_links;"""
cursorString.execute(sqlquery)
number_of_interactions = cursorString.fetchall()[0][0]   # fetachall answer of a count comes in [0][0]

last_interaction_retrieved = -1   # last interaction retrieved (init to -1 so the first one we get is 0)
query_size = 50000                 # how many interactions will be retrieved in each query (needed because of memory limitations)
interactions_to_retrieve = number_of_interactions  # how many interactions haven't been processed

while interactions_to_retrieve > 0:
    # Obtaining the list of all interactions in database STRING

    if verbose:
        sys.stderr.write("%sto%s-" %(last_interaction_retrieved+1, last_interaction_retrieved+1+query_size))
        sys.stderr.flush()

    sqlquery = """select protein_id_a, protein_id_b, equiv_nscore, equiv_nscore_transferred, equiv_fscore, equiv_pscore, equiv_hscore, array_score, array_score_transferred, experimental_score, experimental_score_transferred, database_score, database_score_transferred, textmining_score, textmining_score_transferred, combined_score  from precomputed_protein_links limit %s,%s;""" %(last_interaction_retrieved+1, query_size)
    cursorString.execute(sqlquery)
    list_interactions = cursorString.fetchall()



    number_of_interactions = len(list_interactions)
    i=0

    for gene_pair in list_interactions:

        # Note: variables are called gene_* because old versions of STRING used gene_id_a and gene_id_b instead of protein_id_a and protein_id_b
        #    --> if you are trying to parse an old version of STRING that still uses gene*, you have to change the following in this parser:
        #                 protein_id_a --> gene_id_a
        #                 protein_id_b --> gene_id_b
        #                 precomputed_protein_links --> precomputed_gene_links
        #                 proteins_orthgroups --> genes_orthgroups
        #                 protein_id --> gene_id
        #                 proteins_species --> genes_species
        #
        # gene_pair[0] = gene_id_a
        # gene_pair[1] = gene_id_b
        # gene_pair[2] = equiv_nscore
        # gene_pair[3] = equiv_nscore_transferred_col
        # gene_pair[4] = equiv_fscore_col
        # gene_pair[5] = equiv_pscore
        # gene_pair[6] = equiv_hscore
        # gene_pair[7] = array_score
        # gene_pair[8] = array_score_transferred
        # gene_pair[9] = experimental_score
        # gene_pair[10] = experimental_score_transferred
        # gene_pair[11] = database_score
        # gene_pair[12] = database_score_transferred
        # gene_pair[13] = textmining_score
        # gene_pair[14] = textmining_score_transferred
        # gene_pair[15] = combined_score

        # TO DO!!! Either set a command line option to choose threshold or work on piana to set confidence thresholds when retrieving interactions
        # so we can insert all interactions in STRING but only use those that are reliable

        if int(gene_pair[15]) < score_thres:
            # this is temporary: trying to parse string faster... takes too long to insert all interactions regardless of their
            # combined score. Therefore, I only insert those interactions above a certain threshold
            
            if verbose_detailed:
                sys.stderr.write( "%sto%s-Skipping  interaction number %s (score %s) of a total of %s\n" %(last_interaction_retrieved+1,
                                                                                                           last_interaction_retrieved+1+query_size,
                                                                                                           i, gene_pair[15], number_of_interactions))
            i = i+1
            number_of_interactions_low_score += 1
            continue
        # END OF if int(gene_pair[15]) < score_thres:

        if verbose_detailed:
            sys.stderr.write( "%sto%s-%s(p1=%s, p2=%sscore %s)of%s\n" %(last_interaction_retrieved+1,
                                                                        last_interaction_retrieved+1+query_size,
                                                                        i, gene_pair[0], gene_pair[1], gene_pair[15], number_of_interactions))
        i = i+1


        # before, there was an  "if" here... permiting to re-start the piana update at a given interaction_number
        # it was used for those cases that the parser was halt due to an error, so it didn't have to restart all over again
        # why has it been removed? Don't know :-) But if I needed again, I just have to place it here, don't I?


        # retrieving the swissprotID/swissAccession/trembl from String protein ID
        # string_protein_id is an internal code of STRING that looks something like: 117.3MGH_RHOBA
        # the first term is the protein taxonomy id
        # the second term is the protein identifier (swissprot ID, swissprot Accession, ...)

        list_proteinPiana_a = get_associated_proteinPianas(piana_access=piana_access,
                                                           string_cursor=cursorString,
                                                           return_source_db="yes",
                                                           string_protein_id=gene_pair[0],
                                                           number_seqs_threshold=number_seqs_threshold)
        
        list_proteinPiana_b = get_associated_proteinPianas(piana_access=piana_access,
                                                           string_cursor=cursorString,
                                                           return_source_db="yes",
                                                           string_protein_id=gene_pair[1],
                                                           number_seqs_threshold=number_seqs_threshold)
        


        # remove redundant codes (redundancies happen between different sourceDBs and find out if it is
        # a code that comes from a completion (ie. it only comes from a completion)

        #      create a dic of unique proteinPianas associated to protein A
        dic_proteinPiana_a = {}
        for proteinPiana_a in list_proteinPiana_a:
            # proteinPiana_a is a tuple 
            #    proteinPiana_a[0] --> proteinPiana
            #    proteinPiana_a[1] --> source_dd

            if proteinPiana_a[1] != "completion":
                dic_proteinPiana_a[proteinPiana_a[0]] = 'standard'
            elif proteinPiana_a[1] == "completion":
                if not dic_proteinPiana_a.has_key(proteinPiana_a[0]):
                    dic_proteinPiana_a[proteinPiana_a[0]] = 'completion'
        # END OF for proteinPiana_a in list_proteinPiana_a:

        #      create a dic of unique proteinPianas associated to protein B
        dic_proteinPiana_b = {}
        for proteinPiana_b in list_proteinPiana_b:
            # proteinPiana_b is a tuple 
            #    proteinPiana_b[0] --> proteinPiana
            #    proteinPiana_b[1] --> source_dd

            if proteinPiana_b[1] != "completion":
                dic_proteinPiana_b[proteinPiana_b[0]] = 'standard'
            elif proteinPiana_b[1] == "completion":
                if not dic_proteinPiana_b.has_key(proteinPiana_b[0]):
                    dic_proteinPiana_b[proteinPiana_b[0]] = 'completion'
        # END OF for proteinPiana_b in list_proteinPiana_b:

        for proteinPiana_a in dic_proteinPiana_a:
            for proteinPiana_b in dic_proteinPiana_b:
                    
                if verbose_detailed:
                    sys.stderr.write( "%sto%s-   - inserting interaction number %s: %s(%s) --> %s(%s)\n" %(last_interaction_retrieved+1,
                                                                                                           last_interaction_retrieved+1+query_size,
                                                                                                           i,
                                                                                                           gene_pair[0], proteinPiana_a,
                                                                                                           gene_pair[1], proteinPiana_b))

                if dic_proteinPiana_a[proteinPiana_a] == "completion" or dic_proteinPiana_b[proteinPiana_b] == "completion":
                    source_db = "string_c"
                else:
                    source_db = "string"

                # TO DO! piana_combined_score must somehow be calculated internally for each interaction

                last_interaction_id = piana_access.insert_interaction(proteinPianaA_value =proteinPiana_a,
                                                                      isSourceA_value =1,
                                                                      proteinPianaB_value =proteinPiana_b,
                                                                      isSourceB_value =1,
                                                                      interactionConfidence_value = 1,
                                                                      methodDescription_value = methodDescription_value,
                                                                      sourceDBDescription_value= source_db,
                                                                      confidenceAssignedSourceDB_value = gene_pair[15])
                    
                # as we have kept the interactionPiana, we now insert specific scores for it
                piana_access.insert_interaction_scores(interactionPiana_value = last_interaction_id,
                                                       sourceDBDescription_value= source_db ,
                                                       equiv_nscore_value = gene_pair[2],
                                                       equiv_nscore_transferred_value = gene_pair[3],
                                                       equiv_fscore_value = gene_pair[4],
                                                       equiv_pscore_value = gene_pair[5],
                                                       equiv_hscore_value = gene_pair[6],
                                                       array_score_value = gene_pair[7],
                                                       array_score_transferred_value = gene_pair[8],
                                                       experimental_score_value = gene_pair[9],
                                                       experimental_score_transferred_value = gene_pair[10],
                                                       database_score_value = gene_pair[11],
                                                       database_score_transferred_value = gene_pair[12],
                                                       textmining_score_value = gene_pair[13],
                                                       textmining_score_transferred_value = gene_pair[14],
                                                       combined_score_value = gene_pair[15])
                number_of_interactions_added += 1
            # END OF for proteinPiana_b in dic_proteinPiana_b:
        # END OF for proteinPiana_a in dic_proteinPiana_a:

        if not list_proteinPiana_a or not list_proteinPiana_b:
                if verbose_detailed:
                    sys.stderr.write("%sto%s-proteinPiana not found for %s or %s \n" %(last_interaction_retrieved+1,
                                                                                       last_interaction_retrieved+1+query_size,
                                                                                       gene_pair[0], gene_pair[1]))
                number_of_interactions_no_id += 1

    # END OF for gene_pair in list_interactions:

    last_interaction_retrieved = last_interaction_retrieved + 1 + query_size   # last interaction retrieved (init to -1 so the first one we get is 0)
    interactions_to_retrieve = interactions_to_retrieve - query_size # how many interactions haven't been processed
    
# END OF while interactions_to_retrieve > 0

if verbose:
    sys.stderr.write( "All done! Number of ints added: %s. Number of ints without proteinPiana: %s. Num of ints low score: %s\n\n" %(
        number_of_interactions_added,
        number_of_interactions_no_id,
        number_of_interactions_low_score
        ))
