"""
File        : psi_flat2piana.py
Author      : Ramon Aragues
Creation    : 20.10.2004
Contents    : script that fills up tables in database piana from a  "psi_flat" interactions file
Called from : 

=======================================================================================================

This program parsers a text file with interactions in psi_flat format and inserts them into piana

format of the input file is described in README.psi_flat_format

Command line option '--help' describes usage of this program


Attention! If using this parser for a new database, you must make sure of the following:

- you have adapted the full name parsing to the new database full name (read comments below)
- your database appears in PianaGlobals under source_database and interaction_databases
- in PianaGlobals, there is a color associated to your database

"""

import sys
import getopt

import re
import readline

import MySQLdb

from PianaDBaccess import *

verbose = 1


# ----------------------
# Function usage()
# ----------------------
def usage():
    print "--------------------------------------------------------------------------------------------------------------"
    print "This program fills up tables in database piana related to external DB 'psi_flat' \n"
    print "Usage: python psi_flat2piana.py  --piana-dbname=piana_dbname --piana-dbhost=piana_dbhost --piana-dbuser=piana_dbuser --piana-dbpass=piana_dbpass"
    print "                              --psi_flat-file=psi_flat_file --source-database=source_database [--help] [--verbose]"
    print "\nwhere:"
    print "     piana_dbname : name of database piana to be used (required)"
    print "     piana_dbhost : name of host where database piana to be used is placed (required)"
    print "     piana_dbuser : username accessing the database (not required in most systems)"
    print "     piana_dbpass : password of username accessing the database (not required in most systems)"
    print "     psi_flat_file  : the name of the psi_flat input file" 
    print "     source_database: the name of the database that you are parsing (eg. mips, hprd, ...)" 
    print "     --help         : prints this message and exits"
    print "     --verbose      : prints process info to stdout"
    print "--------------------------------------------------------------------------------------------------------------"
        
# ---------------------------
# Function parseArguments()                                               
# --------------------------- 

def parseArguments():
    
    
    global piana_dbname
    global piana_dbhost
    global piana_dbuser
    global piana_dbpass
    
    global psi_flat_file
    global source_database
    
    global verbose
    
    try:
        opts, args = getopt.getopt(sys.argv[1:], "vhf:s:n:o:u:w:", ["verbose","help","psi_flat-file=", "source-database=",
                                                                  "piana-dbname=", "piana-dbhost=", "piana-dbuser=", "piana-dbpass=" ])


    except getopt.GetoptError, msg:
        # print help information and exit:
        sys.stderr.write( "\n\n--\ncommand line arguments are not correct: %s\n--\n\n" %(msg))
        sys.exit(2)

      
    for option,value in opts:
        
        if option in ("-f", "--psi_flat-file"):
            psi_flat_file = value
            
        elif option in ("-s", "--source-database"):
            source_database = value
            
        elif option in ("-n", "--piana-dbname"):
            piana_dbname = value
            
        elif option in ("-o", "--piana-dbhost"):
            piana_dbhost = value
            
        elif option in ("-u", "--piana-dbuser"):
            piana_dbuser = value
            
        elif option in ("-w", "--piana-dbpass"):
            piana_dbpass = value
            
        elif option in ("-v", "--verbose"):
            verbose = 1
            
        elif option in ("-h", "--help"):
            # print help information and exit
            usage()
            sys.exit(2)
            
    # END OF for option,value in opts:

    if source_database is None:
        raise ValueError("You must set a source database (eg. mips, hprd, ...)\n")

    if psi_flat_file is None:
        raise ValueError("You must set a psi-flat-file to parse\n")


def get_protein_type_from_db_key(db_key = None):
    """
    returns a list of table columns of the code corresponding to db_key

    It returns a list because even though we are given the source
    database, we do not know exactly the type of code (eg SP could be
    unientry or uniacc)

    Currently (in MIPS files), source db values can be:
          - GB: GenBank
          - SP: SwissProt
          - TREMBL: TrEMBL

    """
    
    if db_key == "SP" or db_key == "TREMBL":
        return [PianaGlobals.swissProtID_col, PianaGlobals.swissAccessionID_col]
    
    elif db_key == "GB":
        return [PianaGlobals.emblAccessionID_col, PianaGlobals.emblPID_col]
    else:
        return []
        

def get_proteinPianas( primary_db = None, primary_code=None , secondary_db = None, secondary_code = None, full_name = None):
    """
    returns a list of proteinPianas for the protein described in arguments
    """
    list_protein_piana = []

    # if there is a primary or secondary identifier, we will not try to read the full name: too complicated and error prone
    if primary_code != "":

        # get the source db from primary_db
        primary_code_types = get_protein_type_from_db_key(db_key = primary_db)

        for primary_code_type in primary_code_types:

            primary_proteinPianas = piana_access.get_list_protein_piana(proteinCode_value= primary_code,
                                                                        proteinCodeType_value= primary_code_type,
                                                                        tax_id_value= 0, source_db_info= "yes")  

            for primary_proteinPiana in primary_proteinPianas:
                if primary_proteinPiana not in list_protein_piana:
                    list_protein_piana.append(primary_proteinPiana)

        if secondary_code != "":

            # get the source db from primary_db
            secondary_code_types = get_protein_type_from_db_key(db_key = secondary_db)

            for secondary_code_type in secondary_code_types:

                secondary_proteinPianas = piana_access.get_list_protein_piana(proteinCode_value= secondary_code,
                                                                              proteinCodeType_value= secondary_code_type,
                                                                              tax_id_value= 0, source_db_info= "yes")
                for secondary_proteinPiana in secondary_proteinPianas:
                    if secondary_proteinPiana not in list_protein_piana:
                        list_protein_piana.append(secondary_proteinPiana)
    # END OF if primary_code != "":
    else:
        # there is no primary code... try to make some sense of the full name
        # ATTENTION!!! This highly depends on the database you are using: each database uses
        # its own format for full name


        # this is the code for transforming a MIPS full name into a list of proteinPianas
        #  full name is comma-separated and at least the first element is a geneName
        #  looks like this: ARG;ABL2;ABLL; nonreceptor protein-tyrosine kinase Arg
        #                   TRP3; calcium influx channel protein
        #                   MIP-2A; MBP-1 interacting protein-2A (MIP-2A)

        
        list_full_name = full_name.split(";")
        

        # for each name in the list (all atoms except for last one which is the description) find proteinPianas
        for name in list_full_name[:-1]:

            name_proteinPianas = piana_access.get_list_protein_piana(proteinCode_value=name.strip() ,
                                                                     proteinCodeType_value= PianaGlobals.geneName_col,
                                                                     tax_id_value= 0, source_db_info= "yes")

            for name_proteinPiana in name_proteinPianas:
                if name_proteinPiana not in list_protein_piana:
                    list_protein_piana.append(name_proteinPiana)

    # END OF else: (if primary_code != "":)

    return list_protein_piana
        

# --------
# --------
#  Main()               
# --------                               
# --------

piana_dbname = None
piana_dbuser = None
piana_dbhost = None
piana_dbpass = None

psi_flat_file = None
source_database = None

number_of_interactions_added = 0
number_of_interactions_no_id = 0
number_of_lines = 0

# parsing arguments from the command line
parseArguments()


# Initialisating connection to piana
piana_access = PianaDBaccess(dbname=piana_dbname, dbhost=piana_dbhost, dbuser=piana_dbuser, dbpassword= piana_dbpass)

psi_flat_fd = open(psi_flat_file,"r")

#
# Reading external DB "psi_flat" and inserting its data into piana DB
# 

if verbose:
    sys.stderr.write("Reading data from psi_flat file...\n")

for line in psi_flat_fd:

    number_of_lines += 1

    if number_of_lines == 1:
        # skip header line
        continue

    
    # Retrieving different fields in line
    line_fields = line.split("||")  # line_fields[0] is method used to detect interaction
                                    # line_fields[1] is full name for protein interactor A
                                    # line_fields[2] is db of primary Ref identifier of protein A
                                    # line_fields[3] is id of primary Ref of protein A
                                    # line_fields[4] is db of secondary Ref identifier of protein A
                                    # line_fields[5] is id of secondary Ref of protein B
                                    # line_fields[6] is full name for protein interactor B
                                    # line_fields[7] is db of primary Ref identifier of protein B
                                    # line_fields[8] is id of primary Ref of protein B
                                    # line_fields[9] is db of secondary Ref identifier of protein B
                                    # line_fields[10] is id of secondary Ref of protein B

       
    
    if len(line_fields) == 11:
        
        method = line_fields[0].strip()
        full_name_protein_a = line_fields[1].strip()
        db_primary_protein_a = line_fields[2].strip()
        id_primary_protein_a = line_fields[3].strip()
        db_secondary_protein_a = line_fields[4].strip()
        id_secondary_protein_a = line_fields[5].strip()
        full_name_protein_b = line_fields[6].strip()
        db_primary_protein_b = line_fields[7].strip()
        id_primary_protein_b = line_fields[8].strip()
        db_secondary_protein_b = line_fields[9].strip()
        id_secondary_protein_b = line_fields[10].strip()


        # ATTENTION!!! If using this parser for a new file (now it works for MIPS) you have to make sure of the following:
        #
        #  1. all methods in your file appear in PianaGlobals.method_names
        #  2. all db_primary_protein_x appear are considered in function get_protein_type_from_db_key
        #  3. full name processing varies depending on the kind of full name used by your file... modify code accordingly

        
        if verbose:
            sys.stderr.write("-------------------------------------------------------------------------\n")
            sys.stderr.write( "interaction read is p<%s> --> p<%s> with method %s\n" %(id_primary_protein_a, id_primary_protein_b, method ))
            sys.stderr.write( "interaction read is s<%s> --> s<%s> with method %s\n" %(id_secondary_protein_a, id_secondary_protein_b, method ))
            sys.stderr.write( "interaction read is f<%s> --> f<%s> with method %s\n" %(full_name_protein_a, full_name_protein_b, method ))


        list_proteinPiana_a = get_proteinPianas( primary_db = db_primary_protein_a, primary_code=id_primary_protein_a ,
                                                 secondary_db = db_secondary_protein_a, secondary_code = id_secondary_protein_a,
                                                 full_name = full_name_protein_a)
        
        list_proteinPiana_b = get_proteinPianas( primary_db = db_primary_protein_b, primary_code=id_primary_protein_b ,
                                                 secondary_db = db_secondary_protein_b, secondary_code = id_secondary_protein_b,
                                                 full_name = full_name_protein_b)

        if verbose:
            sys.stderr.write( "inserting interaction: %s --> %s with method %s\n" %(list_proteinPiana_a, list_proteinPiana_b ,
                                                                                    piana_access.get_methodID(method) ))


        for proteinPiana_a in list_proteinPiana_a:
            for proteinPiana_b in list_proteinPiana_b:
                # proteinPiana_a is a pair (proteinPiana, source_db)
                # proteinPiana_b is a pair (proteinPiana, source_db)
            
                if proteinPiana_a[1] == "completion" or proteinPiana_b[1] == "completion":
                    piana_access.insert_interaction(proteinPianaA_value = proteinPiana_a[0],
                                                    isSourceA_value = 1,
                                                    proteinPianaB_value = proteinPiana_b[0],
                                                    isSourceB_value = 1,
                                                    interactionConfidence_value = 1,
                                                    methodDescription_value = method,
                                                    sourceDBDescription_value = source_database + "_c",
                                                    confidenceAssignedSourceDB_value=1)
                else:
                    piana_access.insert_interaction(proteinPianaA_value = proteinPiana_a[0],
                                                    isSourceA_value = 1,
                                                    proteinPianaB_value = proteinPiana_b[0],
                                                    isSourceB_value = 1,
                                                    interactionConfidence_value = 1,
                                                    methodDescription_value = method,
                                                    sourceDBDescription_value = source_database,
                                                    confidenceAssignedSourceDB_value=1)
                    
                number_of_interactions_added += 1
            # END OF for proteinPiana_b in list_proteinPiana_b:
        # END OF for proteinPiana_a in list_proteinPiana_a:
 
        if verbose:
            
            if not list_proteinPiana_a or not list_proteinPiana_b:
                sys.stderr.write("proteinPiana not found for %s (%s) or %s (%s) \n" %(id_primary_protein_a, list_proteinPiana_a,
                                                                                      id_primary_protein_b, list_proteinPiana_b))
                number_of_interactions_no_id += 1
        
if verbose:
    sys.stderr.write("All done! Number of interactions added: %s . Number of interactions without proteinPiana: %s\n\n" %(number_of_interactions_added,
                                                                                                                          number_of_interactions_no_id))
