"""
File        : piana_text_int2piana.py
Author      : Ramon Aragues
Creation    : 12.11.2004
Contents    : script that fills up tables in database piana from a piana text file describing interactions
Called from :  build_database.py

=======================================================================================================

This program parsers a piana text file with interactions and inserts them into pianaDB

The format for files accepted as input files for this parser is described in README.piana_interaction_data_format


Command line option '--help' describes usage of this program

"""

# piana_text_int2piana.py: script that fills up tables in database piana from a piana text file describing interactions
#
# Copyright (C) 2005  Ramon Aragues
# author email: ramon.aragues@upf.edu and boliva@imim.es
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#    http://www.gnu.org/copyleft/gpl.html
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
#
# University Pompeu Fabra, hereby disclaims all copyright
# interest in the program 'PIANA'
# (software for working with protein-protein interaction networks) written 
# by Ramon Aragues

import sys
import getopt

import re
import readline

import MySQLdb

from PianaDBaccess import *
import utilities

verbose = 0

# ---------------------------------------------------------------
# Set here the default values for command line arguments
# ---------------------------------------------------------------



# ----------------------
# Function usage()
# ----------------------
def usage():
    print "--------------------------------------------------------------------------------------------------------------"
    print "This program fills up tables in database piana with interactions from piana text file with interactions \n"
    print "Usage: python piana_text_int2piana.py  --piana-dbname=piana_dbname --piana-dbhost=piana_dbhost "
    print "                                       --piana-dbuser=piana_dbuser --piana-dbpass=piana_dbpass "
    print "                                       --input-file=input_file --input-proteins-type=input_proteins_type "
    print "                                       --tax-id=tax_id --no-species [--help] [--verbose]"
    print "\nwhere:"
    print "     piana_dbname : name of database piana to be used (required)"
    print "     piana_dbhost : name of host where database piana to be used is placed (required)"
    print "     piana_dbuser : username accessing the database (not required in most systems)"
    print "     piana_dbpass : password of username accessing the database (not required in most systems)"
    print "     input_file   : the file with interactions formatted to piana interaction data format (README.piana_interaction_data_format)"
    print "     input_proteins_type: the type of code used for proteins in input_file"
    print "                           -> valid protein types can be obtained by doing $piana/code/execs/> python piana.py --help"
    print "     tax-id : taxonomy id for the proteins in your file"
    print "            --> valid values are 0 (unknown or not important) and NCBI taxon ids"
    print "            --> if your file uses gene names to identify proteins, setting the tax id is required (gene names are ambiguous)"
    print "     --no-species   : inserts all interactions, regardless of the species of proteins"
    print "     --help         : prints this message and exits"
    print "     --verbose      : prints process info to stdout"
    print "--------------------------------------------------------------------------------------------------------------"
        

   
# ---------------------------
# Function parseArguments()                                               
# --------------------------- 

def parseArguments():
    
    
    global piana_dbname
    global piana_dbhost
    global piana_dbuser
    global piana_dbpass
    
    global input_file
    global input_proteins_type
    global no_species
    global tax_id
    
    global verbose
    
    try:
        opts, args = getopt.getopt(sys.argv[1:], "", ["verbose","help","input-file=","input-proteins-type=","no-species",
                                                      "piana-dbname=", "piana-dbhost=", "piana-dbuser=", "piana-dbpass=",
                                                      "tax-id="])
    except getopt.GetoptError, bad_opt:
        # print help information and exit:
        sys.stderr.write( bad_opt.__str__() )
        usage()
        sys.exit(2)
     
    for option,value in opts:
        
        if option == "--input-file":
            input_file = value
            
        elif option  =="--input-proteins-type":
            input_proteins_type = value
            
        elif option  =="--piana-dbname":
            piana_dbname = value
            
        elif option == "--piana-dbname":
            piana_dbname = value
            
        elif option == "--piana-dbhost":
            piana_dbhost = value
            
        elif option == "--piana-dbuser":
            piana_dbuser = value
            
        elif option == "--piana-dbpass":
            piana_dbpass = value
            
        elif option == "--tax-id":
            tax_id = int(value)
            
        elif option == "--no-species":
            no_species = 1
            
        elif option  == "--verbose":
            verbose = 1
            
        elif option  == "--help":
            # print help information and exit
            usage()
            sys.exit(2)
            
    # END OF for option,value in opts:

    if input_file is None:
        raise ValueError("Trying to parse a piana text interaction file without giving a file name")
    
    if input_proteins_type is None:
        raise ValueError("Trying to parse a piana text interaction file without giving the type of protein code used in the input file")
        
# --------
# --------
#  Main()               
# --------                               
# --------

piana_dbname = None
piana_dbuser = None
piana_dbhost = None
piana_dbpass = None

input_file = None
input_proteins_type = None

number_of_interactions_added = 0
number_of_interactions_no_id = 0

tax_id = 0

no_species = 0 # by default 0, will not insert interactions between proteins of different species

# parsing arguments from the command line
parseArguments()

# Initialisating connection to piana
piana_access = PianaDBaccess(dbname=piana_dbname, dbhost=piana_dbhost, dbuser=piana_dbuser, dbpassword= piana_dbpass)

input_file_fd = file(input_file,"r")

#
# Reading "piana_text_int" and inserting its data into piana DB
# 

if verbose:
    sys.stderr.write("Reading data from input file %s...\n" %input_file)


code_type = utilities.get_code_column(input_proteins_type)

number_of_interactions_added = 0
number_of_interactions_dif_species = 0
number_of_interactions_no_id = 0

for line in input_file_fd:


    # Retrieving different fields in line
    line_fields = line.split()
    
    if len(line_fields) == 5:
        
        # line_fields.groups() is a tuple with (tax_id, tax_name)
        protein_a = line_fields[0].strip()
        protein_b = line_fields[1].strip()
        source_db = line_fields[2].strip()
        method = line_fields[3].strip()
        confidence = int(line_fields[4].strip())

        if verbose:
            sys.stderr.write( "interaction read is %s --> %s with method %s\n" %(protein_a, protein_b, method ))

        
        list_proteinPiana_a = piana_access.get_list_protein_piana(proteinCode_value= protein_a,
                                                                   proteinCodeType_value= code_type,
                                                                   tax_id_value= tax_id, source_db_info= "no")
        
        list_proteinPiana_b = piana_access.get_list_protein_piana(proteinCode_value= protein_b,
                                                                  proteinCodeType_value= code_type,
                                                                  tax_id_value= tax_id, source_db_info= "no")
       


        for proteinPiana_a in list_proteinPiana_a:
            for proteinPiana_b in list_proteinPiana_b:
            
                list_species_a = piana_access.get_protein_taxonomy_ids(proteinPiana_value = proteinPiana_a)
                list_species_b = piana_access.get_protein_taxonomy_ids(proteinPiana_value = proteinPiana_b)
                
                # check if proteins are of the same species. If not, interaction not inserted (except command line option says other)
                same_species = 0
                
                if list_species_a and list_species_b:
                    for species_a in list_species_a:
                        if species_a in list_species_b:
                            same_species = 1
                            break
                else:
                    sys.stderr.write( "Warning! No taxonomy id found for proteinPiana %s (%s) or proteinPiana %s (%s)\n" %(proteinPiana_a,
                                                                                                                           list_species_a,
                                                                                                                           proteinPiana_b,
                                                                                                                           list_species_b))
                    # when no taxonomy found, we assume that is a real interaction by seting same_species to 1
                    same_species = 1
                # END OF else: (if list_species_a and list_species_b:)
                    
                if same_species or no_species:
                    if verbose:
                        sys.stderr.write( "inserting interaction: %s --> %s with method %s\n" %(proteinPiana_a, proteinPiana_b ,
                                                                                                piana_access.get_methodID(method) ))

                        
                    piana_access.insert_interaction(proteinPianaA_value = proteinPiana_a,
                                                    isSourceA_value = 1,
                                                    proteinPianaB_value = proteinPiana_b,
                                                    isSourceB_value = 1,
                                                    interactionConfidence_value = confidence,
                                                    methodDescription_value = method,
                                                    sourceDBDescription_value = source_db,
                                                    confidenceAssignedSourceDB_value=1)
                    number_of_interactions_added += 1
                # END OF if same_species or no_species:

                else:
                    number_of_interactions_dif_species +=1
                    if verbose:
                        sys.stderr.write( "proteinA %s (species = %s) and proteinB %s (species =%s) are of different species\n" %(proteinPiana_a,
                                                                                                                                  list_species_a,
                                                                                                                                  proteinPiana_b,
                                                                                                                                  list_species_b) )
                # END OF else: (if same_species or no_species:)

            # END OF for proteinPiana_b in list_proteinPiana_b:
        # END OF for proteinPiana_a in list_proteinPiana_a:
 
        if verbose:
            if not list_proteinPiana_a or not list_proteinPiana_b:
                
                sys.stderr.write("proteinPiana not found for %s (%s) or %s (%s) \n" %(protein_a, list_proteinPiana_a,
                                                                                      protein_b, list_proteinPiana_b))
                number_of_interactions_no_id += 1
        # END OF if verbose:
    
    # END OF if len(line_fields) == 5:
    
    else:
        if verbose:
            sys.stderr.write("Line was not well formatted! Check format used for file! README.piana_interaction_data_format\n")
            sys.stderr.write("(this error might be due to empty lines in your input file: don't worry if it is the case, the data is being inserted anyway)\n")

# END OF for line in input_file_fd
        
   
if verbose:
   sys.stderr.write( "\n------------------------------------------------------------------------\n")
   sys.stderr.write( "All done! Number of ints added: %s.\n" %(number_of_interactions_added))
   sys.stderr.write( "Number of interactions not inserted: between different species: %s\n" %(number_of_interactions_dif_species))
   sys.stderr.write( "Number of interactions with proteins unknown to PIANA: %s\n" %(number_of_interactions_no_id))
   sys.stderr.write( "------------------------------------------------------------------------\n")
