"""
File        : get_num_of_ints_for_proteinPiana_list.py
Author      : Ramon Aragues
Creation    : 11.01.2005
Contents    : script that finds the number of interactions
              in the database for each protein in the input
              file
Called from : 

=======================================================================================================


Command line option '--help' describes usage of this program


"""

# get_num_of_ints_for_proteinPiana_list.py: script that finds the
#     number of interactions in the database for each protein in 
#     the input file
#
# Copyright (C) 2005  Ramon Aragues
# author email: ramon.aragues@upf.edu and boliva@imim.es
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#    http://www.gnu.org/copyleft/gpl.html
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
#
# University Pompeu Fabra, hereby disclaims all copyright
# interest in the program 'PIANA'
# (software for working with protein-protein interaction networks) written 
# by Ramon Aragues

import sys
import getopt

import re
import readline

import MySQLdb

import utilities

from PianaDBaccess import *

verbose = 1

list_source_methods="all"
inverse_methods="no"

#list_source_methods=["tandaffin"]
#inverse_methods="yes"

# ---------------------------------------------------------------
# Set here the default values for command line arguments
# ---------------------------------------------------------------
#
# These will be the values taken by the program when called directly from build_database.py

# ----------------------
# Function usage()
# ----------------------
def usage():
    print "--------------------------------------------------------------------------------------------------------------"
    print "This script finds the number of interactions in the database for each protein in the input file\n"
    print "Usage: python get_num_of_ints_for_proteinPiana_list.py --piana-dbname=piana_dbname --piana-dbhost=piana_dbhost --piana-dbuser=piana_dbuser --piana-dbpass=piana_dbpass "
    print "                 --input-file=input_file --input-proteins-type=proteinPiana --output-file=output_file [--help] [--verbose]"
    print "\nwhere:"
    print "     piana_dbname   : name of database piana to be used (required)"
    print "     piana_dbhost   : name of host where database piana to be used is placed (required)"
    print "     piana_dbuser   : username accessing the database (not required in most systems)"
    print "     piana_dbpass   : password of username accessing the database (not required in most systems)"
    print "     input_file     : file with protein names to be retrieved (all if all proteins in piana required)"
    print "     input_proteins_type : attention!!!! Must be proteinPiana"
    print "     output_file    : file where proteins with num of interactions"
    print "     --help         : prints this message and exits"
    print "     --verbose      : prints process info to stdout"
    print "--------------------------------------------------------------------------------------------------------------"
        

# ---------------------------
# Function parseArguments()                                               
# --------------------------- 

def parseArguments():
    
    global piana_dbname
    global piana_dbhost
    global piana_dbuser
    global piana_dbpass
    
    global input_file
    global input_proteins_type
    
    global output_file
    
    global verbose
    
    try:
        opts, args = getopt.getopt(sys.argv[1:], "", ["verbose","help","input-file=","input-proteins-type=","output-file=",
                                                      "piana-dbname=", "piana-dbhost=", "piana-dbuser=", "piana-dbpass="])
    except getopt.GetoptError, msg:
        # print help information and exit:
        sys.stderr.write( "\n\n--\ncommand line arguments are not correct: %s\n--\n\n" %(msg))
        # print help information and exit:
        usage()
        sys.exit(2)
     
    for option,value in opts:
        
        if option == "--input-file":
            input_file = value
            
        elif option == "--output-file":
            output_file = value
            
        elif option == "--input-proteins-type":
            input_proteins_type = value
            
        elif option == "--piana-dbname":
            piana_dbname = value
            
        elif option == "--piana-dbhost":
            piana_dbhost = value
            
        elif option == "--piana-dbuser":
            piana_dbuser = value
            
        elif option == "--piana-dbpass":
            piana_dbpass = value
            
        elif option == "--verbose":
            verbose = 1
            
        elif option == "--help":
            # print help information and exit
            usage()
            sys.exit(2)
            
    # END OF for option,value in opts:
    
    if input_file is None:
        sys.stderr.write("You didn't set an input file\n")
        usage()
        sys.exit(2)
        
    if input_file != "all" and input_proteins_type != "proteinPiana":
        sys.stderr.write("You didn't set an input proteins type (ie. it is different from 'proteinPiana' \n")
        usage()
        sys.exit(2)
        
    if output_file is None:
        sys.stderr.write("You didn't set an output file\n")
        usage()
        sys.exit(2)
        
# --------
# --------
#  Main()               
# --------                               
# --------
piana_dbname = None
piana_dbuser = None
piana_dbhost = None
piana_dbpass = None


input_file = None
input_proteins_type = None

output_file = None

# parsing arguments from the command line
parseArguments()

# Initialisating connection to piana
piana_access = PianaDBaccess(dbname=piana_dbname, dbhost=piana_dbhost, dbuser=piana_dbuser, dbpassword= piana_dbpass)

if verbose:
    sys.stderr.write( "Reading data from piana...\n")

code_type = utilities.get_code_column(input_proteins_type)

list_proteinPianas = []
correspondence_proteinPiana_extcode= {} # keeps track of which proteinPiana corresponds to each ext code.
                                        # used for title when printing num_ints file


if input_file == "all":

    list_proteinPianas = piana_access.get_all_proteinPiana()
    
else:
  
    input_fd = file(input_file,"r")
    for line in input_fd:

        protein = line.strip()
        proteinPianas = piana_access.get_list_protein_piana(proteinCode_value= protein,
                                                            proteinCodeType_value= code_type,
                                                            tax_id_value= 0, source_db_info= "no")
        
        list_proteinPianas.extend( proteinPianas )
        
        for proteinPiana in proteinPianas:
            # fill in the dictionary of correspondences
            correspondence_proteinPiana_extcode[proteinPiana] = protein
        # END OF for proteinPiana in proteinPianas:
        
    # END OF for line in input_fd
    
    input_fd.close()
# END OF else: (if input_file == "all":)

if verbose:
    sys.stderr.write( "Creating files with number of interactions...\n")

num_ints_fd = file(output_file,"w")

for proteinPiana in list_proteinPianas:

    list_partners = piana_access.get_all_partners(proteinPiana_value= proteinPiana,
                                                  list_source_dbs= "all",
                                                  inverse_dbs="no",
                                                  list_source_methods=list_source_methods ,
                                                  inverse_methods=inverse_methods,
                                                  threshold = 0)
    num_partners = len(list_partners)
    # write in num_ints format (title has external code in case input_file != "all")
    if input_file == "all":
        num_ints_fd.write("%s\t%s\n" %(proteinPiana, num_partners))
    else:
        num_ints_fd.write("%s\t%s\t%s\n" %(proteinPiana, correspondence_proteinPiana_extcode[proteinPiana], num_partners) )

    
# END OF for proteinPiana in list_proteinPianas:

num_ints_fd.close()
