"""
File        : check_network_proteins_not_in_array.py
Author      : Ramon Aragues
Creation    : 29.05.06 (in MSKCC, NY)
Contents    :  checks which network proteins do not appear in the array genes
Called from : command line

=======================================================================================================

This program checks which network proteins do not appear in the array genes.

Given an input list, builds the network for those proteins and then checks how many of the 
network proteins do not have a corresponding gene in the total list of array genes.


"""

# licence goes here

import sys
import getopt

import re
import readline
import MySQLdb

from PianaApi import *

verbose = 1
verbose_detailed = 0

# ----------------------
# Function usage()
# ----------------------
def usage():
    print "\n--------------------------------------------------------------------------------------------------------------"
    print "This program checks which network proteins do not appear in the array genes \n"
    print "\nUsage: python check_network_proteins_not_in_array.py  --piana-dbname=piana_dbname --piana-dbhost=piana_dbhost --piana-dbuser=piana_dbuser --piana-dbpass=piana_dbpass"
    print "              --input-file=input_file --input-proteins-type=input_proteins_type --array-genes-file-name=array_genes_file_name"
    print "              --array-proteins-type=input_proteins_type  [--help] [--verbose]"
    print "\nwhere:"
    print "     input_file : file with proteins that will be used to build network "
    print "     input_proteins_type: the type of code used for proteins in input_file"
    print "                           -> valid protein types can be obtained by doing $piana/code/execs/> python piana.py --help"
    print "     tax-id : taxonomy id for the proteins in your file"
    print "            --> valid values are 0 (unknown or not important) and NCBI taxon ids"
    print "            --> if your file uses gene names to identify proteins, setting the tax id is required (gene names are ambiguous)"
    print "     piana_dbname : name of database piana to be used (required)"
    print "     piana_dbhost : name of host where database piana to be used is placed (required)"
    print "     piana_dbuser : username accessing the database (not required in most systems)"
    print "     piana_dbpass : password of username accessing the database (not required in most systems)"
    print "     array_genes_file_name : file that contains all genes that were placed in the array "
    print "     array_proteins_type: the type of code used for proteins in input_file"
    print "                           -> valid protein types can be obtained by doing $piana/code/execs/> python piana.py --help"
    print "     --help       : prints this message and exits"
    print "     --verbose    : prints process info to stdout"
    print "--------------------------------------------------------------------------------------------------------------"
    print " Attention: parameters to build network are hardcoded in the script itself"
    print " Attention: if your network has to be built from several input files, you must hardcode that in this script"
    print "--------------------------------------------------------------------------------------------------------------"
        

   
# ---------------------------
# Function parseArguments()                                               
# --------------------------- 

def parseArguments():
    
    global piana_dbname
    global piana_dbhost
    global piana_dbuser
    global piana_dbpass

    global array_genes_file_name
    global array_proteins_type
    
    global input_file
    global input_proteins_type
    global tax_id

    global verbose
    
    try:
        opts, args = getopt.getopt(sys.argv[1:], "", ["verbose","help","array-genes-file-name=","input-file=","input-proteins-type=",
                                                      "piana-dbname=", "piana-dbhost=", "piana-dbuser=", "piana-dbpass=","array-proteins-type=",
                                                      "tax-id=" ])
    except getopt.GetoptError, bad_opt:
        # print help information and exit:
        sys.stderr.write( bad_opt.__str__() )
        usage()
        sys.exit(2)
     
    for option,value in opts:
        
        if option == "--input-file":
            input_file = value
            
        elif option  =="--input-proteins-type":
            input_proteins_type = value
            
        elif option == "--piana-dbname":
            piana_dbname = value
            
        elif option == "--piana-dbhost":
            piana_dbhost = value
            
        elif option == "--piana-dbuser":
            piana_dbuser = value
             
        elif option == "--piana-dbpass":
            piana_dbpass = value
            
        elif option == "--tax-id":
            tax_id = int(value)
             
        elif option == "--array-genes-file-name":
            array_genes_file_name = value
            
        elif option  =="--array-proteins-type":
            array_proteins_type = value
             
             
        elif option == "--verbose":
            verbose = 1
            
        elif option == "--help":
            # print help information and exit
            usage()
            sys.exit(2)

    # check arguments
    if array_genes_file_name is None:
        raise ValueError("you must give an input file with array genes")

    if array_proteins_type is None:
        raise ValueError("Trying to check prots without giving the type of protein code used in the array file")


    if input_file is None:
        raise ValueError("Trying to check prots without giving a file name")
    
    if input_proteins_type is None:
        raise ValueError("Trying to check prots without giving the type of protein code used in the input file")

    
# --------
# --------
#  Main()               
# --------                               
# --------

piana_dbname = None
piana_dbuser = None
piana_dbhost = None
piana_dbpass = None
array_genes_file_name = None
array_proteins_type = None

input_file = None
input_proteins_type = None

tax_id = 0

# parsing arguments from the command line
parseArguments()

# 0. Initialisating connection to piana
if verbose:
    sys.stderr.write("Opening connection to PianaApi\n")

piana_api = PianaApi(piana_dbname=piana_dbname, piana_dbhost= piana_dbhost, piana_dbuser=piana_dbuser, piana_dbpass= piana_dbpass, 
		     use_secondary_db= "yes", network_name="dummy" )



# 1. Get all proteinPianas associated to the genes of the array

if verbose:
    sys.stderr.write("Loading array genes from  %s\n" %(array_genes_file_name))

protein_column_type = utilities.get_code_column(array_proteins_type)

dic_array_proteinPianas = {}  # { proteinPiana: None, proteinPiana: None, ...}

for one_line in file(array_genes_file_name,"r"):

    #print "searching proteinPianas for <%s> with colum type %s and tax id %s" %(one_line,protein_column_type , tax_id)
    
    list_proteinPiana = piana_api.piana_access.get_list_protein_piana(proteinCode_value= one_line.strip(),
								      proteinCodeType_value= protein_column_type,
								      tax_id_value= tax_id,
								      source_db_info="no")

    for one_proteinPiana in list_proteinPiana:
	dic_array_proteinPianas[one_proteinPiana] = None
# END OF for one_line in file(array_genes_file_name,"r"):


if verbose:
    sys.stderr.write("Number of genes associated proteinPianas is %s\n" %(len(dic_array_proteinPianas)))




# 2. Build the network for the input file
if verbose:
    sys.stderr.write("Building PPI for file %s\n" %(input_file))

file_object = file(input_file, "r")
piana_api.add_file_proteins_to_piana_graph(file_object= file_object,
					   protein_type_name= input_proteins_type,
					   tax_id_value= tax_id,
					   depth = 1,
					   hub_threshold= 0,
					   use_self_ints="yes",
					   list_source_dbs= "all",
					   inverse_dbs= "no",
					   list_source_methods= "all",
					   inverse_methods= "no")

file_object.close()

#
# Add here any other PIANA orders for building a network
#

# for brain cancer:
piana_api.add_protein_to_piana_graph(protein_code="Q04760",
				     protein_type_name = input_proteins_type,
				     tax_id_value= tax_id,
				     depth=1,
				     hub_threshold=0,
				     use_self_ints="yes",
				     list_source_dbs= "all",
				     inverse_dbs= "no",
				     list_source_methods= "all",
				     inverse_methods= "no",
				     force_secondary_db= "yes")

# for bone cancer:
#piana_api.add_file_proteins_to_piana_graph(file_object= file("/home/raragues/phd/piana/projects/cancer/data/20060501_bone/bone_proteins_with_few_exp_ints.txt", "r"),
#					   protein_type_name= input_proteins_type,
##					   tax_id_value= tax_id,
#					   depth = 1,
#					   hub_threshold= 0,
#					   use_self_ints="yes",
#					   list_source_dbs= "all",
#					   inverse_dbs= "no",
#					   list_source_methods= "all",
#					   inverse_methods= "no",
#					   force_secondary_db= "yes")

# for lung cancer: nothing to add

"""
Now, we have the network built and a dictionary with all proteinPianas that 'apper' in the array.

For each protein in the network, check if any of its proteinPianas is that dictionary. If it is the case, we consider that the
network protein is 'covered' by the array. If not, we count that protein as 'not covered' by the array and we print it out.


"""


user_protein_names = {}
# get the user protein names, to make sure we don't use a different name when outputting the missing proteins
for one_line in file(input_file, "r"):
    user_protein_names[one_line.strip()] = None


if verbose:
    sys.stderr.write("Retrieving network proteins\n" )

# get proteins in the network (using identifier type used for user in his input file)
network_proteins = piana_api.piana_graph.get_network_proteins(output_proteins_type= input_proteins_type, 
							      list_alternative_type_names= ["gi","geneName", "md5"], 
							      tax_id_value= tax_id, 
							      user_protein_names = user_protein_names )


input_column_type = utilities.get_code_column(input_proteins_type)
# now, for each protein check if any of its proteinPianas appears in the dic of proteins that appear in the array

if verbose:
    sys.stderr.write("Matching network proteins to array genes\n" )

if verbose_detailed:
    sys.stderr.write("(%s) network proteins are %s\n" %(len(network_proteins), network_proteins) )

for one_protein in network_proteins:

    associated_proteinPianas = piana_api.piana_access.get_list_protein_piana(proteinCode_value= one_protein,
									     proteinCodeType_value=input_column_type,
									     tax_id_value= tax_id,
									     source_db_info="no")

    if verbose_detailed:
	sys.stderr.write("proteinPianas associated to %s are %s\n" %(one_protein, associated_proteinPianas) )

    protein_matched = 0

    for one_associated_proteinPiana in associated_proteinPianas:
	if dic_array_proteinPianas.has_key(one_associated_proteinPiana):
	    protein_matched = 1
	    break
    # END OF for one_associated_proteinPiana in associated_proteinPianas:
    

    if protein_matched == 1:
	sys.stdout.write("true: protein %s is in array\n" %(one_protein))
    elif protein_matched == 0:
	sys.stdout.write("false: protein %s is not in array\n" %(one_protein))
	
# END OF for one_protein in network_proteins:

	    
		

