#
# READ the REAME file on how to create non redundant non identical lists of proteins and follow those steps:
#    ---> README.creating_non_redundant_non_homologous_proteins_with_clusters_and_ints
#

import sys
import getopt

import re
import readline

from sets import *

import MySQLdb

from PianaDBaccess import *

verbose = 1

# TO DO, instead of creating these lists externally, they could be created here with calls to Piana...
#      For the time being, follow instructions on README.creating_non_redundant_non_homologous_proteins_with_clusters_and_ints to create the files
#

# set here the file name that has a list of proteins with clusters assigned in the database
file_with_clusters= "/home/raragues/phd/piana/code/execs/w_exp.list.all_prots_with_cluster"

# set here the file name that has a list of proteins with interactions assigned in the database
file_with_ints= "/home/raragues/phd/piana/code/execs/w_exp.list.all_prots_with_ints"

# ----------------------
# Function usage()
# ----------------------
def usage():
    print "--------------------------------------------------------------------------------------------------------------"
    print "Usage: python get_non_redundant_prots_with_clusters_and_ints.py "
    print "                      --piana-dbname=piana_dbname --piana-dbhost=piana_dbhost --piana-dbuser=piana_dbuser --piana-dbpass=piana_dbpass"
    print "                                 [--help] [--verbose]"
    print "\nwhere:"
    print "     piana_dbname : name of database piana to be used (required)"
    print "     piana_dbhost : name of host where database piana to be used is placed (required)"
    print "     piana_dbuser : username accessing the database (not required in most systems)"
    print "     piana_dbpass : password of username accessing the database (not required in most systems)"
    print "     --help         : prints this message and exits"
    print "     --verbose      : prints process info to stdout"
    print "--------------------------------------------------------------------------------------------------------------"
        

   
# ---------------------------
# Function parseArguments()                                               
# --------------------------- 

def parseArguments():
    
    global piana_dbname
    global piana_dbhost
    global piana_dbuser
    global piana_dbpass
    
    
    global verbose
    
    try:
        opts, args = getopt.getopt(sys.argv[1:], "vht:c:n:o:u:w:", ["verbose","help",
                                                                   "piana-dbname=", "piana-dbhost=", "piana-dbuser=", "piana-dbpass=" ])
    except getopt.GetoptError:
        # print help information and exit:
        usage()
        sys.exit(2)
     
    for option,value in opts:
        
         if option in ("-t", "--taxonomy-file"):
             taxonomy_file = value
            
         elif option in ("-n", "--piana-dbname"):
             piana_dbname = value
             
         elif option in ("-o", "--piana-dbhost"):
             piana_dbhost = value
            
         elif option in ("-u", "--piana-dbuser"):
             piana_dbuser = value
             
         elif option in ("-w", "--piana-dbpass"):
             piana_dbpass = value
             
         elif option in ("-v", "--verbose"):
             verbose = 1
         elif option in ("-h", "--help"):
             # print help information and exit
             usage()
             sys.exit(2)
        
# --------
# --------
#  Main()               
# --------                               
# --------

piana_dbname = None
piana_dbuser = None
piana_dbhost = None
piana_dbpass = None

# parsing arguments from the command line
parseArguments()

# Initialisating connection to piana
piana_access = PianaDBaccess(dbname=piana_dbname, dbhost=piana_dbhost, dbuser=piana_dbuser, dbpassword= piana_dbpass)


proteins_with_clusters = Set()
proteins_with_ints = Set()

# READ the REAME file on how to create non redundant non identical lists of proteins and follow those steps

with_clusters = file(file_with_clusters, "r")
with_ints = file(file_with_ints, "r")

for line in with_clusters:
    proteins_with_clusters.add(int(line.strip()))

for line in with_ints:
    proteins_with_ints.add( int(line.strip()))

proteins_by_tax = {}

intersection = proteins_with_clusters.intersection(proteins_with_ints)



## #Uncomment following code for taxonomy statistics and processing

## for proteinPiana in intersection:
##     list_taxs = piana_access.get_protein_taxonomy_ids(proteinPiana_value=proteinPiana)

##     for tax in list_taxs:
##         prot_file.write("%s\t%s\n" %(proteinPiana, tax))

##         if proteins_by_tax.has_key(tax):
##             proteins_by_tax[tax].append(proteinPiana)
##         else:
##             proteins_by_tax[tax] = [proteinPiana]
## # END OF for proteinPiana in intersection:

## tax_file = file("number_of_prots_per_tax.txt", "w")

## for tax in proteins_by_tax:
##     species_name = piana_access.get_species_names_from_taxonomies(list_taxonomy_ids = [tax])
##     tax_file.write("%s proteins for tax %s (%s)\n" %(len(proteins_by_tax[tax]), tax, species_name))

## tax_file.close()



#
# Filter the  list of proteins to keep only non redundant proteins
# This has nothing to do with BLAST: it only checks for table proteinSimilarity
#

num_of_proteins_added = 0
num_of_proteins_redundant = 0
total_num_of_proteins = 0

non_redundant_proteins = []

for new_protein in intersection:

    new_protein_is_redundant = 0
    
    for already_added_protein in non_redundant_proteins:

        if new_protein == already_added_protein:
            new_protein_is_redundant = 1
            sys.stderr.write("protein %s was already added\n" %(new_protein))
            continue
        
        if piana_access.check_proteins_similarity(proteinPiana_a_value= new_protein, proteinPiana_b_value= already_added_protein):
            new_protein_is_redundant = 1
            sys.stderr.write("protein %s is redundant with  protein %s\n" %(new_protein, already_added_protein))
            continue
    # END OF for already_added_protein in non_redundant_proteins:

    if not new_protein_is_redundant:
        num_of_proteins_added += 1
        sys.stderr.write("adding a protein: %s\n" %(new_protein))
        non_redundant_proteins.append(new_protein)
    else:
        num_of_proteins_redundant += 1

    total_num_of_proteins += 1
# END OF for new_protein in proteins_in_file:
        

for non_redundant_protein in non_redundant_proteins:
    
    sys.stdout.write("%s\n" %(non_redundant_protein))
# END OF for non_redundant_protein in non_redundant_proteins:

sys.stderr.write("Total num prots in intersection = %s\n" %total_num_of_proteins)
sys.stderr.write("Num of proteins added = %s\n" %num_of_proteins_added)
sys.stderr.write("Num of proteins redundant = %s\n" %num_of_proteins_redundant)


