
import sys
import getopt

import re
import readline

from sets import *

import MySQLdb

from PianaApi import *

verbose = 1


# ----------------------
# Function usage()
# ----------------------
def usage():
    print "--------------------------------------------------------------------------------------------------------------"
    print "Usage: python test_something_piana.py --input-file=input_file "
    print "                          --piana-dbname=piana_dbname --piana-dbhost=piana_dbhost --piana-dbuser=piana_dbuser --piana-dbpass=piana_dbpass"
    print "                                 [--help] [--verbose]"
    print "\nwhere:"
    print "     input_file   : input file to use"
    print "     piana_dbname : name of database piana to be used (required)"
    print "     piana_dbhost : name of host where database piana to be used is placed (required)"
    print "     piana_dbuser : username accessing the database (not required in most systems)"
    print "     piana_dbpass : password of username accessing the database (not required in most systems)"
    print "     --help         : prints this message and exits"
    print "     --verbose      : prints process info to stdout"
    print "--------------------------------------------------------------------------------------------------------------"
        

   
# ---------------------------
# Function parseArguments()                                               
# --------------------------- 

def parseArguments():
    
    global piana_dbname
    global piana_dbhost
    global piana_dbuser
    global piana_dbpass
    
    global input_file
    
    global verbose
    
    try:
        opts, args = getopt.getopt(sys.argv[1:], "vht:c:n:o:u:w:", ["verbose","help","input-file=", 
                                                                   "piana-dbname=", "piana-dbhost=", "piana-dbuser=", "piana-dbpass=" ])
    except getopt.GetoptError:
        # print help information and exit:
        usage()
        sys.exit(2)
     
    for option,value in opts:
        
         if option in ("-t", "--input-file"):
             input_file = value
            
         elif option in ("-n", "--piana-dbname"):
             piana_dbname = value
             
         elif option in ("-o", "--piana-dbhost"):
             piana_dbhost = value
            
         elif option in ("-u", "--piana-dbuser"):
             piana_dbuser = value
             
         elif option in ("-w", "--piana-dbpass"):
             piana_dbpass = value
             
         elif option in ("-v", "--verbose"):
             verbose = 1
         elif option in ("-h", "--help"):
             # print help information and exit
             usage()
             sys.exit(2)
        
# --------
# --------
#  Main()               
# --------                               
# --------

piana_dbname = None
piana_dbuser = None
piana_dbhost = None
piana_dbpass = None

input_file = None

# parsing arguments from the command line
parseArguments()

# Initialisating connection to piana
piana_api = PianaApi(piana_dbname=piana_dbname, piana_dbhost= piana_dbhost, piana_dbuser=piana_dbuser, piana_dbpass= piana_dbpass, 
		     use_secondary_db= "no", network_name="hello" )




"""
CIR_EVAL

This piece of code is used to find out if there is an interaction in
the PDB (ie. appear in the same PDB) between a given protein
(proteinPiana_target) and a given fold. It is used for those cases in
which we are interested in seeing how a particular protein interacts
with other proteins that have a certain domain.

"""
proteinPiana_target =  166505  # the proteinPiana for the protein that you would like to find interacting with domain input_cf
input_cf = 55485               # the domain of your common partner

sqlquery= "select pdbID from pdb where proteinPiana=%s" %(proteinPiana_target)
piana_api.piana_access.db.cursor.execute(sqlquery)
target_pdbs = piana_api.piana_access.db.cursor.fetchall()

set_target_pdbs = Set([])
for one_target in target_pdbs:
    set_target_pdbs.add(one_target[0])
    
sqlquery= "select proteinPiana from proteinScop where cf=%s" %input_cf
piana_api.piana_access.db.cursor.execute(sqlquery)
proteins_with_cf = piana_api.piana_access.db.cursor.fetchall()

for one_proteinPiana in proteins_with_cf:
    # one_protein[0] is the proteinPiana of a protein that has cf=56234
    
    sqlquery= "select pdbID from pdb where proteinPiana=%s" %(one_proteinPiana)
    piana_api.piana_access.db.cursor.execute(sqlquery)
    one_proteinPiana_pdbs = piana_api.piana_access.db.cursor.fetchall()
    
    set_one_protein_pdbs = Set([])
    for one_protein_pdb in one_proteinPiana_pdbs:
        set_one_protein_pdbs.add(one_protein_pdb[0])

    if set_one_protein_pdbs.intersection(set_target_pdbs):
        print "protein %s has same PDB as TARGET protein %s (pdb in common=%s)" %(one_proteinPiana,
                                                                                  proteinPiana_target,
                                                                                  set_one_protein_pdbs.intersection(set_target_pdbs))
    
# END OF for one_protein in proteins_with_cf:

    

"""
test the get_random method of piana api

#for i in range(10):
random_group = piana_api.get_random_protein_list(tax_id_value=0, list_size=10, force_ints="yes", use_self_ints="no",
                                                 list_source_dbs= "all", inverse_dbs="no", list_source_methods= "all", inverse_methods="no")

print "\n------------------\n%s\n------------------\n" %(random_group) 

"""

"""
from an input file with one unientry per line, generate all possible pairs protein1-protein2 and print out whether they
share a SCOP code or not

input_file_name= "/home/raragues/phd/piana/code/execs/dummy_files/input/ribosomal_proteins"

list_proteins = []

dic_prots_scops = {}  # keys are uniprot entries and values are sets of SCOPs family codes associated to each uniprot entry

for one_line in file(input_file_name, "r"):
    

    protein_unientry = one_line.strip()

    list_proteinPiana = piana_api.piana_access.get_list_protein_piana(proteinCode_value= protein_unientry,
							    proteinCodeType_value= PianaGlobals.swissProtID_col,
							    tax_id_value= 4932, source_db_info= "no")

    list_proteins.append(protein_unientry)

    dic_scop_fas = {} # using a dic to remove duplications
    
    for one_proteinPiana in list_proteinPiana:
	temp_scop_fas = piana_api.piana_access.get_protein_scop_fa(proteinPiana_value= one_proteinPiana)

	for one_scop_fa in temp_scop_fas:
	    dic_scop_fas[one_scop_fa] = None
    # END OF for one_proteinPiana in list_proteinPiana:

    dic_prots_scops[protein_unientry] = Set(dic_scop_fas.keys())
    
# END OF for one_line in file(input_file_name, "r"):

num_with_common_scop = 0
num_with_diff_scop = 0

for i in range(len(list_proteins)):
    for j in range(i+1, len(list_proteins)):
	
	protein_i = list_proteins[i]
	protein_j = list_proteins[j]

	
	if len(dic_prots_scops[protein_i])==0 and len(dic_prots_scops[protein_j])==0:
	    common_scop = "unk"
	else:

	    if dic_prots_scops[protein_i].intersection(dic_prots_scops[protein_j]):
		common_scop = "yes"
		num_with_common_scop += 1
	    else:
		common_scop = "no"
		num_with_diff_scop += 1

	sys.stdout.write("%s\t%s\t%s\n" %(protein_i, protein_j, common_scop))
    # END OF for j in range(i+1, len(list_proteins)):
# END OF for i in range(len(list_proteins)):

sys.stdout.write("Num pairs with common SCOP fa: %s\n" %(num_with_common_scop))
sys.stdout.write("Num pairs with diff SCOP fa: %s\n" %(num_with_diff_scop))
sys.stdout.write("Specificity: %s\n" %( 100 * num_with_common_scop / (num_with_common_scop + num_with_diff_scop) ))

"""

"""
from an input file with ranges of scores...
getting intersection of partners for two proteins, their detection methods and results using swissprot id
file_fd = file(input_file, "r")


# set here the parameters to use
output_protein_type_name = "unientry"

use_self_ints = "no"

list_source_dbs="all"
inverse_dbs = "no"

list_source_methods = "all"
inverse_methods = "no"

hub_threshold = 0

fasta_file_fd = file("%s.fasta" %(input_file), "w")
description_file_fd = file("%s.description" %(input_file), "w")
proteins_file_fd = file("%s.proteins" %(input_file), "w")

for line in file_fd:

    line_fields = line.split()   # line_fields is
                                 #   [0] -> range=91.100
                                 #   [1] -> protein1=3008979
                                 #   [2] -> protein2=3654756
                                 #   [3] -> num_ints1=33.0
                                 #   [4] -> num_ints2=34.0
                                 #   [5] -> num_ints_common=31.0
                                 #   [6] -> common_cir=0

    range = line_fields[0].split("=")[1]
    proteinPiana_1 = int(line_fields[1].split("=")[1])
    proteinPiana_2 = int(line_fields[2].split("=")[1])
    num_ints_1 = int(float(line_fields[3].split("=")[1]))
    num_ints_2 = int(float(line_fields[4].split("=")[1]))
    num_ints_common = int(float(line_fields[5].split("=")[1]))
    common_scop = int(line_fields[6].split("=")[1])
    

    uniprot_proteinPiana_1 = piana_api.piana_access.get_list_protein_external_codes(proteinPiana= proteinPiana_1,
                                                                           protein_type_name= output_protein_type_name,
                                                                           alternative_type_names= ["md5"],
                                                                           answer_mode= "single")
    uniprot_proteinPiana_2 = piana_api.piana_access.get_list_protein_external_codes(proteinPiana= proteinPiana_2,
                                                                           protein_type_name= output_protein_type_name ,
                                                                           alternative_type_names= ["md5"],
                                                                           answer_mode= "single")

    partners_1 = Set(piana_api.piana_access.get_all_partners(proteinPiana_value= proteinPiana_1,
                                                   use_self_ints=use_self_ints,
                                                   list_source_dbs= list_source_dbs,
                                                   inverse_dbs=inverse_dbs,
                                                   list_source_methods= list_source_methods,
                                                   inverse_methods=inverse_methods,
                                                   threshold=hub_threshold ))

    partners_2 = Set(piana_api.piana_access.get_all_partners(proteinPiana_value= proteinPiana_2,
                                                   use_self_ints=use_self_ints,
                                                   list_source_dbs= list_source_dbs,
                                                   inverse_dbs=inverse_dbs,
                                                   list_source_methods=list_source_methods ,
                                                   inverse_methods=inverse_methods,
                                                   threshold=hub_threshold ))

    partners_intersection = partners_1.intersection(partners_2)


    description_file_fd.write("============================================================\n")
    description_file_fd.write("min_per_score_range=%s || %s (pp=%s) and %s (pp=%s) have %s interactions in common and common_scop=%s\n" %(
        range,
        uniprot_proteinPiana_1,
        proteinPiana_1,
        uniprot_proteinPiana_2,
        proteinPiana_2,
        num_ints_common,
        common_scop ))
    description_file_fd.write("============================================================\n")
    
    p1_seq = piana_api.piana_access.get_list_protein_external_codes(proteinPiana= proteinPiana_1,
                                                                      protein_type_name= "sequence",
                                                                      alternative_type_names= ["md5"],
                                                                      answer_mode= "single")
    fasta_file_fd.write(">%s\n%s\n" %(uniprot_proteinPiana_1,p1_seq ))
    
    p2_seq = piana_api.piana_access.get_list_protein_external_codes(proteinPiana= proteinPiana_2,
                                                                      protein_type_name= "sequence",
                                                                      alternative_type_names= ["md5"],
                                                                      answer_mode= "single")
    fasta_file_fd.write(">%s\n%s\n" %(uniprot_proteinPiana_2, p2_seq ))

    proteins_file_fd.write("%s\t%s" %(uniprot_proteinPiana_1, uniprot_proteinPiana_2))
    
    already_printed_uniprots = {}
    for one_partner in partners_intersection:
        uniprot_one_partner = piana_api.piana_access.get_list_protein_external_codes(proteinPiana= one_partner,
                                                                           protein_type_name= output_protein_type_name ,
                                                                           alternative_type_names= ["md5"],
                                                                           answer_mode= "single")

        if already_printed_uniprots.has_key(uniprot_one_partner):
            continue
        
        proteins_file_fd.write("\t%s" %(uniprot_one_partner))

        
        already_printed_uniprots[uniprot_one_partner] = None
        
        p1_partner_interactionPiana = piana_api.piana_access.get_interactionPiana(proteinPianaA_value=one_partner, proteinPianaB_value=proteinPiana_1,
                                                                        list_source_dbs=list_source_dbs, inverse_dbs=inverse_dbs,
                                                                        list_source_methods= list_source_methods, inverse_methods=inverse_methods)

        list_methods_1 = piana_api.piana_access.get_interaction_methodID_list(interactionPiana_value= p1_partner_interactionPiana)
        list_dbs_1 = piana_api.piana_access.get_interaction_sourceDB_list(interactionPiana_value= p1_partner_interactionPiana)
        
        p2_partner_interactionPiana = piana_api.piana_access.get_interactionPiana(proteinPianaA_value=one_partner, proteinPianaB_value=proteinPiana_2,
                                                                        list_source_dbs= list_source_dbs, inverse_dbs=inverse_dbs,
                                                                        list_source_methods=list_source_methods , inverse_methods=inverse_methods)

        list_methods_2 = piana_api.piana_access.get_interaction_methodID_list(interactionPiana_value= p2_partner_interactionPiana)
        list_dbs_2 = piana_api.piana_access.get_interaction_sourceDB_list(interactionPiana_value= p2_partner_interactionPiana)
        
        description_file_fd.write("%s(pp=%s) (%s methods=%s dbs=%s) (%s methods=%s dbs=%s)\n" %(uniprot_one_partner,one_partner,
                                                                                             uniprot_proteinPiana_1, list_methods_1, list_dbs_1,
                                                                                             uniprot_proteinPiana_2, list_methods_2, list_dbs_2))
        partner_seq = piana_api.piana_access.get_list_protein_external_codes(proteinPiana=one_partner ,
                                                                          protein_type_name= "sequence",
                                                                          alternative_type_names= ["md5"],
                                                                          answer_mode= "single")
        fasta_file_fd.write(">%s\n%s\n" %(uniprot_one_partner, partner_seq ))
    # END OF for one_partner in partners_intersection:

    
    proteins_file_fd.write("\n")
    description_file_fd.write("\n")
# END OF for line in file_fd



fasta_file_fd.close()
description_file_fd.close()
proteins_file_fd.close()

"""



"""
testing what happens when inverse_methods is None

null_pp = 118753
finds_pp = 118777

partners_null_pp = piana_api.piana_access.get_all_partners(proteinPiana_value= null_pp,
                                                 use_self_ints="no",
                                                 list_source_dbs= "all",
                                                 inverse_dbs="no",
                                                 list_source_methods= ["tandaffin"],
                                                 inverse_methods=None,
                                                 threshold= 0 )

partners_finds_pp = piana_api.piana_access.get_all_partners(proteinPiana_value= finds_pp,
                                                 use_self_ints="no",
                                                 list_source_dbs= "all",
                                                 inverse_dbs="no",
                                                 list_source_methods= ["tandaffin"],
                                                 inverse_methods=None,
                                                 threshold= 0 )

print "num partners for %s is %s" %(null_pp, len(partners_null_pp))
print "num partners for %s is %s" %(finds_pp, len(partners_finds_pp))

"""


"""
testing get_all_g2_partner
one_pp = 410154

is_g2_pp = 114914

partners_g2 = piana_api.piana_access.get_all_g2_partners(proteinPiana_value= one_pp,
                                               use_self_ints="no",
                                               list_source_dbs= "all",
                                               inverse_dbs="no",
                                               list_source_methods= ["tandaffin"],
                                               inverse_methods="yes",
                                               threshold= 0 )

dic_partners = {}
for one_partner in partners_g2:
    dic_partners[one_partner] = None
    


print "G2 for proteinPiana %s are %s" %(one_pp, partners_g2)
print "%s is G2 of %s? --> %s" %(is_g2_pp, one_pp, dic_partners.has_key(is_g2_pp) )


"""
"""
testing go database


go_dbname = "goDB_200601"
go_dbhost = "sefarad"

godb = MySQLdb.connect(db=go_dbname, host=go_dbhost)

gocursor = godb.cursor()


sqlquery= "select term_id, gene_product_id from association where id<30"
gocursor.execute(sqlquery)
go_gene_product_id=gocursor.fetchall()

for pair in go_gene_product_id:
    # pair[0] -> term_id
    # pair[1] -> gene_product_id

    sqlquery="select symbol, species_id from gene_product where id=%s" %pair[1]
    gocursor.execute(sqlquery)
    gene_to_symbol =  gocursor.fetchall() 

    for symbol in gene_to_symbol:
        # symbol[0] -> symbol
        # symbol[1] -> tax_id
        gene_name = symbol[0].replace('"', " ").strip()
        tax_id = int(symbol[1])

        list_proteinPiana = piana_api.piana_access.get_list_protein_piana(proteinCode_value= gene_name,
                                                                proteinCodeType_value= PianaGlobals.geneName_col,
                                                                tax_id_value= tax_id , source_db_info= "no")

        print "proteinPianas for symbol %s tax %s are %s and have go term %s" %(gene_name, tax_id, list_proteinPiana, pair[1] )

# END OF for pair in go_gene_product_id:
"""

"""
getting proteinPiana from md5

pps= piana_api.piana_access.get_list_protein_piana(proteinCode_value= "P29358",
                                         proteinCodeType_value= PianaGlobals.swissAccessionID_col,
                                         tax_id_value=9913,
                                         source_db_info= "yes")


print pps
"""
"""
checking the type of variable returned when tax ids are selected

list_tax_id = piana_api.piana_access.get_protein_taxonomy_ids(proteinPiana_value=2)

for tax_id in list_tax_id:
    print "%s of type %s" %(tax_id, type(tax_id))

"""

"""
get proteinPianas associated to a certain description

pps= piana_api.piana_access.get_list_protein_piana(proteinCode_value= "cytochrome b",
                                         proteinCodeType_value= PianaGlobals.proteinDescription_col,
                                         species_name_value= "all", source_db_info= "yes")

print pps

"""

"""
Find number of interactions per species (only when both proteins are of the same species)

dic_count_species = {} # keys are species and contents the number of ints for it

int_triplets= piana_api.piana_access.get_all_protein_protein_interactions(list_source_dbs= "all", inverse_dbs="no", list_source_methods= "all", inverse_methods="no")
# int_triplets is a list of tuples (proteinPianaA, proteinPianaB, interactionPiana)

for triplet in int_triplets:

    tax_ids_a = piana_api.piana_access.get_protein_taxonomy_ids(proteinPiana_value=triplet[0] )
    tax_ids_b = piana_api.piana_access.get_protein_taxonomy_ids(proteinPiana_value=triplet[1] )

    for tax_id_a in tax_ids_a:

        if tax_id_a in tax_ids_b:

            if tax_id_a == 41:
                print "for tax id 41, triplet is " + str(triplet)
            
            # only counting the interaction if both proteins have the same tax_id
            if dic_count_species.has_key(tax_id_a):
                dic_count_species[tax_id_a] += 1
            else:
                dic_count_species[tax_id_a] = 1

# END OF

for tax_id in dic_count_species:
    print "tax_id=%s\tnum_ints=%s" %(tax_id, dic_count_species[tax_id])


"""

"""
Find swissaccessions for a protein

list_proteinPianas= piana_api.piana_access.get_list_protein_piana(proteinCode_value= "P08670",
                                                        proteinCodeType_value= PianaGlobals.swissAccessionID_col,
                                                        species_name_value="all" )

print "proteinPianas of that code: %s" %(list_proteinPianas)
for proteinPiana_value in list_proteinPianas:
    print "check proteinPiana %s" %(proteinPiana_value)
    swissaccession = piana_api.piana_access.get_list_protein_external_codes(proteinPiana= proteinPiana_value,
                                                                  protein_type_name= "uniacc" ,
                                                                  alternative_type_names= ["md5"],
                                                                  answer_mode= "single")

    print "for pp %s the uniacc is %s" %(proteinPiana_value, swissaccession)

"""
                                                                  
"""
get go terms for a list of proteins

Attention: if you need to calculate distances for a given list of go terms, the input list of proteins to this code should be the complete list of proteins
in the network, not just those proteins that will be used to build the network. ok? :-)

You should also modify the type of code being given in the list as well as the type of term_type in Go you want to obtain
input_file_fd = file(input_file, "r")

go_terms = {}


for protein in input_file_fd:

    # get proteinPianas for this protein
    
    list_proteinPiana = piana_api.piana_access.get_list_protein_piana(proteinCode_value= protein.strip(),
                                                            proteinCodeType_value= "proteinPiana",
                                                            species_name_value= "all", source_db_info= "no")

    for proteinPiana in list_proteinPiana:
        list_go_term_id = piana_api.piana_access.get_protein_go_term_id(proteinPiana_value= proteinPiana, term_type_value="molecular_function")
        for go_term_id in list_go_term_id:
            go_terms[go_term_id] = None
# END OF for protein in input_file_fd:

for term_id in go_terms:
    print term_id

"""

"""
comparing ste20 exp ints with predicted ints

# 1. get proteinPianas for exp partners of ste20

ste20_exp_partners = Set([])
for line in file("/home/raragues/myWork/posas/data/interactions_ste20.txt", "r"):

    partners_proteinPiana = piana_api.piana_access.get_list_protein_piana(proteinCode_value= line.strip(),
                                                                proteinCodeType_value= PianaGlobals.geneName_col,
                                                                species_name_value= "all", source_db_info= "no")
    for partner in partners_proteinPiana:
        ste20_exp_partners.add(partner)

# 2. get proteinPianas for db partners of ste20

# 2.1 get ste20 proteinPianas
ste20_proteinPianas = piana_api.piana_access.get_list_protein_piana(proteinCode_value= "ste20",
                                                          proteinCodeType_value= PianaGlobals.geneName_col,
                                                          species_name_value= "all", source_db_info= "no")

# this is the dictionary that keeps track of which is the intermediate protein for partners at distance 2

intermediates = {}

# 2.2 get the partners
ste20_db_ints = []
for proteinPiana in ste20_proteinPianas:
    # get all step20 tuples (partner, source_db, method_id)
    ste20_db_ints.extend(piana_api.piana_access.get_all_partners(proteinPiana_value= proteinPiana, list_source_dbs= "all", list_source_methods= "all",
                                                          threshold=0, get_source_db_info="yes", get_method_info="yes"))


    # search partners of partners (very innefficiently, duplication of call...)
    temp_list_partners = piana_api.piana_access.get_all_partners(proteinPiana_value= proteinPiana, list_source_dbs= "all", list_source_methods= "all",
                                                       threshold=0, get_source_db_info="no", get_method_info="no")

    for partner in temp_list_partners:
        partners_of_partners = piana_api.piana_access.get_all_partners(proteinPiana_value=partner , list_source_dbs= "all", list_source_methods= "all",
                                                           threshold=0, get_source_db_info="yes", get_method_info="yes")
        ste20_db_ints.extend(partners_of_partners)

            
        for partner_of_partner in partners_of_partners:

            print "ste20 (pp %s) interacts with %s, which in turn interacts with %s" %(proteinPiana, partner, partner_of_partner[0] )
            
            if not intermediates.has_key(partner_of_partner[0]):
                intermediates[partner_of_partner[0]] = []

            intermediates[partner_of_partner[0]].append(partner)

            
# END OF for proteinPiana in ste20_proteinPianas:

step20_db_partners = Set([])
for int in ste20_db_ints:
    step20_db_partners.add(int[0])


#print "\nproteinPianas for ste20: %s\n------------------\n" %(ste20_proteinPianas)
#print "Partners in exp list: %s\n------------------\n" %(ste20_exp_partners)
#print "Partners in db list: %s\n------------------\n" %(step20_db_partners)


# 3. find the intersection between the DB partners and the ones given in input file
print "\nPartners that appear in both lists:"

intersection_partners = step20_db_partners.intersection(ste20_exp_partners)

# 4. print information about the partners
for proteinPiana_partner in intersection_partners:

    
    partner_ext_codes = piana_api.piana_access.get_list_protein_external_codes(proteinPiana= proteinPiana_partner,
                                                                    protein_type_name= "geneName",
                                                                    alternative_type_names= ["proteinPiana"])
    
    sys.stdout.write("Partner proteinPiana=%s (via intemediate protein %s) (extcodes=" %(proteinPiana_partner, intermediates[proteinPiana_partner]))
    for ext_code in partner_ext_codes:
        sys.stdout.write("\t%s" %ext_code)
    sys.stdout.write(")\t from")
        
    for int in ste20_db_ints:
        # int[0] is the partner
        # int[1] is the source db
        # int[2] is the method id
        if int[0] == proteinPiana_partner:
            sys.stdout.write("\tsourcedb=%s;" %int[1] )
            sys.stdout.write("method_id=%s" %int[2] )

    sys.stdout.write("\n" )


print "\n\nthere were %s partners in the intersection between input file proteins and db partners" %(len(intersection_partners))

    
"""

    
    


"""
testing protein similarity

protein0= 287
protein1 = 378
protein2 = 1804845
protein3 = 1800000

similar_proteins = piana_api.piana_access.check_proteins_similarity(proteinPiana_a_value=protein1,
                                      proteinPiana_b_value=protein2)

print "proteins %s and %s are similar: %s" %(protein1, protein2, similar_proteins)

similar_proteins = piana_api.piana_access.check_proteins_similarity(proteinPiana_a_value=protein1,
                                      proteinPiana_b_value=protein3)

print "proteins %s and %s are similar: %s" %(protein1, protein3, similar_proteins)

similar_proteins_dic = piana_api.piana_access.get_similar_proteins_dic(proteinPiana_value=protein0)

print "similar proteins to %s are %s" %(protein0, similar_proteins_dic.keys())
"""

"""
getting proteins with specific characteristics

proteins_with_clusters = Set()
proteins_with_ints = Set()

with_clusters = file("all_proteinPianas_with_DBAliCluster.txt", "r")
with_ints = file("all_proteinPiana_with_ints.txt","r")

for line in with_clusters:
    proteins_with_clusters.add(int(line.strip()))

for line in with_ints:
    proteins_with_ints.add( int(line.strip()))

proteins_by_tax = {}

intersection = proteins_with_clusters.intersection(proteins_with_ints)

prot_file = file("all_proteinPiana_species_with_clusters_and_ints.txt", "w")

for proteinPiana in intersection:
    list_taxs = piana_api.piana_access.get_protein_taxonomy_ids(proteinPiana_value=proteinPiana)

    for tax in list_taxs:
        prot_file.write("%s\t%s\n" %(proteinPiana, tax))

        if proteins_by_tax.has_key(tax):
            proteins_by_tax[tax].append(proteinPiana)
        else:
            proteins_by_tax[tax] = [proteinPiana]
# END OF for proteinPiana in intersection:

tax_file = file("number_of_prots_per_tax.txt", "w")

for tax in proteins_by_tax:
    species_name = piana_api.piana_access.get_species_names_from_taxonomies(list_taxonomy_ids = [tax])
    tax_file.write("%s proteins for tax %s (%s)\n" %(len(proteins_by_tax[tax]), tax, species_name))

tax_file.close()
"""

"""
filtering a list of proteins to keep only non redundant proteins
"""
"""
input_file = file("human_proteins_with_clusters_and_ints", "r")

proteins_in_file = []

for line in input_file:
    proteins_in_file.append(int(line.split()[0]))

sys.stderr.write("There are %s proteins in input file\n" %(len(proteins_in_file)))

non_redundant_proteins = []

for new_protein in proteins_in_file:

    new_protein_is_redundant = 0
    
    for already_added_protein in non_redundant_proteins:

        if new_protein == already_added_protein:
            new_protein_is_redundant = 1
            continue
        
        if piana_api.piana_access.check_proteins_similarity(proteinPiana_a_value= new_protein, proteinPiana_b_value= already_added_protein):
            new_protein_is_redundant = 1
            continue
    # END OF for already_added_protein in non_redundant_proteins:

    if not new_protein_is_redundant:
        non_redundant_proteins.append(new_protein)
# END OF for new_protein in proteins_in_file:
        
non_redundant_file = file("non_redundant_human_proteins_with_clusters_and_ints.txt", "w")

for non_redundant_protein in non_redundant_proteins:
    
    non_redundant_file.write("%s\n" %(non_redundant_protein))
# END OF for non_redundant_protein in non_redundant_proteins:

sys.stderr.write("There are %s proteins in non redundant file\n" %(len(non_redundant_proteins)))
"""
