"""
File        : measure_go_overlap.py
Author      : Ramon Aragues
Creation    : 27.04.06 (in NY - MSKCC)
Contents    : measures the GO overlap for protein pairs in a text file
Called from : command line

=======================================================================================================

Given a protein list with pairs protein1<tab>protein2<tab>whatever<tab>whatever<tab>....

it calculates the GO overlap for each pair and plots the distribution obtained (GO overlap vs Number of pairs)


output looks like:

prot1   prot2 num_common  num1    num2     terms1          terms2
A2LP    ARVCF   1         1       2       Set([1127L])    Set([2694L, 1127L])
A2LP    TARBP2  0         1       2       Set([1127L])    Set([1170L, 16805L])



"""

# measure_go_overlap.py: measures the GO overlap for protein pairs
#
# Copyright (C) 2006  Ramon Aragues
# author email: ramon.aragues@upf.edu and boliva@imim.es
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#    http://www.gnu.org/copyleft/gpl.html
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
#
# University Pompeu Fabra, hereby disclaims all copyright
# interest in the program 'PIANA'
# (software for working with protein-protein interaction networks) written 
# by Ramon Aragues

import sys
import getopt
import os
import time
import re
import readline
import MySQLdb

from sets import *

import copy

from Bio.Blast import NCBIStandalone

import GOApi
import utilities

from PianaDBaccess import *

verbose = 0
verbose_detailed = 0

# ----------------------
# Function usage()
# ----------------------
def usage():
    print "\n--------------------------------------------------------------------------------------------------------------"
    print "   Measures the GO overlap for protein pairs\n"
    print "\nUsage: python measure_go_overlap.py: --input-file=input_file --term-type=term_type"
    print "      --piana-dbname=piana_dbname --piana-dbhost=piana_dbhost --piana-dbuser=piana_dbuser --piana-dbpass=piana_dbpass"
    print "       --input-proteins-type=input-proteins-type-value"
    print "       --input-proteins-species=input-proteins-species-value [--help] [--verbose] \n" 
    print "\nwhere:"
    print "     input_file   : name of the file with protein pairs (separated by tabulators)"
    print "     term_type    : type of GO term you want to measure (all, molecular_function, biological_process or cellular_component)"
    print "     input_proteins_type   : type of identifier used for proteins in input file (eg. geneName, uniacc, unientry, gi, ...)"
    print "     input_proteins_species: species of proteins in input file (eg. human, yeast, ...)"
    print "     piana_dbname   : name of database piana to be used (required)"
    print "     piana_dbhost   : name of host where database piana to be used is placed (required)"
    print "     piana_dbuser   : username accessing the database (not required in most systems)"
    print "     piana_dbpass   : password of username accessing the database (not required in most systems)"
    print "     --help       : prints this message and exits"
    print "     --verbose    : prints process info to stdout"
    print "--------------------------------------------------------------------------------------------------------------"
        

   
# ---------------------------
# Function parseArguments()                                               
# --------------------------- 

def parseArguments():
    
    global input_file

    global term_type

    global piana_dbname
    global piana_dbhost
    global piana_dbuser
    global piana_dbpass
    
    global input_proteins_type
    global input_proteins_species

    global verbose
    
    try:
        opts, args = getopt.getopt(sys.argv[1:], "", ["verbose", "help", "input-file=", "piana-dbname=", "piana-dbhost=", "piana-dbuser=", "piana-dbpass=",
						      "term-type=", "input-proteins-type=","input-proteins-species="])
    except getopt.GetoptError, bad_opt:
        # print help information and exit:
        sys.stderr.write( bad_opt.__str__() )
        usage()
        sys.exit(2)
     
    for option,value in opts:
        
        if option == "--input-file":
            input_file = value
            
        elif option == "--term-type":
            term_type = value
            
        elif option == "--piana-dbname":
            piana_dbname = value
            
        elif option == "--piana-dbhost":
            piana_dbhost = value
            
        elif option == "--piana-dbuser":
            piana_dbuser = value
            
        elif option == "--piana-dbpass":
            piana_dbpass = value
            
        elif option == "--input-proteins-type":
            input_proteins_type = value
            
        elif option == "--input-proteins-species":
            input_proteins_species= value
             
        elif option == "--verbose":
            verbose = 1
            
        elif option == "--help":
            # print help information and exit
            usage()
            sys.exit(2)

    # check arguments
    if input_file is None:
        raise ValueError("trying to run the program without giving an input file")

    if term_type is None:
        raise ValueError("trying to run the program without giving a term_type")

    if term_type == "all":
	# looks weird, but this is the value term_type must have to get all GO terms regardless of their type
        # I am asking the user to set it to all to make sure he is setting something, but then I change the
        # all to None because that is the value piana_access.get_go... is expecting in order to return all terms
	term_type = None


# --------
# --------
#  Main()               
# --------                               
# --------
piana_dbname = None
piana_dbuser = None
piana_dbhost = None
piana_dbpass = None

input_file = None
term_type = None

input_proteins_type = None
input_proteins_species = None

# parsing arguments from the command line
parseArguments()


#
# -------

# connection to GO database
godb = MySQLdb.connect(user="raragues", db="goDB_200601", host="sefarad.imim.es" )
gocursor = godb.cursor()


# Initialisating connection to piana
piana_access = PianaDBaccess(dbname=piana_dbname, dbhost=piana_dbhost, dbuser=piana_dbuser, dbpassword= piana_dbpass)

number_of_pairs = 0
num_pairs_no_proteinPiana =0
num_pairs_no_go = 0

input_id_column = utilities.get_code_column(input_proteins_type)
input_tax_id = piana_access.get_taxonomies_from_species_name(species_name_value= input_proteins_species)[0]

dic_protein_gos = {}  # keeps GO terms for proteins that have already been processed

for one_line in file(input_file, "r"):
    
    line_fields = one_line.split()

    if len(line_fields) < 2:
	# protect against bad lines
	continue

    number_of_pairs += 1

    # line_fields[0] is gene name 1
    # line_fields[1] is gene name 2
    #
    protein_1 = line_fields[0]
    protein_2 = line_fields[1]

    
    # --
    # 1. get GO terms for 1
    # --
    
    if dic_protein_gos.has_key(protein_1):
	set_all_gos_1 = dic_protein_gos[protein_1]

    else:
        # no partners have been retrieved for protein_1 before... do it now

	#  1.1. get all proteinPianas associated to protein 1 
	list_proteinPiana_1 = piana_access.get_list_protein_piana(proteinCode_value= protein_1,
								  proteinCodeType_value= input_id_column,
								  tax_id_value= input_tax_id,
								  source_db_info= "no")

	#  1.2. get GO terms for all proteinPianas associated to gene name 1 (put together all GOs in a set1)
	set_gos_1 = Set([])
	for one_proteinPiana in list_proteinPiana_1:
	    # get partners for every proteinPiana associated to protein1
	    set_gos_1.union_update(piana_access.get_protein_go_term_id(proteinPiana_value=one_proteinPiana, term_type_value=term_type))
	    
	# END OF for one_proteinPiana in list_proteinPiana_1:
	    
	# get all parents for this go term ids
	set_all_gos_1 = Set(GOApi.get_all_parents(list_go_terms=list(set_gos_1), dic_gos_at_level={}, 
						  gocursor=gocursor, piana_access=piana_access))

	#print "\nParents for protein %s go terms %s are: %s\n" %(protein_1, set_gos_1, set_all_gos_1)

	# add to the parents the GOs themselves...
	set_all_gos_1.union(set_gos_1)

	# save set for using it in case we process again this protein
	dic_protein_gos[protein_1] = copy.deepcopy(set_all_gos_1)
    # END OF else: (if dic_protein_gos.has_key():)

    # --
    # 2. get partners for 2
    # --
    
    if dic_protein_gos.has_key(protein_2):
	set_all_gos_2 = dic_protein_gos[protein_2]

    else:
        # no partners have been retrieved for protein_2 before... do it now

	#  2.1. get all proteinPianas associated to protein 2 
	list_proteinPiana_2 = piana_access.get_list_protein_piana(proteinCode_value= protein_2,
								  proteinCodeType_value= input_id_column,
								  tax_id_value= input_tax_id,
								  source_db_info= "no")

	#print "proteinPianas associated to %s are %s" %(protein_2, list_proteinPiana_2)

	#  2.2. get GO terms for all proteinPianas associated to gene name 2 (put together all GOs in a set2)
	set_gos_2 = Set([])
	for one_proteinPiana in list_proteinPiana_2:
	    # get partners for every proteinPiana associated to protein2
	    set_gos_2.union_update(piana_access.get_protein_go_term_id(proteinPiana_value=one_proteinPiana, term_type_value=term_type))
	    
	# END OF for one_proteinPiana in list_proteinPiana_2:
	    
	# get all parents for this go term ids
	set_all_gos_2 = Set(GOApi.get_all_parents(list_go_terms=list(set_gos_2), dic_gos_at_level={}, 
						  gocursor=gocursor, piana_access=piana_access))

	#print "\nParents for protein %s go terms %s are: %s\n" %(protein_2, set_gos_2, set_all_gos_2)

	# add to the parents the GOs themselves...
	set_all_gos_2.union(set_gos_2)

	# save set for using it in case we process again this protein
	dic_protein_gos[protein_2] = copy.deepcopy(set_all_gos_2)
    # END OF else: (if dic_protein_gos.has_key():)


    if not list_proteinPiana_1 or not list_proteinPiana_2:
	# if there was no proteinPiana for any of the proteins in the pair... skip it
	num_pairs_no_proteinPiana += 1
	continue

    if not set_all_gos_1 or not set_all_gos_2:
	# if there was no GO term for any of the proteins in the pair... skip it
	num_pairs_no_go += 1
	continue
    
    intersection_gos = set_all_gos_1.intersection(set_all_gos_2)
    
    # 5. write to file protein1 protein2 (+ or -) num_microarrays num_ints min_per max_per combined_per  num_ints_1 num_ints_2
    sys.stdout.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n" %(protein_1, protein_2, len(intersection_gos),  len(set_all_gos_1), len(set_all_gos_2), set_all_gos_1, set_all_gos_2 ) )
    
    # 6. analyze? how?


# END OF for one_line in file(input_file, "r"):

sys.stderr.write("Num pairs processed: %s\n" %(number_of_pairs))
sys.stderr.write("Num pairs skipped (no proteinPiana found): %s\n" %(num_pairs_no_proteinPiana))
sys.stderr.write("Num pairs skipped (no GO term): %s\n" %(num_pairs_no_go))
