"""
File        : plot_go_overlap.py
Author      : Ramon Aragues
Creation    : 03.05.06 (in NY - MSKCC)
Contents    : takes an input file with gene pairs and num of go
              terms overlap and plots a graph
Called from : command line

=======================================================================

input file must look like this (without headers):
gene1 gene2 num_common  num1    num2     gos 1                                  gos 2
A2M     VWF     7       21      10      Set([8887L,   11292L, 5023, ...])     Set([4801L, ....])
A2M     VWF     7       21      10      Set([8887L,   11292L, 5023])          Set([4801L])
A2M     VWF     7       21      10      Set([8887L,   11292L, 5023])          Set([4801L])
A2M     VWF     7       21      10      Set([8887L,   11292L, 5023])          Set([4801L])

In fact, only the first three fields are mandatory (ie. the others are not used for plotting the graph)




"""

# plot_go_overlap.py: takes an input file with gene pairs and num of go
#                     terms overlap and plots a graph
#
# Copyright (C) 2006  Ramon Aragues
# author email: ramon.aragues@upf.edu and boliva@imim.es
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#    http://www.gnu.org/copyleft/gpl.html
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
#
# University Pompeu Fabra, hereby disclaims all copyright
# interest in the program 'PIANA'
# (software for working with protein-protein interaction networks) written 
# by Ramon Aragues

import sys
import getopt
import os
import time
import re
import readline
import MySQLdb
from plplot import *

from sets import *

import utilities


verbose = 1
verbose_detailed = 0

# ----------------------
# Function usage()
# ----------------------
def usage():
    print "\n--------------------------------------------------------------------------------------------------------------"
    print "\n"
    print "\nUsage: python plot_go_overlap.py --input-dir=input_dir"
    print "       [--help] [--verbose] \n" 
    print "\nwhere:"
    print "     input_dir   : directory with the files with genes and how many GO terms in common they have"
    print "     --help       : prints this message and exits"
    print "     --verbose    : prints process info to stdout"
    print "--------------------------------------------------------------------------------------------------------------"
        

   
# ---------------------------
# Function parseArguments()                                               
# --------------------------- 

def parseArguments():
    
    global input_dir

    global verbose
    
    try:
        opts, args = getopt.getopt(sys.argv[1:], "", ["verbose", "help", "input-dir="])
    except getopt.GetoptError, bad_opt:
        # print help information and exit:
        sys.stderr.write( bad_opt.__str__() )
        usage()
        sys.exit(2)
     
    for option,value in opts:
        
        if option == "--input-dir":
            input_dir = value
             
        elif option == "--verbose":
            verbose = 1
            
        elif option == "--help":
            # print help information and exit
            usage()
            sys.exit(2)

    # check arguments
    if input_dir is None:
        raise ValueError("trying to run the program without giving an input directory")


# --------
# --------
#  Main()               
# --------                               
# --------
input_dir = None

# parsing arguments from the command line
parseArguments()

# --
# Init graphical output
# --
plsdev("psc")                   # set output mode (psc: colour postscript; jpeg; png)
plot_file_name = input_dir.replace("/","_") + ".go_overlap.ps" 
if verbose:
    sys.stderr.write("Creating plot file %s\n" %plot_file_name)
plsfnam(plot_file_name)              # set output file name
plscolbg(255,255,255)           # set background color (default: black)
plsdiori(1)                     # set orientation (portrait==1, landscape == 0)
plinit()
pladv(0)
plenv (0, 50, 0, 101, 0, 0)   # min x, max x, min y, max y, 0, 0
graph_title = plot_file_name + "GO overlap\n" 

x_title= "Number Common GO terms"
y_title= "cummulative percentage of protein pairs"
pllab(x_title, y_title, graph_title)
plschr(0, 1)           # set character size (0-> don't change default, 0.5 -> sets half current size)

j = 0 # used to change colors for each line
labels_position = 25  # this sets where in the graph will the label will be written (ie. initial y position, x is fixed)


for file_in_directory in os.listdir(input_dir):

    if file_in_directory == "CVS":
	continue

    number_of_pairs = 0
    max_overlap = 0 # used to know which was the maximum overlap (so then we can plot it accordingly)

    dic_overlap_distribution = {}  # follows structure:  { 0: number of pairs with 0 overlap,
				   #                       1: number of pairs with 1 overlap,
				   #                       2: number of pairs with 2 overlap,
				   #                       ............
				   #                     }
    if verbose:
	sys.stderr.write("Populating dictionary dic_overlap_distribution for file %s\n" %file_in_directory)

    for one_line in file(input_dir + file_in_directory, "r"):
	# populate dic_overlap_distribution, counting how many pairs there are in each go_overlap situation
	line_fields = one_line.split()

	if len(line_fields) < 3:
	    # protect against bad lines
	    continue


	number_of_pairs += 1

	# line_fields[0] is gene name 1
	# line_fields[1] is gene name 2
	# line_fields[2] is number of common GO terms (go overlap)
	# line_fields[3] is number of GOs associated to gene 1
	# line_fields[4] is number of GOs associated to gene 2
	# line_fields[5] is the list of GO terms associated to gene 1
	# line_fields[6] is the list of GO terms associated to gene 2
	#
	go_overlap = int(line_fields[2])

	if go_overlap > max_overlap:
	    max_overlap = go_overlap

	if dic_overlap_distribution.has_key(go_overlap):
	    dic_overlap_distribution[go_overlap] += 1
	else:
	    dic_overlap_distribution[go_overlap] = 1

    # END OF for one_line in file(file_in_directory, "r"):


    # Now we know how many pairs have each particular GO overlap: plot the cummulative distribution

    if verbose:  	sys.stderr.write("Plotting graph\n")

    file_prefix = file_in_directory.split('/')[-1].split('.')[0]

    x = []
    y = []

    # ---
    # create the vectors that will be plotted
    # ---
    cummulative_num_of_pairs = 0 # keeps what percentage of pairs have been covered up to the number_overlap

    for number_overlap in range(max_overlap+1):
	# max_overlap + 1 is the max overlap found... range will go from 0 to this number, in ascending order

	if dic_overlap_distribution.has_key(number_overlap):
	    cummulative_num_of_pairs += dic_overlap_distribution[number_overlap]

	x.append(number_overlap)
	percentage_cummulative = cummulative_num_of_pairs * 100 / number_of_pairs
	y.append(percentage_cummulative)

	# END OF if dic_cirs[one_range]["num_prots"]>0:

    # END OF for one_range in ranges_class.dic_ranges.keys():

    plcol0(utilities.graphic_colours[j%utilities.number_of_colours] )

    plwid(1) # set width
    label_line = file_prefix
    plptex(5, labels_position, 0., 0., 0., label_line )
    j += 1
    labels_position -= 5
    plpsty(0) # select area fill pattern
    plwid(4) # set width

    plline(x,y)

    sys.stderr.write("Num pairs processed: %s\n" %(number_of_pairs))

# END OF for file_in_directory in os.listdir(input_dir):


# -
# Close graphical output
# -
plcol0(1)
plend()
    

