"""
File        : parse_go_freqs_files.py
Author      : Ramon Aragues
Creation    : 1.06.06 (in NY - MSKCC)
Contents    : parses files *match-proteins-to-gos.type_*.expr_*.txt  produced by PIANA command match-proteins-to-gos
              and produces a matrix comparing the Go frequencies for all the 'cases' that have a file (eg. tissues)
              
Called from : command line

=======================================================================================================

"""

# parse_go_freqs_files.py: parses files *match-proteins-to-gos.type_*.expr_*.txt 
#
# license goes here

import sys
import getopt
import math
import copy
import glob
import readline
import cPickle


import utilities

from PianaDBaccess import *

format_mode = "html"

verbose = 0
verbose_detailed = 0


# ----------------------
# Function usage()
# ----------------------
def usage():
    print "\n--------------------------------------------------------------------------------------------------------------"
    print " parses files *match-proteins-to-gos.type_*.expr_*.txt  produced by PIANA command match-proteins-to-gos"
    print " and produces a matrix comparing the Go frequencies for all the 'cases' that have a file (eg. tissues) \n"
    print "\nUsage: parse_go_freqs_files.py: --input-dir=input_dir [--help] [--verbose] \n" 
    print "\nwhere:"
    print "     input_dir    : directory that contains the desc_files (do not write the ending slash of the directory!!!!)"
    print "     --help       : prints this message and exits"
    print "     --verbose    : prints process info to stdout"
    print "--------------------------------------------------------------------------------------------------------------"
        

   
# ---------------------------
# Function parseArguments()                                               
# --------------------------- 

def parseArguments():

    global input_dir

    global verbose
    
    try:
        opts, args = getopt.getopt(sys.argv[1:], "", ["verbose", "help", "piana-dbname=", "piana-dbhost=", "piana-dbuser=", "piana-dbpass=", 
						      "input-dir=", "name-dic-desc-files=", "threshold=", "output-proteins-type=", "task="])
    except getopt.GetoptError, bad_opt:
        # print help information and exit:
        sys.stderr.write( bad_opt.__str__() )
        usage()
        sys.exit(2)
     
    for option,value in opts:
        
        if option == "--input-dir":
            input_dir = value
             
        elif option == "--verbose":
            verbose = 1
            
        elif option == "--help":
            # print help information and exit
            usage()
            sys.exit(2)
    # END OF for option,value in opts:

    # check arguments
    if input_dir is None:
        raise ValueError("trying to run the program without giving an input dir")

# --------
# --------
#  Main()               
# --------                               
# --------
input_dir = None

# parsing arguments from the command line
parseArguments()

dic_gos_cases = {'all':{}}     # dictionary that keeps the num of matches for each case being studied
                               #
                               #   format is:    { 'all': { go1: freq,
                               #                            go2: freq,
                               #                            ..........
                               #                          },
                               #                   case1: { go1: freq,
                               #                            go2: freq,
                               #                            ..........
                               #                          }
                               #                   case2: { go1: freq,
                               #                           .............
                               #                 }
                               #                 
                               #   key 'all' keeps total frequency of each GO


dic_cases_gos = {}    # keeps how many gos are there in each case network being studied
                      #   -> key is the case key, and content the number of gos

num_file = 0

for file_name in glob.glob(input_dir + "/*match-proteins-to-gos.type_*.expr_*.txt"):
    
    if verbose:
	num_file += 1
	sys.stderr.write("file_%s." %(num_file))
    

    for one_line in file(file_name, "r"):

	# Each file looks like this (tab separated):
	# network_name=tissue_vagina      num_prots_in_network=111        go=protein targeting    freq=1  percentage=0.21%        num_go_terms=473  force_expression=no
         
	line_fields = one_line.split("\t")

	case_name = line_fields[0].split("=")[1]
	num_prots_in_case = int(line_fields[1].split("=")[1])  # not used... not too relevant for this analysis
	go_name = line_fields[2].split("=")[1]
	freq = int(line_fields[3].split("=")[1])
	num_go_terms = int(line_fields[5].split("=")[1])  # not used... not too relevant for this analysis
	force_expression = line_fields[6].split("=")[1]

	# insert the total freq of gos in this case
	dic_cases_gos[case_name] = num_go_terms

	# insert the total freq for this go
	if dic_gos_cases['all'].has_key(go_name):
	    dic_gos_cases['all'][go_name] += freq
	else:
	    dic_gos_cases['all'][go_name] = freq

	# insert the freq of this go for this case
	
	if dic_gos_cases.has_key(case_name):
	    dic_gos_cases[case_name][go_name] = freq
	else:
	    dic_gos_cases[case_name] = {go_name:freq}

    # END OF for one_line in file(file_name, "r"):
# END OF for file_name in glob.glob(input_dir + "/*match-proteins-to-pathways.expr_*.txt"):
	



# Now, print the matrix with the data retrieved from all files	

# get the keys of the dics, we want to follow always the same order...
cases_list = dic_cases_gos.keys()
gos_list = dic_gos_cases['all'].keys()


if format_mode == "txt":

    raise ValueError("txt mode not available")

elif format_mode == "html":
    
    # print header line
    sys.stdout.write("<table border=1>\n   <tr>\n       <td> </td> <td align=center><b>total frequency of GOs</b></td> ")
    for one_case in cases_list:
	sys.stdout.write(" <td align=center><b>%s</b></td> " %(one_case.replace("_"," ")))
    sys.stdout.write("\n   </tr>\n")

    # print line with number of GOs per case
    sys.stdout.write("   <tr>\n      <td><b>number of GOs in case</b></td> <td> </td> ")
    for one_case in cases_list:
	sys.stdout.write(" <td align=center>%s</td> " %(dic_cases_gos[one_case]))
    sys.stdout.write("\n   </tr>\n")



    # print GO lines

    for one_go in gos_list:

	# only printing the GO if its overall frequency is higher than 1

	if dic_gos_cases['all'][one_go] <2:
	    continue

	sys.stdout.write("   <tr>\n      <td><b>%s</b></td> <td align=center>%s</td> " %(one_go, dic_gos_cases['all'][one_go] ))

	for one_case in cases_list:
	    
	    if dic_gos_cases[one_case].has_key(one_go):
		this_case_go_freq = dic_gos_cases[one_case][one_go]
	    else:
		this_case_go_freq = 0

	    if this_case_go_freq > 10:
		open_tag_freq = "<font color=FF33FF>"
		close_tag_freq= "</font>"
	    else:
		open_tag_freq = ""
		close_tag_freq= ""
		

	    sys.stdout.write(" <td align=center>%s%s%s</td>" %(open_tag_freq, this_case_go_freq, close_tag_freq))

	sys.stdout.write("\n   </tr>\n")
	# END OF for one_case in cases_list:
    # END OF for one_go in gos_list:


    sys.stdout.write("</table>\n")


	
