"""
File        : parse_matched_pathway_files.py
Author      : Ramon Aragues
Creation    : 26.05.06 (in NY - MSKCC)
Contents    : parses files *match-proteins-to-pathways.expr_*.txt  produced by PIANA command match-proteins-to-pathways
              and produces a matrix comparing the affected pathways for all the 'cases' that have a file (eg. tissues)
              
Called from : command line

=======================================================================================================


"""

# parse_matched_pathway_files.py: parses files *match-proteins-to-pathways.expr_*.txt
#
# license goes here

import sys
import getopt
import math
import copy
import glob
import readline
import cPickle
import re
from sets import *


import utilities

from PianaDBaccess import *

verbose = 0
verbose_detailed = 0


# ----------------------
# Function usage()
# ----------------------
def usage():
    print "\n--------------------------------------------------------------------------------------------------------------"
    print " parses files *match-proteins-to-pathways.expr_*.txt  produced by PIANA command match-proteins-to-pathways \n"
    print " and produces a matrix comparing the affected pathways for all the 'cases' that have a file in the input dir \n"
    print "\nUsage: parse_matched_pathway_files.py: --input-dir=input_dir --global-file-name=global_file_name [--help] [--verbose] \n" 
    print "\nwhere:"
    print "     input_dir      : directory that contains the *match-proteins-to-pathways.expr_*.txt files (do not write the ending slash of the directory!!!!)"
    print "     global_file_name : set the name of the file that will hold global results from the parsing"
    print "     --help       : prints this message and exits"
    print "     --verbose    : prints process info to stdout"
    print "--------------------------------------------------------------------------------------------------------------"
    print " -> format-mode (html or txt) is hardcoded"
    print " -> Attention! In order for the links to work in the HTML output, the results file must be placed in the directory"
    print "           where thefiles being parsed are located"
    print "--------------------------------------------------------------------------------------------------------------"
        

   
# ---------------------------
# Function parseArguments()                                               
# --------------------------- 

def parseArguments():

    global input_dir
    global global_file_name

    global verbose
    
    try:
        opts, args = getopt.getopt(sys.argv[1:], "", ["verbose", "help", "piana-dbname=", "piana-dbhost=", "piana-dbuser=", "piana-dbpass=", 
						      "input-dir=", "global-file-name="])
    except getopt.GetoptError, bad_opt:
        # print help information and exit:
        sys.stderr.write( bad_opt.__str__() )
        usage()
        sys.exit(2)
     
    for option,value in opts:
        
        if option == "--input-dir":
            input_dir = value
        
	elif option == "--global-file-name":
            global_file_name = value
             
        elif option == "--verbose":
            verbose = 1
            
        elif option == "--help":
            # print help information and exit
            usage()
            sys.exit(2)
    # END OF for option,value in opts:

    # check arguments
    if input_dir is None:
        raise ValueError("trying to run the program without giving an input dir")

    if global_file_name is None:
        raise ValueError("trying to run the program without giving a global file name")


# --------
# --------
#  Main()               
# --------                               
# --------
input_dir = None
global_file_name = None

# parsing arguments from the command line
parseArguments()

dic_pathways_cases = {'all':{}}     # dictionary that keeps the num of matches for each case being studied
                                    #                       and the file that has those proteins and info about them
                                    #
                                    #   format is:    { 'all': { pathway1: [num_matches, file_name]
                                    #                            pathway2: [num_matches, file_name]
                                    #                            .....................
                                    #                          },
                                    #                   case1: { pathway1: [num_matches, file_name]
                                    #                            pathway2: [num_matches, file_name]
                                    #                            .....................
                                    #                          }
                                    #                   case2: { pathway1: [num_matches, file_name]
                                    #                           .............
                                    #                 }
                                    #                 
                                    #   key 'all' keeps total number of proteins appearing in each pathway

dic_pathway_files = {}    # keep the file with the pathway proteins for  each pathway being studied
                                #  key is the pathway name, and content the file name with the pathway proteins

dic_cases_prots = {}    # keeps how many proteins are there in each case network being studied
                        #  key is the case key, and content the number of proteins

dic_cases_relevant_files = {}    # keeps the file with the case relevant proteins for  each case network being studied
                                #  key is the case key, and content the file name with the relevant proteins

num_file = 0

# create files that will hold the results
global_report_fd = file(global_file_name, "w")  # name fixed by user

matched_pathways_matrix_report_file_name = global_file_name + ".matched_pathways.html"# name by default
matched_pathways_matrix_report_fd = file(matched_pathways_matrix_report_file_name, "w") 

cases_overlap_matrix_report_file_name = global_file_name + ".cases_overlap.html" # name by default
cases_overlap_matrix_report_fd = file(cases_overlap_matrix_report_file_name, "w") 

cases_relevant_proteins_report_file_name = global_file_name + ".cases_relevant_proteins.html" # name by default
cases_relevant_proteins_report_fd = file(cases_relevant_proteins_report_file_name, "w")

expression_stats_report_file_name = global_file_name + ".expression_stats.html" # name by default
expression_stats_report_fd = file(expression_stats_report_file_name, "w")

expression_pathway_report_file_name = global_file_name + ".expression_pathway.html" # name by default
expression_pathway_report_fd = file(expression_pathway_report_file_name, "w")

# ----
# Print all links to results files in the global report
# ----
global_report_fd.write("""<center><table border=1 style="border-collapse: collapse" bordercolor="#808080"><tr><td align=center><big>FILE %s: GLOBAL REPORT FOR %s</big></td></tr></table></center><br><br>""" %(global_file_name, input_dir))


global_report_fd.write("""From this page you can access all results generated by PIANA for your biological problem of interest. By clicking in the following links you""")
global_report_fd.write("""will have access to tables that describe interesting proteins, cases similatities and pathway matches.<br><br>""")




global_report_fd.write("""<ul>""")

#
# MATCHED PATHWAYS
#
global_report_fd.write("""<li>Click here to go to <a href="%s">pathways matches report</a><br>""" %(matched_pathways_matrix_report_file_name))
global_report_fd.write("""<br><i>This link takes you to a matrix where the overlap between the cases relevant proteins and known pathways is shown.</i>""")
global_report_fd.write("""</li><br><br>""")

#
# CASES OVERLAP
#
global_report_fd.write("""<li>Click here to go to <a href="%s">cases overlap report</a><br>""" %(cases_overlap_matrix_report_file_name))
global_report_fd.write("""<br><i>This link takes you to a matrix where the overlap between each case relevant proteins is shown.</i>""")
global_report_fd.write("""</li><br><br>""")

#
# CASES RELEVANT PROTEINS
#
global_report_fd.write("""<li>Click here to go to <a href="%s">cases relevant proteins report</a><br>""" %(cases_relevant_proteins_report_file_name))
global_report_fd.write("""<br><i>This link takes you to a table where each protein is described in terms of 1) in how many cases it appears, 2) expression information.</i>""")
global_report_fd.write("""</li><br><br>""")

#
# PROTEINS EXPRESSION STATS
#
global_report_fd.write("""<li>Click here to go to <a href="%s">proteins expression stats report</a><br>""" %(expression_stats_report_file_name))
global_report_fd.write("""<br><i>This link takes you to a table where expression information is given by type of protein (cancer genes, linker, partner, others)</i>""")
global_report_fd.write("""</li><br><br>""")

#
# PATHWAY  EXPRESSION STATS
#
global_report_fd.write("""<li>Click here to go to <a href="%s">pathway expression stats report</a><br>""" %(expression_pathway_report_file_name))
global_report_fd.write("""<br><i>This link takes you to a table where expression information is given for each pathway</i>""")
global_report_fd.write("""</li><br><br>""")



global_report_fd.write("""</ul>""")


# ----
# 1. Create expression statistics report for pathways
# ----



dic_expression_pathway = {}   # is a dictionary that follows the format:
                              #
                              #     { pathway_name: ( num_prots_in_pathway, { case_name: (num_expressed, num_over_expressed, num_under_expressed),
                              #                                               case_name: (num_expressed, num_over_expressed, num_under_expressed),
                              #                                               ...........
                              #                                             } 
                              #                     ),
                              #
                              #       pathway_name: (..., ... ),
                              #       .........
                              #     }

for file_name in glob.glob(input_dir + "/*expression_on_pathways.txt"):
    # for each file with expression on pathways, get information
    #  -> expression for pathways files are created in point 8 of PianaGraph.match_pathways
    #  -> first line is case=case_name
    #     subsequent lines are pathway=pathway_name<TAB>expression=total_number_proteins_pathaway,expressed_number,over_expressed_number,under_expressed_number

    first_line =1
    
    for one_line in file(file_name, "r"):

	if first_line == 1:
	    case_name = one_line.split("=")[1]
	    first_line = 0
	    continue

	one_line_fields = one_line.split()

	pathway_name = one_line_fields[0].split("=")[1]
	number_of_pathway_proteins = int(one_line_fields[1].split("=")[1].split(",")[0])
	number_of_expressed_proteins = int(one_line_fields[1].split("=")[1].split(",")[1])
	number_of_over_expressed_proteins = int(one_line_fields[1].split("=")[1].split(",")[2])
	number_of_under_expressed_proteins = int(one_line_fields[1].split("=")[1].split(",")[3])

	
	if dic_expression_pathway.has_key(pathway_name):
	    dic_expression_pathway[pathway_name][1][case_name] = (number_of_expressed_proteins, number_of_over_expressed_proteins, number_of_over_expressed_proteins)
	else:
	    dic_expression_pathway[pathway_name] = (number_of_pathway_proteins, {case_name:(number_of_expressed_proteins, number_of_over_expressed_proteins, number_of_over_expressed_proteins) } )
    # END OF for one_line in file(file_name, "r"):

# END OF for file_name in glob.glob(input_dir + "/*expression_statistics.txt"):

case_names = dic_expression_pathway[dic_expression_pathway.keys()[0]][1].keys()

# print headers for the table (pathways are rows and cases are columns)
expression_pathway_report_fd.write("""<table border=1 style="border-collapse: collapse" bordercolor="#808080">\n""")
expression_pathway_report_fd.write("<tr> <td align=center><b>Pathway name</b></td> <td align=center><b>Number of proteins</b></td>")

for one_case_name in case_names:
    expression_pathway_report_fd.write("<td align=center><b>%s</b></td>" %(one_case_name))
expression_pathway_report_fd.write("</tr>\n")

for one_pathway in dic_expression_pathway:
    expression_pathway_report_fd.write("<tr><td>%s</td><td>%s</td>" %(one_pathway, dic_expression_pathway[one_pathway][0])) # fields pathway name and size of pathway
    
    for one_case_name in case_names:
	try:
	    percentage_expressed =100*dic_expression_pathway[one_pathway][1][one_case_name][0]/dic_expression_pathway[one_pathway][0]
	except:
	    percentage_expressed = 0
	    
	expression_pathway_report_fd.write("<td align=center><b>%s<br>(%.2f%%)</b></td>" %(dic_expression_pathway[one_pathway][1][one_case_name][0], percentage_expressed))
    # END OF for one_case_name in case_names:
    expression_pathway_report_fd.write("</tr>\n")
    
# END OF for one_pathway in dic_expression_pathway:

expression_pathway_report_fd.write("\n</table>")
expression_pathway_report_fd.close()



# ----
# 2. Create expression statistics report for proteins (classified by root, linker, partner, ...)
# ----
# ATTENTION:  In fact, right now, it is advise not to look at these numbers but rather at those obtained
#             from running run_undexpressed_expression_files.tcsh and run_overexpressed_expression_files.tcsh
#             (in directory piana/code/analysis/genescancer)
# ----

expression_stats_report_fd.write("""<table border=1 style="border-collapse: collapse" bordercolor="#808080">\n""")

expression_stats_report_fd.write("<tr> <td align=center><b>Tissue</b></td> <td align=center><b>Cancer_Gene</b></td> <td align=center><b>Linkers</b></td> <td align=center><b>Cancer_Gene <br>Partners</b></td> <td align=center><b>Others</b></td></tr>\n")

for file_name in glob.glob(input_dir + "/*expression_statistics.txt"):
    # for each file with expression statistics, get information
    #  -> expression statistics files are created in point 4 of PianaGraph.match_pathways
    #  -> they only have one line that looks like this:case=case_name<TAB>roots=total_number,over_expressed_number,under_expressed_number<TAB>linkers=X,X,X<TAB>partners=X,X,X<TAB>others=X,X,X<NEWLINE>



    for one_line in file(file_name, "r"):
	one_line_fields = one_line.split()
	# [0] is case=case_name
	# [1] roots=total_number,expressed_number,over_expressed_number,under_expressed_number
	# [2] linkers=total_number,expressed_number,over_expressed_number,under_expressed_number
	# [3] partners=total_number,expressed_number,over_expressed_number,under_expressed_number
	# [4] others=total_number,expressed_number,over_expressed_number,under_expressed_number
	
	case_name = one_line_fields[0].split("=")[1]

	number_of_roots = int(one_line_fields[1].split("=")[1].split(",")[0])
	expressed_roots = int(one_line_fields[1].split("=")[1].split(",")[1])
	over_expressed_roots = int(one_line_fields[1].split("=")[1].split(",")[2])
	under_expressed_roots = int(one_line_fields[1].split("=")[1].split(",")[3])

	number_of_linkers = int(one_line_fields[2].split("=")[1].split(",")[0])
	expressed_linkers = int(one_line_fields[2].split("=")[1].split(",")[1])
	over_expressed_linkers = int(one_line_fields[2].split("=")[1].split(",")[2])
	under_expressed_linkers = int(one_line_fields[2].split("=")[1].split(",")[3])

	number_of_partners = int(one_line_fields[3].split("=")[1].split(",")[0])
	expressed_partners = int(one_line_fields[3].split("=")[1].split(",")[1])
	over_expressed_partners = int(one_line_fields[3].split("=")[1].split(",")[2])
	under_expressed_partners = int(one_line_fields[3].split("=")[1].split(",")[3])

	number_of_others = int(one_line_fields[4].split("=")[1].split(",")[0])
	expressed_others = int(one_line_fields[4].split("=")[1].split(",")[1])
	over_expressed_others = int(one_line_fields[4].split("=")[1].split(",")[2])
	under_expressed_others = int(one_line_fields[4].split("=")[1].split(",")[3])
	
	try:    percentage_roots = 100*(expressed_roots)/number_of_roots
	except: percentage_roots = 0

	try:    percentage_linkers = 100*(expressed_linkers)/number_of_linkers
	except: percentage_linkers = 0

	try:    percentage_partners = 100*(expressed_partners)/number_of_partners
	except: percentage_partners = 0

	try:    percentage_others = 100*(expressed_others)/number_of_others
	except: percentage_others = 0
	
	expression_stats_report_fd.write("<tr> <td align=center>%s</td> <td align=center>total=%s<br>over=%s<br>under=%s<br>expressed=%s (%s%%)</td> <td align=center>total=%s<br>over=%s<br>under=%s<br>expressed=%s (%s%%)</td> <td align=center>total=%s<br>over=%s<br>under=%s<br>expressed=%s (%s%%)</td> <td align=center>total=%s<br>over=%s<br>under=%s<br>expressed=%s (%s%%)</td></tr>\n" %(
		case_name,
		number_of_roots, over_expressed_roots, under_expressed_roots, expressed_roots, percentage_roots,
		number_of_linkers, over_expressed_linkers, under_expressed_linkers, expressed_linkers, percentage_linkers,
		number_of_partners, over_expressed_partners, under_expressed_partners, expressed_partners, percentage_partners,
		number_of_others, over_expressed_others, under_expressed_others, expressed_others, percentage_others,
		))
# END OF for file_name in glob.glob(input_dir + "/*expression_statistics.txt"):
expression_stats_report_fd.write("\n</table>")
expression_stats_report_fd.close()





# ----
# 3. Now, parse all results files to create the rest of the report
# ----
#
# Note: each case refers to a different run of PIANA (either with different input file, or maybe
#                                        just with different expression files)
#       -> For example, in genescancer, each case is a piana run with the same root proteins
#          for all cases, but different oncomine expression files depending on the type of cancer
#
# Note: a relevant protein is one that appears in the case ppi network and is over/under expressed
#
# 3.1. get data for matched pathways results files
# 3.2  retrieve all relevant proteins from the cases 



# 3.1 for each file with matched pathways results, collect data in dictionary
for file_name in glob.glob(input_dir + "/*match-proteins-to-pathways.expr_*.txt"):
    
    if verbose:
	num_file += 1
	sys.stderr.write("file_%s." %(num_file))
    
    num_line =1
    for one_line in file(file_name, "r"):
	
	# the first line contains the name of the file that has all relevant proteins
	if num_line == 1:
	    num_line +=1
	    relevant_file = one_line.split("=")[1].rstrip()
	    continue
	

	# Each line (that is not first line) looks like this (each token placed on a different line, but they are in fact TAB separated):
	# 
	# [0] network_name=tissue_pituitary
	# [1] num_prots_in_network=15  
	# [2] pathway=DNA_Repair   
	# [3] num_matched=12  
	# [4] percentage_matched=13   
	# [5] num_prots_in_pathway=92 
	# [6] force_expression=no 
	# [7] pathway_file=pathway_prots.txt 
	# [8] matched_file=matched_prots.txt
        
	line_fields = one_line.split()

	case_name = line_fields[0].split("=")[1]
	num_prots_in_case = int(line_fields[1].split("=")[1])
	pathway_name = line_fields[2].split("=")[1]
	num_matched = int(line_fields[3].split("=")[1])
	num_pathway_prots = int(line_fields[5].split("=")[1])
	force_expression = line_fields[6].split("=")[1]
	pathway_file =  line_fields[7].split("=")[1]
	matched_file =  line_fields[8].split("=")[1]

	# TO DO!!! the following insertions are not done in a very efficient way... many of them will be repeated
	# once and again... but I don't really care, since this is quite fast anyway

	dic_cases_relevant_files[case_name] = relevant_file

	# insert the count of number of proteins in a case
	dic_cases_prots[case_name] = num_prots_in_case

	# insert the number of proteins for this pathway
	dic_pathways_cases['all'][pathway_name] = [num_pathway_prots, None] # None because there is no meaning for the file that holds all matched proteins for all cases

	# insert the number of matches for this case-pathway
	
	if dic_pathways_cases.has_key(case_name):
	    dic_pathways_cases[case_name][pathway_name] = [num_matched, matched_file]
	else:
	    dic_pathways_cases[case_name] = {pathway_name: [num_matched, matched_file]}

	dic_pathway_files[pathway_name] = pathway_file

    # END OF for one_line in file(file_name, "r"):
# END OF for file_name in glob.glob(input_dir + "/*match-proteins-to-pathways.expr_*.txt"):
	


# 3.2  Now, retrieve all relevant proteins from the cases and populate a dictionary so we can control
#      which proteins appear only in one case and which other proteins appear in other cases as well
#      Find as well global frequencies for each protein

dic_protein_freqs = {} # keys are protein names and content the number of cases in which it appears as relevant

dic_cases_protein_sets = {}  # a dictionary with keys the case name and content a Set of relevant proteins

for one_case in dic_cases_relevant_files:

    file_fd = file(input_dir + "/" + dic_cases_relevant_files[one_case].rstrip("html") + "txt" , "r")  # not very beautiful, but it is the only way I found to
                                                                                                       # easily parse the proteins for each case... In PianaGraph
                                                                                                       # I am printing twice the proteins, once in HTML and another
                                                                                                       # one in TXT. The HTML is used to link it from the report
                                                                                                       # files, and the TXT to retrieve the relevant proteins for
                                                                                                       # each case. 
                                                                                                       # And the name of the TXT file is the same as the HTML but
                                                                                                       # changing the extension (ie. rstrip())
    dic_cases_protein_sets[one_case] = Set([])

    for one_line in file_fd:
	protein = one_line.split("\t")[0]
	dic_cases_protein_sets[one_case].add(protein)

	if dic_protein_freqs.has_key(protein):
	    dic_protein_freqs[protein] += 1
	else:
	    dic_protein_freqs[protein] = 1
    # END OF for one_line in file_fd:

# END OF for one_case in dic_cases_relevant_files:
	



# Now, print the matrix with the data retrieved from all files	

# get the keys of the dics, we want to follow always the same order...
cases_list = dic_cases_prots.keys()
pathways_list = dic_pathways_cases['all'].keys()

# find out which relevant proteins are unique to each case
dic_cases_unique_proteins = {} # keys are case names and content the proteins that only appear as relevant in this case
dic_cases_unique_file = {} # keys are case names and content the file name that has the unique proteins info

all_cases_info_dics = {} # keeps the dic_proteins_info dictionary for each case
                         # keys are the case names and contents the dic_proteins_info (which is a pickled dictionary with proteins info)

for i in range(len(cases_list)):
    
    current_set = dic_cases_protein_sets[cases_list[i]]
    all_other_prots = Set([])

    for j in range(len(cases_list)):
	# create a set with all relevant proteins except for those in i
	if i != j:
	    all_other_prots.union_update(dic_cases_protein_sets[cases_list[j]])
    # END OF for j in range(len(cases_list)):
    
    dic_cases_unique_proteins[cases_list[i]] = current_set.difference(all_other_prots)

    
    # print the unique proteins to a file... so it can be linked from the main html tables
    unique_file_name = cases_list[i] + ".unique_proteins.html"
    dic_cases_unique_file[cases_list[i]] = unique_file_name
    unique_file_fd = file(unique_file_name, "w")
    pickle_file_name = input_dir + "/" + dic_cases_relevant_files[cases_list[i]].rstrip("html") + "pickle"
    
    pickle_file_fd = file(pickle_file_name, "rb" )

    all_cases_info_dics[cases_list[i]] = cPickle.load(pickle_file_fd) # load the pickle file with info for each protein
                                                                      #  ( see utilities.print_proteins_to_file() )

    unique_file_fd.write("<br><center><b>RELEVANT PROTEINS THAT ARE UNIQUE TO %s</b></center><br><br>" %(cases_list[i]))
    unique_file_fd.write(all_cases_info_dics[cases_list[i]]['header'])
    for one_protein in dic_cases_unique_proteins[cases_list[i]]:
	unique_file_fd.write(all_cases_info_dics[cases_list[i]][one_protein])
    unique_file_fd.write(all_cases_info_dics[cases_list[i]]['footer'])
    
# END OF for i in range(len(cases_list)):



# print intro line and headers for matrix pathway matches report and cases overlap matrix report
matched_pathways_matrix_report_fd.write("""<center><big>FILE %s: PATHWAY MATCHES</big></center><br><br>""" %(matched_pathways_matrix_report_file_name))
cases_overlap_matrix_report_fd.write("""<center><big>FILE %s: CASES OVERLAPS</big></center><br><br>""" %(cases_overlap_matrix_report_file_name))
cases_overlap_matrix_report_fd.write(""" This file shows the overlapping relevant proteins for each case being studied.<ul>""")
cases_overlap_matrix_report_fd.write(""" <li>Column 'unique to case' refers to proteins that only appear in that case</li>""")
cases_overlap_matrix_report_fd.write(""" <li>To see all relevant proteins for a certain case "i", click on the matrix position [case i][case i] </li>""")
cases_overlap_matrix_report_fd.write("""</ul><br>""")
		       
matched_pathways_matrix_report_fd.write(""" <i>(Click here to go <a href="%s">back to the global report </a>)</i><br><br>""" %(global_file_name))
cases_overlap_matrix_report_fd.write(""" <i>(Click here to go <a href="%s">back to the global report </a>)</i><br><br>""" %(global_file_name))

matched_pathways_matrix_report_fd.write("""<table border=1 style="border-collapse: collapse" bordercolor="#808080">\n   <tr>\n       <td> </td> <td align=center><b>number of proteins in pathway</b></td> """)
cases_overlap_matrix_report_fd.write("""<table border=1 style="border-collapse: collapse" bordercolor="#808080">\n   <tr>\n       <td> </td> <td align=center><b>Unique to case</b></td> """)

for one_case in cases_list:
    matched_pathways_matrix_report_fd.write(" <td align=center><b>%s</b></td> " %(one_case.replace("_"," ")))
    cases_overlap_matrix_report_fd.write(" <td align=center><b>%s</b></td> " %(one_case.replace("_"," ")))
		       
matched_pathways_matrix_report_fd.write("\n   </tr>\n")
cases_overlap_matrix_report_fd.write("\n   </tr>\n")

# print line with number of proteins per case (for pathway matches report) and cases overlap (for cases overlap report)
matched_pathways_matrix_report_fd.write("   <tr>\n      <td><b>number of proteins in case</b></td> <td> </td> ")

for i in range(len(cases_list)):

    case_i = cases_list[i]
		       
    matched_pathways_matrix_report_fd.write(""" <td align=center><a href="%s">%s</a><br>(<a href="%s">%s</a> unique)</td>""" %(dic_cases_relevant_files[case_i],
                                                                                                                               dic_cases_prots[case_i], 
                                                                                                                               dic_cases_unique_file[case_i],
                                                                                                                               len(dic_cases_unique_proteins[case_i])))
    

		  
    cases_overlap_matrix_report_fd.write("""<tr><td>%s</td> <td align=center><a href="%s">%s</a></td> """ %(case_i, dic_cases_unique_file[case_i], len(dic_cases_unique_proteins[case_i]) ))

    for j in range(len(cases_list)):

	# now, print in the case vs. case overlap report which is the intersection between each type of cancer
	if i == j:
	    cases_overlap_matrix_report_fd.write("""<td align=center><a href="%s">%s <br>(100%%)</a></td>""" %(dic_cases_relevant_files[case_i], dic_cases_prots[case_i]))
	else:
	    common_file_name = "proteins_in_common." + case_i + "_vs_" + cases_list[j] + ".html"
	    proteins_in_common = dic_cases_protein_sets[case_i].intersection(dic_cases_protein_sets[cases_list[j]])
	    try:
		percentage_in_common = 100*len(proteins_in_common)/min(len(dic_cases_protein_sets[case_i]), 
								       len(dic_cases_protein_sets[cases_list[j]]) ) # with respect to the one with less relevant proteins
	    except:
		percentage_in_common = 0

	    cases_overlap_matrix_report_fd.write("""<td align=center><a href="%s">%s <br>(%s%%)</a>""" %(common_file_name, len(proteins_in_common), percentage_in_common) )
	    cases_overlap_matrix_report_fd.write("</td>\n")

	    # write proteins in common to the file (so that we can link to specific files of overlaps)
	    common_file_fd = file(common_file_name, "w")
	    common_file_fd.write("<br><center><b>PROTEINS IN COMMON BETWEEN %s and %s</b></center><br><br>" %(case_i, cases_list[j]))
	    common_file_fd.write("%s" %(all_cases_info_dics[case_i]['header'].replace("<tr>", "<tr><td align=center><b>Case<b></td>")))
	    for one_protein_in_common in proteins_in_common:
		common_file_fd.write("<tr><td>")
		common_file_fd.write("%s" %(all_cases_info_dics[case_i][one_protein_in_common].replace("<tr>", "<tr><td>%s</td>" %(case_i))))
		common_file_fd.write("%s" %(all_cases_info_dics[cases_list[j]][one_protein_in_common].replace("<tr>", "<tr><td>%s</td>" %(cases_list[j]))))
		common_file_fd.write("</td></tr>")


	    common_file_fd.write("%s" %(all_cases_info_dics[case_i]['footer']))
	    common_file_fd.close()
	# END OF else: (if i == j:)
    # END OF for j in range(len(cases_list)):
    cases_overlap_matrix_report_fd.write("""</tr>\n""")

# END OF for case_i in cases_list:

matched_pathways_matrix_report_fd.write("\n   </tr>\n")
cases_overlap_matrix_report_fd.write("""</table>""")


# print the proteins frequency to the global report

#  - first of all, place pairs (freq, protein) in a list so we can order it


list_freq_prot = [] # contains pairs (freq, protein)
for one_protein in dic_protein_freqs:
    list_freq_prot.append( (dic_protein_freqs[one_protein], one_protein) )

list_freq_prot.sort()   # sort() and reverse() do not return the list... that's why the operations must be done on different lines
list_freq_prot.reverse()

cases_relevant_proteins_report_fd.write("""<br><br><table border=1 style="border-collapse: collapse" bordercolor="#808080"><tr><td align=center><b>Frequency</b></td><td align=center><b>Protein</b></td><td align=center><b>Cases description</b></td></tr>""")
for one_pair in list_freq_prot:
    # one_pair is (freq, protein)
    cases_relevant_proteins_report_fd.write("<tr>")
    cases_relevant_proteins_report_fd.write("""<td>%s</td><td>%s</td><td><table border=1 style="border-collapse: collapse" bordercolor="#808080">%s""" %(one_pair[0], one_pair[1], all_cases_info_dics[all_cases_info_dics.keys()[0]]['header'].replace("<tr>", "<tr><td>Case</td>")))
    i = 0
    for case_with_info in all_cases_info_dics:

	if all_cases_info_dics[case_with_info].has_key(one_pair[1]):	
	    if i:
		# if it is not the first case, avoid printing again the protein function/description: we do not need to see it more than once
		p = re.compile('(<tr><td>.*?)(<td align=center>)(.*?)(<td>)(?P<func>.*?)(<td>)(?P<desc>.*?)(</tr>)')
		m = p.match(all_cases_info_dics[case_with_info][one_pair[1]])
		info_to_print = m.group(1) + m.group(2) + m.group(3) + m.group(4) + "</td>" + m.group(6) + "</td>" + m.group(8) 
	    else:
		# if it is the first case to be printed, we leave description and function information. Update i so that next cases do not print description/function
		i =1
		info_to_print = all_cases_info_dics[case_with_info][one_pair[1]]
	    # ENF OF else: (if i != 0:)

	    cases_relevant_proteins_report_fd.write("%s" %(info_to_print.replace("<tr>", "<tr><td>%s</td>" %(case_with_info)))) # adding case name to info line
	# END OF if all_cases_info_dics[case_with_info].has_key(one_pair[1]):
    # END OF for case_with_info in all_cases_info_dics:

    cases_relevant_proteins_report_fd.write("</table></td></tr>")

cases_relevant_proteins_report_fd.write("</table>")




# print pathway lines

for one_pathway in pathways_list:
    matched_pathways_matrix_report_fd.write("""   <tr>\n      <td><b>%s</b></td> <td align=center><a href="%s">%s</a></td> """ %(one_pathway.replace("_"," "), 
																 dic_pathway_files[one_pathway],
																 dic_pathways_cases['all'][one_pathway][0] ))

    for one_case in cases_list:

	try:
	    per_matched_over_case = 100*dic_pathways_cases[one_case][one_pathway][0] / float(dic_cases_prots[one_case])
	except:
	    per_matched_over_case = 0

	try:
	    per_matched_over_pathway = 100*dic_pathways_cases[one_case][one_pathway][0] / float(dic_pathways_cases['all'][one_pathway][0])
	except:
	    per_matched_over_pathway = 0


	if per_matched_over_pathway >= 50:
	    open_tag_path = "<b><font color=FF33FF>"
	    close_tag_path= "</font></b>"
	else:
	    open_tag_path = ""
	    close_tag_path= ""


	if per_matched_over_case > 10:
	    open_tag_case = "<b><font color=00FFFF>"
	    close_tag_case= "</font></b>"
	else:
	    open_tag_case = ""
	    close_tag_case= ""


	matched_pathways_matrix_report_fd.write(""" <td align=center><a href="%s">%s</a><br>%s(%.0f%% pw)%s<br>%s(%.0f%% cs)%s </td>""" %( dic_pathways_cases[one_case][one_pathway][1],
														    dic_pathways_cases[one_case][one_pathway][0],
														    open_tag_path, per_matched_over_pathway, close_tag_path,
														    open_tag_case, per_matched_over_case, close_tag_case))

    matched_pathways_matrix_report_fd.write("\n   </tr>\n")
    # END OF for one_case in cases_list:
# END OF for one_pathway in pathways_list:


matched_pathways_matrix_report_fd.write("</table>\n")


matched_pathways_matrix_report_fd.close()
global_report_fd .close()
cases_overlap_matrix_report_fd.close()
cases_relevant_proteins_report_fd.close()
