"""
File        : parse_gds.py
Author      : Ramon Aragues
Creation    : 08.05.06 (in NY - MSKCC)
Contents    : parses a GDS SOFT file and produces pairs of higly correlated pairs
Called from : command line

=======================================================================================================



"""

# parse_gds.py: parses a GDS SOFT file and produces pairs of higly correlated pairs
#
# license goes here

import sys
import getopt
import stats
import math
import copy


verbose = 0
verbose_detailed = 0

# ----------------------
# Function usage()
# ----------------------
def usage():
    print "\n--------------------------------------------------------------------------------------------------------------"
    print " Parses a GDS SOFT file and produces pairs of higly correlated pairs\n"
    print "\nUsage: parse_gds.py: --input-file=input_file --r-thres=r_thres --r-pvalue-thres=r_pvalue_thres"
    print "         --correferences-file=correferences_file signal-pvalue-thres=signal_pvalue_thres [--help] [--verbose] \n" 
    print "\nwhere:"
    print "     input_file         : name of the file in GDS SOFT format"
    print "     r_thres            : threshold that will be used to consider two genes as highly correlated (0-1) (absolute value will be used for anticorrelation)"
    print "                          -> pearson coefficients lower than this threshold will not be considered as an indication of correlation"
    print "     r_pvalue_thres  : if the r obtained for a pair has an associated pvalue higher than this threshold, correlation will be ignored regardless of the r value"
    print "     correferences_file : file that contains correferences between the IDs in the GDS file and the ID type you want to use for output"
    print "                            -> format for this file is X  " 
    print "                            -> if no file is set, ID from GDS SOFT file will be used " 
    print "     signal_pvalue_thres : maximum p_value accepted for considering a signal value as 'valid' "
    print "     --help       : prints this message and exits"
    print "     --verbose    : prints process info to stdout"
    print "--------------------------------------------------------------------------------------------------------------"
        

   
# ---------------------------
# Function parseArguments()                                               
# --------------------------- 

def parseArguments():
    
    global input_file
    global correferences_file
    global r_thres
    global r_pvalue_thres
    global signal_pvalue_thres

    global verbose
    
    try:
        opts, args = getopt.getopt(sys.argv[1:], "", ["verbose", "help", "input-file=", "r-thres=", "r-pvalue-thres=", "signal-pvalue-thres=", "correferences-file="])
    except getopt.GetoptError, bad_opt:
        # print help information and exit:
        sys.stderr.write( bad_opt.__str__() )
        usage()
        sys.exit(2)
     
    for option,value in opts:
        
        if option == "--input-file":
            input_file = value
            
        elif option == "--r-thres":
            r_thres = float(value)
            
        elif option == "--r-pvalue-thres":
            r_pvalue_thres = float(value)
            
        elif option == "--signal-pvalue-thres":
            signal_pvalue_thres = float(value)
            
        elif option == "--correferences-file":
            correferences_file = value
             
        elif option == "--verbose":
            verbose = 1
            
        elif option == "--help":
            # print help information and exit
            usage()
            sys.exit(2)

    # check arguments
    if input_file is None:
        raise ValueError("trying to run the program without giving an input file")

    if r_thres is None:
        raise ValueError("trying to run the program without giving a ")

    if r_pvalue_thres is None:
        raise ValueError("trying to run the program without giving a ")

    if signal_pvalue_thres is None:
        raise ValueError("trying to run the program without giving a signal_pvalue_thres")


# --------
# --------
#  Main()               
# --------                               
# --------
input_file = None
correferences_file = None
r_thres = None
r_pvalue_thres = None
signal_pvalue_thres = None

# parsing arguments from the command line
parseArguments()

first_line = 1     # used to treat first line differently
num_bad_lines = 0
number_of_pairs = 0

dic_gene_expression = {}   # dictionary that keeps the (log of) expression for all genes in the input file
                           #   format is:    { gene_1: [log(expression1), log(expression2), ...]
                           #                 { gene_2: [log(expression1), log(expression2), ...]
                           #
                           # the different positions in the list for each gene correspond to the
                           # expression under different experiments (tissues, samples, conditions, ...)
                           #   -> the headers for all positions will be retrieved from the file and kept
                           #      on list headers_vector


j = 0
for one_line in file(input_file, "r"):
    
    line_fields = one_line.split("\t")


    if first_line == 1:
	# this is the first line: get the headers for the experiments
	
	headers_vector = line_fields
	number_experiments = (len(headers_vector) - 2) / 3 # -2 because last field is the description of if gene, not an experiment
	                                                   # and first field is empty (line starts with a TAB
	if verbose_detailed:
	    print "number of experiments is %s\n" %(number_experiments)
	first_line = 0
	continue

    if (len(line_fields)-2)%3 != 0:
	# protect against bad lines: a good line is an identifier followed by repetitions of three fields: signal, tag, pvalue (and ending with a description field)
	num_bad_lines += 1
	if verbose_detailed:
	    print "bad line: >>\n%s\n" %(one_line)
	continue
    

    number_of_pairs += 1

    # line_fields[0] is the probe id (or the gene name)
    # line_fields[1] is the signal value on experiment 1
    # line_fields[2] is the annotation for experiment 1
    # line_fields[3] is the p-value for experiment 1
    # line_fields[4] is the signal value on experiment 2
    # line_fields[5] is the annotation for experiment 2
    # line_fields[6] is the p-value for experiment 2
    # .................................................
    #
    current_id = line_fields[0]
    dic_gene_expression[current_id] = []
    
    for i in range(number_experiments):
	# one iteration for each experiment: in each iteration we will read the three values associated to that experiment
	current_exp_signal = float(line_fields[i*3 + 1])
	current_exp_tag = line_fields[i*3 + 2]
	current_exp_pvalue = float(line_fields[i*3 + 3])
	
	if verbose_detailed:
	    sys.stderr.write("gene %s has for experiment <%s> a signal of <%s> with tag <%s> and pvalue <%s>" %(current_id, 
														headers_vector[i*3 + 1], 
														current_exp_signal, 
														current_exp_tag, 
														current_exp_pvalue ))

	if current_exp_pvalue < signal_pvalue_thres:
	    # only accepting those signals whose pvalue is lower than the threshold set by user
	    dic_gene_expression[current_id].append( math.log(current_exp_signal) )
	else: 
	    # TO CHECK! What do I have to add here to make sure that this value is ignored when calculating the pearson coefficient?
	    dic_gene_expression[current_id].append( 0 )
	    
    # END OF for i in range(number_experiments):

    
# END OF for one_line in file(input_file, "r"):



# now that we have the log(signal) for each gene under each experiment, calculate correlation for each pair of genes

list_genes = dic_gene_expression.keys()
number_genes = len(list_genes)

for i in range(number_genes):
    for j in range(i+1, number_genes):
	# calculate correlation for each pair

	try:
	    (correlation_r, correlation_pvalue) = stats.pearsonr(dic_gene_expression[list_genes[i]], dic_gene_expression[list_genes[j]])
	except:
	    # if exception is raised, probably due to Zero values: set correlation to 0
	    (correlation_r, correlation_pvalue) = (0.0, 0.0)

	if verbose_detailed:
	    sys.stderr.write("correlation for pair %s -- %s is %s" %(list_genes[i], list_genes[j], correlation_r))

	if correlation_r > r_thres and correlation_pvalue < r_pvalue_thres:
	    sys.stdout.write("%s\t%s\t%s\t%s\n" %(list_genes[i], list_genes[j], correlation_r, correlation_pvalue))

# END OF for i in range(number_genes):

sys.stderr.write("Num pairs processed: %s\n" %(number_of_pairs))
sys.stderr.write("Num bad lines: %s\n" %(num_bad_lines))
