"""
File        : parse_cir_grouped_proteins.py
Author      : Ramon Aragues
Creation    : 6.06.06 (in NY - MSKCC)
Contents    : parses files *thres_X.grouped_proteins  produced by PIANA command cir
              and produces ....
              
Called from : command line

=======================================================================================================


"""

# parse_cir_grouped_proteins.py: parses files *thres_X.grouped_proteins
#
# license goes here

import sys
import getopt
import math
import copy
import glob
import readline
import cPickle

import utilities

from PianaDBaccess import *

format_mode = "html"

verbose = 1
verbose_detailed = 0


# ----------------------
# Function usage()
# ----------------------
def usage():
    print "\n--------------------------------------------------------------------------------------------------------------"
    print " parses files *thres_X.grouped_proteins produced by PIANA command cir \n"
    print " and produces ...... \n"
    print "\nUsage: parse_cir_grouped_proteins.py: --input-dir=input_dir [--help] [--verbose] \n" 
    print "\nwhere:"
    print "     input_dir    : directory that contains the txt files from the CIR command (do not write the ending slash of the directory!!!!)"
    print "     threshold    : hub threshold that  you want to use (the 'X' in thres_X.grouped_proteins)"
    print "     metric       : the metric for which you want to parse the files (min_per, max_per, combined_per or num_ints)"
    print "     piana_dbname : name of database piana to be used (required)"
    print "     piana_dbhost : name of host where database piana to be used is placed (required)"
    print "     piana_dbuser : username accessing the database (not required in most systems)"
    print "     piana_dbpass : password of username accessing the database (not required in most systems)"
    print "     --help       : prints this message and exits"
    print "     --verbose    : prints process info to stdout"
    print "--------------------------------------------------------------------------------------------------------------"
        

   
# ---------------------------
# Function parseArguments()                                               
# --------------------------- 

def parseArguments():

    global input_dir
    global threshold
    global metric

    global piana_dbname
    global piana_dbhost
    global piana_dbuser
    global piana_dbpass
    
    global verbose
    
    try:
        opts, args = getopt.getopt(sys.argv[1:], "", ["verbose", "help", "piana-dbname=", "piana-dbhost=", "piana-dbuser=", "piana-dbpass=", 
						      "input-dir=", "threshold=", "metric=", "output-proteins-type="])
    except getopt.GetoptError, bad_opt:
        # print help information and exit:
        sys.stderr.write( bad_opt.__str__() )
        usage()
        sys.exit(2)
     
    for option,value in opts:
        
        if option == "--input-dir":
            input_dir = value
             
        elif option == "--threshold":
            threshold = int(value)
             
        elif option == "--metric":
            metric = value
            
	elif option in ("-n", "--piana-dbname"):
	    piana_dbname = value
	    
	elif option in ("-o", "--piana-dbhost"):
	    piana_dbhost = value
            
	elif option in ("-u", "--piana-dbuser"):
	    piana_dbuser = value
	    
	elif option in ("-w", "--piana-dbpass"):
	    piana_dbpass = value
	    
        elif option == "--verbose":
            verbose = 1
            
        elif option == "--help":
            # print help information and exit
            usage()
            sys.exit(2)
    # END OF for option,value in opts:

    # check arguments
    if input_dir is None:
        raise ValueError("trying to run the program without giving an input dir")

    if threshold is None:
        raise ValueError("trying to run the program without giving a threshold")

    if metric is None:
        raise ValueError("trying to run the program without giving a metric")

# --------
# --------
#  Main()               
# --------                               
# --------
input_dir = None
threshold = 0
metric = None

piana_dbname = None
piana_dbuser = None
piana_dbhost = None
piana_dbpass = None

# parsing arguments from the command line
parseArguments()

# Initialisating connection to piana
piana_access = PianaDBaccess(dbname=piana_dbname, dbhost=piana_dbhost, dbuser=piana_dbuser, dbpassword= piana_dbpass)



num_file = 0

files_pattern = input_dir + "/*%s*.thres_%s.grouped_proteins" %(metric, threshold)


for file_name in glob.glob(files_pattern):
    
    if verbose:
	num_file += 1
	sys.stderr.write("Parsing file %s: %s\n" %(num_file, file_name))
    

    for one_line in file(file_name, "r"):

	# Each file looks like this:
	
	#  range=11.20     protein1=26631  protein2=91433  num_ints1=21.0  num_ints2=27.0  num_ints_common=3.0     common_cir=0
        
	line_fields = one_line.split()

	range = line_fields[0].split("=")[1]
	proteinPiana_1 = int(line_fields[1].split("=")[1])
	proteinPiana_2 = int(line_fields[2].split("=")[1])
	num_ints_1 = int(float(line_fields[3].split("=")[1]))
	num_ints_2 = int(float(line_fields[4].split("=")[1]))
	num_ints_common = int(float(line_fields[5].split("=")[1]))

	if line_fields[6].split("=")[1] == "1":
	    same_scop = "yes"
	else:
	    same_scop = "no"

	p1_unientry = piana_access.get_list_protein_external_codes(proteinPiana= proteinPiana_1,
								   protein_type_name= "unientry",
								   alternative_type_names= ["md5"],
								   answer_mode= "single")	
	try:
	    p1_description = piana_access.get_protein_description(proteinPiana_value = proteinPiana_1)[0]
	except:
	    p1_description = " "

	p2_unientry = piana_access.get_list_protein_external_codes(proteinPiana= proteinPiana_2,
								   protein_type_name= "unientry",
								   alternative_type_names= ["md5"],
								   answer_mode= "single")	
	try:
	    p2_description = piana_access.get_protein_description(proteinPiana_value = proteinPiana_2)[0]
	except:
	    p2_description = " "


	sys.stdout.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" %(range, p1_unientry, p2_unientry, num_ints_1, num_ints_2, num_ints_common, same_scop, p1_description, p2_description))


    # END OF for one_line in file(file_name, "r"):
# END OF for file_name in glob.glob(files_pattern):
	

