"""
File        : parse_cosmic_csv.py
Author      : Ramon Aragues
Creation    : 08.05.06 (in NY - MSKCC)
Contents    : parses a COSMIC_CSV SOFT file and produces pairs of higly correlated pairs
Called from : command line

=======================================================================================================



"""

# parse_cosmic_csv.py: parses a COSMIC CSV  file 
#
# license goes here

import sys
import getopt
import stats
import math
import copy
import glob


verbose = 0
verbose_detailed = 0
create_files = 1


# ----------------------
# Function usage()
# ----------------------
def usage():
    print "\n--------------------------------------------------------------------------------------------------------------"
    print " Parses a COSMIC CSV file \n"
    print "\nUsage: parse_cosmic_csv.py: --input-dir=input_dir"
    print "          [--help] [--verbose] \n" 
    print "\nwhere:"
    print "     input_dir    : directory that contains the COSMIC CSV  format files (do not write the ending slash of the directory!!!!)"
    print "     --help       : prints this message and exits"
    print "     --verbose    : prints process info to stdout"
    print "--------------------------------------------------------------------------------------------------------------"
        

   
# ---------------------------
# Function parseArguments()                                               
# --------------------------- 

def parseArguments():
    
    global input_dir

    global verbose
    
    try:
        opts, args = getopt.getopt(sys.argv[1:], "", ["verbose", "help", "input-dir="])
    except getopt.GetoptError, bad_opt:
        # print help information and exit:
        sys.stderr.write( bad_opt.__str__() )
        usage()
        sys.exit(2)
     
    for option,value in opts:
        
        if option == "--input-dir":
            input_dir = value
             
        elif option == "--verbose":
            verbose = 1
            
        elif option == "--help":
            # print help information and exit
            usage()
            sys.exit(2)

    # check arguments
    if input_dir is None:
        raise ValueError("trying to run the program without giving an input file")



# --------
# --------
#  Main()               
# --------                               
# --------
input_dir = None

# parsing arguments from the command line
parseArguments()


dic_gene_features = {}     # dictionary that keeps the features for all genes with CSV files in the input dir
                           #
                           #   format is:    { gene_1: 'primary_tissue':[tissue1, tissue2, ...]
                           #                           'tissue_subtype_1': [subtype 1, subtype 2, ...]
                           #                           'tissue_subtype_2': [subtype 1, subtype 2, ...]
                           #                           'histology': [histology1, histology2, ...]
                           #                           'histology_subtype1':  [subtype 1, subtype 2, ...]
                           #                           'histology_subtype2':  [subtype 1, subtype 2, ...],
                           #                   gene_2: 'primary_tissue':[tissue1, tissue2, ...]
                           #                           .............
                           #                 }

dic_tissue_genes = {}      # dictionary that keeps genes associated to tissues
                           #
                           # format is:     { tissue_type1: {gene1:None, gene2:None, gene3:None, ...}
                           #                  tissue_type2: {gene1:None, gene2:None, gene3:None, ...}
                           #                  ..............
                           #                }

dic_histology_genes = {}   # dictionary that keeps genes associated to histologies
                           #
                           # format is:     { histology1: {gene1:None, gene2:None, gene3:None, ...}
                           #                  histology2: {gene1:None, gene2:None, gene3:None, ...}
                           #                  ..............
                           #                }



num_file = 0

for file_name in glob.glob(input_dir + "/*.csv"):
    
    if verbose:
	num_file += 1
	sys.stderr.write("file_%s." %(num_file))
    
    line_number = 1
    for one_line in file(file_name, "r"):

	# Each file looks like this:
	# (numbers are there to indicate line number, they do not appear in the actual files)

	# 1
	# 2  Gene : AATK
	# 3  Primary Tissue : None Selected
	# 4  Tissue subtype 1 : None Selected
	# 5  Histology : None Selected
	# 6  Histology subtype 1 : None Selected
	# 7 
	# 8  Sample Name     COSMIC Sample ID        Amino Acid      Nucleotide      Primary Tissue  Tissue subtype 1        Tissue subtype 2        Histology       Histology subtype 1     Histology subtype 2   	     Author  Journal Volume  Year    Pages   Pubmed ID   Mutation ID
	# 9  NCI-H2087       724834  p.L97V  c.289C>G        lung    NS      NS      carcinoma       adenocarcinoma  NS      Davies  Cancer_Res      65      2005    7591-7595       16140923        12781
	# 10 NCI-H2087       724834  p.L97V  c.289C>G        lung    NS      NS      carcinoma       adenocarcinoma  NS      Wooster Cancer_Res      65      2005    7591-7595       16140923        12781
	# 11.................

    
	if line_number == 2:
	    # get the gene name from line two
	    line_fields = one_line.split(" ")  # split using spaces...
	    current_gene_name = line_fields[2].strip()

	    dic_gene_features[current_gene_name] = {'primary_tissue': [],
						    'tissue_subtype_1': [],
						    'tissue_subtype_2': [],
						    'histology': [],
						    'histology_subtype1': [],
						    'histology_subtype2': []  }


	elif line_number >2 and line_number <=8:
	    pass # nothing to be extracted from these lines

	elif line_number > 8:
	    # this is a line with info for a sample and description of cancer related to it for this gene
	    line_fields = one_line.split("\t")
	    
	    # fields are (* if extraction desired):
	    #      [0] -->  Sample Name
	    #      [1] -->  COSMIC Sample ID
	    #      [2] -->  Amino Acid
	    #      [3] -->  Nucleotide
	    #      [4] -->  Primary Tissue      (*)
	    #      [5] -->  Tissue subtype 1    (*)
	    #      [6] -->  Tissue subtype 2    (*)
	    #      [7] -->  Histology           (*)
	    #      [8] -->  Histology subtype 1 (*)
	    #      [9] -->  Histology subtype 2 (*)
	    #      [10] --> Author
	    #      [11] --> Journal
	    #      [12] --> Volume
	    #      [13] --> Year
	    #      [14] --> Pages
	    #      [15] --> Pubmed ID
	    #      [16] --> Mutation ID
	    primary_tissue = line_fields[4]
	    histology = line_fields[7]

	    # populate the gene features dictionary
	    dic_gene_features[current_gene_name]['primary_tissue'].append(primary_tissue)
	    dic_gene_features[current_gene_name]['tissue_subtype_1'].append(line_fields[5])
	    dic_gene_features[current_gene_name]['tissue_subtype_2'].append(line_fields[6])
	    dic_gene_features[current_gene_name]['histology'].append(line_fields[7])     
	    dic_gene_features[current_gene_name]['histology_subtype1'].append(line_fields[8])
	    dic_gene_features[current_gene_name]['histology_subtype2'].append(line_fields[9])

	    # populate the tissue dictionary with gene lists
	    if dic_tissue_genes.has_key(primary_tissue):
		dic_tissue_genes[primary_tissue][current_gene_name] = None
	    else:
		dic_tissue_genes[primary_tissue] = {current_gene_name: None}

	    # populate the histology dictionary with gene lists
	    if dic_histology_genes.has_key(histology):
		dic_histology_genes[histology][current_gene_name] = None
	    else:
		dic_histology_genes[histology] = {current_gene_name: None}
	    

	# END OF else: (elif line_number> 8:)


	line_number += 1
	
    # END OF for one_line in file(file_name, "r"):


# END OF for file_name in glob.glob(input_dir):

if verbose:
    if verbose_detailed:
	for gene_name in dic_gene_features:
	    sys.stdout.write("============================================================================\n")
	    sys.stdout.write("gene=%s\n" %(gene_name))
	    sys.stdout.write("primary_tissue=%s\n" %(dic_gene_features[current_gene_name]['primary_tissue']))
	    sys.stdout.write("tissue_subtype_1=%s\n" %(dic_gene_features[current_gene_name]['tissue_subtype_1']))
	    sys.stdout.write("tissue_subtype_2=%s\n" %(dic_gene_features[current_gene_name]['tissue_subtype_2']))
	    sys.stdout.write("histology=%s\n" %(dic_gene_features[current_gene_name]['histology']))
	    sys.stdout.write("histology_subtype1=%s\n" %(dic_gene_features[current_gene_name]['histology_subtype1']))
	    sys.stdout.write("histology_subtype2=%s\n" %(dic_gene_features[current_gene_name]['histology_subtype2']))
	    sys.stdout.write("============================================================================\n")


    for tissue in dic_tissue_genes:
	sys.stdout.write("tissue=%s\tnum_genes=%s\tgenes=%s\n" %(tissue, len(dic_tissue_genes[tissue]), dic_tissue_genes[tissue].keys()))

    for histology in dic_histology_genes:
	sys.stdout.write("histology=%s\tnum_genes=%s\tgenes=%s\n" %(histology, len(dic_histology_genes[histology]), dic_histology_genes[histology].keys()))


if create_files:
    
    for tissue in dic_tissue_genes:
	new_file_fd = file("tissue_" + tissue.replace("(","_").replace(")","_") + ".gene_names.txt" ,"w")
	for one_gene in dic_tissue_genes[tissue]:
	    new_file_fd.write("%s\n" %(one_gene))
	new_file_fd.close()

    for histology in dic_histology_genes:
	new_file_fd = file("histology_" + histology.replace("(","_").replace(")","_") + ".gene_names.txt" ,"w")
	for one_gene in dic_histology_genes[histology]:
	    new_file_fd.write("%s\n" %(one_gene))
	new_file_fd.close()
