"""
 File       : parse_linkers.py
 Author     : R. Aragues
 Creation   : 12.07.2005
 Contents   : class to 
 Called from: 

=======================================================================================================

This script parses a .print-connect-prots-info file and outputs those proteins that are real linkers


The problem is that the output .print-connect-prots-info is not completely correct: TO DO!!! Make it good

Why?

- a protein is said to be a linker if it joins two protein pianas with the same external code

This program checks if those codes are the same, and only prints the linker if external codes are different


If desired, this script also outputs information about GO terms (biological process, molecular function and cellular component).
"""

import sys
import getopt

import re
import readline

import sets

import GOApi

from PianaApi import *



# Set here the list of keywords you want to use for detecting proteins that are related to your problem of interest
#KEYWORDS_TO_USE = ["tumor", "onco", "cancer", "apoptosis", "death", "proliferation", "carcinoma"]
KEYWORDS_TO_USE = []

SEPARATOR = "\\n"

verbose = 0


# ----------------------
# Function usage()
# ----------------------
def usage():
    print "--------------------------------------------------------------------------------------------------------------"
    print "Usage: python parse_linkers.py --input-file=input_file --input-proteins-type=input_proteins_type"
    print "                           --piana-dbname=piana_dbname --piana-dbhost=piana_dbhost --piana-dbuser=piana_dbuser --piana-dbpass=piana_dbpass"
    print "                           --go-dbname=go_dbname --go-dbhost=go_dbhost --go-dbuser=go_dbuser --go-dbpass=go_dbpass"
    print "                           --print-go-info --go-level=go_level   --output-format=output_format "
    print "                           --results-prefix=results_prefix [--help] [--verbose]"
    print "\nwhere:"
    print "     input_file   : output file *.print-connect-prots-info"
    print "     input_proteins_type   : type of code used in input_file"
    print "     piana_dbname : name of database piana to be used (required)"
    print "     piana_dbhost : name of host where database piana to be used is placed (required)"
    print "     piana_dbuser : username accessing the database (not required in most systems)"
    print "     piana_dbpass : password of username accessing the database (not required in most systems)"
    print "     --print-go-info: prints go information related to linkers and root proteins "
    print "     go_level     : sets which level of go will be used for printing information "
    print "                     -> if go_level== -1, then prints directly the level that was assigned to the protein"
    print "     go_dbname    : name of the go mysql database (required)"
    print "     go_dbhost    : name of the machine with go mysql server (required)"
    print "     go_dbuser    : name of the mysql go username (not required in most systems)"
    print "     go_dbpass    : name of the mysql go username (not required in most systems)"
    print "     results_prefix: prefix that will identify the results of this script"
    print "     output_format: format desired for the output"
    print "                       - 'html': output written to a table in html format (this prints html table, text info files, dot graph)"
    print "                       - 'text': output written in text format (this prints just a short text description)"
    print "     label_size   : in mode html, determines the size of the label in the dot file"
    print "                       - 'all': prints everything (name, mf, bp, cc)"
    print "                       - '0': prints only the name of the protein"
    print "                       - 'mf': prints name and mf"
    print "                       - 'bp': prints name and bp"
    print "                       - 'cc': prints name and cc"
    print "     --help       : prints this message and exits"
    print "     --verbose    : prints process info to stdout"
    print "--------------------------------------------------------------------------------------------------------------"
    print "Attention! keywords are hard-coded in this script: you need to edit list KEYWORDS_TO_USE "
    print "--------------------------------------------------------------------------------------------------------------"
        

   
# ---------------------------
# Function parseArguments()                                               
# --------------------------- 

def parseArguments():
    
    
    global input_file
    global input_proteins_type
    
    global piana_dbname
    global piana_dbhost
    global piana_dbuser
    global piana_dbpass
    
    global go_dbname
    global go_dbhost
    global go_dbuser
    global go_dbpass
    
    global print_go_info
    global go_level
    
    global output_format
    global label_size
    
    global results_prefix
    
    global verbose
    
    try:
        opts, args = getopt.getopt(sys.argv[1:], "", ["verbose","help","print-go-info","go-level=","input-file=", "input-proteins-type=",
                                                      "piana-dbname=", "piana-dbhost=", "piana-dbuser=", "piana-dbpass=","go-dbname=",
                                                      "go-dbuser=","go-dbhost=","go-dbpass=" ,"output-format=","results-prefix=","label-size=" ])
    except getopt.GetoptError, error:
        # print help information and exit:
        sys.stderr.write("Command line incorrect: %s\n" %error)
        usage()
        sys.exit(2)
     
    for option,value in opts:
        
         if option == "--input-file":
             input_file = value
            
         elif option in ("-n", "--input-proteins-type"):
             input_proteins_type = value
            
         elif option in ("-n", "--piana-dbname"):
             piana_dbname = value
             
         elif option in ("-o", "--piana-dbhost"):
             piana_dbhost = value
            
         elif option in ("-u", "--piana-dbuser"):
             piana_dbuser = value
             
         elif option in ("-w", "--piana-dbpass"):
             piana_dbpass = value
             
         elif option == "--go-dbhost":
             go_dbhost = value
             
         elif option == "--go-dbname":
             go_dbname = value
             
         elif option == "--go-dbuser":
             go_dbuser = value
             
         elif option == "--go-dbpass":
             go_dbpass = value

         elif option == "--print-go-info":
             print_go_info = 1
             
         elif option == "--go-level":
             go_level = int(value)
             
         elif option == "--output-format":
             output_format = value
             
         elif option == "--label-size":
             label_size = value
             
         elif option == "--results-prefix":
             results_prefix = value

         elif option == "--verbose":
             verbose = 1
             
         elif option =="--help":
             # print help information and exit
             usage()
             sys.exit(2)
    # END OF for option,value in opts:

    if input_file is None or input_proteins_type is None:
        raise ValueError("input_file and input_proteins_type cannot be None")

    if print_go_info:

        if go_dbname is None or go_dbhost is None or go_level is None:
            raise ValueError("if go info required, go_dbname, go_dbhost and go_level cannot be None")

    if output_format is None:
        raise ValueError("you should set an output format")
            
    if results_prefix is None:
        raise ValueError("you should give a results prefix")
        


def get_protein_go_name( protein_code, protein_code_type, go_cursor, term_type, go_level):
    """
    returns a list with go names of "term_type" at level "go_level" for a given protein "protein_code" of type "protein_code_type"

    if go_level is -1, then print directly the name of the go terms associated to protein
    
    """
    global piana_api
    
    # -
    # get proteinPianas for the protein of interest
    list_proteinPianas = piana_api.piana_access.get_list_protein_piana(proteinCode_value= protein_code,
                                                             proteinCodeType_value= utilities.get_code_column(protein_code_type),
                                                             tax_id_value= 0 )
    # -

    # -
    # get go terms for this protein (for all its proteinPianas)
    # -
    list_go_terms = []
    for proteinPiana in list_proteinPianas:
        list_go_terms.extend(piana_api.piana_access.get_protein_go_term_id(proteinPiana_value=proteinPiana,
                                                                 term_type_value=term_type))

    # removing duplicates using a dictionary
    dic_go_terms = {}
    for go_term in list_go_terms:
        dic_go_terms[go_term] = None


    if go_level != -1:
        # -
        # get parents at level N for those go terms
        # -
        list_go_parents = {}
        for go_parent in GOApi.get_parents_at_level_N( list_go_terms = dic_go_terms.keys(),
                                                       level_desired= go_level,
                                                       dic_gos_at_level = {},
                                                       gocursor= gocursor,
                                                       piana_access= piana_api.piana_access):
            list_go_parents[go_parent] = None
    # END OF if go_level != -1:
    else:
        # go_level is -1: print the go name associated to this go
        list_go_parents = dic_go_terms.keys()


    # return the names of go, instead of term ids
    if not list_go_parents:
        return []
    else:
        list_go_names = {}
        for parent_go_id in list_go_parents:
            list_go_names[piana_api.piana_access.get_protein_go_name(go_term_id_value= parent_go_id)] =  None

        return list_go_names.keys()


class Protein(object):
    """
    Describes a protein in terms of its names, molecular functions, biological processes, celullar components and keywords associated
    """
    def __init__(self, protein_name, list_protein_mf, list_protein_bp, list_protein_cc, list_protein_kw):

        self.protein_name= protein_name
        self.list_protein_mf  = list_protein_mf
        self.list_protein_bp  = list_protein_bp
        self.list_protein_cc  = list_protein_cc
        self.list_protein_kw  = list_protein_kw

class ConnectedRoot(Protein):
    """
    class used to describe the root proteins that are connected by a linker (subclass of Protein)
    """
    def __str__(self):
        return "root_name=%s\troot_mf=%s\troot_bp=%s\troot_cc=%s\troot_kw=%s" %(self.protein_name, self.list_protein_mf, self.list_protein_bp,
                                                                                self.list_protein_cc, self.list_protein_kw)

class LinkerDescription(Protein):
    """
    class used to describe a linker, its GO terms and roots that it connects (subclass of Protein)
    """
    def __init__(self, protein_name, list_protein_mf, list_protein_bp, list_protein_cc, list_protein_kw, list_connected_root_objects):


        self.list_connected_root_objects = list_connected_root_objects  # contains a list of ConnectedRoot objects, the roots that this linker connects

        Protein.__init__(self, protein_name=protein_name, list_protein_mf=list_protein_mf, list_protein_bp=list_protein_bp, list_protein_cc=list_protein_cc,
                         list_protein_kw=list_protein_kw)

    def __str__(self):
        return "linker_name=%s\tlinker_mf=%s\tlinker_bp=%s\tlinker_cc=%s\tlinker_kw=%s" %(self.protein_name, self.list_protein_mf, self.list_protein_bp,
                                                                                          self.list_protein_cc, self.list_protein_kw)

def print_table_headers(output_target= sys.stdout):
    """
    prints the headers of the table
    """
    global max_number_of_roots

    for i in range(0, max_number_of_roots):
    
        output_target.write("<td ALIGN=center>\n")
        output_target.write("<b>root %s name</b></td>\n" %(i+1))
        
        output_target.write("<td ALIGN=center>\n")
        output_target.write("<b>root %s molecular functions</b></td>\n" %(i+1))
        
        output_target.write("<td ALIGN=center>\n")
        output_target.write("<b>root %s biological processes</b></td>\n" %(i+1))
        
        output_target.write("<td ALIGN=center>\n")
        output_target.write("<b>root %s cellular components</b></td>\n" %(i+1))
    # END OF for i in range(0, max_number_of_roots):


    # print headers for linker columns
    output_target.write("<td ALIGN=center>\n")
    output_target.write("<b>linkers names</b></td>\n")
    
    output_target.write("<td ALIGN=center>\n")
    output_target.write("<b>linkers molecular functions</b></td>\n")
    
    output_target.write("<td ALIGN=center>\n")
    output_target.write("<b>linkers biological processes</b></td>\n")
    
    output_target.write("<td ALIGN=center>\n")
    output_target.write("<b>linkers cellular components</b></td>\n")


def generate_all_elements_list(list_of_strings, output_target, mode):
    """
    takes a list of elements in a list and prints it in a table cell

    mode sets how to print the elements of the list

     - html: prints to output_target the elements in html
     - text: returns a string with the elements
    """
    if mode == "html":
        output_target.write("<td ALIGN=center>\n")
    elif mode == "text":
        string = ""
        
    for element in list_of_strings:
        if mode == "html":
            output_target.write("%s<br>--<br>" %(element))
        elif mode == "text":
            string += " %s," %(element)

    if mode == "html":
        if not list_of_strings: output_target.write("&nbsp;")
        output_target.write("</td>\n")
    elif mode == "text":
        return string
    
    
def generate_root(connected_root= None, mode="html", output_target = sys.stdout, label_size="all"):
    """
    prints the fields correspondent to connected_root to output_target
    

    mode sets what to do with the root:

     - html: prints to output_target the root description for the table in html
     - text: returns a string describing the root with mf, bp and cc
    
    """
    global root_proteins_with_keywords
    global all_proteins
    
    # -name of root
    if mode == "html":
        output_target.write("<td ALIGN=center>\n")
    elif mode == "text":
        string = ""
        
    if connected_root.list_protein_kw:
        if mode== "html":
            # if the protein matched keywords, make it bold and red
            output_target.write("<b><font color=CC0000><u>\n")
        root_proteins_with_keywords.append(connected_root.protein_name)
        
    all_proteins.append(connected_root.protein_name)
    
    if mode == "html":
        output_target.write("%s" %(connected_root.protein_name))
    else:
        if connected_root.list_protein_kw:
            string += "[" + connected_root.protein_name + "]"
        else:
            string += connected_root.protein_name

    if mode=="html":
        if connected_root.list_protein_kw:
            output_target.write("</u></font></b>\n")
        output_target.write("</td>\n")
    elif mode=="text":
        pass
        

    # -molecular functions of root
    if mode == "html":
        generate_all_elements_list(list_of_strings= connected_root.list_protein_mf, output_target=output_target, mode=mode)
    elif mode == "text":
        if label_size == "all" or label_size=="mf":
            string += SEPARATOR
            string += "MF: "
            string += generate_all_elements_list(list_of_strings= connected_root.list_protein_mf, output_target=None, mode=mode)

    # -biological processes of root
    if mode == "html":
        generate_all_elements_list(list_of_strings= connected_root.list_protein_bp, output_target=output_target, mode=mode)
    elif mode == "text":
        if label_size == "all" or label_size=="bp":
            string += SEPARATOR
            string += "BP: "
            string += generate_all_elements_list(list_of_strings= connected_root.list_protein_bp, output_target=output_target, mode=mode)
        
    # -cellular components of root
    if mode == "html":
        generate_all_elements_list(list_of_strings= connected_root.list_protein_cc, output_target=output_target, mode=mode)
    elif mode == "text":
        if label_size == "all" or label_size=="cc":
            string += SEPARATOR
            string += "CC: "
            string += generate_all_elements_list(list_of_strings= connected_root.list_protein_cc, output_target=output_target, mode=mode)

    if mode == "text":
        return string
        
def generate_freqs_list_html(list_of_strings=[], mode=None, output_target=sys.stdout):
    """
    takes a list with strings and prints their frequencies and names in a html table cell

    mode can be:

      - html: prints linkers in html table (four cells: names, mf, bp and cc)
      - txt: returns string with label for dot file
    """
    
    if mode == "html":
        output_target.write("<td ALIGN=center> \n")
    elif mode == "text":
        string = ""

    if not list_of_strings:
        if mode == "html":
            output_target.write("&nbsp;")
    else:

        # calculate frequency for each element
        dic_of_freqs = {}

        for element in list_of_strings:
            if dic_of_freqs.has_key(element):
                dic_of_freqs[element] += 1
            else:
                dic_of_freqs[element] = 1
        # END OF for element in list_of_strings:


        # prints elements and frequencies in a html table cell
        for element in dic_of_freqs:
            if mode == "html":
                output_target.write("%s: %s<BR>--<BR>\n" %(dic_of_freqs[element], element))
            elif mode == "text":
                string += " %s: %s," %(dic_of_freqs[element], element)
    # END OF else: (if not list_of_strings:)

    if mode == "html":
        output_target.write("</td> \n")
    elif mode == "text":
        return string

def generate_list_linkers(dic_of_linker_objects, mode, output_target, label_size="all"):
    """
    prints the table cells for a list of linkers

    dic_of_linker_objects is a dictionary with names as keys and contents the linker object

    mode can be:

      - html: prints linkers in html table (four cells: names, mf, bp and cc)
      - txt: returns string with label for dot file
    """
    global linker_proteins_with_keywords
    global all_proteins
    
    # --
    # print linker names
    # --
    if mode == "html":
        output_target.write("<td ALIGN=center> \n")
    elif mode == "text":
        string = ""
    
    list_of_molecular_functions= []
    list_of_biological_processes= []
    list_of_cellular_components= []
    
    for linker_name in dic_of_linker_objects:
        all_proteins.append(linker_name)
        linker_object = dic_of_linker_objects[linker_name]

        list_of_molecular_functions.extend(linker_object.list_protein_mf)
        list_of_biological_processes.extend(linker_object.list_protein_bp)
        list_of_cellular_components.extend(linker_object.list_protein_cc)
        
        if linker_object.list_protein_kw:
            if mode== "html":
                # if the protein matched keywords, make it bold
                output_target.write("<b><font color=CC0000><u>\n")
            linker_proteins_with_keywords.append(linker_object.protein_name)

        if mode == "html":
            output_target.write("%s</u><br>--<br>" %(linker_name))
            if linker_object.list_protein_kw: output_target.write("</font></b>\n")
        elif mode == "text":
            if linker_object.list_protein_kw: 
                string += "-[" + linker_name + "]-"
            else:
                string += "-" + linker_name + "-"
    # END OF for linker in list_of_linker_objects:

    if mode == "html":
        output_target.write("</td> \n")
    elif mode == "text":
        pass
    
    # --
    # print frequency of molecular functions
    # --
    if mode == "html":
        generate_freqs_list_html(list_of_strings= list_of_molecular_functions, mode=mode, output_target=output_target)
    elif mode == "text":
        if label_size == "all" or label_size=="mf":
            string += SEPARATOR
            string += "MF: "
            string += generate_freqs_list_html(list_of_strings= list_of_molecular_functions, mode=mode, output_target=output_target)
    # --
    # print frequency of biological processes
    # --
    if mode == "html":
        generate_freqs_list_html(list_of_strings= list_of_biological_processes, mode=mode, output_target=output_target)
    elif mode == "text":
        if label_size == "all" or label_size=="bp":
            string += SEPARATOR
            string += "BP: "
            string += generate_freqs_list_html(list_of_strings= list_of_biological_processes, mode=mode, output_target=output_target)
    # --
    # print frequency of cellular components
    # --
    if mode == "html":
        generate_freqs_list_html(list_of_strings= list_of_cellular_components, mode=mode, output_target=output_target)
    elif mode == "text":
        if label_size == "all" or label_size=="cc":
            string += SEPARATOR
            string += "CC: "
            string += generate_freqs_list_html(list_of_strings= list_of_cellular_components, mode=mode, output_target=output_target)

    if mode == "text":
        return string
    
def print_html_table(dic_of_linkers_by_root_key= None, output_target=sys.stdout):
    """
    prints an html table for the dictionary passed as argument (root proteins and which linkers connect them with GO information

    """
    global max_number_of_roots

    output_target.write( "<BR> - Proteins in bold-red are associated to at least one of these keywords: %s <BR>\n" %(KEYWORDS_TO_USE))
    output_target.write( "<BR>\n" )
    output_target.write( "<BR>\n" )
    
    output_target.write("<table BORDER=4>\n")

    print_table_headers(output_target=output_target)
    
    for root_key in dic_of_linkers_by_root_key:
        output_target.write("<tr>\n")

        random_linker_name = dic_of_linkers_by_root_key[root_key].keys()[0]  # all linkers within this key have the same roots, therefore any random linker can
                                                                             # be used to retrive info about the roots
                                                                             
        random_linker = dic_of_linkers_by_root_key[root_key][random_linker_name]
        
        for connected_root in random_linker.list_connected_root_objects:
            # for each root connected by these linkers, print the columns
            generate_root(connected_root=connected_root, mode="html", output_target=output_target)
        # END OF for connected_root in random_linker.list_connected_root_objects:

        # now, print cells that will be empty for this row because the number of roots was lower than the cells allocated
        number_of_empty_cells = max_number_of_roots - len(random_linker.list_connected_root_objects)
        for i in range(0, number_of_empty_cells):
            output_target.write("<td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td>")

        generate_list_linkers(dic_of_linker_objects= dic_of_linkers_by_root_key[root_key], mode= "html", output_target=output_target)

        # END OF for linker_name in dic_of_linkers_by_root_key[root_key]:
        output_target.write("</tr>\n")
    
    # END OF for root_key in dic_of_linkers_by_root_key:
    
    output_target.write("</table>\n")


def create_linkers_label(dic_of_linkers, label_size="all"):
    """
    from a list of linkers, creates a label to be used in a dot file (basically the names of the linkers and their mf, bp and cc
    """
    return generate_list_linkers(dic_of_linker_objects= dic_of_linkers, mode="text", output_target=None, label_size=label_size)

def create_root_label(root, label_size="all"):
    """
    creates the label for the root object with its mf, bp and cc
    """
    return generate_root(connected_root= root, mode="text", output_target = None, label_size=label_size)

def print_dot_edge(output_target, root, linkers_label, label_size="all"):
    """
    prints a dot edge between root and the linkers


    "root" is a ConnectedRoot object

    "linkers_label" is a label for the linkers
    """

    root_label = create_root_label(root, label_size=label_size)
    
    output_target.write(""""%s" [fillcolor = %s];\n""" %(root_label, "yellow" ))

    output_target.write(""" "%s" -- "%s" [len=1, style="solid" ]; \n""" %(root_label, linkers_label))

 

    
def print_dot(dic_of_linkers_by_root_key= None, output_target=sys.stdout, label_size="all"):
    """
    prints a dot file to output_target showing roots, linkers and their go terms
    
    """
    output_target.write("graph G { graph [orientation=portrait, pack=true, overlap=scale]") # uncomment this line for best presentation
    output_target.write(" node [shape=box,fontsize=14,height=0.5, width=0.5, style=filled,fillcolor=lightblue];\n")

        
    for root_key in dic_of_linkers_by_root_key:

        random_linker_name = dic_of_linkers_by_root_key[root_key].keys()[0]  # all linkers within this key have the same roots, therefore any random linker can
                                                                             # be used to retrive info about the roots
                                                                             
        random_linker = dic_of_linkers_by_root_key[root_key][random_linker_name]

        linkers_label = create_linkers_label(dic_of_linkers= dic_of_linkers_by_root_key[root_key], label_size=label_size) # create the label for these linkers

        for connected_root in random_linker.list_connected_root_objects:
            # for each root connected by these linkers, print an edge
            print_dot_edge(output_target=output_target, root=connected_root, linkers_label= linkers_label, label_size=label_size)
        # END OF for connected_root in random_linker.list_connected_root_objects:

    
    # END OF for root_key in dic_of_linkers_by_root_key:
    
    output_target.write("}\n")


    
def print_kw_info_proteins_html( header= None, list_proteins= [], proteins_type= None, output_target=sys.stdout ):
    """
    Prints in HTML format geneName, protein description and protein function for proteins in list "list_proteins" which are of type "proteins_type"
    The info will be preceeded with a line with the header
    """
    global piana_api

    dic_proteins_already_printed = {}
    
    output_target.write("<BR><hr><BR>%s<BR><hr><BR>\n" %(header))

    for protein in list_proteins:

        # avoid printing the info for a protein that was already printed
        if dic_proteins_already_printed.has_key(protein):
            continue

        dic_proteins_already_printed[protein] = None
        
        list_descriptions = []
        list_functions = []
        list_keywords = []

        output_target.write("<BR><u><b>Protein %s</b></u><BR>\n" %(protein))
        
        list_proteinPiana = piana_api.piana_access.get_list_protein_piana(proteinCode_value= protein,
                                                                proteinCodeType_value= utilities.get_code_column(proteins_type),
                                                                tax_id_value=0  )
        for proteinPiana in list_proteinPiana:
            list_descriptions.extend(piana_api.piana_access.get_protein_description(proteinPiana_value=proteinPiana))
            list_functions.extend(piana_api.piana_access.get_protein_function(proteinPiana_value=proteinPiana))
            list_keywords.extend(piana_api.piana_access.get_protein_keyword(proteinPiana_value=proteinPiana))
        # END OF for proteinPiana in list_proteinPiana:

        # remove redundancies with a dic
        dic_descriptions = {}
        dic_functions = {}
        dic_keywords = {}
        for description in list_descriptions:
            dic_descriptions[description] = None
            
        for function in list_functions:
            dic_functions[function] = None
            
        for keyword in list_keywords:
            dic_keywords[keyword] = None

        for uniq_description in dic_descriptions:
            output_target.write("<BR>--> Description: %s\n" %(uniq_description))
        output_target.write("<BR>\n")
        for uniq_function in dic_functions:
            output_target.write("<BR>--> Function: %s\n" %(uniq_function))
        output_target.write("<BR>\n")
        for uniq_keyword in dic_keywords:
            output_target.write("<BR>--> Keyword: %s\n" %(uniq_keyword))
        output_target.write("<BR>\n")
        
        
    # END OF for protein in list_proteins:
    
    output_target.write("<BR><BR>\n")
    
def print_all_info_proteins_html(header= None, list_proteins= [], proteins_type= None, output_target=sys.stdout ):
    """
    prints all information available for proteins in the list
    """
    global piana_api

    piana_api.print_list_proteins_information(protein_list=list_proteins, input_proteins_type=proteins_type, list_alternative_type_names=["uniacc","md5"],
                                              output_file_object= output_target, output_proteins_type=proteins_type, protein_species_name="all",
                                              output_mode="compact", format_mode="html", list_keywords= KEYWORDS_TO_USE,
                                              file_over_expressed= None, file_infra_expressed= None, expression_protein_type= None)

    
# --------
# --------
#  Main()               
# --------                               
# --------

input_file = None
input_proteins_type = None

piana_dbname = None
piana_dbuser = None
piana_dbhost = None
piana_dbpass = None

go_dbname = None
go_dbhost = None
go_dbuser = None
go_dbpass = None

print_go_info = 0
go_level= None

output_format= None
results_prefix= ""

label_size= None

# parsing arguments from the command line
parseArguments()

max_number_of_roots = 0 # will be used to know how many roots must be printed in the table headers
linker_proteins_with_keywords = []
root_proteins_with_keywords = []
all_proteins = []

if print_go_info:
    if go_dbuser is None and go_dbpass is None:
       godb = MySQLdb.connect(db=go_dbname, host=go_dbhost)

    elif go_dbpass is None and go_dbuser is not None:
       godb = MySQLdb.connect(user=go_dbuser, db=go_dbname, host=go_dbhost)

    else:
       godb = MySQLdb.connect(user=go_dbuser, db=go_dbname, host=go_dbhost, passwd= go_dbpass )

    gocursor = godb.cursor()

# Initialisating connection to piana
piana_api = PianaApi(piana_dbname=piana_dbname, piana_dbhost= piana_dbhost, piana_dbuser=piana_dbuser, piana_dbpass= piana_dbpass)
   

# ----------------------
# POPULATE DIC WITH DATA
# ---------------------- 
i = 0
list_of_linker_descriptions = [] # used to keep all linkers with their roots and go terms
dic_of_all_mf = {}  # used to keep all molecular functions that appear in these proteins
dic_of_all_bp = {}  # used to keep all biological process that appear in these proteins
dic_of_all_cc = {}  # used to keep all cellular component that appear in these proteins
    
dic_of_linkers_by_root_key = {} # used to group linkers according to the roots they connect
                                #  it follows structure:
                                #       { root_key: {linker_name1: linker object 1, linker_name2: linker object 2, ...},
                                #         root_key: {.....}
                                #       }
                                #
                                #  root keys are the root names separated by dot
                                #  from each linker, information about the roots it connects can be obtained

for line in file(input_file, "r"):

    if not line.strip():
        # skip empty lines
        continue
    
    i += 1
    
    # each line looks like this:
    # P62906 connects 2 root_proteins<TAB>['P25398', 'P25398']<TAB>databases ['string', 'string_c']<TAB>description ['60S ribosomal protein L10a (CSA-19).', '60S ribosomal protein L10a.']
    if verbose:
        sys.stderr.write("line is %s\n" %line)

    line_fields = line.split("\t")

    #get the linker name from line_fields[0]
    linker_name = line_fields[0].split()[0]
    
    # line_fields[1] is a list of root proteins that are connected in format ['P25398', 'P25398']
    root_proteins_string= line_fields[1].replace("["," ").replace("]"," ").replace("'"," ").replace(","," ")
    list_root_proteins = root_proteins_string.split()
    list_unique_root_proteins = {}
    for protein in list_root_proteins:
        list_unique_root_proteins[protein] = None

    if verbose:
        sys.stderr.write("linker name is %s and links root proteins %s\n" %(linker_name, list_unique_root_proteins.keys()))


    # now check that the ext code root proteins they join do not have the same proteinPiana
    root_pps = {}  # keeps for each root protein its proteinPianas in a Set

    if len(list_unique_root_proteins) >1:
        
        for protein in list_unique_root_proteins:
            
            root_pps[protein] = sets.Set( piana_api.piana_access.get_list_protein_piana(proteinCode_value= protein,
                                                                              proteinCodeType_value= utilities.get_code_column(input_proteins_type),
                                                                              tax_id_value= 0 ) )

        not_repeated_pps_roots = []
        pps_already_added = sets.Set([])

        
        for root_protein in root_pps:
            if verbose:
                sys.stdout.write("pps for root" + str(root_protein) + "are: " + str( root_pps[root_protein]) + "\n")
            # root_pps[root_protein] is a Set of proteinPianas
            if not pps_already_added.intersection(root_pps[root_protein]):
                not_repeated_pps_roots.append(root_protein)
                pps_already_added =  pps_already_added.union(root_pps[root_protein])
        # END OF for root_protein in root_pps:

        if len(not_repeated_pps_roots) > 1:

            if not print_go_info:

                sys.stdout.write("%s\tlinks root proteins:\t" %(linker_name))
                for root_protein in not_repeated_pps_roots:
                    sys.stdout.write("%s\t" %(root_protein))
                sys.stdout.write("\n")
            else:
                # print complete table of linkers with go information


                # dictionaries used to keep track of different orderings

                # - print linker info
                linker_mf = get_protein_go_name( protein_code= linker_name, protein_code_type=input_proteins_type,
                                                 go_cursor=gocursor, term_type= "molecular_function",
                                                 go_level=go_level)
                linker_bp = get_protein_go_name( protein_code= linker_name, protein_code_type=input_proteins_type,
                                                 go_cursor=gocursor, term_type= "biological_process",
                                                 go_level=go_level)
                linker_cc = get_protein_go_name( protein_code= linker_name, protein_code_type=input_proteins_type,
                                                 go_cursor=gocursor, term_type= "cellular_component",
                                                 go_level=go_level)

                linker_kw = piana_api.piana_access.check_keywords_in_protein(list_proteinPiana= None,
                                                                             protein_code=linker_name, protein_code_type=input_proteins_type,
                                                                             keywords=KEYWORDS_TO_USE)


                # keeping track of all mf, bp, cc that appear in the linkers
                for mf in linker_mf:
                    dic_of_all_mf[mf] = None
                    
                for bp in linker_bp:
                    dic_of_all_bp[bp] = None
                    
                for cc in linker_cc:
                    dic_of_all_cc[cc] = None

                if verbose:
                    sys.stdout.write("go_level=%s" %(go_level))          # go level
                    sys.stdout.write("\tlinker_name=%s" %(linker_name))  # linker name
                    sys.stdout.write("\tlinker_mf=%s" %(linker_mf))      # linker protein go molecular function at level N
                    sys.stdout.write("\tlinker_bp=%s" %(linker_bp))      # linker protein go biological process at level N
                    sys.stdout.write("\tlinker_cc=%s" %(linker_cc))      # linker protein go cellular component at level N
                    sys.stdout.write("\tlinker_kw=%s" %(linker_kw))    # keywords that were found for linker protein

                # - print root proteins info
                number_of_roots = 1
                list_of_connected_roots = []
                root_key = ""    # used to group linkers by the root proteins they link: formed by a string of root names separated by dots
                for root_protein in not_repeated_pps_roots:
                    
                    root_mf = get_protein_go_name( protein_code= root_protein, protein_code_type=input_proteins_type,
                                                   go_cursor=gocursor, term_type= "molecular_function",
                                                   go_level=go_level)
                    root_bp = get_protein_go_name( protein_code= root_protein, protein_code_type=input_proteins_type,
                                                   go_cursor=gocursor, term_type= "biological_process",
                                                   go_level=go_level)
                    root_cc = get_protein_go_name( protein_code= root_protein, protein_code_type=input_proteins_type,
                                                   go_cursor=gocursor, term_type= "cellular_component",
                                                   go_level=go_level)
                    root_kw = piana_api.piana_access.check_keywords_in_protein(list_proteinPiana= None,
                                                                               protein_code=root_protein, protein_code_type=input_proteins_type,
                                                                               keywords=KEYWORDS_TO_USE)

                    if verbose:
                        sys.stdout.write("\troot_%s_name=%s" %(number_of_roots, root_protein))  # root name
                        sys.stdout.write("\troot_%s_mf=%s" %(number_of_roots, root_mf ))    # root protein go molecular function at level N
                        sys.stdout.write("\troot_%s_bp=%s" %(number_of_roots, root_bp ))    # root protein go biological process at level N
                        sys.stdout.write("\troot_%s_cc=%s" %(number_of_roots, root_cc ))    # root protein go cellular component at level N
                        sys.stdout.write("\troot_%s_kw=%s" %(number_of_roots, root_kw ))    # root protein go matched keywords
                    number_of_roots += 1

                    root_key += "%s." %(root_protein)
                    
                    connected_root = ConnectedRoot(protein_name= root_protein,
                                                   list_protein_mf=root_mf, list_protein_bp=root_bp, list_protein_cc=root_cc, list_protein_kw=root_kw)
                    
                    list_of_connected_roots.append(connected_root)
                # END OF for root_protein in not_repeated_pps_roots:

                number_of_roots = len(list_of_connected_roots)
                if number_of_roots > max_number_of_roots:
                    max_number_of_roots= number_of_roots
                    
                linker_description = LinkerDescription( protein_name= linker_name, list_protein_mf=linker_mf, list_protein_bp=linker_bp, list_protein_cc=linker_cc,
                                                        list_protein_kw=linker_kw, list_connected_root_objects= list_of_connected_roots)
                
                list_of_linker_descriptions.append(linker_description)


                if dic_of_linkers_by_root_key.has_key(root_key):
                    dic_of_linkers_by_root_key[root_key][linker_description.protein_name]= linker_description
                else:
                    dic_of_linkers_by_root_key[root_key] = {linker_description.protein_name: linker_description}


                if verbose:
                    sys.stdout.write("\n")
            # END OF else: (if not print_go_info:)
        
        # END OF if len(not_repeated_pps_roots) > 1:
    
    # END OF if len(list_unique_root_proteins) >1:
    
# END OF for line in file(input_file, "r"):


# ---------------------
# PRINT OUTPUTS
# ---------------------


if output_format == "text":

    for root_key in dic_of_linkers_by_root_key:

        sys.stdout.write( "======================================ROOTS CONNECTED==============================================\n")
        random_linker_name = dic_of_linkers_by_root_key[root_key].keys()[0]   # all linkers within this key have the same roots, therefore any random linker can
                                                                              # be used to retrive info about the roots
        random_linker = dic_of_linkers_by_root_key[root_key][random_linker_name]
        for connected_root in random_linker.list_connected_root_objects:
            sys.stdout.write( "\n")
            sys.stdout.write( "%s\n" %connected_root)

        sys.stdout.write( "*********************LINKERS**************************")
        for linker_name in dic_of_linkers_by_root_key[root_key]:
            sys.stdout.write( "\n" )
            sys.stdout.write( "%s\n" %(dic_of_linkers_by_root_key[root_key][linker_name]) )
    # END OF for root_key in dic_of_linkers_by_root_key:
# END OF if output_format == "text":

elif output_format == "html":

       
    output_target = file(results_prefix + ".linkers_table.html" , "w")
    print_html_table( dic_of_linkers_by_root_key= dic_of_linkers_by_root_key, output_target= output_target)
    output_target.close()
    
    dot_file_target = file(results_prefix + ".dot" , "w")
    print_dot(dic_of_linkers_by_root_key=dic_of_linkers_by_root_key , output_target=dot_file_target, label_size=label_size)
    dot_file_target.close()


else:
    raise ValueError("unknown output format chosen")

                

                
                

    
