"""
File        : go2piana.py
Author      : Pablo Boixeda & Ramon Aragues
Creation    : 06.2005
Contents    : program that fills up tables in database piana with information from GO
Called from : command line

=======================================================================================================

"""

# go2piana.py: program that fills up tables in database piana with information from GO
#
# Copyright (C) 2005  Ramon Aragues
# author email: ramon.aragues@upf.edu and boliva@imim.es
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#    http://www.gnu.org/copyleft/gpl.html
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
#
# University Pompeu Fabra, hereby disclaims all copyright
# interest in the program 'PIANA'
# (software for working with protein-protein interaction networks) written 
# by Ramon Aragues

import sys
import getopt

import re
import readline
import MySQLdb

from PianaGraph import *
from PianaGraphBuilder import *
from PianaDBaccess import *

from PianaDBaccess import *
import PianaGlobals

verbose = 0
verbose_detailed = 0

INFINITE_DISTANCE = 100


# ----------------------
# Function usage()
# ----------------------
def usage():
   print "--------------------------------------------------------------------------------------------------------------"
   print "This program fills up tables in database piana with information from GO \n"
   print "Usage: python go2piana.py  --piana-dbname=piana_dbname --piana-dbhost=piana_dbhost --piana-dbuser=piana_dbuser --piana-dbpass=piana_dbpass"
   print "              --go-dbname=go_dbname --go-dbhost=go_dbhost --go-dbuser=go_dbuser --go-dbpass=go_dbpass "
   print "              --insert-protein-go --insert-go-info --insert-go-distance --threshold=threshold"
   print "              --input-file=input_file --input-proteins-type=input_proteins_type  [--help] [--verbose]"
   print "\nwhere:"
   print "     piana_dbname : name of database piana to be used (required)"
   print "     piana_dbhost : name of host where database piana to be used is placed (required)"
   print "     piana_dbuser : username accessing the database (not required in most systems)"
   print "     piana_dbpass : password of username accessing the database (not required in most systems)"
   print "     go_dbname   : name of the go mysql database (required)"
   print "     go_dbhost   : name of the machine with go mysql server (required)"
   print "     go_dbuser   : name of the mysql go username (not required in most systems)"
   print "     go_dbpass   : name of the mysql go username (not required in most systems)"
   print "     --insert-protein-go    : if this flag is set, then insert relationship between proteins and go terms"
   print "     --insert-go-info       : if this flag is set, then insert go information"
   print "     --insert-go-distance   : if this flag is set, then calculate distance between go terms"
   print "     threshold    : this is the maximum distance that will be calculated (once reached, distance(term1, term2) will be set to INFINITE"
   print "    -> these two parameters only required if you don't want to insert info for all proteins: "
   print "       input_file   : calculates distance only for gos associated to proteins in the network (depth 1) built from the root proteins in the file"
   print "       input_proteins_type    : type of the proteins inside the input_file"
   print "                        --> Attention! Do not use geneName to identify these proteins: they are ambiguous and no species information is available"
   print "     --help       : prints this message and exits"
   print "     --verbose    : prints process info to stdout"
   print "--------------------------------------------------------------------------------------------------------------"

# TO DO!!! Let the user set a tax_id so he can use any type of protein code
#          Then, this tax_id will be given to PianaGraphBuilder as an argument


# ---------------------------
# Function parseArguments()
# ---------------------------

def parseArguments():

   global go_dbname
   global go_dbhost
   global go_dbuser
   global go_dbpass

   global piana_dbname
   global piana_dbhost
   global piana_dbuser
   global piana_dbpass

   global insert_protein_go
   global insert_go_info
   global insert_go_distance
   
   global input_file
   global input_proteins_type
   global threshold
   
   global verbose
   
   try:
      opts, args = getopt.getopt(sys.argv[1:], "", ["verbose","help","go-dbname=","go-dbuser=","go-dbhost=","go-dbpass=",
                                                     "piana-dbname=", "piana-dbhost=", "piana-dbuser=", "piana-dbpass=", 
                                                     "insert-protein-go", "insert-go-info", "insert-go-distance",
                                                    "input-file=", "input-proteins-type=", "threshold=" ]   )
   except getopt.GetoptError, bad_opt:
      # print help information and exit:
      sys.stderr.write( bad_opt.__str__() )
      usage()
      sys.exit(2)

   for option,value in opts:

      if option == "--go-dbhost":
         go_dbhost = value

      elif option == "--go-dbname":
         go_dbname = value

      elif option == "--go-dbuser":
           go_dbuser = value

      elif option == "--go-dbpass":
           go_dbpass = value

      elif option == "--piana-dbname":
           piana_dbname = value

      elif option == "--piana-dbhost":
           piana_dbhost = value

      elif option == "--piana-dbuser":
           piana_dbuser = value

      elif option == "--piana-dbpass":
           piana_dbpass = value

      elif option == "--input-file":
           input_file = value

      elif option == "--input-proteins-type":
           input_proteins_type = value

      elif option == "--threshold":
           threshold = int(value)

      elif option == "--insert-protein-go":
           insert_protein_go = 1

      elif option == "--insert-go-info":
           insert_go_info = 1

      elif option == "--insert-go-distance":
           insert_go_distance = 1


      elif option == "--verbose":
           verbose = 1

      elif option == "--help":
           # print help information and exit
           usage()
           sys.exit(2)

   # check arguments
   if go_dbname is None or go_dbhost is None:
       #raise ValueError("trying to establish a connection to go database without giving a host or database name")
      print "trying to establish a connection to go database without giving a host or database name"
      usage()
      sys.exit(2)

def get_fathers(gocursor=None,term_id=None):
   
   sqlquery="select term1_id from term2term where term2_id=%s"%term_id
   gocursor.execute(sqlquery)
   fathers=gocursor.fetchall()
   fathers_list=[]
   for father in fathers:
      fathers_list.append(father[0])


   return fathers_list

def calculate_distance(distance_up_to_here=None, term_id1=None, term_id2=None, gocursor=None):

   global threshold

   if gocursor is None:
      raise ValueError("No gocursor")

   if distance_up_to_here>= threshold:
      # if distance found is higher than the threshold, stop the search and return INFINITE_DISTANCE
      return INFINITE_DISTANCE

   if term_id1==term_id2:
      return distance_up_to_here

   else :
      fathers_term_id1=get_fathers(gocursor, term_id1)
      fathers_term_id2=get_fathers(gocursor, term_id2)

      results=[]

      for father_term1 in fathers_term_id1:
         results.append(calculate_distance(distance_up_to_here=distance_up_to_here+1,
                                           term_id1=father_term1,
                                           term_id2=term_id2,
                                           gocursor=gocursor))
      for father_term2 in fathers_term_id2:
         results.append(calculate_distance(distance_up_to_here=distance_up_to_here+1,
                                           term_id1=term_id1,
                                           term_id2=father_term2,
                                           gocursor=gocursor))

      if not results:
         return distance_up_to_here
      else :
         return min(results)
         

def insert_go_table(gocursor, piana_access, go_terms, depth, source_db):


   if not go_terms:
      pass      # no more sons... end recursion
   
   elif go_terms[0]==1:
      
      sqlquery = "select id, name, term_type, acc from term where is_obsolete=0 and id=%s" %go_terms[0]
      gocursor.execute(sqlquery)
      go_rows= gocursor.fetchall()

      # go_rows is a list with:
      #     [0][0] -> term id
      #     [0][1] -> name
      #     [0][2] -> term type
      #     [0][3] -> go accession
      
      piana_access.insert_go(go_id=go_rows[0][0],go_name=go_rows[0][1], term_type=go_rows[0][2], acc=go_rows[0][3], distance2root=depth, source_db= source_db)

      sqlquery="select term2_id from term2term where term1_id=%s" %go_rows[0][0]
      gocursor.execute(sqlquery)
      goSons=gocursor.fetchall()

      # TO CHECK!!!! goSons is a list or a list of lists?

      insert_go_table(gocursor=gocursor, piana_access=piana_access, go_terms=goSons, depth=depth+1, source_db=source_db)

   else :
      
      for go_term in go_terms:
         sqlquery = "select id, name, term_type, acc from term where is_obsolete=0 and id= %s" %go_term
         gocursor.execute(sqlquery)
         go_rows= gocursor.fetchall()

         for goRow in go_rows:

            if verbose:
               sys.stderr.write("Inserting go_id=%s whith depth=%s\n"%(goRow[0],depth))

            piana_access.insert_go(go_id=goRow[0], go_name=goRow[1], term_type=goRow[2], acc=goRow[3], distance2root=depth, source_db=source_db)
            
            sqlquery="select term2_id from term2term where term1_id=%s"%goRow[0]
            gocursor.execute(sqlquery)
            goSons=gocursor.fetchall()
            
            insert_go_table(gocursor=gocursor,piana_access=piana_access,go_terms=goSons,depth=depth+1, source_db= source_db)
         # END OF for goRow in go_rows
      # END OF for go_term in go_terms
   # END OF else: (if go_terms[0]==1:)
   

def go2piana(dbname, dbhost, dbuser, dbpassword,go_dbname,go_dbhost,go_dbuser,go_dbpass, input_file, input_proteins_type):
   # Initialisating connection to piana

   if verbose:
      sys.stderr.write("Opening connection to piana\n")

   piana_access = PianaDBaccess(dbname=piana_dbname, dbhost=piana_dbhost, dbuser=piana_dbuser, dbpassword= piana_dbpass)
   SourceDB="go"

   # opening connection to MySQL GO database and create a cursor to work with the database
   if verbose:
      sys.stderr.write("Opening connection to go\n")

   if go_dbuser is None and go_dbpass is None:
      godb = MySQLdb.connect(db=go_dbname, host=go_dbhost)

   elif go_dbpass is None and go_dbuser is not None:
      godb = MySQLdb.connect(user=go_dbuser, db=go_dbname, host=go_dbhost)

   else:
      godb = MySQLdb.connect(user=go_dbuser, db=go_dbname, host=go_dbhost, passwd= go_dbpass )

   gocursor = godb.cursor()


   number_genes_without_proteinPiana = 0
   # --------
   # 1. inserting associations between proteins and go terms
   # --------
   if insert_protein_go:
      
      if verbose:
         sys.stderr.write("Retrieving protein go association\n")

      sqlquery= "select term_id, gene_product_id from association"
      gocursor.execute(sqlquery)
      go_gene_product_id=gocursor.fetchall()

      if verbose:
         num_associations = len(go_gene_product_id)
         sys.stderr.write("%s associations found" %(num_associations))

      i = 0
      for pair in go_gene_product_id:
        # pair[0] -> term_id
        # pair[1] -> gene_product_id

        sqlquery="select symbol, species_id, type_id, dbxref_id from gene_product where id=%s" %pair[1]
        gocursor.execute(sqlquery)
        gene_to_symbol =  gocursor.fetchall() 

        list_proteinPiana = [] # initialize list of proteinPianas to which this GO term will be associated
        for symbol in gene_to_symbol:
           # symbol[0] -> symbol (geneName)
           # symbol[1] -> species id (internal to GO)
           # symbol[2] -> type id (internal to GO, indicates the type of code)
           #              --> type_id == 20361 and 20368 are usually a gene name
           #              --> type_id == 20366 is usually a uniprot accession or entry
           #              --> type_id == 20365, 20367 are weird
           # symbol[3] -> dbxref id (internal to GO, references id on table dbxref which contains links to other databases)
           
           type_id = int(symbol[2])

           gene_name = None
           uniacc = None
           
           if type_id == 20361 or type_id == 20368:
              gene_name = symbol[0].replace('"', " ").strip()
              

           if type_id == 20366:
              # if it is 20366 (ie. protein) search for a UniProt accession for this symbol
              #  (it is easier than trying to guess if the symbol is an accession or an entry)
              sqlquery="""select xref_key from dbxref where id=%s and xref_dbname="UniProt" """ %(symbol[3])
              gocursor.execute(sqlquery)
              protein_value =  gocursor.fetchall()

              if protein_value:
                 uniacc = protein_value[0][0]
                 
           # END OF if type_id == 20361:
              
           
           # tax id has to be retrieved from table species
           sqlquery="select ncbi_taxa_id from species where id=%s" %symbol[1]
           gocursor.execute(sqlquery)
           tax_id =  gocursor.fetchall() 


           # TO DO!!!
           # Attention! I am ignoring the association gene - term id when
           #            GO is not correctly assigning tax_id to the geneName
           #            For example, GO says that AAH1 is Candida Albicans (tax id=5476)
           #            when in fact, AAH1 does not exist in that species but in another
           #            one called Candida albicans SC5314  (tax id=237561)
           if gene_name:
              list_proteinPiana.extend(piana_access.get_list_protein_piana(proteinCode_value= gene_name,
                                                                           proteinCodeType_value= PianaGlobals.geneName_col,
                                                                           tax_id_value= tax_id[0][0] , source_db_info= "no"))
           if uniacc:
              # when a uniprot accession is given, trust it (ie do not impose it to be of the species said by GO)
              list_proteinPiana.extend(piana_access.get_list_protein_piana(proteinCode_value= uniacc,
                                                                           proteinCodeType_value= PianaGlobals.swissAccessionID_col,
                                                                           tax_id_value= 0 , source_db_info= "no"))
              
          # TO DO!!! Take into account the source_db_info to change the goSourceDB to _c

 
        # END OF for symbol in gene_to_symbol:

        if verbose:
           sys.stderr.write("Inserting go for proteinPianas %s (symbol %s, tax %s) and go_term_id %s\n" %(list_proteinPiana,symbol[0], tax_id[0][0],
                                                                                                          pair[0]))
           
        
        # Insert GO information for proteinPianas associated to symbol
        for proteinPiana in list_proteinPiana:

           piana_access.insert_protein_go(go_id= pair[0], proteinPiana_value= proteinPiana, proteinGoSource_value=SourceDB)

        # END OF for proteinPiana in list_proteinPiana:

        if verbose_detailed:
           if not list_proteinPiana:
              sys.stderr.write("No proteinPiana found for symbol %s tax id %s\n" %(symbol[0], tax_id[0][0]))
              number_genes_without_proteinPiana += 1


        # END OF for symbol in gene_to_symbol:
      # END OF for pair in go_gene_product_id:

      
   # END OF if insert_protein_go:
   
   # --------
   # 2. inserting information into table go
   # --------

   if insert_go_info:

   
      if verbose:
         sys.stderr.write("calling insert_go_table\n")
      
      # filling go table into piana go_table
      insert_go_table(gocursor=gocursor, piana_access=piana_access, go_terms=[1], depth=0, source_db=SourceDB)
      
   # END OF if insert_go_info:




   
   # --------
   # 2. inserting distance between go terms
   # --------

   if insert_go_distance:


      #   term_types = ["biological_process",
      #                 "molecular_function",
      #                 "celullar_component"]

      term_types = ["molecular_function"]  # this is here only to speed up the process... normally, do parsing for all term types

      for term_type in term_types:

         if input_file is None:
            # if no list provided, calculate distances for all gos
            sqlquery="""select distinct id from term where is_obsolete=0 and term_type="%s" """ %term_type
            
            gocursor.execute(sqlquery)
            go_ids=gocursor.fetchall()
         else:
            query_proteins = []

            for line in file(input_file,  "r"):
               query_proteins.append( line.strip() )

            # building a network to then get the proteins in the network (and then get the go terms for these proteins)
            temp_piana_graph = PianaGraph("Temp Graph", piana_access_object= piana_access)  
            temp_piana_builder = PianaGraphBuilder(piana_access_object=piana_access,
                                                   depth=1,
                                                   hub_threshold= 0,
                                                   list_protein_codes= query_proteins,
                                                   code_type_name= input_proteins_type,
                                                   tax_id_value=0 ,
                                                   list_source_dbs= "all",
                                                   list_source_methods= "all")
            
            temp_piana_graph.build_graph(temp_piana_builder)


            
            proteinPiana_list=temp_piana_graph.get_node_ids_list()
            go_list=[]

            if verbose:
               sys.stderr.write("getting go terms for proteinPianas %s\n" %(proteinPiana_list))
            
            for proteinPiana_id in proteinPiana_list:
               go_list.extend(piana_access.get_protein_go_term_id(proteinPiana_value=proteinPiana_id,
                                                                  term_type_value=term_type))


            if verbose:
               sys.stderr.write("Go list obtained is %s\n" %(go_list))
               
            go_ids = []
            gos_already_appended = {}  # used to avoid duplication of go terms
            for go_term in go_list:
               if not gos_already_appended.has_key(go_term):
                  # we have to simulate a fetchall list (ie a list of lists, where [0] in each sublist is the go term
                  go_ids.append( (int(go_term), 1) )
                  gos_already_appended[go_term] = None
         # END OF else: (if input_file is None:)

         num_gos = len(go_ids)


         for i in range(0, num_gos):
            for j in range(i+1, num_gos):

               if verbose:
                  sys.stderr.write("%s->i=%s j=%s=" %(num_gos, i, j))

               term2term_distance=calculate_distance(distance_up_to_here=0, term_id1=go_ids[i][0], term_id2=go_ids[j][0], gocursor=gocursor)

               if verbose:
                  sys.stderr.write("%s--" %(term2term_distance))

               if term2term_distance != INFINITE_DISTANCE:
                  # if distance is infinite, do not insert info...
                  if verbose_detailed:
                     sys.stderr.write("Inserting distance %s for go terms %s and %s\n" %(term2term_distance,go_ids[i][0],
                                                                                         go_ids[j][0] ))
                     
                  piana_access.insert_go_term2term_distance(term1_id= go_ids[i][0], term2_id=go_ids[j][0], distance=term2term_distance)
            # END OF for j in range(i+1, num_gos):
         # END OF for i in range(num_gos):

      # END OF for term_type in term_types:
   # END OF if insert_go_distance:
   
   if verbose:
      sys.stderr.write("Number of genes for which a proteinPiana could not be found: %s\n" %(number_genes_without_proteinPiana))

# --------
# --------
#  Main()
# --------
# --------

go_dbname = None
go_dbuser = None
go_dbhost = None
go_dbpass = None

piana_dbname = None
piana_dbuser = None
piana_dbhost = None
piana_dbpass = None

input_file = None
input_proteins_type = None

threshold= INFINITE_DISTANCE   # if nothing is said on command line, set no threshold (1000 is the same as setting no threshold...)

insert_protein_go = 0
insert_go_info = 0
insert_go_distance = 0

# parsing arguments from the command line
parseArguments()

go2piana(piana_dbname, piana_dbhost, piana_dbuser, piana_dbpass, go_dbname, go_dbhost, go_dbuser, go_dbpass, input_file, input_proteins_type)



