"""
File        : xyva_gb2piana.py
Author      : Ramon Aragues
Creation    : 18.10.2004
Contents    : fills up tables in database piana with information from files myva=gb or kyva=gb
Called from : command line

=======================================================================================================

This file implements a program that fills up tables in database piana with information from files myva=gb or kyva=gb

files myva=gb and kyva=gb can be downloaded from ftp://ftp.ncbi.nih.gov/pub/COG

These files contain correspondences between protein names and gi codes. We need to parse these files because
the COG and KOG listings (whog and kog) use protein names instead of gi codes

The format they follow is pretty simple:

protein_name<spaces>gi_code

Attention!!! In some cases, the protein_name is also a gi_code... avoid inserting that proteinPiana (in parser xog2piana.py we make sure to
check for gi_codes as well if nothing is found in table geneName

Before running, genpept and nr have to be parsed, to fill pianaDB with correspondences between gi codes and proteinPianas

"""

# xyva_gb2piana.py: fills up tables in database piana with information from files myva=gb or kyva=gb
#
# Copyright (C) 2005  Ramon Aragues
# author email: ramon.aragues@upf.edu and boliva@imim.es
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#    http://www.gnu.org/copyleft/gpl.html
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
#
# University Pompeu Fabra, hereby disclaims all copyright
# interest in the program 'PIANA'
# (software for working with protein-protein interaction networks) written 
# by Ramon Aragues

import sys
import getopt

import re
import readline
import MySQLdb

from PianaDBaccess import *


# ---------------------------------------------------------------
# Set here the default values for command line arguments
# ---------------------------------------------------------------
#

# These will be the values taken by the program when called directly from build_database.py:

input_file = None

verbose = 0


# ----------------------
# Function usage()
# ----------------------
def usage():
    print "--------------------------------------------------------------------------------------------------------------"
    print "This program fills up tables in database piana with information from xyva_gb \n"
    print "Usage: python xyva_gb2piana.py  --input-file=input_file_name  --piana-dbname=piana_dbname --piana-dbhost=piana_dbhost"
    print "                                --piana-dbuser=piana_dbuser --piana-dbpass=piana_dbpass [--help] [--verbose]"
    print "\nwhere:"
    print "      input_file_name     : file name of input file containing xyva_gb data (kyva=gb or myva=gb)"
    print "      piana_dbname       : name of database piana to be used (required)"
    print "      piana_dbhost       : name of host where database piana to be used is placed (required)"
    print "      piana_dbuser       : username accessing the database (not required in most systems)"
    print "      piana_dbpass       : password of username accessing the database (not required in most systems)"
    print "     --help         : prints this message and exits"
    print "     --verbose      : prints process info to stdout"
    print "--------------------------------------------------------------------------------------------------------------"
        

   
# ---------------------------
# Function parseArguments()                                               
# --------------------------- 

def parseArguments():
    

    global input_file
    global verbose
    
    global piana_dbname
    global piana_dbhost
    global piana_dbuser
    global piana_dbpass

    try:
        opts, args = getopt.getopt(sys.argv[1:], "vhi:n:o:u:w:", ["verbose","help", "input-file=", "piana-dbname=", "piana-dbhost=",
                                                                "piana-dbuser=","piana-dbpass="])
    except getopt.GetoptError, bad_opt:
        # print help information and exit:
        sys.stderr.write( bad_opt.__str__() )
        usage()
        sys.exit(2)
     
    for option,value in opts:
        
        if option in ("-i", "--input-file"):
            input_file = value
            
        elif option in ("-n", "--piana-dbname"):
            piana_dbname = value
            
        elif option in ("-o", "--piana-dbhost"):
            piana_dbhost = value
            
        elif option in ("-u", "--piana-dbuser"):
            piana_dbuser = value
             
        elif option in ("-w", "--piana-dbpass"):
            piana_dbpass = value
            
        elif option in ("-v", "--verbose"):
            verbose = 1
            
        elif option in ("-h", "--help"):
            # print help information and exit
            usage()
            sys.exit(2)
            
    # END OF for option,value in opts:
            
            
# --------
# --------
#  Main()               
# --------                               
# --------

piana_dbname= None
piana_dbhost= None
piana_dbuser= None
piana_dbpass= None

# parsing arguments from the command line
parseArguments()

if verbose:
    print "Arguments read are: input-file= %s || piana_dbname= %s || piana_dbhost= %s" %(input_file, piana_dbname, piana_dbhost)

# Initializing connection to piana
piana_access = PianaDBaccess(dbname=piana_dbname, dbhost= piana_dbhost, dbuser=piana_dbuser, dbpassword= piana_dbpass )

sourceDB = "cog"
correspondences_not_inserted = 0
correspondences_were_gi = 0

input_file_fd =  file(input_file, 'r')


if verbose:
    sys.stderr.write( "Processing file\n")

# while there are lines in the file, get correspondence protein_name gi_code
for line in input_file_fd:

    line_fields = line.split()  # line_fields[0] is the protein_name
                                # line_fields[1] is the gi_code
    

    if len(line_fields) > 1:
        
        # if the protein_name (line_fields[0]) being used is a gi_code (it happens!!!) skip the entry: we do not want to insert gi_codes 
        # in table geneNames 
        proteinPiana_exists = piana_access.get_list_protein_piana(proteinCode_value=line_fields[0],
                                                                  proteinCodeType_value= PianaGlobals.giID_col,
                                                                  tax_id_value= 0,
                                                                  source_db_info="no")

        if not proteinPiana_exists:

            # get the proteinPianas associated with this gi code (line_fields[1])
            list_proteinPiana = piana_access.get_list_protein_piana(proteinCode_value=line_fields[1],
                                                                    proteinCodeType_value= PianaGlobals.giID_col,
                                                                    tax_id_value= 0,
                                                                    source_db_info="no")

            if not list_proteinPiana:
                if verbose:
                    sys.stderr.write("No protein found for this gi_code %s\n" %(line_fields[1]))
                    correspondences_not_inserted += 1
            # END OF if list_inserted_proteinPiana:

            else:
                for proteinPiana in list_proteinPiana:

                    if verbose:
                        sys.stderr.write("correpondence between proteinPiana %s and geneName %s\n" %(proteinPiana, line_fields[0].strip()))

                    piana_access.insert_geneName_code(geneName_code_value= line_fields[0].strip() ,
                                                      proteinPiana_value= proteinPiana,
                                                      geneName_source_value= sourceDB)
                # END OF for proteinPiana in list_proteinPiana:
            # END OF else: (if not list_inserted_proteinPiana:)
        # END OF if not proteinPiana_exists:
        else:
            if verbose:
                sys.stderr.write("Entry skipt because protein name was a gi code\n")
                correspondences_were_gi += 1
    # END OF if line_fields > 1:
# END OF for line in input_file_fd:

if verbose:
    sys.stderr.write("Done! %s correspondences not inserted because proteinPiana was not found for gi_code\n" %(correspondences_not_inserted))
    sys.stderr.write("Done! %s correspondences not inserted because  protein name was actually a gi_code\n" %(correspondences_were_gi))
