"""
File        : xog2piana.py
Author      : Ramon Aragues
Creation    : 18.10.2004
Contents    : fills up tables in database piana with information from files whog or kog
Called from : command line

=======================================================================================================

This file implements a program that fills up tables in database piana with information from files whog or kog

files whog and kog can be downloaded from ftp://ftp.ncbi.nih.gov/pub/COG

These files contain COG and KOG identifiers for proteins and protein domains. 

The format they follow is:

[H] COG0001 Glutamate-1-semialdehyde aminotransferase
  Afu:  AF1241
  Hbs:  VNG2326G
  Mac:  MA0581
  Mth:  MTH228
  Mja:  MJ0603 MJ0604 MJ0605
  Mka:  MK0280
  Tac:  Ta0571
  Tvo:  TVN0635 TVN0634
 
_______
 
[E] COG0002 Acetylglutamate semialdehyde dehydrogenase
  Afu:  AF2071
  Mac:  MA3566
  Mth:  MTH846
  Mja:  MJ1096
  Mka:  MK1077
  Pho:  PH1720
 

Attention!!! In some cases, the protein_name is a gi_code... make sure to check for gi_codes as well if nothing is found in table geneName

Before running, myva=gb and kyva=gb have to be parsed, to fill pianaDB with correspondences between gi codes and gene names


"""

# xog2piana.py: fills up tables in database piana with information from files whog or kog
#
# Copyright (C) 2005  Ramon Aragues
# author email: ramon.aragues@upf.edu and boliva@imim.es
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#    http://www.gnu.org/copyleft/gpl.html
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
#
# University Pompeu Fabra, hereby disclaims all copyright
# interest in the program 'PIANA'
# (software for working with protein-protein interaction networks) written 
# by Ramon Aragues

import sys
import getopt

import re
import readline
import MySQLdb

from PianaDBaccess import *

# this dictionary is used to transform from cog abbreviations to taxonomy id
dic_species_eq = { 
    "Afu":   2234,
    "Hbs":  64091,
    "Mac": 188937,
    "Mth": 145262,
    "Mja":   2190,
    "Mka": 190192,
    "Tac":   2303,
    "Tvo":  50339,
    "Pho":  53953,
    "Pab":  29292,
    "Pya":  13773,
    "Sso":   2287,
    "Ape":  56636,
    "Sce":   4932,
    "Spo":   4896,
    "Ecu":   6035,
    "Aae":  63363,
    "Tma":   2336,
    "Syn":   1148,
    "Nos": 103690,
    "Fnu": 190304,
    "Dra":   1299,
    "Cgl":   1718,
    "Mtu":  83332,
    "MtC":  83331,
    "Mle":   1769,
    "Cac":   1488,
    "Lla":   1360,
    "Spy": 160490,
    "Spn": 170187,
    "Sau": 158879,
    "Lin":   1642,
    "Bsu":   1423,
    "Bha":  86665,
    "Eco":  83333,
    "EcZ": 155864,
    "Ecs":  83334,
    "Ype":    632,
    "Sty":  99287,
    "Buc": 107806,
    "Vch":    666,
    "Pae":    287,
    "Hin":  71421,
    "Pmu":    747,
    "Xfa": 160492,
    "Nme": 122586,
    "NmA": 122587,
    "Rso":    305,
    "Hpy":  85962,
    "jHp":  85963,
    "Cje":    197,
    "Atu": 181661,
    "Sme":    382,
    "Bme":  29459,
    "Mlo":    381,
    "Ccr": 155892,
    "Rpr":    782,
    "Rco":    781,
    "Ctr":    813,
    "Cpn": 115713,
    "Tpa":    160,
    "Bbu":    139,
    "Uur":   2130,
    "Mpu":   2107,
    "Mpn":   2104,
    "Mge":   2097,
    "ath":  3702,
    "cel":  6239,
    "dme":  7227,
    "hsa":  9606,
    "sce":  4932,
    "spo":  4896,
    "ecu":  6035 }

# ---------------------------------------------------------------
# Set here the default values for command line arguments
# ---------------------------------------------------------------
#

# These will be the values taken by the program when called directly from build_database.py:

input_file = None

verbose = 0


# ----------------------
# Function usage()
# ----------------------
def usage():
    print "--------------------------------------------------------------------------------------------------------------"
    print "This program fills up tables in database piana with information from files  myva=gb and kyva=gb \n"
    print "Usage: python xog2piana.py  --input-file=input_file_name  --piana-dbname=piana_dbname --piana-dbhost=piana_dbhost"
    print "                                --piana-dbuser=piana_dbuser --piana-dbpass=piana_dbpass [--help] [--verbose]"
    print "\nwhere:"
    print "      input_file_name     : file name of input file containing genpept data (relxxx_fsa_aa)"
    print "      piana_dbname       : name of database piana to be used (required)"
    print "      piana_dbhost       : name of host where database piana to be used is placed (required)"
    print "      piana_dbuser       : username accessing the database (not required in most systems)"
    print "      piana_dbpass       : password of username accessing the database (not required in most systems)"
    print "     --help         : prints this message and exits"
    print "     --verbose      : prints process info to stdout"
    print "--------------------------------------------------------------------------------------------------------------"
        

   
# ---------------------------
# Function parseArguments()                                               
# --------------------------- 

def parseArguments():
    

    global input_file
    global verbose
    
    global piana_dbname
    global piana_dbhost
    global piana_dbuser
    global piana_dbpass

    try:
        opts, args = getopt.getopt(sys.argv[1:], "vhi:n:o:u:w:", ["verbose","help", "input-file=", "piana-dbname=", "piana-dbhost=",
                                                                "piana-dbuser=","piana-dbpass="])
    except getopt.GetoptError, bad_opt:
        # print help information and exit:
        sys.stderr.write( bad_opt.__str__() )
        usage()
        sys.exit(2)
     
    for option,value in opts:
        
        if option in ("-i", "--input-file"):
            input_file = value
            
        elif option in ("-n", "--piana-dbname"):
            piana_dbname = value
            
        elif option in ("-o", "--piana-dbhost"):
            piana_dbhost = value
            
        elif option in ("-u", "--piana-dbuser"):
            piana_dbuser = value
             
        elif option in ("-w", "--piana-dbpass"):
            piana_dbpass = value
            
        elif option in ("-v", "--verbose"):
            verbose = 1
            
        elif option in ("-h", "--help"):
            # print help information and exit
            usage()
            sys.exit(2)
            
    # END OF for option,value in opts:
            
            
# --------
# --------
#  Main()               
# --------                               
# --------

piana_dbname= None
piana_dbhost= None
piana_dbuser= None
piana_dbpass= None


# parsing arguments from the command line
parseArguments()

if verbose:
    print "Arguments read are: input-file= %s || piana_dbname= %s || piana_dbhost= %s" %(input_file, piana_dbname, piana_dbhost)

# Initializing connection to piana
piana_access = PianaDBaccess(dbname=piana_dbname, dbhost= piana_dbhost, dbuser=piana_dbuser, dbpassword= piana_dbpass )

sourceDB = "cog"
correspondences_not_inserted = 0

input_file_fd =  file(input_file, 'r')


if verbose:
    sys.stderr.write( "Processing file\n")

cogs_not_inserted = 0
cog_code = None

# a xog file looks like this:
#
## [H] COG0001 Glutamate-1-semialdehyde aminotransferase
##   Afu:  AF1241
##   Hbs:  VNG2326G
##   Mac:  MA0581
##   Mth:  MTH228
##   Mja:  MJ0603 MJ0604 MJ0605
##   Mka:  MK0280
##   Tac:  Ta0571
##   Tvo:  TVN0635 TVN0634
  
# while there are lines in the file, get correspondence protein_name gi_code
for line in input_file_fd:

    
    if line[0] == "[":
        # it is a new COG or KOG entry

        
        line_atoms = line[1:].split("]")  # line_atoms[0] will contains the function letters (there can be more than one)
                                          # line_atoms[1:] contains the cog code and its description

        cog_letter_function = line_atoms[0]

        rest_of_line = ""
        for atom in line_atoms[1:]:
            rest_of_line += atom

        line_fields = rest_of_line.split()  # line_fields[0] contains the cog code
                                            # line_fields[1:] contains the cog description

        cog_code = line_fields[0].strip()

        cog_description = ""
        for atom_description in line_fields[1:]:
            cog_description += atom_description

        if verbose:
            sys.stderr.write("Inserting new cog with code %s and function %s and description %s\n" %(cog_code, cog_letter_function, cog_description))

        # insert information into table cog
        piana_access.insert_cog(cog_id=cog_code , cog_description=cog_description , cog_function=cog_letter_function , source_db= sourceDB)

    # END OF if line[0] == "[":
    
    else:

        if line[0:7] == "_______":
            # new entry sign
            cog_code = None
        else:


            line_fields = line.split(":")  # line_fields[0] contains the species abbreviation
                                           # line_fields[1] contains space-separated protein names

            # check that it is a line with info (there are also empty lines)
            if len(line_fields) > 1:

                if cog_code is None:
                    raise ValueError("Cog code cannot be None... error parsing file")

                list_protein_names = line_fields[1].strip().split()

                species_abb = line_fields[0].strip()

                if dic_species_eq.has_key(species_abb):
                    tax_id_value= dic_species_eq[species_abb]
                else:
                    raise ValueError("Species abbreviation %s unknown: add the abbreviation and tax id to dictionary dic_species_eq\n" %(species_abb) )

                for protein_name in list_protein_names:

                    list_proteinPiana= piana_access.get_list_protein_piana(proteinCode_value= protein_name.strip(),
                                                                           proteinCodeType_value= PianaGlobals.geneName_col,
                                                                           tax_id_value= tax_id_value,
                                                                           source_db_info="no")

                    if not list_proteinPiana:
                        # if nothing was found under geneName, try with gi_code (cog uses both codes indistinctively)
                        list_proteinPiana= piana_access.get_list_protein_piana(proteinCode_value= protein_name.strip(),
                                                                               proteinCodeType_value= PianaGlobals.giID_col,
                                                                               tax_id_value= tax_id_value,
                                                                               source_db_info="no")
                    # END OF if not list_proteinPiana:
                    
                    if list_proteinPiana:
                        for proteinPiana in list_proteinPiana:

                            if verbose:
                                sys.stderr.write("Inserting cog %s for proteinPiana %s\n" %(cog_code, proteinPiana))

                            piana_access.insert_protein_cog(cog_id= cog_code, proteinPiana_value= proteinPiana, proteinCogSource_value= sourceDB)

                        # END OF for proteinPiana in list_proteinPiana:
                    # END OF if list_proteinPiana:
                        
                    else:
                        if verbose:
                            sys.stderr.write("cog not inserted because no proteinPiana was found for protein_name %s\n" %protein_name)
                        cogs_not_inserted += 1
                    # END OF (if list_proteinPiana:)
                        
                # END OF for protein_name in list_protein_names:

            # END OF if len(line_fields) > 1:
            
        # END OF else(if line[0:7] == "_______":)
        
    # END OF else: (if line[0] == "[":)
    
# END OF for line in input_file_fd:

if verbose:
    sys.stderr.write("Done! %s relations cog-protein not inserted because proteinPiana was not found for protein name (or gi)" %(cogs_not_inserted))
