"""
File        : flat_file_taxonomy2piana.py
Author      : Ramon Aragues
Creation    : 22.11.2004
Contents    : script that fills up tables in database piana related to  external DB "flat_file_taxonomy"
Called from : 

=======================================================================================================

This file parsers a flat_file_taxonomy text file and inserts information into PianaDb

Command line option '--help' describes usage of this program

"""

# flat_file_taxonomy2piana.py: script that fills up tables in database piana related to  external DB "flat_file_taxonomy"
#
# Copyright (C) 2005  Ramon Aragues
# author email: ramon.aragues@upf.edu and boliva@imim.es
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#    http://www.gnu.org/copyleft/gpl.html
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
#
# University Pompeu Fabra, hereby disclaims all copyright
# interest in the program 'PIANA'
# (software for working with protein-protein interaction networks) written 
# by Ramon Aragues

import sys
import getopt

import re
import readline

import MySQLdb

from PianaDBaccess import *

verbose = 1

#
# These will be the values taken by the program when called directly from build_database.py

# Location of file containing external DB "flat_file_taxonomy"
input_file = "../../../data/externalDBs/taxonomyDB/flat_file/20041110_taxa.txt"

valid_kingdoms = {"Eukaryota":None, "Bacteria":None, "Archaea":None, "Viruses":None, "other sequences":None}


# ----------------------
# Function usage()
# ----------------------
def usage():
    print "--------------------------------------------------------------------------------------------------------------"
    print "This program fills up tables in database piana related to external DB 'flat_file_taxonomy' \n"
    print "Usage: python flat_file_taxonomy2piana.py [--help] [--verbose] --input-file=input_file"
    print "                              --piana-dbname=piana_dbname --piana-dbhost=piana_dbhost --piana-dbuser=piana_dbuser --piana-dbpass=piana_dbpass"
    print "\nwhere:"
    print "     piana_dbname : name of database piana to be used (required)"
    print "     piana_dbhost : name of host where database piana to be used is placed (required)"
    print "     piana_dbuser : username accessing the database (not required in most systems)"
    print "     piana_dbpass : password of username accessing the database (not required in most systems)"
    print "     input_file   : the name of the flat_file_taxonomy input file (default is '%s')" %(input_file)
    print "     --help         : prints this message and exits"
    print "     --verbose      : prints process info to stdout"
    print "--------------------------------------------------------------------------------------------------------------"
        

   
# ---------------------------
# Function parseArguments()                                               
# --------------------------- 

def parseArguments():
    
    global piana_dbname
    global piana_dbhost
    global piana_dbuser
    global piana_dbpass
    
    global input_file
    
    global verbose
    
    try:
        opts, args = getopt.getopt(sys.argv[1:], "vht:c:n:o:u:w:", ["verbose","help","input-file=",
                                                                   "piana-dbname=", "piana-dbhost=", "piana-dbuser=", "piana-dbpass=" ])
    except getopt.GetoptError:
        # print help information and exit:
        usage()
        sys.exit(2)
     
    for option,value in opts:
        
         if option in ("-t", "--input-file"):
             input_file = value
            
         elif option in ("-n", "--piana-dbname"):
             piana_dbname = value
             
         elif option in ("-o", "--piana-dbhost"):
             piana_dbhost = value
            
         elif option in ("-u", "--piana-dbuser"):
             piana_dbuser = value
             
         elif option in ("-w", "--piana-dbpass"):
             piana_dbpass = value
             
         elif option in ("-v", "--verbose"):
             verbose = 1
         elif option in ("-h", "--help"):
             # print help information and exit
             usage()
             sys.exit(2)
        
# --------
# --------
#  Main()               
# --------                               
# --------

piana_dbname = None
piana_dbuser = None
piana_dbhost = None
piana_dbpass = None

source_db = "ncbi"
# parsing arguments from the command line
parseArguments()

# Initialisating connection to piana
piana_access = PianaDBaccess(dbname=piana_dbname, dbhost=piana_dbhost, dbuser=piana_dbuser, dbpassword= piana_dbpass)

input_fd = open(input_file,"r")

#
# Reading external DB "flat_file_taxonomy" and inserting its data into piana DB
# 

if verbose:
    sys.stderr.write( "Reading data from database dump file...\n")

number_species_inserted= 0
number_species_not_inserted= 0
number_incomplete_lines = 0
number_incomplete_subcats = 0

for line in input_fd:

    #if verbose:
    #    sys.stderr.write("Line read is: %s\n" %line)
    # each line looks like:
    # 202302<TAB>Bacillus sp. V4.BE.28<TAB>cellular organisms:Bacteria:Firmicutes:Bacilli:Bacillales:Bacillaceae:Bacillus sp. V4.BE.28



    line_fields = line.split("\t")

    # line_fields is:
    #                line_fields[0] -> tax id
    #                line_fields[1] -> Species name
    #                line_fields[2] -> colon-separated sub-categories

    
    if len(line_fields) < 3:
        number_incomplete_lines += 1
        continue
        
    tax_id = int( line_fields[0].strip() )
    tax_name= line_fields[1]
    tax_subcategories = line_fields[2].split(":")


    if len(tax_subcategories) < 2:
        number_incomplete_subcats += 1
        continue
    
    tax_kingdom = tax_subcategories[1].strip()

    if not valid_kingdoms.has_key(tax_kingdom):
        # only accepting five types of kingdoms... see above
        continue
    
    if verbose:
        sys.stderr.write( "tax_id is <%s> -- tax_name is <%s> -- tax kingdom is <%s>\n" %(tax_id, tax_name, tax_kingdom))

    if tax_id != ""  and  tax_kingdom != "" :
        # insert_species_kingdom updates kingdom of tax id if existing in database. If tax id was not there, inserts all the information
        piana_access.insert_species_kingdom(tax_id= tax_id, tax_name=tax_name,  tax_kingdom= tax_kingdom, source_db= source_db)
        number_species_inserted += 1
    else:
        number_species_not_inserted += 1
        
# END OF for line in input_fd:
                              
        
if verbose:
    sys.stderr.write("All done! %s species inserted - %s species not inserted - number_incomplete_lines=%s - number_incomplete_subcats=%s\n" %(
        number_species_inserted,
        number_species_not_inserted,
        number_incomplete_lines,
        number_incomplete_subcats))
    
