"""
File        : genpept2piana.py
Author      : Ramon Aragues
Creation    : 9.2004
Contents    : fills up tables in database piana with information from genpept
Called from : 

=======================================================================================================

This file implements a program that fills up tables in database piana with information from genpept using biopython
libraries and methods

genpept can be downloaded from ftp://ftp.ncbi.nih.gov/genbank/relXXX.fsa_aa.gz

Before running, taxonomy table of piana must be populated (use taxonomy2piana for that)

It is advisable to parse swissprot and trembl before genpept

"""

# genpept2piana.py: fills up tables in database piana with information from genpept
#
# Copyright (C) 2005  Ramon Aragues
# author email: ramon.aragues@upf.edu and boliva@imim.es
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#    http://www.gnu.org/copyleft/gpl.html
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
#
# University Pompeu Fabra, hereby disclaims all copyright
# interest in the program 'PIANA'
# (software for working with protein-protein interaction networks) written 
# by Ramon Aragues

import sys
import getopt

import re
import readline
import MySQLdb

import utilities

from PianaDBaccess import *

from Bio import Fasta  # needed to read the genpept file (which is in fasta format)

# ---------------------------------------------------------------
# Set here the default values for command line arguments
# ---------------------------------------------------------------
#
input_file = None

verbose = 0


# All these tags will be considered to be pointing to id type emblAccession
accepted_coll_accessions = { "gb": None,
                             "dbj": None,
                             "emb": None}
                             

# ----------------------
# Function usage()
# ----------------------
def usage():
    print "--------------------------------------------------------------------------------------------------------------"
    print "This program fills up tables in database piana with information from genPept \n"
    print "Usage: python genpept2piana.py  --input-file=input_file_name  --piana-dbname=piana_dbname --piana-dbhost=piana_dbhost"
    print "                                --piana-dbuser=piana_dbuser --piana-dbpass=piana_dbpass"
    print "                                --tax-id-file=tax_id_file [--help] [--verbose]"
    print "\nwhere:"
    print "      input_file_name  : file name of input file containing genpept data (relxxx_fsa_aa)"
    print "      piana_dbname     : name of database piana to be used (required)"
    print "      piana_dbhost     : name of host where database piana to be used is placed (required)"
    print "      piana_dbuser     : username accessing the database (not required in most systems)"
    print "      piana_dbpass     : password of username accessing the database (not required in most systems)"
    print "      tax_id_file      : a text file with two tab-separated columns: 1st is gi code and 2nd is taxonomy id"
    print "     --help            : prints this message and exits"
    print "     --verbose         : prints process info to stdout"
    print "--------------------------------------------------------------------------------------------------------------"
        

   
# ---------------------------
# Function parseArguments()                                               
# --------------------------- 

def parseArguments():
    

    global input_file
    global verbose
    
    global piana_dbname
    global piana_dbhost
    global piana_dbuser
    global piana_dbpass

    global tax_id_file

    try:
        opts, args = getopt.getopt(sys.argv[1:], "", ["verbose","help", "input-file=", "piana-dbname=", "piana-dbhost=",
                                                      "piana-dbuser=","piana-dbpass=", "tax-id-file="])
    except getopt.GetoptError, bad_opt:
        # print help information and exit:
        sys.stderr.write( bad_opt.__str__() )
        usage()
        sys.exit(2)
     
    for option,value in opts:
        
        if option == "--input-file":
            input_file = value
            
        elif option == "--piana-dbname":
            piana_dbname = value
            
        elif option == "--piana-dbhost":
            piana_dbhost = value
            
        elif option == "--piana-dbuser":
            piana_dbuser = value
             
        elif option == "--piana-dbpass":
            piana_dbpass = value
             
        elif option == "--tax-id-file":
            tax_id_file = value
            
        elif option == "--verbose":
            verbose = 1
            
        elif option == "--help":
            # print help information and exit
            usage()
            sys.exit(2)
            
    # END OF for option,value in opts:

    if tax_id_file is None:
        raise ValueError("taxonomy ids for gi codes needed. Please set a tax-id-file!\n")
            
            
# --------
# --------
#  Main()               
# --------                               
# --------

piana_dbname= None
piana_dbhost= None
piana_dbuser= None
piana_dbpass= None

tax_id_file = None

# parsing arguments from the command line
parseArguments()

num_seqs_tax_not_known = 0

if verbose:
    print "Arguments read are: input-file= %s || piana_dbname= %s || piana_dbhost= %s" %(input_file, piana_dbname, piana_dbhost)

# Initializing connection to piana
piana_access = PianaDBaccess(dbname=piana_dbname, dbhost= piana_dbhost, dbuser=piana_dbuser, dbpassword= piana_dbpass )

sourceDB = "genpept"

genpept_parser = Fasta.RecordParser()
genpept_input_file =  utilities.open_file(input_file, 'r')
genpept_iterator = Fasta.Iterator(genpept_input_file, genpept_parser)

genpept_record = genpept_iterator.next()


if verbose:
    sys.stderr.write("Loading taxonomies\n")
# get dictionary for gis and tax ids
dic_gi_tax = utilities.return_dic_gi_vs_tax(file_name=tax_id_file)


if verbose:
    sys.stderr.write( "Processing file\n")
# while record read is not None, parse the record and insert data into piana
while genpept_record is not None:

    if verbose:
        sys.stderr.write( "===================================\n")
        sys.stderr.write( "            NEW ENTRY\n")
        sys.stderr.write( "===================================\n")

    protein_title_line = genpept_record.title
    protein_sequence = genpept_record.sequence.strip()

    if verbose:
        #sys.stderr.write("record is : %s\n" %(genpept_record))
        sys.stderr.write("title line is : %s\n" %(protein_title_line)) 
        #sys.stderr.write("sequence is : %s\n" %(protein_sequence))
        
    # title looks like this: gi|gi_id|sourceDB|acc| protein name [species]
    #                        - gi|4105707|gb|AAD02507.1| carbamate kinase [Trichomonas vaginalis]
    #                        - gi|4105707|gb|AAD02507.1| carbamate kinase [Trichomonas vaginalis]
    #                        - gi|4105707|gb|AAD02507.1| carbamate kinase
    #                        - gi|4105707|dbj|BBAS02507.1| carbamate kinase 4243
    #                        
    #    (character '>' from input file has been removed)
        
    title_atoms = protein_title_line.split('|')

    gi_id = int(title_atoms[1])

    if accepted_coll_accessions.has_key(title_atoms[2]):
        # we consider accessions from sources in accepted_coll_accessions
        coll_acc = title_atoms[3].strip()

    temp_protein_name = title_atoms[4] # this atom is a bit more complex to process...


    # code to parse genpept titles provided by Wan Kyu Kim and Christoph Winter
    p = re.compile("(\S+)?\s+(.*?)(\[(.*?)\])?\s*$")
    r = p.match(temp_protein_name)

    # embl_accession = r.group(1)      # ignoring it: I think the new genpept version does not include it
    rest_of_protein_name = r.group(2)
    species_name= r.group(4)
    
    
    protein_name = rest_of_protein_name.strip().replace("\\", " ").replace('"'," ").replace("'"," ")    # remove " and ' characters
        
    if dic_gi_tax.has_key(gi_id):
        # we give priority to the dictionary with tax ids for gis
        list_tax_id_value = [dic_gi_tax[gi_id]]
    else:
        if species_name:
            # if no tax_id was known for this gi, try to get it from the species name
            list_tax_id_value = piana_access.get_taxonomies_from_species_name(species_name_value = species_name)
        else:
            list_tax_id_value = []

    num_taxonomies = len(list_tax_id_value)
    if num_taxonomies == 1:
        tax_id_value= list_tax_id_value[0]
    else:
        tax_id_value = 0
        num_seqs_tax_not_known += 1
        
    if verbose:
        sys.stderr.write("Title processed and fields obtained are:\n")
        sys.stderr.write("   - gi_id: %s\n" %(gi_id))
        sys.stderr.write("   - coll_acc: %s\n" %(coll_acc))
        sys.stderr.write("   - protein_name: %s\n" %(protein_name))
        sys.stderr.write("   - species_name: %s\n" %(species_name))
        sys.stderr.write("   - tax_id: %s (extracted from file:%s)\n" %(tax_id_value, dic_gi_tax.has_key(gi_id)))

    if protein_sequence:
        # Insert protein: insert_protein takes care of checking whether the (sequence, tax) was already there or not
        inserted_proteinPiana = piana_access.insert_protein(proteinSequence_value= protein_sequence,
                                                            tax_id_value= tax_id_value)


    """
    Now, we have a proteinPiana for the (sequence, tax_id), either the one already existing before or a new one.

    Just add protein information to database tables.

    TO DO!! this would be a good place to check for consistency between uniprot and genbank, using embl accessions
    """
    
    # --- table gi ---

    # inserting gi ID  for current protein
    if gi_id:
        piana_access.insert_gi_code(gi_code_value= gi_id, proteinPiana_value= inserted_proteinPiana,
                                    gi_source_value = sourceDB)

        
    # --- table embl accession ---

    # inserting embl  Accession  for current protein
    if coll_acc:

        # I am not 100% sure how genpept works now... from what I understand, they have
        # unified embl accession, genbank accession and DNA dbj
        # For the time being, I consider this accession as an embl accession...
        # --> insert_emblAccession_code takes care of processing the version after the point (if it is there)
        piana_access.insert_emblAccession_code(emblAccession_code_value= coll_acc,
                                               proteinPiana_value= inserted_proteinPiana,
                                               emblAccession_source_value= sourceDB)
        
        
     # --- table description ---

    # inserting description (protein name) for current protein
    if protein_name != "":

        piana_access.insert_protein_description(description= protein_name ,
                                                proteinPiana_value= inserted_proteinPiana,
                                                proteinDescriptionSource_value= sourceDB)
    # --- table species ---

    # inserting species for current protein
    if tax_id_value:
        # if a tax_id was already found for this protein, use it
        piana_access.insert_protein_species(tax_id =  tax_id_value,
                                            proteinPiana_value = inserted_proteinPiana,
                                            proteinSpeciesSource_value = sourceDB)

    elif species_name:
        # in case tax_id is 0, it can mean several species were found for the protein...
        # insert them...

        for species_tax_id in list_tax_id_value:
            
            if verbose:
                sys.stderr.write( "adding protein_tax_id %s (species_name=%s) for proteinPiana %s (gi=%s)\n" %(
                    species_tax_id,
                    species_name,
                    inserted_proteinPiana,
                    gi_id))

            piana_access.insert_protein_species(tax_id =  species_tax_id,
                                                proteinPiana_value = inserted_proteinPiana,
                                                proteinSpeciesSource_value = sourceDB)
        # END OF for species_tax_id in list_species_tax_id:

    
    # reading next record
    if verbose:
        sys.stderr.write( "reading next record\n")
        
    genpept_record = genpept_iterator.next()
    
# END OF while genpept_record is not None
if verbose:
    sys.stderr.write("All done! Number of sequences for which the tax_id was unknown (no tax id associated or more than one): %s\n" %(num_seqs_tax_not_known))
    
