"""
File        : swissprot2piana.py
Author      : Ramon Aragues
Creation    : 29.11.2004
Contents    : fills up tables in database piana with information from swissprot
Called from : 

=======================================================================================================

This file implements a program that fills up tables in database piana with information from ncbi swissprot (gi with sprots)

ncbi swissprot can be downloaded from ftp://ftp.ncbi.nih.gov/blast/db/FASTA/swissprot.gz

Before running, taxonomy table of piana must be populated (use taxonomy2piana for that)

It is advisable to parse swissprot and trembl and genpept and nr before parsing swissprot

"""

# swissprot2piana.py: fills up tables in database piana with information from ncbi swissprot
#
# Copyright (C) 2005  Ramon Aragues
# author email: ramon.aragues@upf.edu and boliva@imim.es
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#    http://www.gnu.org/copyleft/gpl.html
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
#
# University Pompeu Fabra, hereby disclaims all copyright
# interest in the program 'PIANA'
# (software for working with protein-protein interaction networks) written 
# by Ramon Aragues
import sys
import getopt

import re
import readline
import MySQLdb

import utilities

from PianaDBaccess import *

from Bio import Fasta  # needed to read the swissprot file (which is in fasta format)

# ---------------------------------------------------------------
# Set here the default values for command line arguments
# ---------------------------------------------------------------
#

# These will be the values taken by the program when called directly from build_database.py:

input_file = None

verbose = 0
verbose_detailed = 0


# ----------------------
# Function usage()
# ----------------------
def usage():
    print "--------------------------------------------------------------------------------------------------------------"
    print "This program fills up tables in database piana with information from swissprot \n"
    print "Usage: python swissprot2piana.py  --input-file=input_file_name  --piana-dbname=piana_dbname --piana-dbhost=piana_dbhost"
    print "                           --piana-dbuser=piana_dbuser --piana-dbpass=piana_dbpass"
    print "                           --tax-id-file=tax_id_file  [--help] [--verbose]"
    print "\nwhere:"
    print "      input_file_name  : file name of input file containing swissprot data (swissprot)"
    print "      piana_dbname     : name of database piana to be used (required)"
    print "      piana_dbhost     : name of host where database piana to be used is placed (required)"
    print "      piana_dbuser     : username accessing the database (not required in most systems)"
    print "      piana_dbpass     : password of username accessing the database (not required in most systems)"
    print "      tax_id_file      : a text file with two tab-separated columns: 1st is gi code and 2nd is taxonomy id"
    print "     --help            : prints this message and exits"
    print "     --verbose         : prints process info to stdout"
    print "--------------------------------------------------------------------------------------------------------------"
        

   
# ---------------------------
# Function parseArguments()                                               
# --------------------------- 

def parseArguments():
    

    global input_file
    global verbose
    
    global piana_dbname
    global piana_dbhost
    global piana_dbuser
    global piana_dbpass
    
    global tax_id_file

    try:
        opts, args = getopt.getopt(sys.argv[1:], "", ["verbose","help", "input-file=", "piana-dbname=", "piana-dbhost=",
                                                      "piana-dbuser=","piana-dbpass=", "tax-id-file="])
    except getopt.GetoptError, bad_opt:
        # print help information and exit:
        sys.stderr.write( bad_opt.__str__() )
        usage()
        sys.exit(2)
     
    for option,value in opts:
        
        if option == "--input-file":
            input_file = value
            
        elif option == "--piana-dbname":
            piana_dbname = value
            
        elif option == "--piana-dbhost":
            piana_dbhost = value
            
        elif option == "--piana-dbuser":
            piana_dbuser = value
             
        elif option == "--piana-dbpass":
            piana_dbpass = value
             
        elif option == "--tax-id-file":
            tax_id_file = value
            
        elif option == "--verbose":
            verbose = 1
            
        elif option == "--help":
            # print help information and exit
            usage()
            sys.exit(2)
            
    # END OF for option,value in opts:
            

    if tax_id_file is None:
        raise ValueError("taxonomy ids for gi codes needed. Please set a tax-id-file!\n")
            
# --------
# --------
#  Main()               
# --------                               
# --------

piana_dbname= None
piana_dbhost= None
piana_dbuser= None
piana_dbpass= None

tax_id_file = None

# parsing arguments from the command line
parseArguments()

if verbose_detailed:
    print "Arguments read are: input-file= %s || piana_dbname= %s || piana_dbhost= %s" %(input_file, piana_dbname, piana_dbhost)

# Initializing connection to piana
piana_access = PianaDBaccess(dbname=piana_dbname, dbhost= piana_dbhost, dbuser=piana_dbuser, dbpassword= piana_dbpass )

sourceDB = "ncbi_sprot"

swissprot_parser = Fasta.RecordParser()
swissprot_input_file =  utilities.open_file(input_file, 'r')
swissprot_iterator = Fasta.Iterator(swissprot_input_file, swissprot_parser)

swissprot_record = swissprot_iterator.next()


if verbose:
    print "Processing file"

# get dictionary for gis and tax ids
dic_gi_tax = utilities.return_dic_gi_vs_tax(file_name=tax_id_file)

# while record read is not None, parse the record and insert data into piana
while swissprot_record is not None:

    if verbose_detailed:
        sys.stderr.write( "===================================\n")
        sys.stderr.write( "            NEW ENTRY\n")
        sys.stderr.write( "===================================\n")

    protein_title_line = swissprot_record.title
    protein_sequence = swissprot_record.sequence

    if verbose_detailed:
        sys.stderr.write("title line is : %s\n" %(protein_title_line)) 
        sys.stderr.write("sequence is : %s\n" %(protein_sequence))


    # title can look like this: 
    #                    gi|51315741|sp||P62809_1 [Segment 1 of 3] Cyclic phosphodiesterase (CPDase)
    #                    gi|50400705|sp|Q9ZES3|APAG_RHIET Protein apaG
    #                    gi|50401406|sp||P84036_1 [Segment 1 of 2] Plasminogen-activating proteinase (LV-PA)
    #                    gi|50401529|sp|Q7WQ22|TRMB_BORBR tRNA (guanine-N(7)-)-methyltransferase (tRNA(m7G46)-methyltransferase)$gi|50401527|sp
#|Q7WC23|TRMB_BORPA tRNA (guanine-N(7)-)-methyltransferase (tRNA(m7G46)-methyltransferase)

    #                           
    #    (character '>' from input file has been removed by biopython)
    #    (where character $ is actually octal character 001)
    #
    # in misc ncbi swissprot (as opposed to genpept) a title can have several gi identifiers for the same sequence


    # So, given the messy format of this file, first of all split by octal character 001 to see if there is more than one gi
    title_entries = protein_title_line.split("\001")

    # for each of the title entries (normally just one) split it, retrieve information and insert codes, using same sequence for all)
    for title_entrie in title_entries:

        
        title_atoms = title_entrie.split('|')

        # title_atom[1] (should) is always the gi code
        gi_id = int(title_atoms[1])

        # title_atom[3] is the swiss accession
        # if swiss accession doesn't appear on position 3, then position 4 is a swiss accession with segment identifier
        # if swiss accession appears on position 3, then position 4 is a swissprot id
        #

        if title_atoms[3] == "" or title_atoms[3] is None:
            # no swiss accession given, get it from position 4 (removing the segment part)
            swissaccession = title_atoms[4].split()[0][:-2]
            swissprot      = None
        else:
            swissaccession = title_atoms[3]
            swissprot      = title_atoms[4].split()[0]


        if verbose:
            sys.stderr.write("Inserting gi <%s>, uniacc <%s> and unientry <%s> for proteinPiana <%s>\n" %(gi_id,
                                                                                                          swissaccession,
                                                                                                          swissprot,
                                                                                                          inserted_proteinPiana))



        
        if dic_gi_tax.has_key(gi_id):
            # we give priority to the dictionary with tax ids for gis
            tax_id_value = dic_gi_tax[gi_id]
        else:
            # get tax_id for this gi (tax id was previously inserted by genpept or nr)
            # 1. get proteinPianas associated to the gi (using 0 for tax id will get all proteinPianas regardless of the species)
            #                                            a gi is normally associated to a single species... it should work ok)
            pps_for_this_gi = piana_access.get_list_protein_piana(proteinCode_value=gi_id,
                                                                  proteinCodeType_value= PianaGlobals.giID_col,
                                                                  tax_id_value = 0,
                                                                  source_db_info="no")
            # 2. get tax_ids associated to those proteinPianas
            dic_tax_id = {}
            for this_proteinPiana in pps_for_this_gi:
                list_tax_id = piana_access.get_protein_taxonomy_ids(proteinPiana_value=this_proteinPiana )

                for this_tax_id in list_tax_id:
                    if this_tax_id != 0:
                        dic_tax_id[this_tax_id] = None

            # END OF for this_proteinPiana in pps_for_this_gi:

            # 3. check if we can use this tax id (ie. there is only one, and it is a valid one)
            if len(dic_tax_id)== 1:
                tax_id_value = dic_tax_id.keys()[0]
            else:
                # if nothing found, use dummy value for this sequence
                tax_id_value = 0
        # END OF else: (if dic_gi_tax.has_key(gi_id):)

        if tax_id_value == 0:
            # for the time being, do not allow ncbi swissprot to insert new sequences is the tax id is unknown...
            #  (moreover, when the tax id is known, the (sequence, tax_id) will probably be already there...)
            # I prefer to avoid introducing noise...
            continue
        
        #----
        # first of all, get a proteinPiana for the (sequence, tax_id) that we are processing
        #----

        if protein_sequence != "":
            inserted_proteinPiana = piana_access.insert_protein(proteinSequence_value= protein_sequence,
                                                                tax_id_value= tax_id_value)
        else:
            raise ValueError("Trying to insert an empty sequence from pdbaa into Piana")


        
        """
        Now, we have a proteinPiana for the sequence, either the one already existing before or a new one
        We also have the protein swissprot info

        insert this info to database
        """

        # --- table gi ---

        # inserting gi ID  for current protein
        if gi_id != "" and gi_id is not None:
            piana_access.insert_gi_code(gi_code_value= gi_id, proteinPiana_value= inserted_proteinPiana,
                                        gi_source_value = sourceDB)


        # --- table swissaccession ---

        # inserting swissaccession  for current protein
        if swissaccession != "" and swissaccession is not None:
            piana_access.insert_swissAccession_code(swissAccession_code_value= swissaccession,
                                                    proteinPiana_value= inserted_proteinPiana,
                                                    swissAccession_source_value = sourceDB, isPrimary_value= 0)

        # --- table swissprot ---

        # inserting  swissprot for current protein
        if swissprot != "" and swissprot is not None:
            piana_access.insert_swissProt_code(swissProt_code_value= swissprot,
                                               proteinPiana_value= inserted_proteinPiana,
                                               swissProt_source_value = sourceDB)
          
        # --- table species ---

        # tax_id_value comes from the section where we checked if there was a species name in the title line
        if verbose_detailed:
            sys.stderr.write( "adding protein_tax_id %s (species_name=%s) for proteinPiana %s (gi=%s)\n" %(tax_id_value,
                                                                                                           species_name,
                                                                                                           inserted_proteinPiana,
                                                                                                           gi_id))
        piana_access.insert_protein_species(tax_id =  tax_id_value,
                                            proteinPiana_value = inserted_proteinPiana,
                                            proteinSpeciesSource_value = sourceDB)


    # END OF for title_entrie in title_entries

    
    # reading next record
    if verbose_detailed:
        print "reading next record"
        
    swissprot_record = swissprot_iterator.next()
    
# END OF while swissprot_record is not None

