"""
File        : nr2piana.py
Author      : Ramon Aragues
Creation    : 10.2004
Contents    : fills up tables in database piana with information from nr
Called from : 

=======================================================================================================

This file implements a program that fills up tables in database piana with information from nr

nr can be downloaded from ftp://ftp.ncbi.nih.gov/blast/db/FASTA/nr.gz

Before running, taxonomy table of piana must be populated (use taxonomy2piana for that)

It is advisable to parse swissprot and trembl and genpept before parsing nr

"""

# nr2piana.py:  fills up tables in database piana with information from nr
#
# Copyright (C) 2005  Ramon Aragues
# author email: ramon.aragues@upf.edu and boliva@imim.es
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#    http://www.gnu.org/copyleft/gpl.html
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
#
# University Pompeu Fabra, hereby disclaims all copyright
# interest in the program 'PIANA'
# (software for working with protein-protein interaction networks) written 
# by Ramon Aragues

import sys
import getopt

import re
import readline
import MySQLdb

import utilities

from PianaDBaccess import *

from Bio import Fasta  # needed to read the nr file (which is in fasta format)

# ---------------------------------------------------------------
# Set here the default values for command line arguments
# ---------------------------------------------------------------
#

# These will be the values taken by the program when called directly from build_database.py:

input_file = None

verbose = 0
verbose_all = 0


# ----------------------
# Function usage()
# ----------------------
def usage():
    print "--------------------------------------------------------------------------------------------------------------"
    print "This program fills up tables in database piana with information from nr \n"
    print "Usage: python nr2piana.py  --input-file=input_file_name  --piana-dbname=piana_dbname --piana-dbhost=piana_dbhost"
    print "                           --piana-dbuser=piana_dbuser --piana-dbpass=piana_dbpass"
    print "                           --tax-id-file=tax_id_file  [--help] [--verbose]"
    print "\nwhere:"
    print "      input_file_name     : file name of input file containing nr data (nr)"
    print "      piana_dbname       : name of database piana to be used (required)"
    print "      piana_dbhost       : name of host where database piana to be used is placed (required)"
    print "      piana_dbuser       : username accessing the database (not required in most systems)"
    print "      piana_dbpass       : password of username accessing the database (not required in most systems)"
    print "      tax_id_file      : a text file with two tab-separated columns: 1st is gi code and 2nd is taxonomy id"
    print "     --help         : prints this message and exits"
    print "     --verbose      : prints process info to stdout"
    print "--------------------------------------------------------------------------------------------------------------"
        

   
# ---------------------------
# Function parseArguments()                                               
# --------------------------- 

def parseArguments():
    

    global input_file
    global verbose
    
    global piana_dbname
    global piana_dbhost
    global piana_dbuser
    global piana_dbpass
    
    global tax_id_file

    try:
        opts, args = getopt.getopt(sys.argv[1:], "", ["verbose","help", "input-file=", "piana-dbname=", "piana-dbhost=",
                                                      "piana-dbuser=","piana-dbpass=", "tax-id-file="])
    except getopt.GetoptError, bad_opt:
        # print help information and exit:
        sys.stderr.write( bad_opt.__str__() )
        usage()
        sys.exit(2)
     
    for option,value in opts:
        
        if option == "--input-file":
            input_file = value
            
        elif option == "--piana-dbname":
            piana_dbname = value
            
        elif option == "--piana-dbhost":
            piana_dbhost = value
            
        elif option == "--piana-dbuser":
            piana_dbuser = value
             
        elif option == "--piana-dbpass":
            piana_dbpass = value
             
        elif option == "--tax-id-file":
            tax_id_file = value
            
        elif option == "--verbose":
            verbose = 1
            
        elif option == "--help":
            # print help information and exit
            usage()
            sys.exit(2)
            
    # END OF for option,value in opts:
          

    if tax_id_file is None:
        raise ValueError("taxonomy ids for gi codes needed. Please set a tax-id-file!\n")  
            
# --------
# --------
#  Main()               
# --------                               
# --------

piana_dbname= None
piana_dbhost= None
piana_dbuser= None
piana_dbpass= None

tax_id_file = None

# parsing arguments from the command line
parseArguments()

if verbose:
    print "Arguments read are: input-file= %s || piana_dbname= %s || piana_dbhost= %s" %(input_file, piana_dbname, piana_dbhost)

# Initializing connection to piana
piana_access = PianaDBaccess(dbname=piana_dbname, dbhost= piana_dbhost, dbuser=piana_dbuser, dbpassword= piana_dbpass )

sourceDB = "ncbi_nr"

nr_parser = Fasta.RecordParser()
nr_input_file =  utilities.open_file(input_file, 'r')
nr_iterator = Fasta.Iterator(nr_input_file, nr_parser)

nr_record = nr_iterator.next()


if verbose:
    print "Processing file"

# get dictionary for gis and tax ids
dic_gi_tax = utilities.return_dic_gi_vs_tax(file_name=tax_id_file)

# while record read is not None, parse the record and insert data into piana
while nr_record is not None:

    if verbose:
        sys.stderr.write( "===================================\n")
        sys.stderr.write( "            NEW ENTRY\n")
        sys.stderr.write( "===================================\n")

    protein_title_line = nr_record.title
    protein_sequence = nr_record.sequence.strip()

    if verbose_all:
        sys.stderr.write("title line is : %s\n" %(protein_title_line)) 
        sys.stderr.write("sequence is : %s\n" %(protein_sequence))

    """
    Now, we have a proteinPiana that will be asigned to all other codes that we find in the title line
    """
    # title can look like this: gi|1346670|sp|P04175|NCPR_PIG NADPH--cytochrome P450 reductase (CPR) (P450R)
    #                           gi|5832586|dbj|BAA84019.1| maturase [Eucharis grandiflora]
    #                           gi|223154|prf||0601198A polymerase beta,RNA
    #                           gi|17538019|ref|NP_496216.1| surfeit 5 (18.1 kD) (2K591) [Caenorhabditis elegans]$gi|2497033|sp|Q23679|YWM3_CAEEL Hypothetical protein ZK970.3 in chromosome II$gi|3881897|emb|CAA88887.1| Hypothetical protein ZK970.3 [Caenorhabditis elegans]$gi|7511441|pir||T28127 hypothetical protein ZK970.3 - Caenorhabditis elegans
    #                           
    #    (character '>' from input file has been removed)
    #    (where character $ is actually octal character 001)
    #
    # in nr (as opposed to genpept) a title can have several gi identifiers for the same sequence
    # however, nr is a mess, so it is easier to get species and so on from genpept (that is why we parse it first)


    # So, given the messy format of nr, first of all split by octal character 001 to see if there is more than one gi
    title_entries = protein_title_line.split("\001")

    # for each of the title entries (normally just one) split it, retrieve information and insert codes, using same sequence for all)
    for title_entrie in title_entries:

#        if verbose:
#            sys.stderr.write("One entry of the title is: %s\n" %(title_entrie))
        
        title_atoms = title_entrie.split('|')

        # title_atom[1] (should) is always the gi code
        gi_id = int(title_atoms[1])

        #
        #  title_atom[2] can be (exhaustive list as of 10.2004):
        #
        #  - gb: for EMBL accessions
        #  - emb: for EMBL protein id
        #  - pir: for PIR accessions
        #  - sp: for swissprot
        #  - dbj: dna japan
        #  - prf: ???
        #  - ref: ref_seq
        #  - pdb: pdb code
        #  - tpg: ???
        #  - tpe: ???

        # title_atom[3] is the value (a protein id) indicated in title_atom[2]

        embl_accession = None
        embl_pid = None
        pir = None
        sp_acc_code = None  # this type of code is currently being read and inserted 
        swissprot_code = None  # this type of code is currently being read and inserted  
        ref_seq = None
        pdb_code = None

        species_name = None

        # read uniprot entries and uniprot accession numbers
        if title_atoms[2] == "sp":
            if title_atoms[3].strip() != "":
                sp_acc_code= title_atoms[3][0:6]  # [0:6] to avoid reading the subfragment section
                
            # if dealing with sp, then try to read the uniprot entry from the first word in title_atoms[4]
            messy_part_fields = title_atoms[4].split()
            
            if len(messy_part_fields) > 0:
                swissprot_code = messy_part_fields[0].strip()
            else:
                swissprot_code = None
        

        # Now, process the rest of the line.... not an easy one, it is a real mess
        # For the time being, I only try to find the species
        
        # TO DO!!! Change this into a regular expression!!!
        last_atom = title_atoms[-1:][0] # take last atom, with the hope that the species is there
                                        # we will only read as species if between [] and at the end of title_entrie

        if verbose_all:
            sys.stderr.write("last atom of the entrie is: %s\n" %(last_atom))

        last_atom_fields = last_atom.split("[") # this separates the species from whatever was before

        if verbose_all:
            sys.stderr.write("split list of last atom is: %s\n" %(last_atom_fields))

        if len(last_atom_fields) > 1:
            # there is something after a [... read it as species making sure that there was as well a ']'
            # TO DO!!! this would be better done with a regular expression...

            species_name_fields = last_atom_fields[1].split("]")
            if len(species_name_fields) > 1:
                species_name = species_name_fields[0].replace('"','').replace("(","").strip()

        # at this point, species_name will remain None if nothing is found between the []

        if dic_gi_tax.has_key(gi_id):
            # we give priority to the dictionary with tax ids for gis
            list_tax_ids = [dic_gi_tax[gi_id]]
        elif species_name:
            # if no value found in dic, get the tax_id associated to that species name
            list_tax_ids= piana_access.get_taxonomies_from_species_name(species_name_value = species_name)
        else:
            list_tax_ids = []

        if len(list_tax_ids) == 1:
            # if there is just one tax_id associated to that species, use it
            tax_id_value = list_tax_ids[0]
        else:
            # in other cases, try to get the tax_id associated to the gi (which was probably inserted when parsing genpept)
            #   --> other cases are: no species name found, no tax id associated to species name
            #                        or more than one tax id associated to species name

            # 1. get proteinPianas associated to the gi (using 0 for tax id will get all proteinPianas regardless of the species)
            #                                            a gi is normally associated to a single species... it should work ok)
            pps_for_this_gi = piana_access.get_list_protein_piana(proteinCode_value=gi_id,
                                                                  proteinCodeType_value= PianaGlobals.giID_col,
                                                                  tax_id_value = 0,
                                                                  source_db_info="no")
            # 2. get tax_ids associated to those proteinPianas
            dic_tax_id = {}
            for this_proteinPiana in pps_for_this_gi:
                list_tax_id = piana_access.get_protein_taxonomy_ids(proteinPiana_value=this_proteinPiana )

                for this_tax_id in list_tax_id:
                    if this_tax_id != 0:
                        dic_tax_id[this_tax_id] = None

            # END OF for this_proteinPiana in pps_for_this_gi:

            # 3. check if we can use this tax id (ie. there is only one, and it is a valid one)
            if len(dic_tax_id)== 1:
                tax_id_value = dic_tax_id.keys()[0]
            else:
                # if nothing found, use dummy value for this sequence
                tax_id_value = 0
           
        if tax_id_value == 0:
            # for the time being, do not allow nr to insert new sequences if the tax id is unknown...
            #  (moreover, when the tax id is known, the (sequence, tax_id) will probably be already there...)
            # I prefer to avoid introducing noise...
            continue 
        # ----
        # first of all, get a proteinPiana for the (sequence, tax_id) that we are processing
        # ----
        
        if protein_sequence:
            inserted_proteinPiana = piana_access.insert_protein(proteinSequence_value= protein_sequence,
                                                                tax_id_value= tax_id_value)
        else:
            raise ValueError("Trying to insert an empty sequence from nr into Piana")
    

        
        if verbose:
            sys.stderr.write("   - proteinPiana: %s has gi_id %s and species name %s\n" %(inserted_proteinPiana, gi_id, species_name))

        """
        Now, we have a proteinPiana for the sequence, either the one already existing before or a new one, and the protein codes

        Just add protein information to database tables.

        """

        # --- table gi ---

        # inserting gi ID  for current protein
        if gi_id != "" and gi_id is not None:
            if verbose_all:
                sys.stderr.write("inserting gi %s -- " %(gi_id))
            piana_access.insert_gi_code(gi_code_value= gi_id, proteinPiana_value= inserted_proteinPiana,
                                        gi_source_value = sourceDB)

        # --- table sp accession ---

        # inserting sp acc  for current protein
        if sp_acc_code != "" and sp_acc_code is not None:
            if verbose_all:
                sys.stderr.write("inserting uniprot accession number %s -- " %(sp_acc_code))
            piana_access.insert_swissAccession_code(swissAccession_code_value= sp_acc_code, proteinPiana_value= inserted_proteinPiana,
                                                    swissAccession_source_value = sourceDB, isPrimary_value= 0)

        # --- table swissprot ---

        # inserting swissprot for current protein
        if swissprot_code != "" and swissprot_code is not None:
            if verbose_all:
                sys.stderr.write("inserting swissprot %s -- " %(swissprot_code))
            piana_access.insert_swissProt_code(swissProt_code_value= swissprot_code, proteinPiana_value= inserted_proteinPiana,
                                               swissProt_source_value = sourceDB)



        # --- table species ---

        # list_tax_ids comes from the section where we checked if there was a species name in the title line
        for species_tax_id in list_tax_ids:

            if verbose_all:
                sys.stderr.write( "adding protein_tax_id %s (species_name=%s) for proteinPiana %s (gi=%s)\n" %(species_tax_id,
                                                                                                               species_name,
                                                                                                               inserted_proteinPiana,
                                                                                                               gi_id))
            piana_access.insert_protein_species(tax_id =  species_tax_id,
                                                proteinPiana_value = inserted_proteinPiana,
                                                proteinSpeciesSource_value = sourceDB)
        # END OF for species_tax_id in list_species_tax_id:

            
    # END OF for title_entrie in title_entries

    
    # reading next record
    if verbose:
        sys.stderr.write( "\n---------------reading next record---------------\n")
    nr_record = nr_iterator.next()
    
# END OF while nr_record is not None

