"""
File        : DipContentHandler.py
Author      : Ramon Aragues
Creation    : 2003
Contents    : XML content handler for DIP xml xin files
Called from : parseDIP.py

=======================================================================================================

This file implements the content handler for DIP xml xin files. 

"""

# DipContentHandler.py:XML content handler for DIP xml xin files
#
# Copyright (C) 2005  Ramon Aragues
# author email: ramon.aragues@upf.edu and boliva@imim.es
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#    http://www.gnu.org/copyleft/gpl.html
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
#
# University Pompeu Fabra, hereby disclaims all copyright
# interest in the program 'PIANA'
# (software for working with protein-protein interaction networks) written 
# by Ramon Aragues

from xml.sax import make_parser, SAXException
from xml.sax.handler import ContentHandler

import MySQLdb

from PianaDB import *




class DipContentHandler(ContentHandler):

    """
    -------------------
    __init__ method
    -------------------

    Initialises variables and sets output mode and mysql parameters if needed (for dipDB mode)
    """

    def __init__(self, output_mode,  verbose=0, dip_dbhost= None, dip_dbname= None, dip_dbuser= None, dip_dbpass= None):

        self.output_mode = output_mode

        self.verbose = verbose
        self.buffer = ''
        self.currentEntity = None
        self.currentAtt = None
        self.protein_uid = None
        self.protein_id = None
        self.protein_name = None
        self.protein_class = None
        self.protein_atts = {'descr':None, 'organism':None, 'taxon':None}
        self.protein_ext_names = []
        self.protein_sprot = None
        self.protein_pir   = None
        self.protein_refseq = None
        self.protein_gi = None
        self.protein_organism = None
        self.protein_taxon = None
        self.protein_descr = None

        
        self.link_uid = None
        self.link_id  = None
        self.link_from  = None
        self.link_to  = None
        self.link_class = None
        self.link_experiments = {}

        self.currentExperiment = None

        if self.output_mode == "dipDB":


            if dip_dbname is None or dip_dbhost is None:
                raise ValueError("trying to establish a connection to dip database without giving a host or database name")

            # opening connection to MySQL DIP database and create a cursor to work with the database
            if dip_dbuser is None and dip_dbpass is None:
                self.dipdb = MySQLdb.connect(db=dip_dbname, host=dip_dbhost)
                
            elif dip_dbpass is None and dip_dbuser is not None:
                self.dipdb = MySQLdb.connect(user=dip_dbuser, db=dip_dbname, host=dip_dbhost)
                
            else:
                self.dipdb = MySQLdb.connect(user=dip_dbuser, db=dip_dbname, host=dip_dbhost, passwd= dip_dbpass )

            self.cursor = self.dipdb.cursor()


    """
    -------------------
    startElement method
    -------------------

    Called whenever the parser enters an xml element (i.e. anything inside <>): set flags indicating which kind of element we are processing
    """
    
    def startElement(self, labelName, attrs):

        if labelName == "attributes":
            # labelName is "attributes, extract default values for attributes (TO BE DONE)
            self.currentEntity = "attributes"
        
        elif labelName == "node":
            # labelName is "node", extract attribute values for it
            # when the content of the attribute is composed (e.g. "G:234") we split and take only the value
            self.currentEntity = "node"
            self.protein_uid = attrs.get('uid').split(":")[1]
            self.protein_id = attrs.get('id').split(":")[1]
            self.protein_name = attrs.get('name')
            self.protein_class = attrs.get('class')
            self.protein_ext_names = []
            self.currentAtt = None
            
        elif labelName == "edge":
            # labelName is "edge", extract attribute values for it
            self.currentEntity = "edge"
            self.link_uid = attrs.get('uid').split(":")[1]
            self.link_id = attrs.get('id').split(":")[1]
            self.link_from = attrs.get('from').split(":")[1]
            self.link_to = attrs.get('to').split(":")[1]
            self.link_class = attrs.get('class')
            self.link_experiments = {}
            self.currentExperiment = None
            self.currentAtt = None


            
        elif self.currentEntity == "node":
            # current labelName is not "node" or "edge" (because we are inside "elif")
            # but we are inside a node (indicated by self.currentEntity) and entering
            # inside elements of this node
            
            if labelName == "feature":

                # we fix variable self.currentAtt to
                # something (i.e. "feature") that we can recognize afterwards in method
                # endElement, so we can set the <val> content correctly for the node
                self.currentAtt = "feature"
                
            elif labelName == "att":
                # if the element is 'att' we save in self.currentAtt its name to later
                # know where to save its 'val' element content (in the endElement method)
                self.currentAtt = attrs.get('name')

            elif labelName == "xref" and self.currentAtt == "organism":
                tax_db= attrs.get('db')

                if tax_db == "TXID":
                    self.protein_taxon = int(attrs.get('id'))
                

                
            
        elif self.currentEntity == "edge":
            # current labelName is not "node" or "edge" (because we are inside "elif"
            # but we are inside an edge (indicated by self.currentEntity) and entering
            # inside elements of this edge
            
            if labelName == "feature":
                # since we are inside a node, we append all the features names to the
                # protein external names (cross-references to other DBs)
                self.link_experiments[attrs.get('uid')] = {"source":None, "exp_name":None}
                self.currentExperiment = self.link_experiments[attrs.get('uid')] 

                # although "feature" is not an attribute, we fix variable self.currentAtt to
                # something (i.e. "feature") that we can recognize afterwards in method
                # endElement, so we can set the <val> content correctly
                self.currentAtt = "feature"
                
            elif labelName == "att":
                # if the element is 'att' we save in self.currentAtt its name to later
                # know where to save its 'val' element content (in the endElement method)
                self.currentAtt = attrs.get('name')
                

        # set buffer content to NIL, we are just starting a new element: it will be filled afterwards
        self.buffer = ''
          


    """
    -------------------
    endElement method
    -------------------

    Called whenever the parser exits an element (i.e. anything inside <>): get buffer content into corresponding variables
    """
    
    def endElement(self,labelName):

        #------------------------------
        # Assigning values from buffer
        #------------------------------
        if self.currentEntity == "node":

            # if we are inside a node, the content of the "val" element must be saved
            # in the corresponding variable
            if labelName == "val":
                if self.currentAtt == "feature":
                    self.protein_ext_names.append(self.buffer.strip('"'))

                else:
                    # if it is not a feature, we consider it to be a node attribute
                    self.protein_atts[self.currentAtt] = self.buffer.strip('"')

                
        elif self.currentEntity == "edge":

            # if we are inside an edge, the content of the "val" element must be saved
            # in the corresponding variable
            
            if labelName == "src":
                self.currentExperiment["source"] = self.buffer.strip('"')

            elif labelName == "val":
                
                if self.currentAtt == "name":
                    self.link_submitter = self.buffer.strip('"')
                    
                elif self.currentAtt == "class":
                    self.link_class = self.buffer.strip('"')
                    
                elif self.currentAtt == "feature":
                    self.currentExperiment["exp_name"]= self.buffer.strip('"')
            # END_OF_IF if labelName == "src": elif labelName == "val":
            

        #------------------------------
        # Processing all information read for a certain element
        #------------------------------
        
        # when node data are finished, process information retrieved for the protein
        # Once the information is processed (printed out, inserted in db, ...) initialize variables
        if labelName == "node":

            
           
            for ext_name in self.protein_ext_names:

                
                temp_names = ext_name.split(":")

                # temp_names[0] indicates which type of code are they using to refer to the protein
                # in dip20051016, the possible values are: SWP (for sprot acc number), PIR, RefSeq and GI
                
                if temp_names[0] == "SWP":
                    self.protein_sprot = temp_names[1].strip('"')
                elif temp_names[0] == "PIR":
                    self.protein_pir = temp_names[1].strip('"')
                elif temp_names[0] == "RefSeq":
                    self.protein_refseq = temp_names[1].strip('"')
                elif temp_names[0] == "GI":
                    self.protein_gi = temp_names[1].strip('"')
                else:
                    sys.stderr.write("found protein with unknown ext_name %s of type %s and value %s\n" %(ext_name, temp_names[0], temp_names[1]))
            # END OF for ext_name in self.protein_ext_names:
                    
            for att in self.protein_atts.keys():

                if att == "organism":
                    self.protein_organism = self.protein_atts[att]
                elif att == "taxon":
                    # in the current dip version, there are no taxon attributes...
                    # in the current dip version, taxonomy is obtained from att organism, inside the xref label
                    if self.protein_atts[att]:
                        self.protein_taxon= int(str_taxon)
                        
                elif att == "descr":
                    self.protein_descr= self.protein_atts[att]
            # END OF for att in self.protein_atts.keys():
                
                    

            # Check output mode and behave accordingly...
            
            if self.output_mode == "text" or self.verbose:
                
                # print information found associated to Node
                sys.stderr.write("==========================================\n")
                sys.stderr.write("Node with the following information found:\n")
                sys.stderr.write("protein_uid: <%s>\n" %self.protein_uid)
                sys.stderr.write("protein_id: <%s>\n" %self.protein_id)
                sys.stderr.write("protein_name: <%s>\n" %self.protein_name)
                sys.stderr.write("protein_class: <%s>\n" %self.protein_class)
                sys.stderr.write("protein_organism: <%s>\n" %self.protein_organism)
                sys.stderr.write("protein_taxon: <%s>\n"  %self.protein_taxon)
                try:
                    # avoid errors in line "...baculoviral^Xiap^Xrepeat...."
                    sys.stderr.write("protein_descr: <%s>\n"  %self.protein_descr)
                except:
                    new_descr = self.protein_descr.replace("\040", " ")
                    self.protein_descr = new_descr
                    sys.stderr.write("protein_descr: <%s>\n"  %self.protein_descr)
                    
                sys.stderr.write("protein_sprot: <%s>\n" %self.protein_sprot)
                sys.stderr.write("protein_pir: <%s>\n" %self.protein_pir)
                sys.stderr.write("protein_refseq: <%s>\n" %self.protein_refseq)
                sys.stderr.write("protein_gi: <%s>\n" %self.protein_gi)
                sys.stderr.write("==========================================\n")
                
            # END OF if self.output_mode == "text" or self.verbose:
            
            if self.output_mode == "dipDB":

                # insert to DB the information found associated to Node
                sqlquery = """INSERT IGNORE INTO nodes \
                (node_id, node_uid, node_name, node_class, node_organism, node_taxonomy, node_description) \
                VALUES ("%s","%s","%s","%s","%s",%s,"%s");""" \
                %(self.protein_id, self.protein_uid, self.protein_name, self.protein_class, \
                  self.protein_organism.strip('"'), self.protein_taxon, self.protein_descr.strip('"').strip("'"))
                
                self.cursor.execute(sqlquery)
                
                sqlquery = """INSERT IGNORE INTO nodes_ext_names \
                (node_ext_id, node_ext_uid, node_ext_name, node_ext_sprot, node_ext_pir, node_ext_refseq, node_ext_gi) \
                VALUES ("%s","%s","%s","%s","%s","%s","%s");""" \
                %(self.protein_id, self.protein_uid, self.protein_name, self.protein_sprot, self.protein_pir, self.protein_refseq, self.protein_gi)
                
                self.cursor.execute(sqlquery )
                
            # END OF if self.output_mode == "dipDB"
            
                        
            # since the node has been parsed, initialize all variables related to it
            self.currentEntity = None
            self.currentAtt = None
            self.protein_uid = None
            self.protein_id = None
            self.protein_name = None
            self.protein_class = None
            self.protein_atts = {'descr':None, 'organism':None, 'taxon':None}
            self.protein_ext_names = []
            self.protein_sprot = None
            self.protein_pir   = None
            self.protein_refseq = None
            self.protein_gi = None
            self.protein_organism = None
            self.protein_taxon = None
            self.protein_descr = None
        # END OF if labelName == "node":

            
        # when edge data are finished, process information retrieved for the link
        # Once the information is processed (printed out, inserted in db, ...) initialize variables
        elif labelName == "edge":

            if self.output_mode == "text" or self.verbose:
                # print information found associated to Edge
                sys.stderr.write("==========================================\n")
                sys.stderr.write("Edge with the following information found:\n")
                sys.stderr.write("link_uid: <%s>\n" %self.link_uid)
                sys.stderr.write("link_id: <%s>\n" %self.link_id)
                sys.stderr.write("link_from: <%s>\n" %self.link_from)
                sys.stderr.write("link_to: <%s>\n" %self.link_to)
                sys.stderr.write("link_class: <%s>\n" %self.link_class)

                for link_experiments_key in self.link_experiments.keys():
                    current_link_exp_descr = self.link_experiments[link_experiments_key]
                    for link_exp_desc_key in current_link_exp_descr.keys():
                        sys.stderr.write( "link described in %s has %s %s\n" %(link_experiments_key,
                                                                               link_exp_desc_key,
                                                                               current_link_exp_descr[link_exp_desc_key] ))
           
                sys.stderr.write("==========================================\n")
            # END OF if self.output_mode == "text" or self.verbose:
            
            if self.output_mode == "dipDB":
                # insert to DB the information found associated to Node
                sqlquery = """INSERT IGNORE INTO edges \
                (edge_id, edge_uid, edge_from, edge_to, edge_class) \
                VALUES ("%s","%s","%s","%s","%s");""" \
                %(self.link_id, self.link_uid, self.link_from, self.link_to, self.link_class)
                
                self.cursor.execute(sqlquery)

                for link_experiments_key in self.link_experiments.keys():
                    current_link_exp_descr = self.link_experiments[link_experiments_key]

                    if link_experiments_key is not None:
                        temp_exp_uid = link_experiments_key.split(":")[1]
                    else:
                        temp_exp_uid = None
                        
                    temp_exp_source = None
                    temp_exp_name = None

                    for link_exp_desc_key in current_link_exp_descr.keys():
                        if link_exp_desc_key == "source":
                            temp_exp_source = current_link_exp_descr[link_exp_desc_key]
                        elif link_exp_desc_key == "exp_name":
                            temp_exp_name = current_link_exp_descr[link_exp_desc_key]
                    # END OF for link_exp_desc_key in current_link_exp_descr.keys():

                    if self.verbose:
                        sys.stderr.write("Inserting edge info into database: %s %s %s %s\n" %(self.link_id, temp_exp_uid, temp_exp_source, temp_exp_name ))
                        
                    sqlquery = """INSERT IGNORE INTO edges_experiments \
                    (edge_exp_id, edge_exp_uid, edge_exp_source, edge_exp_name) \
                    VALUES ("%s","%s","%s","%s");""" \
                    %(self.link_id, temp_exp_uid, temp_exp_source, temp_exp_name )
                    
                    self.cursor.execute(sqlquery)
                # END OF for link_experiments_key in self.link_experiments.keys():
                
            # END OF if self.output_mode == "dipDB":

            self.link_uid = None
            self.link_id = None
            self.link_from = None
            self.link_to = None
            self.link_class = None
            self.link_submitter = None
            self.currentEntity = None
        # END OF elif labelName == "edge":
            
        # Buffer contents were saved: initialiate buffer after each element
        self.buffer = ''

    """
    -------------------
    characters method
    -------------------

    Called by the xml lib whenever the parser finds chars between two tags:
    its only mission in this program is to keep in a buffer the chars read
    """     
    def characters(self, charsRead):
        
        self.buffer += charsRead
            
