"""
File        : ori2piana.py
Author      : Ramon Aragues
Creation    : 25.02.2004
Contents    : script that fills up tables in database piana related to external DB "ori"
Called from : build_database.py

=======================================================================================================

This program parsers a text file with interactions (ori format) and inserts them into piana

The input file to this parser can be found at piana/data/externalDBs/oriDB/interact.dat

format of the input file is:

Entry 1:
|Heteromer|1cpc_AB|3.7|DIP=0|STRING=0|Homol=0|
5H2C_HUMAN (P28335) 5-hydroxytryptamine 2C receptor (     6.7    1.2e+03 ! ?
FBN1_HUMAN (P35555) Fibrillin 1 precursor.                3.5      1e+04 ! ?
Entry 2:
|Heteromer|1cpc_AB|3.7|DIP=0|STRING=0|Homol=1|
PHA1_FREDI (P07122) C-phycocyanin-1 alpha chain.          4.4    5.7e+03   X
PHB1_FREDI (P07119) C-phycocyanin-1 beta chain.          12.6         20   X
Entry 3:
|Heteromer|1cpc_AB|3.7|DIP=0|STRING=0|Homol=1|
PHA1_FREDI (P07122) C-phycocyanin-1 alpha chain.          4.4    5.7e+03   X
PHB3_FREDI (P14877) C-phycocyanin-3 beta chain.          12.6         20   X
E
........................

Command line option '--help' describes usage of this program

"""

# ori2piana.py: script that fills up tables in database piana related to external DB "ori"
#
# Copyright (C) 2005  Ramon Aragues
# author email: ramon.aragues@upf.edu and boliva@imim.es
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#    http://www.gnu.org/copyleft/gpl.html
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
#
# University Pompeu Fabra, hereby disclaims all copyright
# interest in the program 'PIANA'
# (software for working with protein-protein interaction networks) written 
# by Ramon Aragues

import sys
import getopt

import re
import readline

import MySQLdb

from PianaDBaccess import *

verbose = 0

# ---------------------------------------------------------------
# Set here the default values for command line arguments
# ---------------------------------------------------------------
#
# These will be the values taken by the program when called directly from build_database.py

# Location of file containing external DB "ori"
ori_file = "../../../data/externalDBs/oriDB/interact.dat"


# ----------------------
# Function usage()
# ----------------------
def usage():
    print "--------------------------------------------------------------------------------------------------------------"
    print "This program fills up tables in database piana related to external DB 'ori' \n"
    print "Usage: python ori2piana.py --piana-dbname=piana_dbname --piana-dbhost=piana_dbhost --piana-dbuser=piana_dbuser --piana-dbpass=piana_dbpass"
    print "                           --ori-file=ori_file [--help] [--verbose]"
    print "\nwhere:"
    print "     piana_dbname : name of database piana to be used (required)"
    print "     piana_dbhost : name of host where database piana to be used is placed (required)"
    print "     piana_dbuser : username accessing the database (not required in most systems)"
    print "     piana_dbpass : password of username accessing the database (not required in most systems)"
    print "     ori_file       : the name of the ori input file (default is '%s')" %(ori_file)
    print "     --help         : prints this message and exits"
    print "     --verbose      : prints process info to stdout"
    print "--------------------------------------------------------------------------------------------------------------"
        

   
# ---------------------------
# Function parseArguments()                                               
# --------------------------- 

def parseArguments():
    
    global ori_file
    
    global piana_dbname
    global piana_dbhost
    global piana_dbuser
    global piana_dbpass
    
    global verbose
    
    try:
        opts, args = getopt.getopt(sys.argv[1:], "vhf:n:o:u:w:", ["verbose","help","ori-file=",
                                                                  "piana-dbname=", "piana-dbhost=", "piana-dbuser=", "piana-dbpass=" ])
    except getopt.GetoptError:
        # print help information and exit:
        usage()
        sys.exit(2)
     
    for option,value in opts:
        
        if option in ("-f", "--ori-file"):
            ori_file = value
            
        elif option in ("-n", "--piana-dbname"):
            piana_dbname = value
            
        elif option in ("-o", "--piana-dbhost"):
            piana_dbhost = value
            
        elif option in ("-u", "--piana-dbuser"):
            piana_dbuser = value
            
        elif option in ("-w", "--piana-dbpass"):
            piana_dbpass = value
            
        elif option in ("-v", "--verbose"):
            verbose = 1
            
        elif option in ("-h", "--help"):
            # print help information and exit
            usage()
            sys.exit(2)
            
# --------
# --------
#  Main()               
# --------                               
# --------

piana_dbname = None
piana_dbuser = None
piana_dbhost = None
piana_dbpass = None

# parsing arguments from the command line
parseArguments()

# Initialisating connection to piana
piana_access = PianaDBaccess(dbname=piana_dbname, dbhost=piana_dbhost, dbuser=piana_dbuser, dbpassword= piana_dbpass)

# opening file descriptor to input file
ori_fd = open(ori_file,"r")


# 
# Reading external DB "ori" and inserting its data into piana DB
# 

if verbose:
    print "Reading data from ori file..."

# initialize variables
new_entry_count = 0
swissacc_a = None
swissacc_b = None
protein_piana_a = None
protein_piana_b = None

number_of_interactions_null_swissacc = 0
number_of_interactions_no_id = 0

for line in ori_fd:

    # since there is no separator character, use single space to detect the swissaccession
    # this means that we need to change double spaces into single spaces, so split() always
    # partitions the line in the same way
    line = re.sub( "[ ]+", " ", line)
    
    # Retrieving fields from the line (separated by space)
    line_fields = line.split(" ")

    if line_fields[0] == "Entry":
        # new entry reached:
        #     1 - insert information about previous entry
        #     2 - initialize variables

        # 1 - insert information about previous entry
        if swissacc_a is not None and swissacc_b is not None:

            # TO CHECK!!! Why can swissacc be None? I don't remember the reason for this test...
            if verbose:
                print "Interaction found between %s and %s" %(swissacc_a, swissacc_b)

            list_protein_piana_a = piana_access.get_list_protein_piana(proteinCode_value=swissacc_a,
                                                                       proteinCodeType_value=PianaGlobals.swissAccessionID_col,
                                                                       tax_id_value= 0, source_db_info= "yes")
            
            list_protein_piana_b = piana_access.get_list_protein_piana(proteinCode_value=swissacc_b,
                                                                       proteinCodeType_value=PianaGlobals.swissAccessionID_col,
                                                                       tax_id_value= 0, source_db_info= "yes")
            


            
            # remove redundant codes (redundancies happen between different sourceDBs and find out if it is
            # a code that comes from a completion (ie. it only comes from a completion)
            
            #      create a dic of unique proteinPianas associated to protein A
            dic_proteinPiana_a = {}
            for proteinPiana_a in list_protein_piana_a:
                # proteinPiana_a is a tuple 
                #    proteinPiana_a[0] --> proteinPiana
                #    proteinPiana_a[1] --> source_dd

                if proteinPiana_a[1] != "completion":
                    dic_proteinPiana_a[proteinPiana_a[0]] = 'standard'
                elif proteinPiana_a[1] == "completion":
                    if not dic_proteinPiana_a.has_key(proteinPiana_a[0]):
                        dic_proteinPiana_a[proteinPiana_a[0]] = 'completion'
            # END OF for proteinPiana_a in list_protein_piana_a:
            
            #      create a dic of unique proteinPianas associated to protein B
            dic_proteinPiana_b = {}
            for proteinPiana_b in list_protein_piana_b:
                # proteinPiana_b is a tuple 
                #    proteinPiana_b[0] --> proteinPiana
                #    proteinPiana_b[1] --> source_dd

                if proteinPiana_b[1] != "completion":
                    dic_proteinPiana_b[proteinPiana_b[0]] = 'standard'
                elif proteinPiana_b[1] == "completion":
                    if not dic_proteinPiana_b.has_key(proteinPiana_b[0]):
                        dic_proteinPiana_b[proteinPiana_b[0]] = 'completion'
            # END OF for proteinPiana_b in list_protein_piana_b:

            
            for proteinPiana_a in dic_proteinPiana_a:
                for proteinPiana_b in dic_proteinPiana_b:
                    
                    if verbose:
                        sys.stderr.write( "   - inserting interaction: %s --> %s\n" %(proteinPiana_a, proteinPiana_b))


                    if dic_proteinPiana_a[proteinPiana_a] == "completion" or dic_proteinPiana_b[proteinPiana_b] == "completion":
                        source_db = "ori_c"
                    else:
                        source_db = "ori"
                        
                    piana_access.insert_interaction(proteinPianaA_value = proteinPiana_a,
                                                        isSourceA_value = 1,
                                                        proteinPianaB_value = proteinPiana_b,
                                                        isSourceB_value = 1,
                                                        interactionConfidence_value = 1,
                                                        methodDescription_value = "pred_struct",
                                                        sourceDBDescription_value = source_db,
                                                        confidenceAssignedSourceDB_value=1)

                # END OF for proteinPiana_b in dic_proteinPiana_b:
            # END OF for proteinPiana_a in dic_proteinPiana_a:
            if verbose:
 
                if not list_protein_piana_a or not list_protein_piana_b:
                    sys.stderr.write("proteinPiana not found for %s or %s \n" %(swissacc_a, swissacc_b))
                    number_of_interactions_no_id += 1

            
        # END OF if swissacc_a is not None and swissacc_b is not None:
        else:
            number_of_interactions_null_swissacc += 1
        # END OF else: (if swissacc_a is not None and swissacc_b is not None:)

        # 2 - initialize variables for next entry
        new_entry_count = 1
        swissacc_a = None
        swissacc_b = None
        protein_piana_a = None
        protein_piana_b = None
     
    elif new_entry_count == 1:
        # skipping second line of the entry (not relevant to parse)
        new_entry_count +=1
        
    elif new_entry_count == 2:
        # first protein of the interaction being parsed
        swissacc_a = line_fields[1]
        swissacc_a = swissacc_a.strip("(").strip(")")
        new_entry_count +=1
        
    elif new_entry_count == 3:
        # second protein of the interaction being parsed
        swissacc_b = line_fields[1]
        swissacc_b = swissacc_b.strip("(").strip(")")
        new_entry_count +=1

    # END OF if line_fields[0] == "Entry": elif new_entry_count == 1: .....
   

if verbose:
    sys.stderr.write( "All done! There were %s interactions without swissaccession and %s without a proteinPiana\n\n" %(
        number_of_interactions_null_swissacc,
        number_of_interactions_no_id))
