"""
File        : scop2piana.py
Author      : Ramon Aragues
Creation    : 8.11.2004
Contents    : script that fills up tables in database piana from  "scop" 
Called from : 

=======================================================================================================

This program parsers a text file with scop information and inserts them into piana

The specific files from scop that are parsed by this parser are: dir.cla.scop.txt_xxx


Command line option '--help' describes usage of this program


"""

# scop2piana.py: script that fills up tables in database piana from  "scop" 
#
# Copyright (C) 2005  Ramon Aragues
# author email: ramon.aragues@upf.edu and boliva@imim.es
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#    http://www.gnu.org/copyleft/gpl.html
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
#
# University Pompeu Fabra, hereby disclaims all copyright
# interest in the program 'PIANA'
# (software for working with protein-protein interaction networks) written 
# by Ramon Aragues

import sys
import getopt

import re
import readline

from PianaDBaccess import *

verbose = 0


# ----------------------
# Function usage()
# ----------------------
def usage():
    print "--------------------------------------------------------------------------------------------------------------"
    print "This program fills up tables in database piana related to external DB 'scop' \n"
    print "Usage: python scop2piana.py  --piana-dbname=piana_dbname --piana-dbhost=piana_dbhost --piana-dbuser=piana_dbuser --piana-dbpass=piana_dbpass"
    print "                              --input-file=input_file_name [--help] [--verbose]"
    print "\nwhere:"
    print "     piana_dbname : name of database piana to be used (required)"
    print "     piana_dbhost : name of host where database piana to be used is placed (required)"
    print "     piana_dbuser : username accessing the database (not required in most systems)"
    print "     piana_dbpass : password of username accessing the database (not required in most systems)"
    print "     input_file_name    : the name of the scop input file (dir.cla.scop.txt_xxx)" 
    print "     --help         : prints this message and exits"
    print "     --verbose      : prints process info to stdout"
    print "--------------------------------------------------------------------------------------------------------------"
        
# ---------------------------
# Function parseArguments()                                               
# --------------------------- 

def parseArguments():
    
    
    global piana_dbname
    global piana_dbhost
    global piana_dbuser
    global piana_dbpass
    
    global input_file_name
    
    global verbose
    
    try:
        opts, args = getopt.getopt(sys.argv[1:], "vhf:n:o:u:w:", ["verbose","help","input-file=", 
                                                                  "piana-dbname=", "piana-dbhost=", "piana-dbuser=", "piana-dbpass=" ])
    except getopt.GetoptError:
        # print help information and exit:
        usage()
        sys.exit(2)
     
    for option,value in opts:
        
        if option in ("-f", "--input-file"):
            input_file_name = value
            
        elif option in ("-n", "--piana-dbname"):
            piana_dbname = value
            
        elif option in ("-o", "--piana-dbhost"):
            piana_dbhost = value
            
        elif option in ("-u", "--piana-dbuser"):
            piana_dbuser = value
            
        elif option in ("-w", "--piana-dbpass"):
            piana_dbpass = value
            
        elif option in ("-v", "--verbose"):
            verbose = 1
            
        elif option in ("-h", "--help"):
            # print help information and exit
            usage()
            sys.exit(2)



# --------
# --------
#  Main()               
# --------                               
# --------

piana_dbname = None
piana_dbuser = None
piana_dbhost = None
piana_dbpass = None

input_file_name = None

number_of_proteins_added = 0
number_of_proteins_no_id = 0
number_of_lines = 0
number_incomplete_lines = 0

# parsing arguments from the command line
parseArguments()

if input_file_name is None:
    raise ValueError("You must set a scop to parse\n")


# Initialisating connection to piana
piana_access = PianaDBaccess(dbname=piana_dbname, dbhost=piana_dbhost, dbuser=piana_dbuser, dbpassword= piana_dbpass)

scop_fd = file(input_file_name,"r")

#
# Reading external DB "scop" and inserting its data into piana DB
# 

if verbose:
    sys.stderr.write("Reading data from scop file...\n")

for line in scop_fd:

    number_of_lines += 1

    if line[0] == "#":
        # skip comments
        continue
    
    
    # Retrieving different fields in line
    # line example:
    # d1dlya_ 1dly    A:            a.1.1.1 14983   cl=46456,cf=46457,sf=46458,fa=46459,dm=46460,sp=46462,px=14983
    # d1dlya_ 1dly    A:12-123      a.1.1.1 14983   cl=46456,cf=46457,sf=46458,fa=46459,dm=46460,sp=46462,px=14983

    line_fields = line.split()  # line_fields[0] is complete pdb entry
                                # line_fields[1] is pdb code
                                # line_fields[2] is pdb chain follow by : XX-YY (optional)
                                # line_fields[3] is ???
                                # line_fields[4] is ???
                                # line_fields[5] is comma-separated codes


    if len(line_fields) != 6:
        # skip incomplete lines
        number_incomplete_lines += 1
        continue
    
    pdb_code = line_fields[1]

    pdb_chain_fields = line_fields[2].split(":")
    if len(pdb_chain_fields) == 2:
        # standard line, chain:something (where something can be nothing or xx-yy)
        pdb_chain = pdb_chain_fields[0]
    else:
        # if pdb chain format is incorrect, set to A
        pdb_chain = "A"

    if pdb_chain == "-":
        # in piana, there are no empty chains: set to A
        pdb_chain = "A"
    
    pdb_code_chain = pdb_code + "." + pdb_chain
    list_code_values = line_fields[5].split(",")
    
    for code_value in list_code_values:
        
        code_value_pair = code_value.split("=")
        
        if code_value_pair[0] == "cf":
            cf_value = code_value_pair[1]
            
        elif code_value_pair[0] == "sf":
            sf_value = code_value_pair[1]
            
        elif code_value_pair[0] == "fa":
            fa_value = code_value_pair[1]
    # END OF for code_value in list_codes:

    # pdb.chain points directly to a species: no need to check tax id
    list_proteinPiana = piana_access.get_list_protein_piana(proteinCode_value=pdb_code_chain,
                                                            proteinCodeType_value= PianaGlobals.pdb_chain_col,
                                                            tax_id_value =0,
                                                            source_db_info= "no")
    
    
    for proteinPiana in list_proteinPiana:

        if verbose:
            sys.stderr.write("Inserting cf=%s sf=%s fa=%s for proteinPiana %s of pdb_chain %s\n" %(cf_value, sf_value, fa_value,
                                                                                                   proteinPiana, pdb_code_chain) )
              

        piana_access.insert_protein_scop(cf= cf_value, sf= sf_value, fa= fa_value, proteinPiana_value= proteinPiana, proteinScopSource_value= "scop")

        
    # END OF for proteinPiana_a in list_proteinPiana_a:
    
    if verbose:
        
        if not list_proteinPiana:
            sys.stderr.write("proteinPiana not found for %s \n" %(pdb_code_chain))
            number_of_proteins_no_id += 1


# END OF for line in scop_fd:

if verbose:
    sys.stderr.write("All done! Number of proteins added: %s . Number of proteins without proteinPiana: %s\nnumber lines=%s\nnumber incomplete %s\n" %(
        number_of_proteins_added,
        number_of_proteins_no_id,
        number_of_lines,
        number_incomplete_lines))
