"""
File        : pdbsprotec2piana.py
Author      : Ramon Aragues
Creation    : 10.2004
Contents    : fills up tables in database piana with information from pdbsprotec
Called from : 

=======================================================================================================

This file implements a program that fills up tables in database piana with information from pdbsprotec, which contains
a mapping between pdb codes, swissprot and ec codes

pdbsprotec can be downloaded from http://www.bioinf.org.uk/pdbsprotec/

Before running, uniprot (swissprot and trembl) has to be in pianaDB

Files that can be parsed using pdbsprotec2piana.py look like this:


 pdbcode | chainid | res1 | res2 | sprot  |     ec
---------+---------+------+------+--------+------------
 101m    |         | 1    | 153  | P02185 | 0.0.0.0
 101m    |         | 1    | 153  | P02185 | 0.0.0.0
 102l    |         | 1    | 162  | P00720 | 3.2.1.17
 102m    |         | 1    | 153  | P02185 | 0.0.0.0 
 104l    | A       | 1    | 162  | P00720 | 3.2.1.17
 104l    | B       | 1    | 162  | P00720 | 3.2.1.17
 104m    |         | 1    | 153  | P02185 | 0.0.0.0

 

"""

# pdbsprotec2piana.py: fills up tables in database piana with information from pdbsprotec
#
# Copyright (C) 2005  Ramon Aragues
# author email: ramon.aragues@upf.edu and boliva@imim.es
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#    http://www.gnu.org/copyleft/gpl.html
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
#
# University Pompeu Fabra, hereby disclaims all copyright
# interest in the program 'PIANA'
# (software for working with protein-protein interaction networks) written 
# by Ramon Aragues

import sys
import getopt

import re
import readline
import MySQLdb

from PianaDBaccess import *


# ---------------------------------------------------------------
# Set here the default values for command line arguments
# ---------------------------------------------------------------
#

# These will be the values taken by the program when called directly from build_database.py:


verbose = 0


# ----------------------
# Function usage()
# ----------------------
def usage():
    print "--------------------------------------------------------------------------------------------------------------"
    print "This program fills up tables in database piana with information from pdbsprotec \n"
    print "Usage: python pdbsprotec2piana.py  --input-file=input_file_name  --piana-dbname=piana_dbname --piana-dbhost=piana_dbhost"
    print "                                --piana-dbuser=piana_dbuser --piana-dbpass=piana_dbpass [--help] [--verbose]"
    print "\nwhere:"
    print "      input_file_name    : file name of input file containing pdbsprotec data "
    print "      piana_dbname       : name of database piana to be used (required)"
    print "      piana_dbhost       : name of host where database piana to be used is placed (required)"
    print "      piana_dbuser       : username accessing the database (not required in most systems)"
    print "      piana_dbpass       : password of username accessing the database (not required in most systems)"
    print "     --help         : prints this message and exits"
    print "     --verbose      : prints process info to stdout"
    print "--------------------------------------------------------------------------------------------------------------"
        

   
# ---------------------------
# Function parseArguments()                                               
# --------------------------- 

def parseArguments():
    

    global input_file
    global verbose
    
    global piana_dbname
    global piana_dbhost
    global piana_dbuser
    global piana_dbpass

    try:
        opts, args = getopt.getopt(sys.argv[1:], "vhi:n:o:u:w:", ["verbose","help", "input-file=", "piana-dbname=", "piana-dbhost=",
                                                                "piana-dbuser=","piana-dbpass="])
    except getopt.GetoptError, bad_opt:
        # print help information and exit:
        sys.stderr.write( bad_opt.__str__() )
        usage()
        sys.exit(2)
     
    for option,value in opts:
        
        if option in ("-i", "--input-file"):
            input_file = value
            
        elif option in ("-n", "--piana-dbname"):
            piana_dbname = value
            
        elif option in ("-o", "--piana-dbhost"):
            piana_dbhost = value
            
        elif option in ("-u", "--piana-dbuser"):
            piana_dbuser = value
             
        elif option in ("-w", "--piana-dbpass"):
            piana_dbpass = value
            
        elif option in ("-v", "--verbose"):
            verbose = 1
            
        elif option in ("-h", "--help"):
            # print help information and exit
            usage()
            sys.exit(2)
            
    # END OF for option,value in opts:
            
            
# --------
# --------
#  Main()               
# --------                               
# --------

input_file = None

piana_dbname= None
piana_dbhost= None
piana_dbuser= None
piana_dbpass= None

# parsing arguments from the command line
parseArguments()

if verbose:
    print "Arguments read are: input-file= %s || piana_dbname= %s || piana_dbhost= %s" %(input_file, piana_dbname, piana_dbhost)

# Initializing connection to piana
piana_access = PianaDBaccess(dbname=piana_dbname, dbhost= piana_dbhost, dbuser=piana_dbuser, dbpassword= piana_dbpass )

sourceDB = "pdbsprotec"


pdbsprotec_input_file_fd =  file(input_file, 'r')

number_of_lines = 0
number_of_sprots_with_no_id =0
number_of_pds_inserted = 0

if verbose:
    sys.stderr.write( "Processing file\n")

# while there are lines, insert info into piana
for line in pdbsprotec_input_file_fd:

    number_of_lines += 1
    
    if number_of_lines == 1 or number_of_lines == 2:
        # skip two first header lines
        continue


    line_fields = line.split("|")  # line_fields[0] is pdbcode
                                   # line_fields[1] is chainid 
                                   # line_fields[2] is res1   (starting residue number)
                                   # line_fields[3] is res2   (ending residue number)
                                   # line_fields[4] is sprot
                                   # line_fields[5] is ec 

    if len(line_fields) != 6:
        continue
    
    pdbcode = line_fields[0].strip()
    chainid = line_fields[1].strip()
    sprot   = line_fields[4].strip()
    ec      = line_fields[5].strip()

    if verbose:
        sys.stderr.write("Entry processed and fields obtained are:\n")
        sys.stderr.write("   -pdbcode : %s\n" %(pdbcode))
        sys.stderr.write("   -chainid : %s\n" %(chainid))
        sys.stderr.write("   -sprot : %s\n" %(sprot))
        sys.stderr.write("   -ec : %s\n" %(ec))
        

    # check if the pdbsprotec protein sequence already exists in piana or not
    list_sprot_proteinPiana = piana_access.get_list_protein_piana(proteinCode_value=sprot,
                                                                  proteinCodeType_value= PianaGlobals.swissAccessionID_col,
                                                                  tax_id_value = 0)


    for sprot_proteinPiana in list_sprot_proteinPiana:
        # for each proteinPiana, insert a new pdb entry and a new ec code

        if pdbcode != "":

            if verbose:
                sys.stderr.write("Inserting correspondence between proteinPiana <%s> and pdcode <%s>, chain <%s>\n" %(sprot_proteinPiana,
                                                                                                                      pdbcode,
                                                                                                                      chainid))
                number_of_pds_inserted += 1
            piana_access.insert_pdb_code(pdb_code_value= pdbcode , proteinPiana_value= sprot_proteinPiana, chain_value= chainid,
                                         pdb_source_value= sourceDB )

        if ec != "0.0.0.0":
            # don't insert if ec doen't carry content

            if verbose:
                sys.stderr.write("Inserting correspondence between proteinPiana <%s> and ec <%s>\n" %(sprot_proteinPiana,
                                                                                                      ec))
                
            piana_access.insert_protein_ec(ec_id= ec, proteinPiana_value= sprot_proteinPiana, proteinECSource_value= sourceDB)

    # END OF for sprot_proteinPiana in list_sprot_proteinPiana:

    if not list_sprot_proteinPiana:
        if verbose:
            number_of_sprots_with_no_id += 1
            sys.stderr.write ("No proteinPiana found for sprot %s\n" %(sprot) )
    
    
# END OF for line in pdbsprotec_input_file_fd:

if verbose:
    sys.stderr.write("Number of lines read: %s. Number of sprots without a proteinPiana: %s. Number of pdbs inserted: %s.\n" %(number_of_lines,
                                                                                                                               number_of_sprots_with_no_id,
                                                                                                                               number_of_pds_inserted))
