"""
File        : uniprot2piana.py
Author      : Ramon Aragues
Creation    : 16.3.2004
Contents    : fills up tables in database piana with information from swissprot
Called from : 

=======================================================================================================

This file implements a program that fills up tables in database piana with information of uniprot databases

This parser uses biopython libraries and methods

Command line option '--help' describes usage of this program

For more details on how to use it, read piana/README.populate_piana_db
"""

# uniprot2piana.py: fills up tables in database piana with information from swisspro
#
# Copyright (C) 2005  Ramon Aragues
# author email: ramon.aragues@upf.edu and boliva@imim.es
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#    http://www.gnu.org/copyleft/gpl.html
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
#
# University Pompeu Fabra, hereby disclaims all copyright
# interest in the program 'PIANA'
# (software for working with protein-protein interaction networks) written 
# by Ramon Aragues


import sys
import getopt
import re
import readline
import MySQLdb

from Bio.SwissProt import SProt
from Bio import File


from PianaDBaccess import *


verbose = 0

# ---------------------------------------------------------------
# Set here the default values for command line arguments
# ---------------------------------------------------------------

# if using mode 'tables' these will be the tables that will be populated. In mode 'scrath' this list is ignored
tables_to_fill = [PianaGlobals.geneName_table]   # valid tables are those *_table in PianaGlobals

# ----------------------
# Function usage()
# ----------------------
def usage():
    print "--------------------------------------------------------------------------------------------------------------"
    print "This program fills up tables in database piana with information from swissprot or TrEMBL (no different treatment required)\n"
    print "Usage: python uniprot2piana.py  --input-file=input_file_name --piana-dbname=piana_dbname --piana-dbhost=piana_dbhost"
    print "                                        --piana-dbuser=piana_dbuser --piana-dbpass=piana_dbpass --mode=mode [--help] [--verbose]"
    print "\nwhere:"
    print "      input_file_name     : file name of input file containing swissprot or TrEMBL data"
    print "      piana_dbname       : name of database piana to be used (required )"
    print "      piana_dbhost       : name of host where database piana to be used is placed (required)"
    print "      piana_dbuser       : username accessing the database (not required in most systems)"
    print "      piana_dbpass       : password of username accessing the database (not required in most systems)"
    print "      mode               : sets mode to be used by parser. Valid modes are:"
    print "                             - scratch: piana database is empty, create it from scratch"
    print "                             - tables:  fill only tables indicated in tables_to_fill (see code)"
    print "                                       (tables which must haven been previously emptied)"
    print "     --help         : prints this message and exits"
    print "     --verbose      : prints process info to stdout"
    print "--------------------------------------------------------------------------------------------------------------"
        

   
# ---------------------------
# Function parseArguments()                                               
# --------------------------- 

def parseArguments():
    

    global input_file
    global verbose
    
    global piana_dbname
    global piana_dbhost
    global piana_dbuser
    global piana_dbpass

    global mode
    
    try:
        opts, args = getopt.getopt(sys.argv[1:], "", ["verbose","help", "input-file=", "piana-dbname=", "piana-dbhost=",
                                                                "piana-dbuser=","piana-dbpass=", "mode="])
    except getopt.GetoptError, bad_opt:
        # print help information and exit:
        sys.stderr.write( bad_opt.__str__() )
        usage()
        sys.exit(2)
     
    for option,value in opts:
        
        if option == "--input-file":
            input_file = value
            
        elif option == "--piana-dbname":
            piana_dbname = value
            
        elif option == "--piana-dbhost":
            piana_dbhost = value
            
        elif option == "--piana-dbuser":
            piana_dbuser = value
             
        elif option == "--piana-dbpass":
            piana_dbpass = value
            
        elif option == "--mode":
            mode = value
            
        elif option == "--verbose":
            verbose = 1
            
        elif option == "--help":
            # print help information and exit
            usage()
            sys.exit(2)
            
    # END OF for option,value in opts:

    if mode is None or (mode != "scratch" and mode != "tables"):
        print "You didn't set a parsing mode or mode is invalid"
        usage()
        sys.exit(2)
            
            
# --------
# --------
#  Main()               
# --------                               
# --------
mode = None
input_file = None

piana_dbname= None
piana_dbhost= None
piana_dbuser= None
piana_dbpass= None

# parsing arguments from the command line
parseArguments()

input_file_fd = file(input_file, 'r')

# Initializing connection to piana
piana_access = PianaDBaccess(dbname=piana_dbname, dbhost= piana_dbhost, dbuser=piana_dbuser, dbpassword= piana_dbpass )


# Initializing bipoython swissprot parser
swiss_parser = SProt.RecordParser()
swiss_iterator = SProt.Iterator(handle= input_file_fd, parser= swiss_parser)

# reading first record
swiss_record = swiss_iterator.next()

sourceDB = None

# while record read is not None, parse the record and insert data into piana
while swiss_record is not None:

    if verbose:
        sys.stderr.write( "===================================\n")
        sys.stderr.write( "            NEW ENTRY\n")
        sys.stderr.write( "===================================\n")
        

    # first of all, we set the source of this protein entry: swissProt or TrEMBL
    if swiss_record.data_class == "STANDARD":
        sourceDB = "SwissProt"
    elif  swiss_record.data_class == "PRELIMINARY":
        sourceDB = "TrEMBL"
    else:
        sourceDB = "Unknown"


    # since a protein is defined by its sequence and tax_id, first thing
    # to clarity is whether we have a tax_id for this protein or not
    #  Special cases:  - can we have a sequence assigned to two species?
    #                  -
    # If there is no taxonomy_id or there is more than one, we use a dummy
    # tax_id for this protein... which means that it will correspond to
    # whichever tax_id
    
    if swiss_record.taxonomy_id and len(swiss_record.taxonomy_id) == 1 and swiss_record.taxonomy_id[0]:
        tax_id_value = int(swiss_record.taxonomy_id[0])
    else:
        tax_id_value = 0
       
    if verbose:
        sys.stderr.write("parsing mode is: %s \n" %mode )
        sys.stderr.write("sourceDB is: %s\n " %sourceDB )
        sys.stderr.write("protein sequence is: %s\n" %swiss_record.sequence )   
        sys.stderr.write("tax id is: %s\n" %tax_id_value )         

    # --- table protein ---
    if mode == "tables":
        # since we are only updating certain tables, get the protein piana that should already exist in the database for this sequence
        # TO DO!!! species_name_value set to 'all' is not logical....
        list_inserted_proteinPiana = piana_access.get_list_protein_piana(proteinCode_value=swiss_record.sequence,
                                                                         proteinCodeType_value= PianaGlobals.proteinSequence_col,
                                                                         tax_id_value = tax_id_value,
                                                                         source_db_info="no")


        num_of_pps = len(list_inserted_proteinPiana)
        
        # check that only one proteinPiana was found for the sequence (as it should be)
        if num_of_pps == 0:
            raise ValueError("trying to update tables for a (sequence, tax_id) that is not in the database")
        
        elif num_of_pps > 1:
            raise ValueError("trying to update tables for a (sequence, tax_id) that has multiple proteinPianas associated")
        else:
            # we will work with that proteinPiana from now on
            inserted_proteinPiana = list_inserted_proteinPiana[0]

    elif mode == "scratch":
        # inserting sequence  for current protein into piana. insert_protein fills all fields of table protein
        # if inserting a sequence that was already in the database, it will update fields without creating a new proteinPiana

        # TO CHECK!!! What's the use of updating fields that are automatically generated from the sequence... if the sequence
        # is the same, then the fields will be the same... then, why updating instead of "doing nothing"? This has to be changed
        # in PianaDBaccess.insert_protein if found to be a reasonable argument... maybe updating is good when we didn't calculate
        # correctly the MW, IP or MD5... 
        
        if swiss_record.sequence != "":
            inserted_proteinPiana = piana_access.insert_protein(proteinSequence_value= swiss_record.sequence,
                                                                tax_id_value = tax_id_value)


    # --- table swissProt ---
    if mode == "scratch" or (PianaGlobals.swissProt_table in tables_to_fill and mode =="tables"):
        if verbose:
            sys.stderr.write( "swissprotID is: %s \n" %swiss_record.entry_name ) 

        # inserting swissProt ID  for current protein
        if swiss_record.entry_name != "":
            piana_access.insert_swissProt_code(swissProt_code_value= swiss_record.entry_name, proteinPiana_value= inserted_proteinPiana, 
                                               swissProt_source_value = sourceDB)

    # --- table swissAccession ---
    if mode == "scratch" or (PianaGlobals.swissAccession_table in tables_to_fill and mode =="tables"):
   
        if verbose:
            sys.stderr.write( "accession numbers are: %s \n"  %swiss_record.accessions)
            
            
        isPrimary= 1    # isPrimary will be true only for the first accession number found
    
        for swiss_accession in swiss_record.accessions:

            if swiss_accession != "":
                piana_access.insert_swissAccession_code(swissAccession_code_value= swiss_accession,
                                                        proteinPiana_value= inserted_proteinPiana,
                                                        swissAccession_source_value = sourceDB,
                                                        isPrimary_value= isPrimary)
                isPrimary= 0

    # --- table geneName ---
    if mode == "scratch" or (PianaGlobals.geneName_table in tables_to_fill and mode =="tables"):
   
        if verbose:
            sys.stderr.write( "swiss_record.gene_name is: %s \n"  %swiss_record.gene_name )

        # parse_string_field_value gets pairs [field_name, field_value] from a string where
        # format looks something like "field_name=field_value;field_name=field_value...."
        gene_name_fields = utilities.parse_string_field_value(input_string= swiss_record.gene_name,
                                                              separator_field_value="=",
                                                              global_separators=[';','\n','"']) 
        
        # inserting geneName code for current protein
        if gene_name_fields:

            for gene_name_field in gene_name_fields:

                if gene_name_field[0] == "Name":
                    if verbose:
                        sys.stderr.write( "individual gene name (primary) is: <%s>" %(gene_name_field[1].strip().strip("'").strip('"')) )

                    geneName_value= gene_name_field[1].strip().strip("'").strip('"')
                    
                    if geneName_value and not re.match('^[0-9]+$', geneName_value):
                        # avoid inserting empty values or numbers
                        piana_access.insert_geneName_code(geneName_code_value= geneName_value ,
                                                          proteinPiana_value= inserted_proteinPiana,
                                                          geneName_source_value= sourceDB)
                    
                elif gene_name_field[0] == "Synonyms" or gene_name_field[0] == "OrderedLocusNames" or gene_name_field[0] == "ORFNames":
                    gene_synonyms = gene_name_field[1].split(",")
                    
                    for gene_synonim in gene_synonyms:

                        gene_synomim_value = gene_synonim.strip().strip("'").strip('"')
                        if gene_synomim_value and not re.match('^[0-9]+$', gene_synomim_value ):
                            piana_access.insert_geneName_code(geneName_code_value= gene_synomim_value,
                                                              proteinPiana_value= inserted_proteinPiana,
                                                              geneName_source_value= sourceDB)
                    # END OF for gene_synonim in gene_name_field[1]:
                # END OF elif gene_name_field[0] == "Synonyms":

            # END OF for gene_name_field in gene_name_fields:
        # END OF  if gene_name_fields:

    # --- table species ---
    if mode == "scratch" or (PianaGlobals.species_table in tables_to_fill and mode =="tables"):

        if verbose:
            sys.stderr.write( "swiss record taxonomy: %s \n" %swiss_record.taxonomy_id)

        # inserting taxonomy code for current protein
        if swiss_record.taxonomy_id:

            for one_taxonomy_id in swiss_record.taxonomy_id:

                if one_taxonomy_id != "":
                    piana_access.insert_protein_species(tax_id =  one_taxonomy_id,
                                                        proteinPiana_value = inserted_proteinPiana,
                                                        proteinSpeciesSource_value = sourceDB)


    # --- table proteinDescription ---
    if mode == "scratch" or (PianaGlobals.proteinDescription_table in tables_to_fill and mode =="tables"):

        if verbose:      
            sys.stderr.write( "description is: %s \n"  %swiss_record.description)

        # example of field: Corticotropin-lipotropin precursor (Pro-opiomelanocortin) (POMC) Contains: NPP; Melanotropin gamma (Gamma-MSH); 
 
        # inserting description of the protein
        if swiss_record.description  != "":
            list_descriptions = swiss_record.description.split(";")
            # do some preprocessing of the description
            for one_description in list_descriptions:

                piana_access.insert_protein_description(
                    description= one_description.replace('"'," ").replace("\n", " ").replace("'"," ").replace('\\', " ").strip(),
                    proteinPiana_value= inserted_proteinPiana,
                    proteinDescriptionSource_value= sourceDB)

    # --- table proteinEC ---
    if mode == "scratch" or (PianaGlobals.proteinEC_table in tables_to_fill and mode =="tables"):

        # inserting EC code of the protein (EC code is found between parenthesis inside the description field, preceded by "EC")
        # example: synthase (EC 6.3.5.5); Aspartate carbamoyltransferase (EC 2.1.3.2)
        if swiss_record.description  != "":
            enzymes = re.findall("\(EC \d+\.\d+\.\d+\.\d+\)", swiss_record.description)

            for enzyme in enzymes:

                enzyme = re.sub("[(^\(EC )(\)$)]", "", enzyme).strip()

                if enzyme != "":
                    piana_access.insert_protein_ec(ec_id= enzyme,
                                                   proteinPiana_value= inserted_proteinPiana,
                                                   proteinECSource_value= sourceDB)
            
    for comment in swiss_record.comments:

        comment_fields = comment.split(":")

        # comment_fields[0] can be of type:
        #
        # ALLERGEN  Information relevant to allergenic proteins
        # ALTERNATIVE PRODUCTS Description of the existence of related protein sequence(s) produced by alternative splicing of the same gene
        #                      or by the use of alternative initiation codons
        # BIOTECHNOLOGY Description of the use of a specific protein in a biotechnological process
        # CATALYTIC ACTIVITY Description of the reaction(s) catalyzed by an enzyme [1]
        # CAUTION Warning about possible errors and/or grounds for confusion
        # COFACTOR Description of an enzyme cofactor
        # DATABASE Description of a cross-reference to a network database/resource for a specific protein; see 3.11.2
        # DEVELOPMENTAL STAGE Description of the developmentally-specific expression of a protein
        # DISEASE Description of the disease(s) associated with a deficiency of a protein
        # DOMAIN Description of the domain structure of a protein
        # ENZYME REGULATION Description of an enzyme regulatory mechanism
        # FUNCTION General description of the function(s) of a protein
        # INDUCTION Description of the compound(s) or condition(s) that stimulate the synthesis of a protein
        # MASS SPECTROMETRY Reports the exact molecular weight of a protein or part of a protein as determined by mass spectrometric methods
        # MISCELLANEOUS Any comment which does not belong to any of the other defined topics
        # PATHWAY Description of the metabolic pathway(s) with which a protein is associated
        # PHARMACEUTICAL Description of the use of a protein as a pharmaceutical drug
        # POLYMORPHISM Description of polymorphism(s)
        # PTM Description of a posttranslational modification
        # RNA EDITING Description of any type of RNA editing that leads to one or more amino acid changes
        # SIMILARITY Description of the similaritie(s) (sequence or structural) of a protein with other proteins
        # SUBCELLULAR LOCATION Description of the subcellular location of the mature protein
        # SUBUNIT Description of the quaternary structure of a protein
        # TISSUE SPECIFICITY Description of the tissue specificity of a protein


        if comment_fields[0] == "SUBCELLULAR LOCATION":
            
            # --- table proteinsubcellularLocation ---
            if mode == "scratch" or (PianaGlobals.proteinSubcellularLocation_table in tables_to_fill and mode =="tables"):

                if verbose:
                    sys.stderr.write( "comment of type %s is: %s \n" %(comment_fields[0], comment_fields[1]))
               
                piana_access.insert_protein_subcellularLocation(subcellularLocation = comment_fields[1].replace('"'," ").replace("\n", " ").strip(),
                                                                proteinPiana_value = inserted_proteinPiana,
                                                                proteinSubcellularLocationSource_value= sourceDB)
                
        elif comment_fields[0] == "FUNCTION":
            
            # --- table proteinFunction ---
            if mode == "scratch" or (PianaGlobals.proteinFunction_table in tables_to_fill and mode =="tables"):

                if verbose:
                    sys.stderr.write( "comment of type %s is: %s \n" %(comment_fields[0], comment_fields[1]))
               
                piana_access.insert_protein_function(function= comment_fields[1].replace('"'," ").replace("\n", " ").strip(),
                                                     proteinPiana_value= inserted_proteinPiana,
                                                     proteinFunctionSource_value= sourceDB)
        
    # END OF for comment in swiss_record.comments:
    

    
    for cross_reference in swiss_record.cross_references:

        # cross_reference[0] can be of type:
        #
        # EMBL  Nucleotide sequence database of EMBL/EBI (see 3.12.6)  Nucleic Acids Res. 32:D27-D30(2004); PMID: 14681351
        # Aarhus/Ghent-2DPAGE Human keratinocyte 2D gel protein database from Aarhus and Ghent universities FEBS Lett. 430:64-72(1998); PMID: 9678596
        # ANU-2DPAGE Australian National University 2-DE database Proteomics 1:1149-1161(2001); PMID: 11990509
        # COMPLUYEAST-2DPAGE 2-DE database at Universidad Complutense de Madrid J. Chromatogr. B. Biomed. Sci. Appl. 787:129-148(1993); PMID: 12659738
        # DictyBase Dictyostelium discoideum online informatics resource Nucleic Acids Res. 32:D332-D333(2004); PMID: 14681427
        # ECO2DBASE Escherichia coli gene-protein database (2D gel spots) (ECO2DBASE) Electrophoresis 20:2149-2159(1999); PMID: 9298644
        # EcoGene Escherichia coli K12 genome database (EcoGene) Nucleic Acids Res. 28:60-64(2000); PMID: 10592181
        # FlyBase Drosophila genome database (FlyBase) Nucleic Acids Res. 32:D418-D420(2004); PMID: 14681446
        # GeneDB_SPombe Schizosaccharomyces pombe GeneDB Nucleic Acids Res. 32:D339-D343(2004); PMID: 14681429
        # Genew Human gene nomenclature database (Genew) Nucleic Acids Res. 32:D255-D257(2004); PMID: 14681406
        # GermOnline GermOnline database Nucleic Acids Res. 32:D560-D567(2004); PMID: 14681481
        # GK Genome Knowledgebase (GK) http://www.genomeknowledge.org/
        # GlycoSuiteDB Database of glycan structures (GlycoSuiteDB) Nucleic Acids Res. 31:511-513(2003); PMID: 12520065
        # GO Gene Ontology (GO) database Nucl. Acids. Res. 32:D258-D261(2004); PMID: 14681407
        # Gramene Comparative mapping resource for grains (Gramene) Plant Physiol. 130:1606-1613(2002); PMID: 12481044
        # HAMAP Database of microbial protein families (HAMAP) Comput. Biol. Chem. 27:49-58(2003)
        # HIV HIV sequence database Kuiken C.L. et al., In: Theoretical Biology and Biophysics Group, Los Alamos National Laboratory, Los Alamos, NM.
        # HSC-2DPAGE Harefield hospital 2D gel protein databases (HSC-2DPAGE) Electrophoresis 18:471-479(1997); PMID: 9150926
        # HSSP Homology-derived secondary structure of proteins database (HSSP) Nucleic Acids Res. 27:244-247(1999); PMID: 9847191
        # InterPro Integrated resource of protein families, domains and functional sites (InterPro) Nucleic Acids Res. 31:315-318(2003); PMID: 12520011
        # Leproma Mycobacterium leprae genome database (Leproma) Lepr. Rev. 72:470-477(2001); PMID: 11826483
        # ListiList Listeria innocua and Listeria monocytogenes genomes database Microbiology 141:261-268(1995); PMID: 7704253
        # MaizeDB Maize Genetics/Genomics Database (MaizeGDB) Nucleic Acids Res. 32:D393-D397(2004); PMID: 14681441
        # Maize-2DPAGE Maize genome 2D Electrophoresis database (Maize-2DPAGE) Theor. Appl. Genet. 93:997-1005(1996)
        # MEROPS Peptidase database (MEROPS) Nucleic Acids Res. 32:D160-D164(2004); PMID: 14681384
        # MGD Mouse genome database (MGD) Nucleic Acids Res. 32:D476-D481(2004); PMID: 14681461
        # MIM Mendelian Inheritance in Man Database (MIM) Nucleic Acids Res. 30:52-55(2002); PMID: 11752252
        # MypuList Mycoplasma pulmonis genome database (MypuList) http://genolist.pasteur.fr/MypuList/
        # PDB 3D-macromolecular structure Protein Data Bank (PDB) Nucleic Acids Res. 31:489-491(2003); PMID: 12520059
        # Pfam Pfam protein domain database Nucleic Acids Res. 32:D138-141(2004); PMID: 14681378
        # PHCI-2DPAGE Parasite host cell interaction 2D-PAGE database http://www.gram.au.dk/
        # PhosSite Phosphorylation Site Database for prokaryotic proteins (In) Leslie M. (ed.); NetWatch. Science 294:1623-1623(2001) 
        # PhotoList Photorhabdus luminescens genome database Microbiology 141:261-268(1995); PMID: 7704253
        # PIR Protein sequence database of the Protein Information Resource (PIR) Nucleic Acids Res. 30:35-37(2002); PMID: 11752247
        # PIRSF PIR SuperFamilies of iProClass Nucleic Acids Res. 32:D112-D124(2004); PMID: 14681371
        # PMMA-2DPAGE Purkyne Military Medical Academy 2D-PAGE database http://www.pmma.pmfhk.cz/
        # PRINTS Protein Fingerprint database (PRINTS) Nucleic Acids Res. 30:239-241(2002); PMID: 11752304
        # ProDom ProDom protein domain database Brief. Bioinform. 3:246-251(2002); PMID: 12230033
        # PROSITE PROSITE protein domain and family database (see 3.12.7) Nucleic Acids Res. 32:D134-137(2004); PMID: 14681377
        # REBASE Restriction enzymes and methylases database (REBASE) Nucleic Acids Res. 31:418-420(2003); PMID: 12520038
        # RGD Rat Genome Database (RGD) Nucleic Acids Res. 30:125-128(2002); PMID: 11752273
        # SagaList Streptococcus agalactiae NEM316 / Serotype III genome database Microbiology 141:261-268(1995); PMID: 7704253
        # SGD Saccharomyces Genome Database (SGD) Nucleic Acids Res. 32:D311-D314(2004); PMID: 14681421
        # Siena-2DPAGE 2D-PAGE database from the Department of Molecular Biology, University of Siena, Italy http://www.bio-mol.unisi.it/
        # SMART Simple Modular Architecture Research Tool (SMART) Nucleic Acids Res. 32:D142-D144(2004); PMID: 14681379
        # StyGene Salmonella typhimurium LT2 genome database (StyGene)  
        # SubtiList Bacillus subtilis 168 genome database (SubtiList) Nucleic Acids Res. 30:62-65(2002); PMID: 11752255
        # SWISS-2DPAGE 2D-PAGE database from the Geneva University Hospital (SWISS-2DPAGE) Nucleic Acids Res. 28:286-288(2000); PMID: 10592248
        # TIGR The bacterial databases of 'The Institute of Genome Research' (TIGR) Nucleic Acid Res. 29:159-164(2001); PMID: 11125077
        # TIGRFAMs TIGR protein family database (TIGRFAMs) Nucleic Acids Res. 29:41-43(2001); PMID: 11125044
        # TRANSFAC Transcription factor database (TRANSFAC) Nucleic Acids Res. 31:374-378(2004); PMID: 12520026
        # TubercuList Mycobacterium tuberculosis H37Rv genome database (TubercuList) FEBS Lett. 452:7-10(1999); PMID: 10376668
        # WormPep Caenorhabditis elegans genome sequencing project protein database (WormPep) Genomics 46:200-216(1997); PMID: 9417907
        # ZFIN Zebrafish Information Network genome database (ZFIN) Nucleic Acids Res. 31:241-243(2003); PMID: 12519991


        if cross_reference[0] == "PIR":
            # cross_reference[1] is PIR accession number
            # cross_reference[2] is PIR Entry name
            # both can be unknown (coded as "-")

            if cross_reference[1]  != "" and cross_reference[1] <> "-":
                
                # --- table pirAccession ---
                if mode == "scratch" or (PianaGlobals.pirAccession_table in tables_to_fill and mode =="tables"):

                    if verbose:
                        sys.stderr.write( "cross reference to database %s is: %s  \n" %(cross_reference[0], cross_reference[1:]) )
               
                    piana_access.insert_pirAccession_code(pirAccession_code_value= cross_reference[1],
                                                          proteinPiana_value= inserted_proteinPiana,
                                                          pirAccession_source_value= sourceDB)
                
            if cross_reference[2]  != "" and cross_reference[2] <> "-":
                
                # --- table pirEntry ---
                if mode == "scratch" or (PianaGlobals.pirEntry_table in tables_to_fill and mode =="tables"):

                    if verbose:
                        sys.stderr.write( "cross reference to database %s is: %s  \n" %(cross_reference[0], cross_reference[1:]) )
                
                    piana_access.insert_pirEntry_code(pirEntry_code_value= cross_reference[2],
                                                      proteinPiana_value= inserted_proteinPiana,
                                                      pirEntry_source_value= sourceDB)
                

        elif cross_reference[0] == "EMBL":
            # cross_reference[1] is EMBL accession number
            # cross_reference[2] is EMBL Protein_ID
            # both can be unknown (coded as "-")

            if cross_reference[1] != "" and cross_reference[1] <> "-":
                
                # --- table emblAccession ---
                if mode == "scratch" or (PianaGlobals.emblAccession_table in tables_to_fill and mode =="tables"):

                    if verbose:
                        sys.stderr.write( "cross reference to database %s is: %s  \n" %(cross_reference[0], cross_reference[1:]) )

                    # insert_emblAccession_code takes care of processing the version after the point (if it is there)
                    piana_access.insert_emblAccession_code(emblAccession_code_value= cross_reference[1],
                                                           proteinPiana_value= inserted_proteinPiana,
                                                           emblAccession_source_value= sourceDB)
                
            if cross_reference[2] != "" and cross_reference[2] <> "-":
                
                # --- table emblPID ---
                if mode == "scratch" or (PianaGlobals.emblPID_table in tables_to_fill and mode =="tables"):

                    if verbose:
                        sys.stderr.write( "cross reference to database %s is: %s  \n" %(cross_reference[0], cross_reference[1:]) )

                    # insert_emblPID_code takes care of processing the version after the point (if it is there)
                    piana_access.insert_emblPID_code(emblPID_code_value= cross_reference[2],
                                                     proteinPiana_value= inserted_proteinPiana,
                                                     emblPID_source_value= sourceDB)
                

        # TO DO!!! Add tables and code to populate other external references...
        elif cross_reference[0] == "GO":
            pass
        elif cross_reference[0] == "Pfam":
            pass
        elif cross_reference[0] == "PROSITE":
            pass
        elif cross_reference[0] == "TRANSFAC":
            pass
        elif cross_reference[0] == "ProDom":
            pass
        elif cross_reference[0] == "InterPro":

            if cross_reference[1]  != "" and cross_reference[1] <> "-":
                
                # --- table interPro ---
                if mode == "scratch" or (PianaGlobals.interPro_table in tables_to_fill and mode =="tables"):

                    if verbose:
                        sys.stderr.write( "cross reference to database %s is: %s  \n" %(cross_reference[0], cross_reference[1:]) )
                        
                    # interPro description is not necessarily there... make arrangements to avoid errors
                    if len(cross_reference) > 2 and cross_reference[2] != "" and cross_reference[2] <> "-":
                        interPro_description = cross_reference[2]
                    else:
                        interPro_description= ""
                        
                    piana_access.insert_interPro_code(interProID_code_value= cross_reference[1],
                                                      proteinPiana_value= inserted_proteinPiana,
                                                      interProDescription_value= interPro_description,
                                                      interPro_source_value= sourceDB)

    
    # END OF for cross_reference in swiss_record.cross_references:
    
                
    # --- table proteinKeyword ---
    if mode == "scratch" or (PianaGlobals.proteinKeyword_table in tables_to_fill and mode =="tables"):
        
        # inserting keywords into piana
        for keyword in swiss_record.keywords:

            # The KW (KeyWord) lines provide information that can be
            # used to generate indexes of the sequence entries based on
            # functional, structural, or other categories.

            if keyword != "":
                piana_access.insert_protein_keyword(keyword= keyword,
                                                    proteinPiana_value= inserted_proteinPiana,
                                                    proteinKeywordSource_value= sourceDB)

        # END OF for keyword in swiss_record.keywords:
            
    for feature in swiss_record.features:

        # The FT (Feature Table) lines provide a precise but simple
        # means for the annotation of the sequence data. The table
        # describes regions or sites of interest in the sequence. In
        # general the feature table lists posttranslational
        # modifications, binding sites, enzyme active sites, local
        # secondary structure or other characteristics reported in
        # the cited references. Sequence conflicts between
        # references are also included in the feature table.
        if mode == "scratch":

            # TO DO!!! Do something about this field!
            
            if verbose:
                sys.stderr.write( "feature of type %s is: %s  \n" %(feature[0] , feature[1:]))
            
    # END OF for feature in swiss_record.features:



    # --- table uniprotInfo ---
    if mode == "scratch" or (PianaGlobals.uniprotInfo_table in tables_to_fill and mode =="tables"):

        if verbose:
            sys.stderr.write("swissprot sequence lenght is: %s \n" %(swiss_record.seqinfo[0] ) )
            sys.stderr.write("swissprot molecular weight is: %s \n" %(swiss_record.seqinfo[1] ) )
            sys.stderr.write("swissprot crc32 is: %s \n" %(swiss_record.seqinfo[2] ))
            sys.stderr.write("swissprot created is: %s \n" %(swiss_record.created[0]) )
            sys.stderr.write("swissprot organism is:%s \n" %(swiss_record.organism ))
            # The OG (OrGanelle) line indicates if the gene coding for a
            # protein originates from the mitochondria, the chloroplast, the
            # cyanelle, the nucleomorph or a plasmid.
            sys.stderr.write("swissprot organelle is:%s \n" %(swiss_record.organelle ))
            sys.stderr.write("swissprot organism classification is:%s \n" %(swiss_record.organism_classification ))
            
        piana_access.insert_uniprotInfo( proteinPiana_value=  inserted_proteinPiana,
                                         swissProtID_value= swiss_record.entry_name,
                                         swissAccessionID_value= swiss_record.accessions[0],
                                         data_class_value= swiss_record.data_class,
                                         description_value= swiss_record.description.replace('"'," ").replace("\n", " ").strip(),
                                         geneName_value= swiss_record.gene_name.replace('"'," ").replace("\n", " ").strip(),
                                         organism_value= swiss_record.organism,
                                         organelle_value= swiss_record.organelle,
                                         proteinSequenceLength_value= swiss_record.seqinfo[0],
                                         proteinMW_value= swiss_record.seqinfo[1])
        
    for reference in swiss_record.references:

        if mode == "scratch":
            # These lines comprise the literature citations. The citations indicate the sources from which the data has been abstracted.
            if verbose:
                sys.stderr.write("      reference positions: %s \n" %(reference.positions ))
                sys.stderr.write("      reference comments: %s \n" %(reference.comments   ) ) 
                sys.stderr.write("      reference references: %s \n" %(reference.references ))
                sys.stderr.write("      reference authors: %s \n" %(reference.authors   ))
                sys.stderr.write("      reference title: %s \n" %(reference.title    ) )
                sys.stderr.write("      reference location: %s \n" %(reference.location ))
                
    # END OF  for reference in swiss_record.references:

    # reading next record
    if verbose:
        sys.stderr.write("-- reading next record\n")
    swiss_record = swiss_iterator.next()
    
# END OF while swiss_record is not None
