"""
 File       : utilities.py
 Author     : Ramon Aragues & Joan Planas
 Creation   : 15.01.2004
 Contents   : miscelaneous utilities used by piana
=======================================================================================================

"""

# utilities.py: miscelaneous utilities used by piana
#
# Copyright (C) 2005  Ramon Aragues
# author email: ramon.aragues@upf.edu and boliva@imim.es
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#    http://www.gnu.org/copyleft/gpl.html
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
#
# University Pompeu Fabra, hereby disclaims all copyright
# interest in the program 'PIANA'
# (software for working with protein-protein interaction networks) written 
# by Ramon Aragues


import sys, string, re, md5
import fnmatch
import os
import cPickle
import time

from sets import *

import math


from Bio.Blast import NCBIStandalone
from Bio import Fasta  # needed to read fasta format input files


import PianaGlobals

verbose = 0
verbose_detailed = 0
verbose_very_detailed = 0
verbose_matrix = 0
verbose_string_utilities = 0
verbose_blast_report = 0

# -------------------------------------
# utilities for dealing with redundancy
# -------------------------------------

def get_non_redundant_sets_sim(piana_access, set_1, set_2):
    """
    returns a tuple (non_redundant_set1, non_redundant_set2) where none of the proteins inside each set is redundant to the other proteins in the set

    Most importantly, both sets are syncronized, ie. they both use the same non-redundant-representants for proteins

    For example, if set_1 is (1,2,3,4) and set_2 is (4,5,6) and 1 is similar to 4, the returned sets will be:
         (1,2,3) and (1,5,6)

         (depending on the order in the sets, the returned sets could be as well (4,2,3) and (4,5,6). The thing that we are sure
          will be respected is that both the original proteins 1 and 4 will be represented by the same protein)


    redundancy is checked by looking into table proteinSimilarity
    """

    dic_correspondences = {} # this dictionary contains which is the 'non redundant representant' for each protein
                             #
                             #     { protein1: non_redundant_representant,
                             #       protein2: non_redundant_representant,
                             #       ................................
                             #     }
                             #
    set_all_proteins = set_1.union(set_2)

    for one_protein in set_all_proteins:
        # for each protein that is going to be used for calculating tp, fp, fn, then set a
        # representant for it.
        # -> representants are guaranteed not to be 'similar' (ie redundant) among them
        #    and will be used to calculate tp, fp, fn instead of the original proteins
        #    in the input arguments sets


        if dic_correspondences.has_key(one_protein):
            # correspondence was already established... skip this protein
            continue

        # correspondence still not established... follow the process:
        #
        #  1. find all proteins that are similar to one_protein
        #  2. check if any of those proteins have a representant
        #      2.1 if more than one does, a new consensus representant must be assigned to those and to one_protein
        #      2.2 if only one representant found, assign it for one_protein
        #      2.3 if representant not found, set this protein to be its own representant
        #

        # 1.
        dic_similar_proteins = piana_access.get_similar_proteins_dic(proteinPiana_value = one_protein)

        temp_correspondences = {} # keeps correspodences protein:representant for proteins similar
                                  # to one_protein that already had a representant
                                  #  -> keys are the proteins similar to one_protein
                                  #  -> values are their representants

        for similar_protein in dic_similar_proteins:
            if dic_correspondences.has_key(similar_protein):
                temp_correspondences[similar_protein] = dic_correspondences[similar_protein]
        # END OF for similar_protein in dic_similar_proteins:

        # now, check how many different representants there are... that will determine which representant we give to one_protein
        #  -> one easy way of getting unique representants is building a Set from the values of temp_correspondences

        different_representants = Set( temp_correspondences.values() )

        number_of_representants = len(different_representants)

        if number_of_representants == 0:
            # there was no similar protein to one_protein: therefore, one_protein is its own representant
            dic_correspondences[one_protein] = one_protein

        elif number_of_representants == 1:
            # there was only one representant used for proteins (could be just one protein) that are similar
            # to one_protein: use that representant for one_protein
            dic_correspondences[one_protein] = different_representants.pop()

        else:
            # there are more than one representant for proteins that are similar to one_protein
            #  a - get a consensus representant for all these proteins
            #  b - assign that consensus to all proteins in dic_correspondences that had as
            #      representant one of the representants in different_representants
            #  c - assign it as well to one_protein
            #      


            # a. take any of the representants as consensus representant (there is no way we can know if one is more 'representative' than the other
            this_representant = different_representants.pop()

            # b.
            for protein_to_update in dic_correspondences:
                if dic_correspondences[protein_to_update] in different_representants:
                    # if the representant for protein_to_update was in the list
                    # of potential representants, update it to be the new consensus representant
                    dic_correspondences[protein_to_update] = this_representant
            # END OF for protein_to_update in dic_correspondences:

            # c.
            dic_correspondences[one_protein] = this_representant
        # END OF else: (elif number_of_representants == 1:)

    # END OF for one_protein in set_all_proteins:

    # Now, transform each input set into a set with protein representants

    set_1_of_representants = Set([])
    set_2_of_representants = Set([])

    for original_protein in set_1:
        set_1_of_representants.add(dic_correspondences[original_protein])

    for original_protein in set_2:
        set_2_of_representants.add(dic_correspondences[original_protein])

    return (set_1_of_representants, set_2_of_representants)


def get_non_redundant_sets_blast(piana_access, set_1, set_2, min_length=None, min_identity=None, max_evalue=None, prefix = ""):
    """
    returns a tuple (non_redundant_set1, non_redundant_set2, intersection_set) where none of the proteins inside each set is homologous to the other proteins in the set

    Most importantly, both sets are syncronized, ie. they both use the same non-redundant-representants for proteins

    For example, if set_1 is (1,2,3,4) and set_2 is (4,5,6) and 1 is homologous to 4, the returned sets will be:
         (1,2,3) and (1,5,6)

         (1 would be chosen over 4 as a representant if 1 has a longer sequence than 4)

    
    "min_length" is the minimum lenght the alignment can have in order to consider two proteins homologous (has to be an integer)

    
    "min_identity" is the minimum identity the alignment can have in order to consider two proteins homologous (has to be between 0 and 1)


    "max_evalue" is the maximum evalue the alignment can have in order to consider two proteins homologous (has to be in the form 0.000xxx


    

    """

    dic_correspondences = {} # this dictionary contains which is the 'non redundant representant' for each protein
                             #
                             #     { protein1: non_redundant_representant,
                             #       protein2: non_redundant_representant,
                             #       ................................
                             #     }
                             #


    set_all_proteins = set_1.union(set_2)

    intersection_set = set_1.intersection(set_2)

    # for each protein  set a representant for it.
    dic_correspondences= get_dic_representants(piana_access= piana_access, list_proteinPiana= list(set_all_proteins), 
					       min_length= min_length, min_identity= min_identity, max_evalue= max_evalue,
					       prefix= prefix)

    # Now, transform each set into a set with protein representants

    set_1_of_representants = Set([])
    set_2_of_representants = Set([])
    intersection_set_of_representants = Set([])

    for original_protein in set_1:
        set_1_of_representants.add(dic_correspondences[original_protein])

    for original_protein in set_2:
        set_2_of_representants.add(dic_correspondences[original_protein])

    for original_protein in intersection_set:
        intersection_set_of_representants.add(dic_correspondences[original_protein])

    return (set_1_of_representants, set_2_of_representants, intersection_set_of_representants )




def get_dic_representants(piana_access=None, list_proteinPiana=[], min_length=None, min_identity=None, max_evalue=None, prefix = ""):
    """
    function used to remove redundancy making an external call to blast


    returns a dictionary of the form { protein1: representant1,
                                       protein2: representant2,
				       protein3: representant1,
				       ....
				     } 

				     where the representants are not homologous between them, and have been assigned to
				     key proteins with blast. 
				     In case two proteins are homologoues, the one with longer sequence is chosen as representant.
  

    "min_length" is the minimum lenght the alignment can have in order to consider two proteins homologous (has to be an integer)

    
    "min_identity" is the minimum identity the alignment can have in order to consider two proteins homologous (has to be between 0 and 1)


    "max_evalue" is the maximum evalue the alignment can have in order to consider two proteins homologous (has to be in the form 0.000xxx



    "prefix" is used to control that the temporary files used by this function do not overwrite if multiple process are calling this function
           -> the user can control the prefix that the temporary file will have by setting a different prefix in each call

  """
    if verbose_blast_report:
	blast_report_name = prefix + "tmp_blast_db.blast_report"
	blast_report_fd = file(blast_report_name , "w")

	dic_representants_name = prefix + "tmp_blast_db.dic_representants"
	dic_representants_fd = file(dic_representants_name , "w")

    if min_identity is None or max_evalue is None or min_length is None:
	raise ValueError("You didn-t give all the arguments needed to know if two proteins are homologous")


    dic_representants = {}

    # -------
    # create fasta file with proteins in list_proteinPiana
    # -------
    fasta_file_name = prefix + "tmp_fasta_file.fasta"
    tmp_fasta_file_fd = file(fasta_file_name, "w")
    for one_proteinPiana in list_proteinPiana:
	sequence = piana_access.get_protein_sequence(proteinPiana= one_proteinPiana)
	tmp_fasta_file_fd.write(">%s|_\n" %(one_proteinPiana) )
	tmp_fasta_file_fd.write("%s\n" %(sequence) )
	dic_representants[one_proteinPiana] = None

    tmp_fasta_file_fd.close()

    # -------
    # do formatdb to create a blast database
    # -------
    blast_db_name = prefix + "tmp_blast_db.formatdb"
    formatdb_arguments = []
    formatdb_arguments.append("formatdb")
    formatdb_arguments.append("-p")
    formatdb_arguments.append("T")
    formatdb_arguments.append("-i")
    formatdb_arguments.append("%s" %fasta_file_name)
    formatdb_arguments.append("-n")
    formatdb_arguments.append("%s" %blast_db_name)
    

    pid = os.fork()
    if pid == 0:
	# the child
	# system call to piana.py with arguments taken from command line
	os.execvp("formatdb", formatdb_arguments)
    else:
	# the father waits till completion of child
	time_to_finish = 18000    # max time allowed to finish
	
	while time_to_finish > 0:
	    # If the child takes too long to complete, we kill it... it normally means the computer does not have enough memory to handle that protein
	    
	    (status_pid, status) = os.waitpid(pid, os.WNOHANG)   # check if child has already finish... if not, give it more time...
	    
	    if status_pid == pid:
		# if the process finished normally  then break the while
		# TO CHECK: I don't know why, but when the child process has finished, the value of status_pid is the pid of the child and when
		#           the child didn't finish, the value of status_pid is 0. THis works, but it would be nice to know why...
		if verbose:   sys.stderr.write("blast database created" )
		break
	    else:
		# process did not finish... give some more time to child...
		time.sleep(5)       # giving to the child 5 seconds more to continue processing       
		time_to_finish -= 5
	    # END OF else:(if status_pid == pid:)
	# END OF while time_to_finish > 0:

	
	# blast database has been created and we are ready to do the blastall

	# -------
	# do a blastall against the blast DB
	# -------
	
	# my_blast_exe = '/Applications/unix_tools/blast-2.2.13/bin/blastall'

	my_blast_exe = '/usr/local/modelling/sequence/psi-blast/blastall'


	blast_out, error_info = NCBIStandalone.blastall(my_blast_exe, 'blastp',
							blast_db_name, fasta_file_name )

	b_parser = NCBIStandalone.BlastParser()
	b_iterator = NCBIStandalone.Iterator(blast_out, b_parser)


	protein_lengths = []     # list that follows the structure:
				 #    [ (protein_id, sequence_length), (protein_id, sequence_length), ...]


	protein_homologs = {}    # dictionary that follows the structure:
				 #    { protein_id : [list of proteins that are homologous],
				 #      protein_id : [list_of_proteins that are homologous],
				 #      ..................................................
				 #    }

	list_of_non_identical_proteins = []  # this will hold (at the end of the script) those proteins that are non redundant.
					     # it will first be filled with all proteins, and then those that are found
					     # redundant will be removed


	blast_record = b_iterator.next()

	num_proteins = 0


	while blast_record:

	    this_protein = int(blast_record.query.split("|")[0])

	    if verbose_blast_report:
		num_proteins += 1

		# generate blast report
		E_VALUE_THRESH = 0.04
		blast_report_fd.write('\n****PROCESSING PROTEIN %s****\n' %(this_protein))
		for alignment in blast_record.alignments:
		    for hsp in alignment.hsps:
			if hsp.expect < E_VALUE_THRESH:
			    blast_report_fd.write('-->Alignment\n')
			    blast_report_fd.write('sequence: %s\n' %alignment.title)
			    blast_report_fd.write('length:%s\n' %alignment.length)
			    blast_report_fd.write('e value:%s\n'  %hsp.expect)
			    blast_report_fd.write('identity:%s\n' %(hsp.identities[0]/float(hsp.identities[1]) ))

			    if len(hsp.query) > 75:
				dots = '...'
			    else:
				dots = ''

			    blast_report_fd.write(hsp.query[0:75] + dots + "\n")
			    blast_report_fd.write(hsp.match[0:75] + dots + "\n")
			    blast_report_fd.write(hsp.sbjct[0:75] + dots + "\n")
			# END OF
		    # END OF
		# END OF for alignment in blast_record.alignments:
	    # END OF if verbose


	    list_of_non_identical_proteins.append(this_protein)

	    protein_homologs[this_protein] = []
	    protein_lengths.append( (this_protein, blast_record.query_length) )

	    for alignment in blast_record.alignments:
		if alignment.length < min_length:
		    # if the length of the alignment is lower than the value set by user, do not consider it as relevant
		    continue

		aligned_protein = int(alignment.title.split("|")[0][1:])

		for hsp in alignment.hsps:

		    percentage_identity = hsp.identities[0]/float(hsp.identities[1])
		    alignment_evalue = float(hsp.expect)


		    if verbose_very_detailed:
			if this_protein != aligned_protein:
			    sys.stderr.write("Checking alignment between protein %s (size %s) and  protein %s (alignment size %s): evalue=%s, identity=%.2f, positive=%.2f\n" %(
				    this_protein,
				    blast_record.query_length,
				    aligned_protein,
				    alignment.length,
				    hsp.expect,
				    percentage_identity,
				    hsp.positives[0]/float(hsp.positives[1])  ))


		    if percentage_identity >= min_identity and alignment_evalue < max_evalue:
			if this_protein != aligned_protein:
			    protein_homologs[this_protein].append(aligned_protein)
			    if verbose_detailed:
				sys.stderr.write("protein %s (size %s) is homolog to protein %s (alignment size %s): evalue=%s, identity=%.2f, positive=%.2f\n" %(
				    this_protein,
				    blast_record.query_length,
				    aligned_protein,
				    alignment.length,
                            hsp.expect,
                            percentage_identity,
                            hsp.positives[0]/float(hsp.positives[1])  ))
		# END OF for hsp in alignment.hsps:
	    # END OF for alignment in blast_record.alignments:
                
	    # retrieve next record
	    blast_record = b_iterator.next()
	# END OF while blast_record:

	# this will sort the list of (protein, length) from longest to shortest protein
	protein_lengths.sort(lambda x, y: compare_two_lists(x, 1, y, 1, -1) )



	for pair_protein_length in protein_lengths:
	    # pair_protein_length[0] is proteinPiana
	    # pair_protein_length[1] is length of sequence
	    # they are processed from longest sequence to shortest sequence (thanks to previous sorting)
	    current_proteinPiana = pair_protein_length[0]
	    
	    if not dic_representants[current_proteinPiana]:
		# if no representant has been assigned to this protein, it is its own representant
		dic_representants[current_proteinPiana] = current_proteinPiana

	    for homolog_proteinPiana in protein_homologs[current_proteinPiana]:
		# set all homologs of this protein to be represented by the current proteinPiana
		# (in case they didn't have a representant yet)
		if not dic_representants[homolog_proteinPiana]:
		    dic_representants[homolog_proteinPiana] = current_proteinPiana

		else:
		    # if there was already one representant, check that it is the same as the one given
		    # to current_proteinPiana. If it isn't it means that we are facing the following case:
		    #   A is homologous to B
		    #   B is homologous to C
		    #   A is not homologous to C
		    # TO CHECK: should I take this transitivity into account or just leave it like that?
		    #           maybe I would have to change the representant of homolog_proteinPiana to be 
		    #           the representant of current_proteinPiana, since that representant will for
		    #           sure be longer... and if transitivity is applied, it will also be homologous...
		    #           But... does transitivity apply?
		    pass
	# END OF for pair_protein_length in protein_lengths:

	# remove temporary files
#	os.remove(fasta_file_name)
#	os.remove(blast_db_name + ".psq")
#	os.remove(blast_db_name + ".pin")
#	os.remove(blast_db_name + ".phr")
#	os.remove("formatdb.log")

	if verbose_blast_report:
	    blast_report_fd.close()
	    for one_prot in dic_representants:
		dic_representants_fd.write("protein %s with representant %s\n" %(one_prot, dic_representants[one_prot]))
	    dic_representants_fd.close()


	return dic_representants
    # END OF else: (if pid == 0:) (END OF father)

# ---------------------------------------------
# utilities for calculating partners similarity
# ---------------------------------------------

def get_partners_similarity(set_a, set_b, intersection_set, sim_mode):
    """
    returns the similarity score for set_1 and set_2, given its intersection_set

    sim_mode can be:

        - num_ints: number of common interaction partners (normalized by largest number of common int partners
        - min_per: number_of_protein_partners_in_common(proteins in cluster1, proteins in cluster2)x100/
                                       min(number_of_protein_partners(proteins in cluster1), number_of_protein_partners(proteins in cluster2))
        - max_per: number_of_protein_partners_in_common(proteins in cluster1, proteins in cluster2)x100/
                                       max(number_of_protein_partners(proteins in cluster1), number_of_protein_partners(proteins in cluster2))
        - combined_per: (min_per + max_per) / 2

    """
    
    num_ints_in_common = len(intersection_set)
    total_ints_protein_a = len(set_a)
    total_ints_protein_b = len(set_b)



    try:
	if sim_mode == "combined_per":
	    score= ((100*num_ints_in_common/total_ints_protein_a) + (100*num_ints_in_common/total_ints_protein_b) )/ 2

	elif sim_mode == "min_per":
	    score= (100*num_ints_in_common/min(total_ints_protein_a, total_ints_protein_b)) 

	elif sim_mode == "max_per":
	    score=  (100*num_ints_in_common/max(total_ints_protein_a, total_ints_protein_b))

	elif sim_mode == "num_ints":
	    score= num_ints_in_common

    except:
	# if expcetion occurs ( 0/0) then set to unknown
	score = "unknown"

    return score
	
    



# ------------------------------
# utilities for loading file info
# ------------------------------

def return_dic_gi_vs_tax(file_name= None):
    """

    returns a dictionary { gi: tax_id,
                           gi: tax_id,
                           ......
                         }

    filled with info from "file_name" (gis and tax_ids are both integers

    "file_name" is a file name of a file that has two tab-separated columns
       1st one is gi code
       2nd one is tax id for that gi
    """

    dic_gi_tax = {}

    file_fd = file(file_name, "r")

    for line in file_fd:
        line_fields = line.split()

        if len(line_fields) == 2:
            dic_gi_tax[int(line_fields[0])] = int(line_fields[1])
    # END OF for line in file_fd:

    return dic_gi_tax
        
        

# --------------------------
# PLPLOT utilities
# --------------------------

graphic_colours = [3,9,10,11,13]         # 0 -> white
                                         # 1 -> red
                                         # 2 -> yellow 
                                         # 3 -> green     
                                         # 4 -> magenta   
                                         # 5 -> light red   
                                         # 6 -> light beige   
                                         # 7 -> light grey   
                                         # 8 -> bordeaux  
                                         # 9 -> bright blue marin  
                                         # 10 -> maroon
                                         # 11 -> bright magenta
                                         # 12 -> magenta
                                         # 13 -> bright pink
                                         # 14 -> light red
                                         # 15 -> white
number_of_colours = len(graphic_colours)

# --------------------------
# String utilities
# --------------------------

def parse_string_field_value(input_string= None, separator_field_value=None, global_separators=None):
    """
    parses a string that has field_names and values and returns a list of pairs [[field_name,value], [field_name, value], ...]

    global_separators is a list with all the string separators that can act as a string separator (e.g. [" ", "|", ";" ])
    
    separator_field_value can only be one character

    string must follow format:

    [global_separator]*field[separator_field_value]value[global_separator]*field[separator_field_value]value[global_separator]*.....

    meaning that each field has a value

    for example, stringX

    "    ;Name=Ramon    ;   and   Name=Pedro   , Synonim=Juan  ;  "

    could be converted into a list [[Name, Ramon], [Name, Pedro], [Synonim, Juan] by calling parse_string_field_value(input_string=stringX,
                                                                                                                        separator_field_value="=",
                                                                                                                        global_separators=[" ",";"])

    Attention!!! Even if space (ie " ") is not in global_separators, a strip() is done before returning the pairs, to remove trailing spaces from the
    field names and field values. So, if trailing spaces are needed, something else has to be done...
    """
    string_global_groups = []
    pairs_field_value = []

    if verbose_string_utilities:
        sys.stderr.write("Old input string: %s\n" %input_string)
    
    # to simplify the process, convert all global_separators characters into global_separators[0]
    for i in range(1,len(global_separators)):
        homogeneous_input_string = input_string.replace(global_separators[i], global_separators[0])
        input_string = homogeneous_input_string
    # END OF  for i in range(1,len(global_separators)):

    
    if verbose_string_utilities:
        sys.stderr.write("New input string: %s\n" %input_string)
        
    string_global_groups = input_string.split(global_separators[0])

    if verbose_string_utilities:
        sys.stderr.write("Global groups: %s\n" %string_global_groups)

    # string_global_groups will contain those pieces of the string that are between global_separator
    # that piece of string only contains field and value if there is a separator_field_value
    for string_global_group in string_global_groups:
        if separator_field_value in string_global_group:
            pair_field_value = string_global_group.split(separator_field_value)
            pairs_field_value.append([pair_field_value[0].strip(), pair_field_value[1].strip()])
        # END OF         if separator_field_value in string_global_group:
    # END OF for string_global_group in string_global_groups:

    return pairs_field_value

    
# ---------------------------
# list utilities                                               
# ---------------------------
def get_list_indexes_equal_to_value(list = None, value= None):
    """
    returns a list of indexes of a list which contents are equal to value 
    """
    indexes = []
    
    for i in range(len(list)):
        if list[i] == value:
            indexes.append(i)

    return indexes
        
def compare_two_lists(list1, el1, list2, el2, mode= 1):
    """
    takes two lists as input, and decides its order as follows:
    
    - comparing elements list1["el1"] with element list2["el2"]
    eg ( if l1 = [200, 400] and l2 = [250, 450] and el1 is 0 and el2 is 1, then it will compare 200 with 450)
    
    - mode 1  returns 1 if x > y, 0 if x == y, -1 if y > x  (to be used for sorting from lower value to higher value) 
    - mode -1 returns 1 if y > x, 0 if x == y, -1 if x > y  (to be used for sorting from higher value to lower value)


    Note: can be used to order a list of lists, for example if we want to order the lists inside the list by their second element

    list_of_lists = [ (1,2,3), (1,1,1), (1,1,2) ]

    list_of_lists.sort(lambda x, y: utilities.compare_two_lists(x, 2, y, 2, 1) )

    will order the list as: [  (1,1,1), (1,1,2), (1,2,3) ]
    
    """
    if mode != -1 and mode != 1:
        raise ValueError("mode given to compare_to_lists is incorrect\n")
    
    if list1[el1] > list2[el2]:
        return mode
    
    elif list1[el1] < list2[el2]:
        return mode*(-1)
    else:
        return 0
           
    
def _reverse_order(x,y):
    """
    used to sort a list from higher value to lower value

    eg. list_fvalues.sort( lambda x,y : ScoringFunctionBenchmark._reverse_order(x,y))

    """
    if y > x:
        return 1
    elif x > y:
        return -1
    else:
        return 0
 
# ---------------------------
# Matrix utilities                                               
# ---------------------------

def get_max_value_from_matrix(matrix= None):
    """
    returns the maximum value in the matrix
    
    """
    if matrix is None:
        raise ValueError("A matrix is needed to calculate the maximum value")

    # TO DO!!!! Seed cannot be 0 if there are negative values!!!!
    # get another random number from the matrix
    # I do not know why, I am not being able to get the first element
    
    # just getting the first value as init seed
    # max_value = ((matrix.keys())[0].keys())[0]
    max_value = 0

    for row in matrix.keys():
        for column in matrix[row].keys():
            if not matrix[row][column] is None:
                if max_value is None:
                    max_value = matrix[row][column]
                elif max_value < matrix[row][column]:
                    max_value = matrix[row][column]
            # END OF if not matrix[row][column] is None:
        # END OF for column in matrix[row].keys():
    # END OF for row in matrix.keys():


    # TO DO!!! I think this would be faster if I use max(list of values) instead of checking one by one

 
    return max_value



def get_max_value_cells_from_matrix(matrix= None, threshold= 1):
    """
    returns a list with those positions that hold maximum values in the matrix
       -> returned list has structure: max_value_positions = [ [row1, column1], [row2, column2], ... ]

     input "matrix" must follow structure: { key1:{key1:value11, key2: value12, ...},
                                             key2:{key1:value21, key2: value22, ...},
                                              ......................................
                                           }
                                           
    "threshold" determines the minimum value to consider the matrix cell value a maximum one
       (e.g. if maximum value in matrix is 2, and threshold is 5, then an empty list will be returned)
    """
    if matrix is None:
        raise ValueError("A matrix is needed to find cells with maximum values")
    
    max_value_cells = []

    max_value = get_max_value_from_matrix(matrix= matrix)



    if max_value > threshold:
        for row in matrix.keys():
            for column in matrix[row].keys():
                if not matrix[row][column] is None:
                    if max_value == matrix[row][column]:
                        max_value_cells.append( [row, column] )
                # END OF if not matrix[row][column] is None:
            # END OF for column in matrix[row].keys():
        # END OF for row in matrix.keys():
    # END OF if max_value > threshold:

    if verbose_matrix:
        sys.stderr.write("maximum value found is %s, and threshold is %s. len(max_value_cells)=%s\n" %(max_value,
                                                                                                       threshold,
                                                                                                       len(max_value_cells)) )

    return max_value_cells

def get_max_value_cells_from_symmetrical_matrix(matrix= None, threshold= 1):
    """
    returns a list with those positions that hold maximum values in the symmetrical matrix
      -> returned list has structure: max_value_positions = [ [row1, column1], [row2, column2], ... ]
      -> only returns one of the symmetrical pairs!!! This is, if [1,2] is maximum, then [2,1] wont be returned
    
    input "matrix" must follow structure: { key1:{key1:value11, key2: value12, ...},
                                             key2:{key1:value21, key2: value22, ...},
                                              ......................................
                                           }
                                           
    "threshold" determines the minimum value to consider the matrix cell value a maximum one
       (e.g. if maximum value in matrix is 2, and threshold is 5, then an empty list will be returned)
 
    """
    if matrix is None:
        raise ValueError("A matrix is needed to find cells with maximum values")
    
    max_value_cells = []

    max_value = get_max_value_from_matrix(matrix= matrix)

    if max_value > threshold:
        for row in matrix.keys():
            for column in matrix[row].keys():
                if column >= row:   # ensuring only one symmetrical pair returned
                    if not matrix[row][column] is None:
                        if max_value == matrix[row][column]:
                            max_value_cells.append( [row, column] )
                    # END OF if not matrix[row][column] is None:
                # END OF if column >= row:
            # END OF for column in matrix[row].keys():
        # END OF for row in matrix.keys():
    # END OF if max_value > threshold:
    
    if verbose_matrix:
        sys.stderr.write("maximum value found is %s, and threshold is %s. len(max_value_cells)=%s\n" %(max_value,
                                                                                                       threshold,
                                                                                                       len(max_value_cells)) )
    
    return max_value_cells
  

def print_matrix(matrix_to_print= None):
    """
    prints a matrix (which has been implemented as a dictionary containing dictionaries with values)

    matrix = {row_id:{column_id:value, column_id:value, ...}
              row_id:{column_id:value, column_id:value, ...}
              .............................................
             }
    
    """
    if matrix_to_print is None:
        raise ValueError("A matrix is needed if you want to print something")
    
    # printing column headers (since it is a symmetrical matrix, row and column indexes are the same)
    sys.stdout.write("\t") # first column is empty to leave place for row names
    for column in matrix_to_print.keys():
        sys.stdout.write("c%s\t" %column)
    sys.stdout.write("\n")
        

    for row in matrix_to_print.keys():
        sys.stdout.write("r%s\t" %row)
        for column in matrix_to_print[row].keys():
            sys.stdout.write("%s\t" %(matrix_to_print[row][column]) )
        # END OF for column in matrix_to_print[row].keys():
        sys.stdout.write("\n")
    # END OF for row in matrix_to_print.keys():

# ---------------------------
# File Management utilities                                             
# ---------------------------
def get_ordered_files_in_directory(dir, pattern):
    """
    returns and ordered list of files in directory 'dir' that follow pattern 'pattern'

    Attention! The order given here is very specific... this has been implemented with CIR evaluation in mind...

    files a_0.results a_11_results a_2.results and a_1.results in directory, would be ordered as follows:
    ['a_0.results', 'a_1.results', 'a_2.results', 'a_11_results']

    a normal 'ls' would place a_11_results before a_2.results, which is not what we want...
    
    """

    def sorting_method(file_name1, file_name2):
        """
        use to sort files by their _X.results label, considering only the int(X)
        """
        ext_1 = file_name1.split("_")[-1]
        num_1 = int(ext_1.split(".")[0])
        
        ext_2 = file_name2.split("_")[-1]
        num_2 = int(ext_2.split(".")[0])

        if num_1 > num_2:
            return 1
        
        elif num_1 == num_2:
            return 0
        
        else:
            return -1

        
    
    all_files = os.listdir(dir)
    good_files = []
    
    for one_file in all_files:
        if fnmatch.fnmatch(one_file, pattern):
            one_file_fullname = os.path.join(dir, one_file)
            good_files.append(one_file_fullname)

    good_files.sort(sorting_method)

    return good_files
    

    

    
    
class GlobDirectoryWalker:
    # a forward iterator that (recursively or not) traverses a directory tree searching for a pattern (default: all files)
    #  set recursive to "yes" if you want to search in all the directory tree
    #                   "no" if you just want files in the very same directory 
    #
    #
    # example of use:
    #
    # for accumulative_file in utilities.GlobDirectoryWalker("." , "*.accumulative"):
    #
    #     accumulative_file_fd = file(accumulative_file, "r")


    def __init__(self, directory, pattern="*", recursive="no"):
        self.stack = [directory]
        self.pattern = pattern
        self.files = []
        self.index = 0

        # recursivity attributes
        self.recursive = recursive
        self.directory_to_read = 1

    def __getitem__(self, index):
        while 1:
            try:
                file = self.files[self.index]
                self.index = self.index + 1
            except IndexError:
                if self.recursive == "yes" or self.directory_to_read:
                    # pop next directory from stack (handling appropiately recursivity)
                    self.directory = self.stack.pop()
                    self.files = os.listdir(self.directory)
                    self.index = 0
                    self.directory_to_read = 0
                else:
                    # no more files to read: raise Exception to stop iterator
                    raise IndexError("no more files to read")
                
            else:
                # got a filename
                fullname = os.path.join(self.directory, file)
                if os.path.isdir(fullname) and not os.path.islink(fullname):
                    self.stack.append(fullname)
                if fnmatch.fnmatch(file, self.pattern):
                    return fullname

                
def open_file(filename, opening_mode):
    """
    Open file "filename" in a given mode "opening_mode"

    """

    while 1:
        try:
            opened = open(filename, opening_mode)
            return opened
            break
        except IOError:
            sys.stderr.write(': impossible to load file ' + filename + '\n')
            sys.exit()

# -----------------------------
# Basic timing functionality
# -----------------------------

# If possible (Unix), use the resource module instead of time.clock()
try:
    import resource
    def clock():
        """clock() -> floating point number

        Return the CPU time in seconds (user + system) since the start of the
        process.  This is done via a call to resource.getrusage, so it avoids
        the wraparound problems in time.clock()."""
        
        sys.stderr.write("module resource was found\n")
        res = resource.getrusage(resource.RUSAGE_SELF)
        return res[0]+res[1]
except ImportError:
    sys.stderr.write("No module resource found\n")
    clock = time.clock

def timings_out(reps,func,*args,**kw):
    """timings_out(reps,func,*args,**kw) -> (t_total,t_per_call,output)

    Execute a function reps times, return a tuple with the elapsed total
    CPU time in seconds, the time per call and the function's output.

    Under Unix, the return value is the sum of user+system time consumed by
    the process, computed via the resource module.  This prevents problems
    related to the wraparound effect which the time.clock() function has.
    
    Under Windows the return value is in wall clock seconds. See the
    documentation for the time module for more details."""

    reps = int(reps)
    assert reps >=1, 'reps must be >= 1'
    if reps==1:
        start = clock()
        out = func(*args,**kw)
        tot_time = clock()-start
    else:
        rng = xrange(reps-1) # the last time is executed separately to store output
        start = clock()
        for dummy in rng: func(*args,**kw)
        out = func(*args,**kw)  # one last time
        tot_time = clock()-start
    av_time = tot_time / reps
    return tot_time,av_time,out

def timings(reps,func,*args,**kw):
    """timings(reps,func,*args,**kw) -> (t_total,t_per_call)

    Execute a function reps times, return a tuple with the elapsed total CPU
    time in seconds and the time per call. These are just the first two values
    in timings_out()."""

    return timings_out(reps,func,*args,**kw)[0:2]

def timing(reps,func,*args,**kw):
    """timing(reps,func,*args,**kw) -> t_total

    Execute a function reps times, the elapsed total CPU time in seconds. This
    is just the first value in timings_out()."""

    return timings_out(reps,func,*args,**kw)[0]

# -------------------------------------------------------
# Methods for conversion from one protein code to another                                              
# -------------------------------------------------------

def sequence2md5(sequence):
    """
    Return MD5 code for sequence "sequence"
    (MD5 hexdigestion of sequence + its leading 4 chars
    + its last 4 chars)
    """ 

    sequence = sequence.strip()
    head = sequence[:4]
    tail = sequence[-4:]
    toconvert = md5.new(sequence)
    digested = toconvert.hexdigest()
    md5_code = digested + head + tail

    return md5_code

# -------------------------------------------------
# Methods for identifying the protein code type
# Methods for locating the protein codes in pianaDB
# -------------------------------------------------

def get_code_table(code_type_col):
    """
    method that returns the table in which "code_type_col" can be found (ie the pianaDB table where column code_type_col is located)

    "code_type_col" can have the values described in PianaGlobals.valid_protein_types for protein code columns (ie xxx_col)

    """

    # handle this special case: descriptions are not codes, but we would sometimes use them as if they were...
    if code_type_col == PianaGlobals.proteinDescription_col:
        return PianaGlobals.proteinDescription_table
    
    for table_column_pair in PianaGlobals.valid_protein_types.values():

        if table_column_pair.values()[0] == code_type_col:
            return table_column_pair.keys()[0]
    # END OF for table_column_pair in PianaGlobals.valid_protein_types.values():

    return None

def get_code_column(code_type_name):
    """
    returns the pianaDB column for easy-to-remember code type "code_type_name"

    "code_type_name" can have the values described in PianaGlobals.valid_protein_types.keys()
    """

    if PianaGlobals.valid_protein_types.has_key(code_type_name):
        return PianaGlobals.valid_protein_types[code_type_name].values()[0]
    else:
        return None

def get_code_table_column(code_type_name):
    """
    returns a tuple [pianaDB table, pianaDB column] for easy-to-remember code type "code_type_name"

    "code_type_name" can have the values described in PianaGlobals.valid_protein_types.keys()
    """

    # TO DO!!! I think I don't want to do it... but maybe: should I return "unknown" instead of None.
    #          I think I shouldn't because if the user set the type, then if it is incorrect, he better knows it than not knowing it


    if PianaGlobals.valid_protein_types.has_key(code_type_name):
        return [PianaGlobals.valid_protein_types[code_type_name].keys()[0], PianaGlobals.valid_protein_types[code_type_name].values()[0]]
    else:
        return None

def get_easy_to_remember_type_from_column(code_type_col):
    """
    returns the easy to remember piana type name that is used for a database column "code_type_col"

    Attention: this is currently not being used, but it might be helpful in the future...
    """
    for easy_to_remember_name in PianaGlobals.valid_protein_types:
        
        table_column_pair = PianaGlobals.valid_protein_types[easy_to_remember_name].values()
        # table_column_pair is a dictionary with one item {protein_table: column_with_code}
        
        if table_column_pair.values()[0] == code_type_col:
            return easy_to_remember_name

        
def get_code_type(proteinCode_value):
    """
    method that returns a list with potential types of protein code (ie database column) of
    a given protein name "proteinCode_value"  for which we do not know the type of code
    
    This method should be called prior to PianaDBaccess.get_list_protein_piana() if "proteinCodeType_value" is not known

    Attention!!! This function is only being used by string2piana

    
    THIS IS CURRENTLY ONLY BEING USED IN THE STRING PARSER string2piana: that is why I am currently only looking for codes that might appear in STRING
    """

    if re.match("[a-zA-Z0-9]{1,6}_[a-zA-Z0-9]{1,5}", proteinCode_value):
        # this is for sure a unientry (ie. swissprot entry or trembl entry)
        list_potential_id_types = [PianaGlobals.swissProtID_col]

    elif re.match("[OPQ][0-9][a-zA-Z0-9]{3}[0-9]", proteinCode_value):
        # this is a uniacc or emblAccession
        list_potential_id_types = [PianaGlobals.swissAccessionID_col]

    #elif re.match("[A-Z]{1-2}[0-9]{5,6}\.\d+", proteinCode_value):
    #    proteinCodeType_value = PianaGlobals.emblAccessionID_col + "_vers"

    elif re.match("[a-zA-Z]{1,3}[0-9]{5,7}", proteinCode_value):
        # being very permissive to consider something a emblAccession...
        list_potential_id_types = [PianaGlobals.emblAccessionID_col, PianaGlobals.emblPID_col]
        
    elif re.match("\w+", proteinCode_value):
        # nothing matched before, and there are letters and numbers... try geneName
        list_potential_id_types = [PianaGlobals.geneName_col]

    #elif re.match("[A-Z]{3}[0-9]{5}\.\d+", proteinCode_value):
    #    proteinCodeType_value = PianaGlobals.emblPID_col + "_vers"

    #elif re.match("[A-Z]{3}[0-9]{5}", proteinCode_value):
    #    proteinCodeType_value = PianaGlobals.emblPID_col

    #elif re.match("[0-9][a-z]{32}[A-Z]{8}", proteinCode_value):
    #    proteinCodeType_value = PianaGlobals.proteinMD5_col
        
    #elif re.match("[0-9]+", proteinCode_value):
    #    proteinCodeType_value = PianaGlobals.giID_col

    # TO CHECK!!! Can I be more stringent in deciding that a code is a gi???
    # I risk of taking things that are not a protein code as gi....
        
    else:
        list_potential_id_types = []

    return list_potential_id_types


def get_codeVersion_item(proteinCodeVersion_value, item2get):
    """
    Method that returns the required item "item2get" of a "proteinCodeVersion_value"
    A "proteinCode_value" is "proteinCodeVersion_value" if get_code_type("proteinCode_value")
    has returned *_vers for such a code

    "item2get" must be "proteinCodeType_value" or "proteinCodeType_version" otherwise nothing will be returned
    """

    proteinCodeType_value, proteinCodeType_version = re.split("\.", proteinCodeVersion_value)

    if item2get == "proteinCodeType_value" :
        return proteinCodeType_value
    elif item2get == "proteinCodeType_version" :
        return proteinCodeType_version
    else:
        return "unknown"


def print_code_type_names():
    """
    Prints to standard error output the valid easy-to-remember protein code types
    """
    for input_type in PianaGlobals.valid_protein_types.keys():
        sys.stderr.write("     - %s " %input_type)

# ----------------------------------------
# Methods that deal with protein sequences                                              
# ----------------------------------------

def get_clean_sequence(input_sequence):
    """
    cleans an input sequence from all spaces, tabs, and special characters it might have, leaving only a contigous list of aminoacids
    """
    return input_sequence.replace(" ", "").replace("*", "").replace("\n", "").replace("\t", "").replace("\r", "").replace("_", "")
    
# ----------------------------------------
# Miscelaneous methods that deal with proteins                                               
# ----------------------------------------

def print_proteins_to_file(file_object= None, proteins_list=[], piana_access=None, dic_ext_code_2_pp= {}, dic_root_ext_codes= {}, 
			   dic_special_labels= {}, dic_linkers= {}, 
			   dic_over_expressed={}, dic_infra_expressed={},  dic_root_linked={}, format_mode=None, files_prefix="", file_title="", pickle_file=None ):
    """
    takes a list with any elements and prints those elements to a file, one per line, with some extra information about the proteins

    "file_object" is the file where the output will be written

    "protein_list" is the list of protein names that will be printed to the file (they must be in the type of identifier you wish to use for your output

    "piana_access" is the PianaDBAccess object you use to access your piana database

    "dic_ext_code_2_pp" is a dictionary with keys external codes and values are lists of proteinPianas associated to those ext codes

    "dic_root_ext_codes" is a dictionary with keys external codes that are root proteins

    "dic_special_labels" is a dictionary with keys external codes and contents are lists of special labels for that protein ext code

    "dic_linkers" is a dictionary with keys external codes that are linker proteins

    "dic_over_expressed" is a dictionary with keys external codes that are over expressed proteins
    "dic_infra_expressed" is a dictionary with keys external codes that are under expressed proteins

    "dic_root_linked" contains for relevant proteins (keys) the values: if it is directly linked to a root ("yes"), not linked directly ("no"), or it is a root itself ("is_root")
        -> this dictionary doesn't need to have all proteins that will be printed... a "no" is printed out by default

    "format_mode" can be 'txt' or 'html'
    
    "files_prefix" is the prefix that was added in front of files with interactions for each protein. (see PianaGraph.match_pathways)

    "file_title" is a label that will be placed as a title for the output file (only in HTML mode)

    "pickle_file" can be a file object or None. if set to a file name, dumps to it a dictionary with key the protein name and content the string to be printed for it
                                                It is used by parse_matched_pathway_files.py to print a compendium of interesting proteins

    Attention! All proteins in proteins_list must be a key in dic_ext_code_2_pp

    """
    if pickle_file:
	dic_to_pickle = {}    # is a dic with keys the protein ext codes and contents the string associated to each protein
	                      # there is also two other keys which are: 'header' (with the headers associated to this info) and 'footer' (idem)

    if format_mode == "html":
	file_object.write("<br><center><b>PROTEIN LIST FOR: %s</b></center><br><br>" %(file_title))
	header_string="<table border=1>\n<tr><td align=center><b>Protein</b></td><td align=center><b>Source</b></td><td align=center><b>Linked to Roots</b></td><td align=center><b>Oncomine Info</b></td><td align=center><b>Special Labels</b></td><td align=center><b>Description</b></td><td align=center><b>Function</b></td></tr>\n"
	file_object.write(header_string)

	if pickle_file:
	    dic_to_pickle['header'] = header_string

    for one_ext_code in proteins_list:
	
	# get descriptions, functions for all proteinPianas associated to this external code
	dic_descriptions = {}
	dic_functions = {}
	for one_proteinPiana in dic_ext_code_2_pp[one_ext_code]:

	    for one_description in piana_access.get_protein_description(proteinPiana_value =one_proteinPiana):
		dic_descriptions[one_description] = None

	    for one_function in piana_access.get_protein_function(proteinPiana_value =one_proteinPiana):
		dic_functions[one_function] = None
	# END OF for one_proteinPiana in dic_ext_code_2_pp[one_ext_code]:

        # find out if this is a cancer gene or a prediction
	if dic_root_ext_codes.has_key(one_ext_code):   was_seed = "Cancer_Gene"
	else:                                          was_seed = "Predicted"

        # find out which roots are connected by this protein (and their expression level)
	roots_connected = ""
	if dic_linkers.has_key(one_ext_code):     
	    if format_mode == "txt":  
		for one_root in dic_linkers[one_ext_code]:
		    roots_connected += one_root + "-"

	    elif format_mode == "html":
		for one_root in dic_linkers[one_ext_code]:

                    if dic_over_expressed.has_key(one_root) and dic_infra_expressed.has_key(one_root):   
                        root_color= "9900CC"
                        
                    elif dic_over_expressed.has_key(one_root): 
                        root_color= "FF33FF"
                        
                    elif dic_infra_expressed.has_key(one_root): 
                        root_color= "33FF00"
                        
                    else:
                        root_color = "000000"
                        
		    roots_connected += " <font color=%s> %s </font> " %(root_color, one_root)
		
	else:  
            # if it is not a linker, then look if it is connected to a root protein
	    if dic_root_linked.has_key(one_ext_code):
                
                if dic_over_expressed.has_key(dic_root_linked[one_ext_code]) and dic_infra_expressed.has_key(dic_root_linked[one_ext_code]):   
                    root_color= "9900CC"
                    
                elif dic_over_expressed.has_key(dic_root_linked[one_ext_code]): 
                    root_color= "FF3300"
                    
                elif dic_infra_expressed.has_key(dic_root_linked[one_ext_code]): 
                    root_color= "33FF00"
                    
                else:
                    root_color = "000000"
                    
		roots_connected =  " <font color=%s> %s </font> " %(root_color, dic_root_linked[one_ext_code])
            else:                               
                roots_connected = "" 

	expression_info_string = ""
	if dic_over_expressed.has_key(one_ext_code) and dic_infra_expressed.has_key(one_ext_code):   
	    if format_mode == "txt":
		expression_info_string += "Over-expressed_and_Under-expressed"
	    elif format_mode == "html":
		expression_info_string += "<font color=FF33FF>Over-expressed</font> <br>and <font color=33FF00>Under-expressed</font>"

	elif dic_over_expressed.has_key(one_ext_code):    
	    if format_mode == "txt": 
		expression_info_string += "Over-expressed"
	    elif format_mode == "html":
		expression_info_string += "<font color=FF33FF>Over-expressed</font>"

	elif dic_infra_expressed.has_key(one_ext_code):      
	    if format_mode == "txt":  
		expression_info_string += "Under-expressed"
	    elif format_mode == "html":
		expression_info_string += "<font color=33FF00>Under-expressed</font>"
	  
	if dic_special_labels.has_key(one_ext_code):    
	    special_labels_string = ""
	    for one_label in dic_special_labels[one_ext_code]:
		special_labels_string += one_label + " "
	else:  
	    special_labels_string = " "
	
	if format_mode == "txt":
	    info_string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\n" %(one_ext_code, was_seed, roots_connected, expression_info_string, special_labels_string, dic_descriptions.keys(), dic_functions.keys())
	    
	elif format_mode == "html":

	    info_string = """<tr><td>%s <a href="%s">(ints)</a></td><td align=center>%s</center></td><td align=center>%s</center></td><td align=center>%s</center></td><td align=center>%s</center></td><td>%s</td><td>%s</td></tr>\n""" %(
		one_ext_code, 
		files_prefix  + "." + one_ext_code.replace("/","_".replace(" ","_")) + ".interaction_file.html",
		was_seed,
		roots_connected, 
		expression_info_string,
		special_labels_string,
		dic_descriptions.keys(), 
		dic_functions.keys())
	# END OF elif format_mode == "html": (if  format_mode == "txt":)
	    
	file_object.write(info_string)
	    
	if pickle_file:
	    dic_to_pickle[one_ext_code] = info_string

    # END OF for one_ext_code in proteins_list:

    if format_mode == "html":
	file_object.write("</table>")

	if pickle_file:
	    dic_to_pickle['footer'] = "</table>"

    if pickle_file:
	cPickle.dump(dic_to_pickle, pickle_file , 2)  # 2 is the pickle protocol to be applied

def return_proteins_from_file(file_object= None, proteins_type=None):
    """
    returns a list of proteins read from a file descriptor (removing redundancies)

    file contains one protein per line unless it is a fasta file
        -> protein must be in the first column of the file. The other columns can contain other information
        -> columns are tab separated

      -> if proteins_type is not 'fasta' this method just returns the proteins in the file in the form of a list (no id type change involved)

      -> if it is a fasta file, set "proteins_type" to "fasta". For a fasta file, this method returns a list of sequences
         (if type is fasta, pianaDB will consider you are using "sequence" as a protein code)
    
    """
    dic_input_protein_codes = {}
    
    if proteins_type != "fasta":
        
        for file_line in file_object:
            # reading lines of input file (which are protein codes)
	    protein_name = file_line.split("\t")[0].strip()
            dic_input_protein_codes[protein_name] = None
        # END OF for input_file_line in file_object:
    # END OF if input_proteins_type != "fasta":

    else:
        # if the input file is in fasta format, processing has to be different
        fasta_parser = Fasta.RecordParser()
        fasta_iterator = Fasta.Iterator(file_object, fasta_parser)

        fasta_record = fasta_iterator.next()

        while fasta_record is not None:

            fasta_title_line = fasta_record.title
            fasta_sequence = fasta_record.sequence.strip()
            
            dic_input_protein_codes[fasta_sequence] = None
            
            fasta_record = fasta_iterator.next()
        # END OF while fasta_record is not None:
    # END OF else: (if input_proteins_type != "fasta":)

    return dic_input_protein_codes.keys()

def return_dic_proteinPianas_from_file(piana_access= None, tax_id_value=None, file_object= None, proteins_type=None):
    """
    returns a list of proteinPianas read from a file descriptor that contains proteins of id type "proteins_type" (removing redundancies)

    file contains one protein per line unless it is a fasta file
        -> protein must be in the first column of the file. The other columns can contain other information
        -> columns are tab separated
    
    """
    all_proteinPianas = {}
    
    list_protein_codes = return_proteins_from_file(file_object=file_object , proteins_type=proteins_type)

    for one_protein_code in list_protein_codes:
        
        for one_proteinPiana in piana_access.get_list_protein_piana(proteinCode_value= one_protein_code,
                                                                    proteinCodeType_value= get_code_column(proteins_type),
                                                                    tax_id_value= tax_id_value,
                                                                    source_db_info= "no"):
            all_proteinPianas[one_proteinPiana] = None
        # END OF for one_proteinPiana in piana_access.get_list_protein_piana(...)
    # END OF for one_protein_code in list_protein_codes:
        
    return all_proteinPianas
    
    
    
# ------------------------------
# Miscelaneous methods that deal with numbers                                              
# ------------------------------

def get_ratio_group(numerator, denominator):
    """
    returns the category of the ratio (the interval where its value is)
    """

    ratio = numerator / float(denominator)

    ratio_group= None

    if ratio == 0:                              ratio_group = "-0-"
    elif ratio > 0 and ratio <= 0.1:            ratio_group = "0-01"
    elif ratio > 0.1 and ratio <= 0.2:          ratio_group = "01-02"
    elif ratio > 0.2 and ratio <= 0.3:          ratio_group = "02-03"
    elif ratio > 0.3 and ratio <= 0.4:          ratio_group = "03-04"
    elif ratio > 0.4 and ratio <= 0.5:          ratio_group = "04-05"
    elif ratio > 0.5 and ratio <= 0.6:          ratio_group = "05-06"
    elif ratio > 0.6 and ratio <= 0.7:          ratio_group = "06-07"
    elif ratio > 0.7 and ratio <= 0.8:          ratio_group = "07-08"
    elif ratio > 0.8 and ratio <= 0.9:          ratio_group = "08-09"
    elif ratio > 0.9 and ratio <= 1:            ratio_group = "09-1"
    elif ratio > 1 and ratio <= 2:              ratio_group = "1-2"
    elif ratio > 2 and ratio <= 3:              ratio_group = "2-3"
    elif ratio > 3 and ratio <= 4:              ratio_group = "3-4"
    elif ratio > 4 and ratio <= 5:              ratio_group = "4-5"
    elif ratio > 5 and ratio <= 6:              ratio_group = "5-6"
    elif ratio > 6 and ratio <= 7:              ratio_group = "6-7"
    elif ratio > 7 and ratio <= 8:              ratio_group = "7-8"
    elif ratio > 8 and ratio <= 9:              ratio_group = "8-9"
    elif ratio > 9 and ratio <= 10:             ratio_group = "9-10"
    elif ratio > 10 and ratio <= 20:            ratio_group = "10-20"
    elif ratio > 20 and ratio <= 30:            ratio_group = "20-30"
    else:                                       ratio_group = "huge"

    return ratio_group

def get_range_group(value):
    """
    returns the range where the value "value" lies
    """
    range_group= None

    if value == 0:                              range_group = "-0-"
    elif value > 0 and value <= 10 :            range_group = "0-10"
    elif value > 10 and value <= 20:            range_group = "10-20"
    elif value > 20 and value <= 30:            range_group = "20-30"
    elif value > 30 and value <= 40:            range_group = "30-40"
    elif value > 40 and value <= 50:            range_group = "40-50"
    elif value > 50 and value <= 60:            range_group = "50-60"
    elif value > 60 and value <= 70:            range_group = "60-70"
    elif value > 70 and value <= 80:            range_group = "70-80"
    elif value > 80 and value <= 90:            range_group = "80-90"
    elif value > 90 and value <= 100:           range_group = "90-100"
    elif value > 100 and value <= 110:          range_group = "100-110"
    elif value > 110 and value <= 120:          range_group = "110-120"
    elif value > 120 and value <= 130:          range_group = "120-130"
    elif value > 130 and value <= 140:          range_group = "130-140"
    elif value > 140 and value <= 150:          range_group = "140-150"
    elif value > 150 and value <= 160:          range_group = "150-160"
    elif value > 160 and value <= 170:          range_group = "160-170"
    elif value > 170 and value <= 180:          range_group = "170-180"
    elif value > 180 and value <= 190:          range_group = "180-190"
    elif value > 190 and value <= 200:          range_group = "190-200"
    elif value > 200 and value <= 250:          range_group = "200-250"
    elif value > 210 and value <= 300:          range_group = "250-300"
    else:                                       range_group = "huge"

    return range_group

def calculate_std_deviation( all_values= None, with_mean = 0):
    """
    calculates the std deviation of all numerical values in list "all_values"

    returns 'unknown' when all_values is an empty list

    if "with_mean" is 1, then it returns a tuple (std_deviation, mean)
    
                                                                2
    std deviation defined as                  sum [ (value-mean)  ]
                             square root {   -----------------------  }
                                                  num values 
                                                  


   (this should be (num values -1) but since I am working most of the time with just a few cases, substracting 1 has a very high impact on the stats

    When a value is "None" or None or "unknown", it is ignored and not used for calculating neither the average not the std deviation
    """

    total = 0
    number_of_values = 0

    # first of all, calculate the mean
    for value in all_values:
        if value != "None" and value != "unknown" and value is not None:
            number_of_values += 1
            total += value
    # END OF for value in all_values:

    if number_of_values == 0:
        return "unknown"

    mean = total / float(number_of_values)

    # now, calculate the sum (value-mean) to the power of 2

    total_sum_powered = 0
    for value in all_values:
        if value != "None" and value is not None:
            dif = value - mean
            total_sum_powered += math.pow(dif, 2)
    # END OF for value in all_values:

    # divide it by n
    inside_sqr_root = total_sum_powered/ float(number_of_values  )

    std_deviation = math.sqrt(inside_sqr_root)
    if with_mean:
        return (std_deviation, mean)
    else:
        return std_deviation
        


# ------------------------------
# Methods that should not be here ;-)                                             
# ------------------------------


def get_patchgroup_graph_from_file_name(file_name = None):
        """
        returns a PatchGroup object from a file name "file_name" that was previously created with cPickle.dump
        """
        # TO DO!!! Move this to another file
        #  --> this is here because I don't manage to make it work as staticmethod in class PatchDecomposition
        return cPickle.load( file(file_name , "rb") )
    

