# --------------------------------------------------------------------- # File : general_template.piana_conf # Author : Ramon Aragues # Creation : 2.10.2004 # Updated : 6.4.2007 # Contents : template to be used to create your own piana # configuration files # --------------------------------------------------------------------- # This file is used to configure parameters for piana.py for some # specific use described below # # if you want piana.py to be configured by this file, set command line # option --configuration-file=this_file_name # # configuration files can be used for both execution modes: batch or # interactive # # -> in execution mode interactive, the execution section will be # ignored (ie. will only read parameters section) # # ----------description of this particular configuration file --------- # # This is a template for creating your own piana configuration files: # edit, move, delete options in this file to tell piana what you want # to do: in your configuration file, write here what is the use it has # (eg creates a network from an input file and prints the interactions # table to the screen) # # # In your configuration file, write here which are the parameters that # must be set through the command line # - apart from this configuration file, the user must use piana.py # command line options: # # (For example, you could write something like this: # # --> input-file-name # --> input-id-type # --> output-id-type # --> results-prefix # --> piana-dbname # --> piana-dbhost # --> piana-dbuser (depends on the system) # --> piana-dbpass (depends on the system) # --> depth # ) # # These parameters are required in the command line! In this file, # they are set to blank, obliging the user to set them on the # command line (although, they could have values assigned and still # be ignored, since the command line has preference over the # parameter values in this file) # ------------------------------------------------------------------- # # # # Attention! # # - All non-configuration lines in this file must start with '#' # (unless empty line) # # - A configuration line that is preceded by '#' is not taken # into account # # - Configuration file parameters equal to blank are ignored # # ----------------------------------------------------------------- # # Remark: in many parameters and commands, the PIANA reference card is # mentioned as the place to look for method names, id types, etc. To # print the PIANA reference card, run # "python piana.py --print-configuration-file --piana-dbname=your_pianaDB --piana-dbhost=your_piana_host" # # ------------------------------------------------------------------ # ================================================================== # configuration of execution parameters # ------------------------------------------------------------------ # set here the input parameters for this specific configuration that # are not required in the command-line # ================================================================== # exec-mode can be interactive or batch. # - if interactive is chosen, the execution commands of this file are # ignored, and the user can chose commands from a text menu. # - if batch is chosen, piana.py will execute the commands described # in the execution commands section of this file exec-mode=batch # *************************** # Memory usage # *************************** # Networks can be built and managed in different ways. This parameter # can have the following values: # "high": all information of the network is stored in memory. # - slower to build the network # - faster when information is printed more than once. # "low": information is retrieved from database when needed. # - faster to build the network # - slower to print and to post-process the network. # By default, if this parameter is not used, its value will be "high" memory-usage=high # **************************** # Input proteins configuration # **************************** # Proteins can be added to the network using commands (eg add-protein # and add-proteins-file), via the command line by setting parameter # input-file or setting it here (ie. input parameters section) # - If no input-file is set, the network will be empty unless you add # proteins or interactions afterwards # - If an input-file is set, then the initial network will be built # using proteins in this file (more proteins can be added afterwards # using PIANA commands) # # Set here the file name that contains one protein per line # - all proteins must be of the same type # - if you have proteins in different code type (eg. gis and uniprot) # then you must separate them into different files to make # sure each file only contains proteins of the same type # Then, you can use command add-proteins-file to add each # of your files # - example input files are in piana/code/execs/dummy_input_files input-file=blank # Set here the type of code for proteins that will be used by default # This parameter is also used (and required) if you set an input-file # # - valid input id types are those defined in # the PIANA reference card # -> type python piana.py --print-configuration-file --piana-dbname=your_pianaDB --piana-dbhost=your_piana_host" # to print the PIANA reference card input-id-type=blank # Set here the protein species that will be used by default # --> valid values are all and NCBI species names # (egs: all, yeast, human, Candida albicans SC5314,...) # -> you must write the complete name or it won't work. # If you don't know which is the complete name for # your species you can look at the website: # http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=Taxonomy # (alternatively, you can look into table species of your # PIANA mysql database) # # This parameter is required if you set an input-file # # -> Normally, this can be set to all, since the protein code # already refers to a unique species. Use this parameter to fix the # protein species when the protein code you are using does not # implicitly refer to a species: for example one geneName can refer # to different proteins of many species # # -> if you write an unknown species name or a species name # that is ambiguous (ie. corresponds to more than one tax id) # PIANA will halt the execution # input-proteins-species=all # ***************************** # Piana Database configuration # ***************************** # # Set here which piana database to use, where it is and how to access # it: # - piana-dbname (eg pianaDB_limited) and piana-dbhost (eg sefarad) # are required # - piana-dbuser and piana-dbpass are required in mysql password # protected systems # # If your system uses a mysql port different than the default one, you # can change it by setting the correct value for variable piana_dbport # on file piana/code/utils/piana_configuration_parameters.py piana-dbname=blank piana-dbhost=blank piana-dbuser=blank piana-dbpass=blank # (the following parameter is only for advanced users... leave it to # blank if you don't understand what it does) # # Set here if you want to use a secondary PIANA database or not # -> this secondary database will be used if the primary database # (which is set above) does not contain any interactions for # a given protein of interest. # -> for proteins with known interactions in the primary database, # the secondary database will not be used # -> this is a parameter that lets you do something like this: # "I want to build a network for my proteins of interest. For # those proteins for which experimental interactions are known # I only want to use those. If no experimental interactions # are known for a given protein, then I want to look for # predictions for that specific protein and add them to the # network." In this scenario, the user has two synchronized # PIANA databases, whith one containing only experimental # interactions and the other one containing experimental # and predicted interactions (see section "PIANA databases # contain all the information PIANA needs" on file # REAME.piana_tutorial for more information on why would # you want to have two separated synchronized PIANA databases) # -> if you do not want to generalize the use of the secondary # database you have the possibility of forcing the use of # the secondary database only for a specific protein or # proteins using argument piana-db of commands add-protein # and add-proteins-file # # -> Attention! The primary and the secondary PIANA databases # must be synchronized. Otherwise you cannot use this feature # By synchronized I mean that they must differ only in the # interaction tables: their protein information must be the # same (same proteinPianas, same values in the protein tables, # same protein databases have been parsed for both, etc)(see # section "PIANA databases contain all the information PIANA # needs" on file REAME.piana_tutorial for a detailed description # on how to keep two separate synchronized PIANA databases) # # -> the secondary database that will be used is to be set by you # on file piana/code/utilities/piana_configuration_parameters.py, # section "PIANA secondary database" # -> Attention: even if you set use-secondary-db to 'no', # if piana_configuration_parameters.py has a secondary # db set, PIANA will create the connection to the # secondary DB. It will not use it (since the parameter # use-secondary-db is set to 'no') but it will create # the connection. If you do not want to create that # connection, you must set to None all parameters in # piana_configuration_parameters.py related to the # secondary DB. # -> I am thinking about including all the secondary # database parameters in the piana configuration files # (ie. here) but I don't want the 'standard' PIANA user # to be confused, so maybe I'll leave it like this... # # -> Attention! There are two cases in which this command might not # have the effect you attended (ie. predictions might not be added # for a protein which had no experimental interactions). These two # cases are: # -> the protein interacts with itself according to experimental # evidence: PIANA will consider that it has experimental # interactions and will not look into the secondary DB # -> you can avoid this by setting the input parameter # use-self-ints to 'no' # -> even when telling PIANA not to use self interactions, # it might happen that there is an experimental interaction # between two different proteinPianas (ie. sequences) which # translated to your protein identifier become a single node # in the network (ie. two proteinPianas pointing to the same # identifier. Since PIANA does not know beforehand that that # interaction is in fact a self-interaction, it will use it # and therefore will not look at predictions for that protein. # # You can solve this problem by deciding which are the proteins # for which you want to use the predictions and then setting # argument piana-db to secondary in commands add-protein and # add-proteins-file # # Valid values for use-secondary-db are yes, no or blank (which is # defaulted to no) # use-secondary-db=blank # ***************************** # Network options # ***************************** # Set here the depth to which the network will be developed (ie. how # many interaction steps will be searched for each root protein) # # --> Setting depth to 1, the partners of the proteins in the input # file will be used to build the network. Setting depth to 2, # the partners of the partners of the seed proteins will be # used. Etc, etc. # --> Setting depth to 0 will search for interactions between # the input proteins # --> Settting depth to -1 will not build a network (useful when # using commands that do not require a network, such as # translating between protein identifiers) # depth=blank # Set here the hub threshold # # Use this option to limit the interactions that will be added to # the network # # If a protein has hub-threshold interactions or more, these # interactions will not be added # -> this parameter is used to avoid inserting in the network # those proteins that bind to "everything" # -> set it to 0 if you don't want any thresholds to be applied # # Attention: this threshold applies at the time of creating the # network. Due to the ideosincracy of PIANA (PIANA # keeps interactions between protein sequences, not # between protein identifiers), you might observe # in some cases that your output network does not # respect this threshold. Read README.piana_tutorial # for more details on this. hub-threshold=blank # Set here whether self interactions should be added to network or # not # -> in some analysis, self interactions perturb the results # because it causes all proteins to be at all possible # distances from a given protein # # -> a self interaction is a protein that is known to interact # with itself # # -> valid values are 'yes' (ie use all interactions) and no (ie # do not add self interactions to network) use-self-ints=yes # Set here the interaction databases that you want to use # -> list-source-dbs=all will use interactions from all source # databases in your PIANA database # -> write colon-separated database names to limit the source # databases # - valid interaction database names are those defined # in PianaGlobals.interaction_databases # --> do python2.3 piana.py --help to get a list of # valid database names # - for example, write list-source-dbs=dip:string:mips # or for just one database... # list-source-dbs=dip # # -> if inverse-dbs is yes, then this parameter does the opposite: # dbs in list-source-dbs will not be taken into account. # # - if list-source-dbs is all, inverse-dbs is ignored (it # doesn't make sense to ignore all databases) # # # Attention: any configuration different from (all, no, no) will # slow down PIANA, since it has to introduce # restrictions when searching for interactions # list-source-dbs=all inverse-dbs=no # Set here the interaction methods that you want to use # -> list-source-methods=all will use interactions from all types # of methods in piana-dbname # -> write colon-separated database names to limit the methods # - valid method names are those defined in # PianaGlobals.method_names.keys() # - for example, write list-source-methods=y2h:copurif # or for just one method... # list-source-methods=y2h # # -> if inverse-methods is yes, then this parameter does the opposite: # methods in list-source-methods will not be taken into account # -> if list-source-methods is all, inverse-methods is ignored # (it doesn't make sense to ignore all methods) list-source-methods=all inverse-methods=no # ***************************** # Output options # ***************************** # # Set here how the output results look like # # Set here the type of protein code to be used in your output # # - valid output id types are those defined in # the PIANA reference card # -> type python piana.py --print-configuration-file --piana-dbname=your_pianaDB --piana-dbhost=your_piana_host" # to print the PIANA reference card # output-id-type=blank # Set here alternative types of protein identifiers for your output # # - alternative-id-types determines which protein identifier types # will be used in case no code is found for output-id-type # # -> write a colon-separated list of easy-to-remember id types # (for example: uniacc:unientry:gi:md5 ) # -> valid id types are those defined in # the PIANA reference card # -> alternative-id-types cannot be set through the command line # -> it is recommended to write md5 as last id type code to # be used, so output has the protein md5 at least # -> md5 is a checksum of the protein sequence (sequence is # transformed to a unique code (shorter than the sequence # itself)) # # Attention! If you do not set at least one type of code for which you # can be sure there will be a value (eg. md5) PIANA might # have an error when outputting results (because it won't # know which name to use for that protein # alternative-id-types=md5 # Set here which species you want your output proteins to be # # - output-proteins-species determines the species that the # output_proteins must be in order to be printed # # valid values are: # -> all: will print network proteins regardless of the # species # -> and those names in the NCBI database # (egs: all, yeast, human, Candida albicans SC5314,...) # # -> you must write the complete name or it won't work. # If you don't know which is the complete name for # your species you can look at the website: # http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=Taxonomy # (alternatively, you can look into table species of your # PIANA mysql database) # # -> this parameter only affects what you get in the output! The # network is allowed to contain proteins from other species. # You can control which species is used to build the network # with parameter input-protein-species # # -> when printing interactions, setting this command to a species # guarantees that at least one of the two proteins belongs to # that species. Then, you can make the distinction between # interactions where both proteins are of the same species # by looking to the species field ('same' when both proteins # are of the same species, 'different' when only one of the # two proteins is of the species you chose) # # -> when printing information, only proteins of this species # will be shown # # -> connecting information (ie. linker proteins) of proteins is # not affected by output-proteins-species # # -> setting a species is recomended (ie. do it) when using geneName # as protein code (duplications across species) # # -> this currently applies to outputing the following information: # - proteins information # - translating codes # - printing interactions and network # - printing expansions (not to mode "add" in expansions) # output-proteins-species=all # Set here the results prefix that will identify your results # # - results-prefix will precede each results file. A file extension # describing the results file content will be added to results-prefix # (eg. interaction table will be printed to results-prefix.print-table) # results-prefix=blank # Set here the directory where your results will be saved # # - results-dir is the directory where results will be saved # (e.g. /home/pepito/piana/results/ ) # -> if it is equal to blank, results are saved in piana execution # directory # -> you must place the slash at the end of the path! # results-dir=./ # ********************************* # Extra data to be added to network # ********************************* # Set here the keywords that you want to use for your analysis: this # is a way that PIANA provides to detect proteins related to specific # words # # -> write a colon-separated list of keywords # - these are the keywords that you are used to detect if the # proteins of the network are related to something you are # interested in # - for example, if you are interested in cancer, you could write: # list-keywords=cancer:carcinoma:onco:tumor:apoptosis # # -> the network produced by command print-network will highlight in # red those proteins that contain one or more keyword in their # function, description or name # -> the output produced by commands print-*-prots-info will: # - in format-mode 'txt': write tokens user_keyword=word for # those proteins that contain the keyword in their function, # description or name # - in format-mode 'html': underline the proteins that contain # the keyword in their function, description or name # -> for command print-table, when using format-mode 'html', this list # of keywords is used to underline the proteins that contain the # keyword # # -> list-keywords cannot be set through the command line: it must be # done here # -> if you do not want to use keywords for your analysis, leave this # to blank list-keywords=blank # Set here the names of files that contain proteins that you have found to # be over/under expressed. # # These files will be used for several things: # # 1) to highlight over/under expressed proteins in the network # (in the image file (dot or node attributes for cytoscape) # over expressed proteins will appear as a box with red border. # under-expressed proteins as a box with green border) # -> Colors can be changed in PianaGlobals.node_border_colors # dictionary # Text and HTML tables will also provide information on the # expression associated to each protein in the network. # # 2) to be able to select proteins from the network that are over/under # expressed (eg. in PIANA commands such as match-proteins-to-pathways) # # -> these files contain one protein per line, using a code of type # indicated in parameter expression-id-type # -> first column must be the protein name. Subsequence columns # can have extra informacion (which will be ignored) # -> expression-id-type can be any of the PIANA id types # --> you can do $> python piana.py --help, which # will display valid input values for proteins id types # # -> input-proteins-species will be used to retrieve the proteins in these # files: therefore, if you are using a list of geneNames for your # over/under expressed lists, set your input-proteins-species to a # valid value (other ids are less ambigous and do not necessarily # need to set the species) file-over-expressed=blank file-infra-expressed=blank expression-id-type=blank # Set here the file names that contain proteins that you want # to label as "special" in your results # # -> you should use this parameter to highlight specific proteins # that are 'important' to you. For example, if you are # creating a network for proteins obtained in a experiment, and # you have a list of proteins that are interesting because # they are known to be related to a disease, you can set # here the files that contain those disease-related proteins. # These proteins can then be visualized in cytoscape with # a different color by mapping their labels to colours # (cytoscape attributes *.special_prots.noa) # Text and HTML tables will also provide information on # the labels associated to each protein # # -> Moreover, you can add one extra column to special files. This # extra column will be taken as a property of the protein, # and will be printed along the protein in output files. # For example, if you have data on which is the probability of # a protein of being related to cancer, you can create a special # file that contains rows in the form of: # proteinprobability # .................. # These probabilities will then be printed in output files using # different formats. For example, when classifying network proteins # according to their connections to root proteins, an extra column # will be added that says label=prob@0.67 # # -> for each file with relevant proteins you must specify the # type of protein identifier you wish to use and a label. # The label will be used when printing out results so that # you know in which file was that network protein found. # For example, you could have files for 5 pathways of # interest, with the corresponding labels, and then # your results will show to which of the 5 pathways # does each protein belong to. # # -> the format you must follow is the following: # special-proteins=path_to_file1,id_type1,label1:path_to_file2,id_type_2,label2:... # -> Eg. special-proteins=/home/brain_cancer.txt,geneName,brain:/home/liver_cancer.txt,geneName,liver # -> Attention! Do not end up the line with ':' # -> Attention! Paths must be absolute paths (i.e. ../../xxxx.txt and ./dfsdf.txt are wrong!!) # # -> each special file is simply a text file with one protein per # line (and an optional extra column) # special-proteins=blank >>> Do not remove this line: marks transition from parameters to commands <<< # ========================================================================== # configuration of execution commands # ========================================================================== # Set here the commands that piana.py will execute # # these commands can be ordered as desired by the user: it is up to # the user to make sure the command sequence makes sense (eg. not # asking to write a table without building a network first) # # commands that can be used are those listed in piana.py --help # # some commands require extra information that can be set in this file # as well (eg. command "species-network" requires a species_name, # which will be provided as well in this configuration file) # # even if you don't want to give any value to the command arguments, # you must leave the argument and set it to blank # # You should choose the commands you want to execute from the list # following these lines. Remove those commands that you are not # interested in, and set arguments appropiatly for those commands that # you need. You can see some configuration file examples in this same # directory under *.piana_conf # # ========================================================================== # # the following commands perform the actions described in the # description of this particular configuration file # # - The commands will be executed in the same order as they appear in this # file # # - All commands must be followed by ";", even if they do not have arguments # --> the command arguments are separated by ";" # --> configuration lines with arguments should not finish with ";" # # - if you don't want the configuration line to set a given argument, write # "blank" after the "=" sign # # - default names for output files (used in case you set it to blank) are: # results_prefix.command_name_creating_output[.format_mode] # --> in some cases, extra information will be added to the results file # name (eg. "_compact" or "_extended" output mode) # --> format_mode is usually added to indicate whether it is a text file # or an html page that has to be visualized with a browser # # - in all commands that set id_type, if nothing is found for id_type # (or input-id-type) alternative-id-types are used instead (and will # print protein codes as "alternative_id_type:protein_id" ) # # ============================================================================ # ************* # create-report: this command makes PIANA keep a report with links to results # ************* # # Place this command before the commands for which you want their results # to be 'centralized' in a global PIANA report. Once PIANA has finished, # you will find under the name you chose (see below) an HTML page describing # the results and providing links to all files produced by PIANA. You can # open this file using any internet browser. # # # It is recommended that if you want a global report you place this command # at the beginning of the execution section of your configuration file. # # Attention! This commmand only works if you are printing results in HTML # format. If you are outputting results in TXT format you will # still be able to access the results from the report but # raw text is not very nice to read on a browser... # # Attention! In order for the global report to point correctly to the # results files, all PIANA files must be in the same directory. # # - If you do not want the report to be written to a default file name, # then you must specify a value for argument file-name # - leave it to blank if you want the report to be written to the # default name (ie. results_prefix + "create-report.html") # Otherwise, the report will be saved to file_name in directory # results-dir (specified in parameters section) # ( an extension '.html' will be automatically added by PIANA to # this file name ) # # Attention! if you are asking PIANA to produce more than one report, # you must specify a file-name in order to distinguish # between them. Otherwise, PIANA will only write (to # the default name) one report (for the last set of # commands executed). # create-report;file-name=blank # ************* # reset-network: this command resets the network currently in memory # ************* # # Attention! It just resets the network (the input and output parameters are # not resetted: input-id-type, output-id-type, etc will be the same) # If you need to work with different parameters, I recommend doing a different # configuration file # # - no required parameters # # - it can be used to do operations on several different networks with a unique # configuration file # (eg. build one network, get its results, reset the nework, build a new # network, get new results) reset-network; # ************ # save-network: this command saves the current network into a disk file # ************ # # - file_name is required # - the network will be saved to file_name in directory results-dir # (specified in parameters section) # - the saved file is not human-readable: it is managed by python # (using cPickle) # save-network;file-name=file_name # ************ # load-network: this command loads into memory the network that was # saved in a file using save-network # ************ # # - file_name is required # - the network will be loaded from file_name in directory # results-dir (specified in parameters section) # # Attention! A network can only be loaded to be used with the same # database from which the network was created (due to internal piana # distribution: proteinPiana identifiers are not coherent across # different piana databases) load-network;file-name=file_name # *********** # add-protein: this command adds a protein (and its interactions from # the piana database) to the network # *********** # # - network doesn't have to exist previously: it can be built from # a single protein through "add-protein" # # - required parameters (either from command line or from "parameters # section of this file") are: # -> depth and input-id-type (in case argument # id-type is blank in the execution below) # # - protein_name is required. # # - if protein code type used for protein-name is different from # input-id-type, then set id-type with the new type of # identifier as shown (eg. input-id-type is geneName but here # you want to use GenBank gi to refer to your protein). # if protein-name used is of input-id-type, then set # id-type to blank # -> Valid id types are blank (ie use input-id-type) # and those defined in the PIANA reference card # --> see comment on input-id-type # # - if the species of this protein is different from # input-proteins-species, then set species-name to the new species # as shown. # if the species of the protein is input-proteins-species, then set # species-name to blank # Valid protein species are blank (ie use input-proteins-species), # all (use all species) and those defined in NCBI (eg. human, yeast, # ...) # -> this parameter is mainly useful for protein codes that do not # implicitly establish their species (eg. geneName) # # # - if you do not want to use the primary PIANA database (the one you specified # in the input parameters) for retrieving interactions for this protein you # can set a different one using argument 'piana-db' # (reasons why you would want to do this explained on input parameter # use-secondary-db) # -> valid values are blank (ie. use primary) and secondary # -> if you set it to secondary, the database that will be used to # retrieve interactions for this protein will be the one that is # written as secondary on variables piana_secondary_* of file # piana/code/utilities/piana_configuration_parameters.py # -> Attention: primary and secondary database must be synchronized # Read more about this on description of input parameter # use-secondary-db # -> Attention> if you want to force a specific PIANA database for a # protein, make sure of the following: # -> either both databases are completely synchronized (the # secondary PIANA database is a superset of the primary, both # for proteins and interactions) # -> or make sure that this protein does not appear in other # input lists to this network. # add-protein;protein-name=protein_name;id-type=blank;species-name=blank;piana-db=blank # ***************** # add-proteins-file: this command adds proteins (and their # interactions from the piana database) from # a file to the network # ***************** # # - network doesn't have to exist previously: it can be built # with "add-proteins-file" # # - complete_path_to_file is required # -> file with input proteins must have one protein per line # -> proteins in this file must all be of the same type # # - if the type of protein code used in the file is different # from input-id-type then set id-type # -> valid id types are blank (ie use # input-id-type) or those defined in # the PIANA reference card # --> see comment on input-id-type # # - if species-name is different from input-proteins-species, # then set the new species as shown. Otherwise, set to blank # Valid protein species are blank (ie use input-proteins-species), # all (use all species) and those defined in NCBI # -> this parameter is mainly useful for protein codes that # do not implicitly establish their species (eg. geneName) # # - if you do not want to use the primary PIANA database (the one you specified # in the input parameters) to retrieve interactions for the proteins in this # file you can set a different one using argument 'piana-db' # (reasons why you would want to do this explained on input parameter # use-secondary-db) # -> valid values are blank (use primary) and secondary # -> if you set it to secondary, the database that will be used to # retrieve interactions for these proteins will be the one that is # written as secondary on variables piana_secondary_* of file # piana/code/utilities/piana_configuration_parameters.py # -> Attention: primary and secondary database must be synchronized # Read more about this on description of input parameter # use-secondary-db # -> Attention> if you want to force a specific PIANA database for a # list of proteins, make sure of the following: # -> either both databases are completely synchronized (the # secondary PIANA database is a superset of the primary, both # for proteins and interactions) # -> or make sure that none of the proteins in this list appears # in other input lists to this network. # # # Attention!!! proteins in this file must all be of the same # type of identifier. If you have proteins that are of a different code type, # you must split the proteins in as many files as different types of # identifiers there are, and add file by file separately using this command add-proteins-file;file-name=complete_path_to_file;id-type=blank;species-name=blank;piana-db=blank # ********************* # add-interactions-file: this command adds interactions from a file to # the network # ********************* # # - network doesn't have to exist previously: it can be built with # "add-interactions-file" # # - complete_path_to_file is required # -> file with input interactions must have one interaction per # line # -> the input file format must be the following: # protein_aprotein_bsource_dbmethodconfidence # -> the format is described in detail on file # piana/code/dbParsers/piana_text_intParser/README.piana_interaction_data_format # -> source_db must be a db appearing in # PianaGlobals.interaction_databases # -> if you are using interactions from a # database that does not appear in this # list and you do not want to add a label # to PianaGlobals.interaction_databases # you can use 'user' as your source db # -> method must be a method appearing in # PianaGlobals.method_names # # - if the type of protein code used in the file is different # from input-id-type then set id-type # -> valid id types are blank (ie use input-id-type) # or those defined in the PIANA reference card # --> see comment on input-id-type # # - proteins in the interactions file must all be of the same type # # - this command does not add any interactions from the piana # database: if you want as well the interactions from the # database you must create a file with proteins and use # command add-proteins-file # # Attention: all interactions in the file will be added! # No restrictions applied... that means list-source-dbs # list-source-methods and use-self-ints have no effect # on this command # You are responsible for having the interactions you # want to use on your interactions file # (if you think it is important for you to apply # restrictions to your file, send me an email and # I will do it...) add-interactions-file;file-name=complete_path_to_file;id-type=blank # *************** # species-network: this command builds a network for all proteins in a # given species # *************** # # Executing this command will replace the existing network with a new # network. Moreover, this command ignores the input list of proteins # and the species set in the input section: it will build a network # using all proteins and all interactions of a given species. # # - a network must not previously exist (ie. commands build-network # and add-protein* not active) # # - to set the species for which you want to load the network, # you have two options (one of the two arguments must be set # to blank and the other to a correct value): # # - tax_id # -> valid taxonomy ids are those defined by the NCBI # (9606 for human, 7227 for drosophila meg, ... ) # # - species_name # -> valid species names are those in the NCBI database # (human, yeast, Arabidopsis thaliana, ...) # -> if the species name given has multiple corresponding # taxonomy ids, the network will contain proteins # from these multiple taxonomy ids # (eg. "rat" is tax_id 10116 and 10114) # # Attention: if both arguments are different from blank, # tax_id will be used # if none of the arguments is set to a value, # an error will be raised # # Attention! use this command at your own discretion... # networks can be huge species-network;species-name=blank;tax-id=blank # *********************** # database-method-network: this command builds a network for all # interactions in a given database and/or # a given method # *********************** # # This command can be useful to build a network that contains # all interactions in a given database... for example, if you # want to visualize the network for a database that you have # inserted into a piana database # # - a network must not previously exist # (ie. commands add-protein* can not appear in the same # configuration file as database-method-network) # --> this command does not require a list of proteins, # since it takes all interactions for a given database # and method # # - database_name is required # -> valid databases names are all (all databases taken into # consideration) and those in # PianaGlobals.interaction_databases # -> use at your own discretion... networks can be huge # # - method_name is required # -> valid method names are all (all methods are taken into # consideration) and those in PianaGlobals.method_names # -> use at your own discretion... networks can be huge # # - species_name is required: network will only contain # interactions between proteins of species_name # -> valid species names are all and those in the NCBI # database # -> if the species name given has multiple corresponding # taxonomy ids, the network will contain proteins from # these multiple taxonomy ids (eg. "rat" is tax_id 10116 # and 10114) # # Attention! hub_threshold parameter does not affect this # command: all interactions will be added regardless # of the hub_threshold value # database-method-network;database-name=database_name;method-name=method_name;species-name=species_name # *********** # print-table: this command prints a table with all interactions in # current network # *********** # # - if you do not want the table to be printed to default name # then you should set output-target to your own file name # -> output-target can be blank (ie. use default name), a # file name or screen if you want the results printed # to stdout # # - if you want the type of protein code used for printing # the table different from output-id-type then set # id-type. # -> valid id types are blank (ie use output-id-type) # or those defined in the PIANA reference card # --> see comment on input-id-type # # - print-mode is required: set which proteins will appear in # output # -> all: prints all interactions in the network # -> all_root: prints all interactions in the network where # at least one of the proteins is a root protein # -> only_root: prints only interactions between root proteins # in network # -> connecting will print only interactions between root # proteins and those proteins that connect more than one # root protein (linker proteins) # # - format-mode is required: set which format will be used for # printint the output # -> txt: prints flat text # -> html: prints in html format # # # -> the output for all print-table commands in format-mode == html # is described in the output file itself # # -> the output for all print-table commands in format-mode == txt # is (all in one line... separated in lines for the sake of clarity): # # connectivity=num of root prots connected by the not-root-prot of the pair (CURRENTLY NOT WORKING!) # # protein_1=protein_1 code using the type chosen by user # # neighbours_1=number of neighbours protein 1 # # root_1=(is-root or not-root) for protein 1 # # expression_1=(None, over_expressed or infra_expressed) expression info prot 1 # # protein_2=protein_2 code using the type chosen by user # # neighbours_2=number of neighbours protein 2 # # root_2=is-root or not-root for protein 2 # # expression_2=(None, over_expressed or infra_expressed) expression info prot 2 # # location=proteins are in same cellular location (y or n) # # species=proteins are of same species (y or n) # # db=source database where interaction appears # # db=source database where interaction appears # # db=.............. # # method=method used to detect interaction # # method=method used to detect interaction # # method=.............. # print-table;output-target=blank;id-type=blank;print-mode=print_mode;format-mode=format_mode # *************************** # print-table-db-intersection: this command prints a table with # interactions that appear in the # intersection of several databases # *************************** # # This command will only print those interactions that appear in all # the databases given by the user as argument For example, if the # network only has two interactions, one extracted from dip and the # other one extracted from dip and mips and the user sets list-dbs to # dip:mips, then only the second interaction will be printed by this # command # # - if you do not want the table to be printed to default name then # you should set output-target to your own file name # -> output-target can be blank (ie. use default name), a file # name or screen if you want the results printed to stdout # # - if you want the type of protein code used for printing the # table different from output-id-type then set id-type. # -> valid id-type values are blank (ie use # output-id-type) or those defined in # the PIANA reference card. # -> see comment on input-id-type # # - list-dbs is required: used to set the databases where the # interactions must appear in order to be printed # -> valid database names are those defined in # PianaGlobals.interaction_databases # -> for example: list-dbs=dip:string:ori # # - print-mode is required: set which proteins will appear in # output # -> all prints all interactions in the network # -> all_root prints all interactions in the network where at # least one of the proteins is a root protein # -> only_root prints only interactions between root proteins # in network # -> connecting will print only interactions between root # proteins and those proteins that connect more than one # root protein # # - format-mode is required: set which format will be used for # printint the output # -> txt: prints flat text # -> html: prints in html format # # - for obtaining several intersections, just repeat this command # changing the dbnames # # - the default results file name will describe the intersecting # dbs # # output format described in command print-table print-table-db-intersection;output-target=blank;id-type=blank;print-mode=print_mode;list-dbs=dbname1:dbname2:dbname3:...;format-mode=format_mode # ************* # print-network: this command prints a file with all the interactions # in current network. Output formats: DOT and SIF # ************* # # - if you do not want the network to be printed to default name # then you should set output-target to your own file name # -> output-target can be blank (ie. use default name), a file # name or screen if you want the results printed to stdout # # - if you want the type of protein code used for printing the # network different from output-id-type then set id-type. # -> valid id-type values are blank # (ie use output-id-type) or those defined in # the PIANA reference card. # --> see comment on input-id-type # # - print-mode is required: set which proteins will appear in output # -> all prints all interactions in the network # -> all_root prints all interactions in the network where at # least one of the proteins is a root protein # -> only_root prints only interactions between root proteins # in network # -> connecting prints only proteins that are either root or # linkers (proteins that connect root nodes between them) # # - format-mode is required: defines which format will follow the output # - dot : DOT format (compatible with graphviz package and others) # - sif : SIF format (compatible with cytoscape and others) # # # -> The parameters that PIANA will use to generate the .DOT file # can be easily changed in file piana/code/PianaDB/PiabaGlobals.py # -> section "PARAMETERS FOR OUTPUT .DOT NETWORK" describes all # the parameters that you can modify # # # --> The color of the node box is an indication of the type of # protein # Node fill colors can be easily changed in file # piana/code/PianaDB/PiabaGlobals.py # (dictionary node_fill_colors in section COLOR CODES FOR # OUTPUT NETWORK) # # Currently, these are the meanings of the colors: # - blue: standard protein # - yellow: root protein # - red: protein that contains a keyword (see list-keywords # in input parameters) in its function, description # or name # - orange: root protein that contains a keyword (see # list-keywords in input parameters) in its # function, description or name # # --> The color of the border of the node box is an indication # on how that node was added to the network # Border colors can be easily changed in file # piana/code/PianaDB/PiabaGlobals.py # (dictionary node_border_colors in section # COLOR CODES FOR OUTPUT NETWORK) # # Currently, these are the meanings of the border colors: # - black: protein from the database # - blue: protein added to the network after a prediction # based on interologs # - green: protein found in the file with under expressed # proteins (from a microarray experiment) # - red: protein found in the file with over expressed # proteins (from a microarray experiment) # # # --> The color of the edge line is an indication of the source # database that had that interaction # Edge colors can be easily changed in file # piana/code/PianaDB/PiabaGlobals.py # (dictionary interaction_source_databases_colors in section # COLOR CODES FOR OUTPUT NETWORK) # # Currently, these are the meanings of the edge line colors: # - red: DIP # - green: ori (predictions from by distant # sequence/structure patterns similarity) # - magenta: STRING # - orange: prediction using interologs # (either by COG, SCOP, ...) # - dark green: MIPS # - blue: HPRD # - grey: BIND # - yellow: user (an interaction added by user with # command add-file-interactions given label # 'user') # - cyan: interaction appears in more than one database # (you can see the list of all the dbs where it # appears by looking to the result file .print-table) # # Attention!: since the colors change depending on the graphics # card, we have created a GIF image indicating to which database # corresponds each edge color: # piana/docs/documentation/network_colors.gif # # Attention!: if you add a new database, you have to add the # name of the database and a new color # in dictionary interaction_source_databases_colors # # --> The style of the edge line is an indication on how that # interaction was added to the network # Edge styles can be easily changed in file # piana/code/PianaDB/PiabaGlobals.py # (dictionary interaction_line_styles in section # COLOR CODES FOR OUTPUT NETWORK) # # Currently, these are the meanings of the edge line colors: # - solid: interaction from the database # - dashed: added to the network as an db interaction # of a node that was added when doing # predictions # - dotted: prediction by interologs # # The output of this command can be used to create an image of the # network (read piana/code/execs/README.visualize_piana_network) print-network;output-target=blank;id-type=blank;print-mode=print_mode;format-mode=format_mode # ***************************** # print-network-db-intersection: this command prints a DOT file with # interactions that appear in the # intersection of several databases # ***************************** # # This command will only print those interactions that appear in all # the databases given by the user as argument For example, if the # network only has two interactions, one extracted from dip and the # other one extracted from dip and mips and the user sets list-dbs to # dip:mips, then only the second interaction will be printed by this # command # # - if you do not want the network to be printed to default name # then you should set output-target to your own file name # output-target can be blank (ie. use default name), a file name # or screen if you want the results printed to stdout # # - if you want the type of protein code used for printing the # network different from output-id-type then set # id-type. # -> valid id-type values are blank (ie use # output-id-type) or those defined in # the PIANA reference card. # --> see comment on input-id-type # # - list-dbs is required: used to set the databases where the # interactions must appear in order to be printed # -> valid database names are those defined in # PianaGlobals.interaction_databases # -> for example: list-dbs=dip:string:ori # # - print-mode is required: set which proteins will appear in # output # -> all prints all interactions in the network # -> all_root prints all interactions in the network where # at least one of the proteins is a root protein # -> only_root prints only interactions between root proteins # in network # -> connecting prints only proteins that are either root or # linkers (proteins that connect root nodes between them) # # - format-mode is required: defines which format will follow the output # - dot : DOT format (compatible with graphviz package and others) # - sif : SIF format (compatible with cytoscape and others) # # - for obtaining several intersections, just repeat this command # changing the dbnames # # - the default results file name will describe the intersecting dbs # # Read the description of command print-network for a detailed # explanation of the output of this command # # The output of this command can be used to create an image of the # network (read piana/code/execs/README.visualize_piana_network) print-network-db-intersection;output-target=blank;id-type=blank;print-mode=print_mode;list-dbs=dbname1:dbname2:dbname3:...;format-mode=dot # ******************** # print-all-prots-info: this command prints information (protein # description, other codes, ...) about all # proteins in network # ******************** # # - if you do not want the information to be printed to default # name then you should set output-target to your own file name # output-target can be blank (ie. use default name), a file # name or screen if you want the results printed to stdout # # - if you want the type of protein code used for printing the # information different from output-id-type then set # id-type. # -> valid id-type values are blank (ie use # output-id-type) or those defined in # the PIANA reference card. # --> see comment on input-id-type # # - output-mode is required: used to set how information will be # printed # -> valid output-mode values are: # - extended (multiple lines, all available information) # - compact (one line, connected root proteins and # description) # -> default results file name will describe if output_mode # is extended or compact by placing '.compact.' or # '.extended.' in the file name # -> in compact mode, when a user keyword from list-keywords # appears, "user_keyword=the_word_that_appears" will be # printed to the protein information line # -> in compact mode, a list of the protein names associated # to the protein is printed after the protein information. # The first name, the one that is used to identify the # protein, is the "unique" name that PIANA has assigned # to that protein by means of inference and name checking # -> Please, read README.piana_tutorial section "PIANA and # protein names" for better understanding how PIANA # handles protein names # # - format-mode is required: set which format will be used for # printint the output # -> txt: prints flat text # -> html: prints in html format # # # the output for print-all-prots-info and print-root-prots-info # commands in format-mode == txt and output-mode 'compact' is: # # (format-mode == txt and output-mode 'compact' is the only # type of output you might find yourself parsing... # the other outputs are thought for looking directly at them, # as html tables have column titles) # # (for clarity, the format described here appears in multiple # lines... the real output is all in the same line: one line # of information for each protein) # # protein name # # ['protein description 1', 'prot desc 2', ...] (a protein can have several descriptions associated) # # ['protein function 1', 'prot funct 2', ...] (a protein can have several functions associated) # # root=value (where value is 1 when the protein is a root protein, and 0 otherwise) # # expression=expression_value (None, over_expressed or infra_expressed) # # special_labels=['special_label_1', 'special_label_2', ...] # # tax_ids=['tax_id1', 'tax_id2', ...] # # id_type:proteinid_type:protein.... (as many fields as identifiers this protein has) # -> valid id types are described in the PIANA reference card) # # # (if you need more information about these proteins, # just look for it in the results files of command print-all-prots-info) print-all-prots-info;output-target=blank;id-type=blank;output-mode=output_mode;format-mode=format_mode # ********************* # print-root-prots-info: this command prints information (protein # description, other codes, ...) about root # proteins in network # ********************* # # - if you do not want the information to be printed to default # name then you should set output-target to your own file name # output-target can be blank (ie. use default name), a file # name or screen if you want the results printed to stdout # # - if you want the type of protein code used for printing the # information different from output-id-type then set # id-type. # -> valid id-type values are blank (ie use # output-id-type) or those defined in # the PIANA reference card. # --> see comment on input-id-type # # - output-mode is required: used to set how information will be # printed # -> valid output-mode values are: extended (multiple lines, # all available information) or compact (one line, # connected root proteins and description) # -> default results file name will describe if output_mode # is extended or compact # -> in compact mode, when a user keyword from list-keywords # appears, "user_keyword=the_word_that_appears" will be # printed to the protein information line # -> in compact mode, a list of the protein names associated # to the protein is printed after the protein information. # The first name, the one that is used to identify the # protein, is the "unique" name that PIANA has assigned # to that protein by means of inference and name checking # -> Please, read README.piana_tutorial section "PIANA and # protein names" for better understanding how PIANA # handles protein names # # - format-mode is required: set which format will be used for # printing the output # -> txt: prints flat text # -> html: prints in html format # # the output of this command is described in command # print-all-prots-info print-root-prots-info;output-target=blank;id-type=blank;output-mode=output_mode;format-mode=format_mode # ************************ # print-connect-prots-info: this command identifies linker proteins # (proteins that connect root nodes between # them) and prints information (protein # description, other codes, linked roots, # ...) about them # ************************ # # - if you do not want the information to be printed to default # name then you should set output-target to your own file name # output-target can be blank (ie. use default name), a file # name or screen if you want the results printed to stdout # # - if you want the type of protein code used for printing the # information different from output-id-type then set # id-type. # -> valid id-type values are blank (ie use # output-id-type) or those defined in # the PIANA reference card. # --> see comment on input-id-type # # - output-mode is required: used to set how information will be # printed # -> valid output-mode values are: extended (multiple lines, # all available information) or compact (one line, # connected root proteins and description) # -> default results file name will describe if output_mode # is extended or compact # -> in compact mode, when a user keyword from list-keywords # appears, "user_keyword=the_word_that_appears" will be # printed to the protein information line # -> in compact mode, a list of the protein names associated # to the protein is printed after the protein information. # The first name, the one that is used to identify the # protein, is the "unique" name that PIANA has assigned # to that protein by means of inference and name checking # -> Please, read README.piana_tutorial section "PIANA and # protein names" for better understanding how PIANA # handles protein names # # - format-mode is required: set which format will be used for # printing the output # -> txt: prints flat text # -> html: prints in html format # # the output of this command looks like this: # # (for clarity, the format described here appears in multiple # lines... the real output is all in the same line: one line of) # information for each protein) # # # 'protein name' connects # root_proteins (with # being the number of root proteins that this linker protein connects) # # ['root_protein_1', 'root_protein_2', ...] # # ['source interaction database 1', source inter db 2', ...] (list of interaction dbs where these interactions where retrieved from) # # ['protein description 1', 'protein desc 2', ...] (a protein can have several descriptions associated) # # ['protein function 1', 'protein func 2', ...] (a protein can have several functions associated) # # expression=expression_value (None, over_expressed or infra_expressed) # # special_labels=['special_label_1', 'special_label_2', ...] # # tax_ids=['tax_id1', 'tax_id2', ...] # # # (if you need more information about these linker proteins, # just look for it in the results files of command print-all-prots-info) print-connect-prots-info;output-target=blank;id-type=blank;output-mode=output_mode;format-mode=format_mode # ************************ # classify-network-proteins: this command writes to a file all # network proteins with a label # indicating whether they are a root # protein, a linker or a partner, as # well as "special" labels associated # to them. # ************************ # # This command can be useful for some analyses where you need to # know all proteins in the network and whether they were originally # given by the user (root proteins), they connect two or more of the # root proteins (linker proteins) or they are just connected to # one root protein (partner proteins). # # Moreover, next to the classification information, if wished, this # command also writes labels associated to the protein, according to # input parameter special-proteins. Leave this input parameter to blank # if you do not want this information # # Moreover, next to the previous information, if whished, this command # also writes expression information associated to the protein, # according to the expression files of parameters file-over-expressed # and file-infra-expressed # # Attention: this command has not been prepared to interpret networks # that were built for depths 2 or higher. A protein that # is connected to a root protein via another protein will # be considered as a partner. # # # - if you do not want the information to be printed to default # name then you should set output-target to your own file name # output-target can be blank (ie. use default name), a file # name or screen if you want the results printed to stdout # # - if you want the type of protein code used for printing the # information different from output-id-type then set # id-type. # -> valid id-type values are blank (ie use # output-id-type) or those defined in # the PIANA reference card. # --> see comment on input-id-type # # the output of this command looks like this (everything in one line): # # protein_idclass=valueexpression=value # label=value1label=value2... # # -> where class value can be: "root", "linker_N" or "partner" # -> N will have as value the number of root proteins # connected by that linker # -> and label values are labels specified in special-proteins # parameter # -> and expression value can be over_and_under_expressed, # over_expressed, under_expressed or None # # (if you need more information about these proteins, # just look for it in the results files of command print-all-prots-info) classify-network-proteins;output-target=blank;id-type=blank # *************************** # protein-code-2-protein-code: transforms codes from input-file (which # are of type input-id-type) to # output-id-type # *************************** # # This command is thought to be used independently from other # commands: it uses piana modules to transform proteins from one # code to another. It doesn't make use of the network itself, it # just outputs a table with protein code equivalences. Moreover, # most of the input and output parameters are not used when # executing this command (ie. all parameters ignored except for # input-file, input-id-type and output-id-type). For # example, even if you set output-proteins-species to yeast, the # output of this command can contain proteins from all species. # (reason: this is not building a network, and therefore, this # command is considered as an 'extra' to PIANA and does not # use the other parameters) # # - if you do not want the information to be printed to a default # file name then you should set output-target to your own file # name # -> output-target can be blank (ie. use default name), a file # name or screen if you want the results printed to stdout # # - retrieves proteins to be "translated" from input-file (which # is set through the command line or above in this file) # -> uses input-id-type as the type of code of proteins # in the input file # -> uses output-id-type as the type of code to which # proteins will be "translated" # # - format-mode is required: set which format will be used for # printing the output # -> txt: prints flat text # -> html: prints in html format protein-code-2-protein-code;output-target=blank;format-mode=format_mode # *********************** # protein-code-2-all-info: gets information for proteins in input-file # (which are of type input-id-type) # *********************** # # This command is thought to be used as a stand alone tool: it uses # piana modules to get information from proteins It doesn't make use # of the network itself, it just outputs a table with protein # information. Moreover, most of the input and output parameters # are not used when executing this command (see comments on previous # command protein-code-2-protein-code) # # The format followed for the output is described in command # print-all-prots-info # # - if you do not want the information to be printed to a default # file name then you should set output-target to your own file name # -> output-target can be blank (ie. use default name), a file # name or screen if you want the results printed to stdout # # - will output information for proteins in input-file (which is # set through the command line or above in this file) # -> uses input-id-type as the type of code of proteins # in the input file # # - if you want the type of protein code used for printing the # proteins different from output-id-type then set # id-type. # -> valid id types are blank (ie use output-id-type) # or those defined in the PIANA reference card. # --> see comment on input-id-type # # - format-mode is required: set which format will be used for printing # the output # -> txt: prints flat text # -> html: prints in html format # # - output-mode is required: used to set how information will be # printed # -> valid output-mode values are: extended (multiple lines, # all available information), compact (one line, # connected root proteins and description) # -> default results file name will describe if output_mode # is extended or compact # -> in compact mode, when a user keyword from list-keywords # appears, "user_keyword=the_word_that_appears" will be # printed to the protein information line # -> in compact mode, a list of the protein names associated # to the protein is printed after the protein information. # The first name, the one that is used to identify the # protein, is the "unique" name that PIANA has assigned # to that protein by means of inference and name checking # -> Please, read README.piana_tutorial section "PIANA and # protein names" for better understanding how PIANA # handles protein names # protein-code-2-all-info;output-target=blank;id-type=blank;format-mode=format_mode;output-mode=output_mode # *************************** # create-random-files: creates files that contain random # proteins # *************************** # # This command is thought to be used independently from other # commands: it uses piana modules to create text files that # contain random groups of proteins. It doesn't make use of # the network itself, it just creates the files. Moreover, # most of the input and output parameters are not used when # executing this command # # This command can be useful for certain bioinformatics # experiments where you have to check how a random group # of proteins would perform in a given analysis. You can # use this command to create the random group of proteins # (and write it to a text file) and then run piana again # (with a separate configuration file) to use these random # proteins as input file # # This command can also be used to get all proteins in # the pianaDB for which there are known interactions # # If you set the appropiate arguments to this command, you # can get information about these random proteins on # their expression (according to the expression input parameters) # and their special labels (according to input parameter # special-proteins) # # output-id-type will be used as the identifier for # the proteins written to the files # # output-proteins-species will be used to make sure that the random # proteins are of that species. If the species is not # relevant to you, set it to all # # - if you do not want the files to be named with the default name # (ie. results_dir/results_prefix.*) then you should set # files-prefix to the prefix you want to use for your files # (eg. my_random_file) # -> output files names are formed by adding consecutive integers # to results-prefix (set in the input parameters section) # -> the output files will contain one protein per line, using the # type of identifier specified by the user on parameter # output-id-type (from the parameters section of the # configuration file) # # - num-files is required: set how many files with random proteins # you want to create # # - num-prots is required: set how many proteins will be written # to each file (ie. the size of your random group) # --> if num-prots is 0, no random sampling is done: all # proteins that respect the criteria are written to # the output file # --> ie. one can use this command to create a # file that has all proteins in PIANA that # have interactions, with special labels # attached to them. # # - force-ints is required: set whether you require the random # proteins to have interactions or not # Valid values are: # - yes: only proteins that have at least one interaction # will be used to create the random groups # - no: having interactions is not required in order to # appear in the random groups # # -> all input parameters are used to decide whether a # protein has interactions or not. IE. use-self-ints # list-source-dbs, etc etc are relevant to this command! # # - check-expression is required: set whether you want to write next # to the proteins info about the expression of that protein # - yes: expression (from expression input parameters ) will be # written next to the protein if it appears in the # expression files # # -> a field expression=value will be added to the # protein line # - no: file will only contain one column # # - check-special is required: set whether you want to write next # to the proteins the labels that indicate what type of special # protein they are. # - yes: labels (from parameter special-proteins) will be written # next to the protein if it appears in the special-proteins # files # - no: file will only contain one column # # # The output of this command will therefore be: # (all in one line, separated by TABs and ending with NEWLINE) # # column 1 (always appears): protein # column N (optional): expression=expression_value (expression_value # can be: # - None # - over_expressed # - under_expressed # - over_and_under_expressed # column N+1 (optional): label=special_label # column N+2 (optional): label=special_label # column N+3 (optional): .... # create-random-files;files-prefix=files_prefix;num-files=num_files;num-prots=num_prots;force-ints=force_ints;check-expression=check_expression;check-special=check_special # ******************* # expand-interactions: this command predicts interactions of proteins # in the network using interologs (or other # methods) # ******************* # # This command propagates interactions between nodes that share # a certain characteristic. For example, this command can be # used to transfer interactions between proteins that have the # same domain (scop) or that belong to the same orthologous # group (cog) # # Each expand-interactions piana command does the following: # # For each protein in the network: # # 1. find interactions of this protein in the current network # 2. find proteins in the database that share a certain # characteristic with this protein (e.g cog code) # 3. for each protein that shares that characteristic: # - find interactions for protein that shares the # characteristic in the database # - find interactions for protein that shares the # characteristic in the network # - assign to protein being processed all interactions # of protein that shares the characteristic # - assign to protein that shares that characteristic # all interactions of protein being processed # # This process can be repeated more than once, to reach # far-fetched deductions # # For example, if root protein is A, and if we know # that C and D (yeast) interact, and that A =cog= C # and B =cog= D # # - simple expansion will predict that A interacts # with D # - double expansion will predict that A interacts # with D and that A interacts with B # (ie double expansion predicts interactions from # a previous prediction) # (this is achieved by executing two consecutive # expand-interactions piana commands # # - the new interactions (predictions) can be added to # the network or printed out to a results file # # - expansion_type is required: use to know the type # of expansion to perform # -> valid expansion-type values are those defined # in PianaGlobals.expansion_types (currently can # be cog, scop (ie. scop family), interpro or ec) # -> if two proteins share expansion-type, interactions # are interpropagated # # - expansion-nodes is required: used to define which # proteins will be expanded # -> valid expansion-nodes values are: all (all # proteins in network are expanded) or root # (only root proteins are expanded) # -> if you are looking for new interactions (predictions) # for your input proteins, use root # -> if you want to expand all the proteins in the # network (partners of root proteins as well) use all # -> root proteins are the proteins used to build the # network (eg. the proteins in input-file) # # - expansion-threshold is required: used to avoid propagating # interactions when there are too many nodes that share the # expansion type # -> valid values are: 0 (no thresholds applied) and # positive integers # -> depending on the expansion type, the expansion-threshold # to be used varies # # - exp-output-mode is required: used to define if predictions # should be added to network or printed to file # -> valid exp-output-mode values are: add (add predictions # to network) and print (print to output-target) # -> 'add' will add to the network the predictions found # by expansion # -> 'print' will print to output-target (or to default # results file) the list of predictions found by expansion # -> for example, if you wanted to get predictions for root # nodes using double cog expansion you would first use # command expand-interactions with expansion-nodes=all # and mode=add and then, another command # expand-interactions with expansion-nodes=root # and mode=print doing this "double expansion" you will # be predicting interactions based on a previous expansion # # - if exp-output-mode is add, the following arguments # can be ignored: leave them to blank: # - if exp-output-mode is "print" then : # # -> if you do not want the information to be printed # to a default file name then you should set output-target # to your own file name # -> output-target can be blank (ie. use default name), # a file name or screen if you want the results printed # to stdout # # -> if you want the type of protein code used for printing # the information different from output-id-type then # set id-type. # -> valid id-type values are blank (ie use # output-id-type) or those defined in # the PIANA reference card # --> see comment on input-id-type # # -> the results will follow the following format (one # interaction per line): # # protein1protein2expansion_typesource_interactionPianasource_proteinPiana # # This file can then be used to insert predictions into # a PIANA database using parser expansion2piana.py # # If you are going to insert these predictions into a # PIANA database, I recommend that your output type # for protein codes is proteinPiana (to make sure that # the prediction refers to that protein sequence). In any # case, never use geneNames for creating a list of # predictions that is going to be inserted into a # PIANA database: geneNames do not implicitly contain # the species and can be ambiguous within a species. # # To learn more about inserting predictions into # PIANA databases, read README.populate_piana_db and # README.piana_examples # # # - We do not recommend doing predictions based on predictions: # ie. we do not recommend executing command expand-interactions # on networks that were built from a database with predictions. # To avoid this, you can use parameters list-source-dbs and # list-source-methods or do what we do internally in our lab: # have to piana databases, one with only experimentally # detected interactions and another one with all interactions. # # - for expansions, I recommend using program # run_piana_protein_by_protein.py instead of piana.py # -> the result will be the same if you work with all proteins # at the same time than if you do it one by one # -> it is much more faster to manage the expansion separately # for each protein # -> read README.piana_examples for more info on this # expand-interactions;expansion-type=expansion_type;expansion-nodes=expansion_nodes;expansion-threshold=expansion_threshold;exp-output-mode=mode;output-target=blank;id-type=blank # ******************* # find-shortest-route: this command finds the shortest route between # two proteins in the network # ******************* # # ATTENTION: this command requires the piana 'advanced mode' or # 'developer mode'. By default, all users work in # 'simple mode'. Therefore, if you want to use this # command you'll have to modify your working mode as # indicated in section 'PIANA types of users' of file # README.piana_tutorial # # - network must exist before running this command # # - if you do not want the information to be printed to default name # then you should set output-target to your own file name # output-target can be blank (ie. use default name), a file name # or screen if you want the results printed to stdout # # - protein_a_name and protein_b_name are required. # # - if protein code type used for protein-name is different from # input-id-type, then set id-type with the new type # as shown. # if protein-name used is of input-id-type, then set # id-type to blank # -> Valid id types are blank (ie use input-id-type) # and those defined in the PIANA reference card # --> see comment on input-id-type # # - the output will be written using output-id-type (read from # the input parameters section) # # # format of the txt output is: # # first line: protein_a=protein_a_nameprotein_b=protein_b_namedistance=distance_of_route # second line: START=protein_a_name<-->protein_in_route<-->protein_in_route<-->...<-->END=protein_b_name find-shortest-route;protein-a-name=protein_a_name;protein-b-name=protein_b_name;id-type=blank;output-target=blank # ******************* # find-distance-group: this command finds a group of proteins that are # at distance N from a query protein # ******************* # # - network must exist before running this command # # - if you do not want the information to be printed to default name # then you should set output-target to your own file name # output-target can be blank (ie. use default name), a file name # or screen if you want the results printed to stdout # # - protein_name is required. # # - if protein code type used for protein-name is different from # input-id-type, then set id-type with the new type # as shown. # if protein-name used is of input-id-type, then set # id-type to blank # -> Valid id types are blank (ie use input-id-type) # and those defined in the PIANA reference card # --> see comment on input-id-type # # - distance is required: set the distance between your query # protein and the group of proteins you are searching # - when distance is "all", groups 1, 2 and 3 are printed # out) # # - info is used to choose the information that will be printed # next to the proteins at distance N # - values admitted are: # - blank: no info printed # - all: all info known about protein # - scop: scop codes # - cath: cath codes # # - format-mode is required: set which format will be used for # printing the output # -> txt: prints flat text # -> html: prints in html format # find-distance-group;protein-name=protein_name;id-type=blank;distance=distance;output-target=blank;info=blank;format-mode=format_mode # ************************** # match-proteins-to-pathways: this command finds which pathways # have proteins that appear in the # current network # ************************** # # this command can be used to study the pathways that are # 'involved' in the current network, where 'involved' refers # to how many proteins of the current network appear as well # in a given pathway # # pathways have to be defined by the user, and each pathway # must be represented by a different file where all pathway # members have been written. Read below for details # # # A network must exist before running this command. # # - pathways-dir must be set to the directory that holds the # files with the pathways. Each pathway is defined by a # different file, and the name of the file must uniquely # identify the pathway. The file name will be used to # in the output to identify the matched pathways. # # - if protein code type used for the pathways proteins is # different from input-id-type, then set pathway-type # with the new type as shown. # if identifiers used are of input-id-type, then set # pathway-type to blank # -> Valid id types are blank (ie use input-id-type) # and those defined in the PIANA reference card # --> see comment on input-id-type # # - force-expression can be yes or no # - no: all proteins in the network will be matched # against the proteins in the pathways # # - yes: only proteins in the network that are over/under # expressed will be matched against the proteins in the # pathways # -> if you set force-expression to yes, then you # must have set as well the expression files # in the input parameters section # # - if you do not want the information to be printed to default # name then you should set output-target to your own file name # -> output-target can be blank (ie. use default name), a file # name or screen if you want the results printed to stdout # # - if you do not want the proteins to be printed out using the # identifier type output-id-type, then you must write a # new id type in parameter id-type. Otherwise, leave # it to blank # -> Valid id types are blank (ie use output-id-type) # and those defined in the PIANA reference card # --> see comment on input-id-type # # Attention! If you want to get a report with interaction files # for each protein, it is mandatory that inmediately # before this command you make a call to print-table # using the same type of protein identifier # (eg. uniacc) than the one you are using here # # - format-mode is required: set which format will be used for # printing the output # -> txt: prints flat text # -> html: prints in html format # match-proteins-to-pathways;pathways-dir=pathways_dir;pathway-type=blank;force-expression=force_expression;output-target=blank;format-mode=format_mode;id-type=blank # ************************** # match-proteins-to-gos: this command finds which GO terms # appear more frequently in the current network # ************************** # # this command can be used to study the GO terms that are # 'involved' in the current network, where 'involved' refers # to the frequency a GO term appears in the network # # # A network must exist before running this command. # # - if protein code type you want for outputting results is # different from output-id-type, then set id-type # with the new type as shown. # if protein-name used is of output-id-type, then set # id-type to blank # -> Valid id types are blank (ie use input-id-type) # and those defined in the PIANA reference card # --> see comment on input-id-type # # - force-expression can be yes or no # - no: all proteins in the network will be used for # calculating GO frequencies # # - yes: only proteins in the network that are over/under # expressed will be used for calculating GO frequencies # # -> if you set force-expression to yes, then you # must have set as well the expression files # in the input parameters section # # - if you do not want the information to be printed to default # name then you should set output-target to your own file name # -> output-target can be blank (ie. use default name), a file # name or screen if you want the results printed to stdout # # - format-mode is required: set which format will be used for # printing the output # -> txt: prints flat text # -> html: prints in html format # # - term-type sets the kind of GO terms that will be used for # calculating the frequencies # -> term-type can be "molecular_function", "biological_process" # or "cellular_component" # # - level_threshold is the lowest level of the go terms that will # be retrieved for calculating the frequency # -> all GO terms associated to each protein will be retrieved, # then parents of those GO terms until reaching # level_threshold will also be retrieved # # -> GO is a hierarchy organized from a initial root level # (level 0) that increasingly makes more specific the terms. # Therefore, the higher the level used the more specific the # GO terms will be. To obtain a relevant GO terms use level # thresholds between 3 and 5. However, It all depends on how # general you want to be in the interpretation of the network. # # # - go-dbname, go-dbhost, go-dbuser and go-dbpass are the mysql # parameters for your external GO database (in the same way # piana-dbname, piana-dbhost, etc) # -> attention! this database is different from your standard # pianaDB. This is a different database that has to be created # by following the instructions on README.populate_piana_db # section "parse GO" match-proteins-to-gos;id-type=blank;force-expression=force_expression;output-target=blank;format-mode=format_mode;term-type=term_type;level-threshold=level_threshold;go-dbname=go_dbname;go-dbhost=go_dbhost;go-dbuser=go_dbuser;go-dbpass=go_dbpass # *********************** # match-proteins-to-spots: this command identifies spots in a 2D gel # by matching MW and/or IP to proteins in # the network # *********************** # # we have spot ids from a 2D electrophoresis gel, with their # molecular weights (MW) and isoelectric points (IP). Some of those # spots were identified by mass spectrometry but other spots were # unnassigned. We can use PIANA to identify some of those # unnassigned spots, by comparing the MW and IP of the spots with # the MW and IP of the proteins in the network. # # - if you do not want the information to be printed to default # name then you should set output-target to your own file name # -> output-target can be blank (ie. use default name), a file # name or screen if you want the results printed to stdout # # - format-mode is required: set which format will be used for # printing the output # -> txt: prints flat text # -> html: prints in html format # # - if you want the type of protein code used for printing the # information different from output-id-type then set # id-type. # -> valid id-type values are blank (ie use # output-id-type) or those defined in # the PIANA reference card. # --> see comment on input-id-type # # - spots-file-name is a file name following the structure # (one spot per line): # spot_idmolecular_weightisoeletric_point # -> where decimals are expressed with "." # -> complete_path_to_spots_file can be blank if # spots-file-name was set in the command line # # - list-mw-error and list-ip-error set the error bounds # admitted for the matching of molecular weight and # isoelectric point # -> the number of error bounds for mw and ip must # be identical: values can be different, but the # number of values not # -> use "." for decimals # -> set to blank if you prefer to use the default # error bounds (which are hard-coded in piana.py) # -> to set your own error bounds, write colon-separated # values # (e.g. list-mw-error=0.01:0.02:0.05;list-ip-error=0.1:0.2:0.5 # # Attention!: correspondences that appear in a given error level will # not be shown in higher error levels match-proteins-to-spots;output-target=blank;format-mode=format_mode;id-type=blank;spots-file-name=complete_path_to_spots_file;list-mw-error=blank;list-ip-error=blank # ******************** # cluster-by-go-terms: this command clusters the proteins of the # network using GO terms # ******************** # # - In order to cluster a network using go terms, a # protein-protein interaction network must previously exist # # - if you do not want the clustered network to be printed to # the default file name then you should set output-target to # your own file name # -> output-target can be blank (ie. use default name), a # file name or screen if you want the results printed to # stdout # # - term-type sets the kind of GO terms that will be used for # the clustering (required) # -> term-type can be molecular_function, biological_process # or cellular_component # # - score-threshold is the lowest score obtained by the # similarity function allowed for continuing the clustering # -> can be any real number from 0 to 100 (0 will group # all proteins, 100 will not group any proteins). # To obtain a relevant clustered network use score # thresholds between 0.1 and 1 # # - sim-mode sets how to calculate distances between two # clusters # - random takes a random element from each cluster # and evaluates similarity between them # - min takes the minimal distance between elements # of each cluster # - max takes the maximal distance between elements # of each cluster # - average takes the average distance between all # elements of each cluster # # - level-threshold is the lowest level of the go term in # the cluster allowed for continuing the clustering # -> GO is a hierarchy organized from a initial root # level (ie. 0) that increasingly makes more specific # the terms. # Therefore, the higher the level used the less # clustering will be performed. To obtain a relevant # clustered network use level thresholds between 1 # and 3. It all depends on how general you want to # be in the interpretation of the network. # # - distance-threshold is the maximum distance allowed # between two proteins in order to be clustered # -> can be any integer between 1 and ... # # - rep-term sets which of the GO terms of the cluster # will be used for printing output # -> can be min (term of minimal depth in the hierarchy) # or max (maximal depth) # # - print-id sets which id will be used for identifying # the clusters in the printed output # -> can be "no" (default id: go term name) or "yes" # (a more complex id) # cluster-by-go-terms;output-target=blank;term-type=term_type;score-threshold=score_threshold;sim-mode=sim_mode;level-threshold=level_threshold;distance-threshold=distance_threshold;rep-term=rep_term;print-id=print_id # ****************** # calculate-imotifs: calculate iMotifs of a protein as described on paper # "Characterization of protein hubs by inferring interacting motifs # from binary protein interactions" # ****************** # # - similarity-mode is the type of similarity function that will be applied to do the clustering # # -> valid values are: (in all cases, term1 is 1 is number_of_protein_partners_in_common>0, and 0 otherwise) # # - 'num_ints': number of common interaction partners (N in the paper) # -> number_of_protein_partners_in_common(proteins in cluster1, proteins in cluster2) # # # - 'min_per': Rmin in the paper # -> number_of_protein_partners_in_common(proteins in cluster1, proteins in cluster2)x100/ # min(number_of_protein_partners(proteins in cluster1), number_of_protein_partners(proteins in cluster2)) # # # - 'max_per': Rmax in the paper # -> number_of_protein_partners_in_common(proteins in cluster1, proteins in cluster2)x100/ # max(number_of_protein_partners(proteins in cluster1), number_of_protein_partners(proteins in cluster2)) # # - 'combined_per': Rave in the paper # -> (min_per + max_per) / 2 # # # - min_score is the minimum score allowed in order to fuse two clusters. Any positive integer is a valid value # # # - imotif_mode determines which proteins will be grouped in iMotifs # --> valid values are: # - all: all proteins in the network will be (tried to) grouped in iMotifs # - roots: only root proteins will be (tried to) grouped in iMotifs # # Note: imotif_mode is an important parameter, because it will determine the way the iMotifs and iMotif-iMotif interactions # will be formed # # In terms of which proteins will appear in each iMotif, the consequence of using imotif_mode roots is that iMotifs # will only have more than one protein in them if a root protein appears in that iMotif. In imotif_mode all, # iMotifs will be formed for all proteins, regardless of whether they have a root protein or not # # In terms of iMotif-iMotif interactions, in imotif_mode all we will get extra lines "imotif_imotif" in files *.results # describing iMotif-iMotif interactions inferred from the clusters interactions. Read comments in # piana/code/Graph/GraphCluster/GraphCluster.py (method print_imotif_imotif_interactions) for more # info on this. # # - num_ints_thres sets the threshold for considering a protein well described in terms of interactions # --> this is the minimum number of interactions that a protein must have in order to be considered in the clustering # --> valid values are from 1 to infinite # # - common_ints_thres sets the threshold for considering that two proteins share a relevant number of proteins # --> this is the minimum number of interactions that two proteins must share in order to be considered # --> valid values are from 1 to infinite # # - global-map can be used to produce files with all interactions in the form of *.results files # (leave to blank unless you are evaluating the results) # # These files will be used to calculate with train_and_test_imotif.py whether we improve the trivial # results for PDB interactions. See README.imotif_evaluation_explained for more info # # -> valid values are blank (don't do anything) or a directory name # --> global result files are written to this directory # --> directory name must end with the slash!!!! # # -> attention: if global-map is different from blank, execution is stopped after creating the results file for interactions, # and iMotifs will not be identified for the protein # # Attention! Results from this command are always written using proteinPiana identifier. You should then do yourself the extra # step of translating them to your favorite type of identifier (e.g. using protein_code_2_protein_code.piana_conf) calculate-imotifs;similarity-mode=similarity_mode;min-score=min_score;imotif-mode=imotif_mode;num-ints-thres=num_ints_thres;common-ints-thres=common_ints_thres;global-map=blank # **** # exit: this command exits # **** # # - required in all piana configuration files, unless interactive mode # used exit; ## ## # THE FOLLOWING OPTIONS ARE CURRENTLY UNAVAILABLE # ## ## # ***************** # modify-parameters: this command modifies some parameter values (NOT WORKING!!!) # ********************* # # ATTENTION! DO NOT USE, NOT WORKING # # # use this command to modify parameter values at any point of # piana execution. # # - set to blank those parameters that you do not want to modify modify-parameters;results-prefix=blank;file-over-expressed=blank;file-infra-expressed=blank;expression-id-type=blank