PanExplorer.xml

<tool id="PanExplorer2" name="PanExplorer2" version="2.0">
  <description> Bacterial pan-genome analysis </description>
  <requirements>
  <!--
 <requirement type="package" version="2.2.26">blast-legacy</requirement>
 <requirement type="package" version="7.480">mafft</requirement>
 <requirement type="package" version="14.137">mcl</requirement>
 <requirement type="package" version="3.697">phylip</requirement>
 <requirement type="package" version="1.7.2">perl-bioperl</requirement>
 -->
    <requirement type="package" version="1.30">perl-yaml</requirement>
    <requirement type="package" version="3.8.7">singularity</requirement>
 
 </requirements>

    <command><![CDATA[

export PANEX_PATH=${__tool_directory__};


#if str($mode.mode) == "accessions":
	    perl ${__tool_directory__}/Perl/generateConfig.pl 'None' '$input' config.yaml 'None';
#else if str($mode.mode) == "genbanks":
	    perl ${__tool_directory__}/Perl/generateConfig.pl '$private_genomes' 'None' config.yaml 'None';
#else if str($mode.mode) == "fasta":
            perl ${__tool_directory__}/Perl/generateConfig.pl '$private_genomes' 'None' config.yaml '$private_genomes_fasta';
#end if

cat config.yaml >$logfile;


if [ ! -f \$PANEX_PATH/panexplorer.sif ]; then wget -O \$PANEX_PATH/panexplorer.sif https://panexplorer.southgreen.fr/singularity/panexplorer.sif >>$logfile 2>&1;fi;


#if $software=="pgap"
sed "s/identity\=80/identity\=$min_identity/g" \$PANEX_PATH/Snakemake_files/Snakefile_wget_PGAP_heatmap_upset_COG >snakefile;
#else if $software=="roary"
sed "s/identity\=80/identity\=$min_identity/g" \$PANEX_PATH/Snakemake_files/Snakefile_wget_roary_heatmap_upset_COG >snakefile;
#else if $software=="orthofinder"
sed "s/identity\=80/identity\=$min_identity/g" \$PANEX_PATH/Snakemake_files/Snakefile_orthofinder_heatmap_upset >snakefile;
#else if $software=="cactus"
cp -rf \$PANEX_PATH/Snakemake_files/Snakefile_wget_cactus_heatmap_upset_COG2 snakefile;
#else if $software=="pggb"
sed "s/identity\=30/identity\=$min_identity/g" \$PANEX_PATH/Snakemake_files/Snakefile_wget_pggb_heatmap_upset_COG >snakefile;
#else
sed "s/identity\=80/identity\=$min_identity/g" \$PANEX_PATH/Snakemake_files/Snakefile_wget_panacota_heatmap_upset_COG >snakefile;
#end if

singularity exec \$PANEX_PATH/panexplorer.sif snakemake --cores 1 -s snakefile >>$logfile 2>&1;


cp -rf outputs/upsetr.svg $upset;
cp -rf outputs/pav_matrix.tsv $output;
cp -rf outputs/heatmap.svg.gz $heatmap;
cp -rf outputs/heatmap.svg.heatmap_plotly.html $heatmap_html;
cp -rf outputs/rarefaction_curves.txt $rarefaction_curves;
cp -rf outputs/rarefaction_curves.svg $rarefaction_curves_svg;
cp -rf outputs/heaps.tsv $heaps;
cp -rf outputs/heatmap.svg.complete.pdf.distance_matrix.txt $distance_matrix;
cp -rf outputs/heatmap.svg.complete.pdf.distance_matrix.hclust.newick $njtree;

#if $software!="orthofinder"
cp -rf outputs/GCskew.txt $gcfile;
cp -rf outputs/cog_output.txt $cogfile;
cp -rf outputs/cog_stats.txt $outcog_stat;
cp -rf outputs/cog_stats2.txt $outcog_stat2;
cp -rf outputs/cog_of_clusters.txt $outcog_clusters;
cp -rf outputs/genomes/genes.txt $genes;
cp -rf outputs/fastani.out.matrix.complete $fastani;
cp -rf outputs/fastani.out.svg $ani_svg;
#end if

#if $software=="pggb"
cp -rf outputs/all_genomes.vcf $vcf;
#else
touch $vcf;
#end if
]]></command>


  <inputs>
	  <conditional name="mode">
                <param name = "mode" type="select" label="What is your inputs?">
                        <option value="accessions">Prokaryote genomes: List of Genbank assembly accessions (GCA)</option>
                        <option value="genbanks">Prokaryote genomes: Genbank files</option>
                        <option value="fasta">Eukaryote genomes: FASTA + GFF files</option>
                </param>
                <when value="accessions">
			<param name="input" type="text" multiple="true" label="List of genbank identifiers" help="Coma separated list (ex: GCA_000007385.1,GCA_000010025.1,GCA_000019585.2)"/>
			<param type="select" name="software" label="Choose the pan-genome software">
				<option value="roary">Roary</option>
				<option value="panacota">PanACoTA</option>
				<option value="pggb">PanGenome Graph Builder (PGGB)</option>
			</param>
                </when>
                <when value="genbanks">
			<param name="private_genomes" type="data" format="zip" label="Zip of genbank files" optional="true"/>
			<param type="select" name="software" label="Choose the pan-genome software">
                                <option value="roary">Roary</option>
                                <option value="panacota">PanACoTA</option>
                                <option value="pggb">PanGenome Graph Builder (PGGB)</option>
                        </param>
		</when>
		<when value="fasta">
			<param name="private_genomes_fasta" type="data" format="zip" label="Zip of Fasta files" optional="true"/>
			<param name="private_genomes" type="data" format="zip" label="Zip of GFF files" optional="true"/>
			<param type="select" name="software" label="Choose the pan-genome software">
				<option value="orthofinder">OrthoFinder</option>
				<option value="cactus">Minigraph-Cactus</option>
				<option value="pggb">PanGenome Graph Builder (PGGB)</option>
			</param>
                </when>
	</conditional>

  <param name="min_identity" type="text" value="80" label="Minimum percentage identity for BlastP" />
 </inputs>

 <outputs>
 <data format="txt" name="output" label="Pangenome presence absence matrix"/>
 <data format="newick" name="njtree" label="PanBased NJ tree"/>
 <data format="txt" name="genes" label="Genes"/>
 <data format="txt" name="cogfile" label="COG assignation"/>
 <data format="txt" name="gcfile" label="GC_percent"/>
 <data format="svg" name="upset" label="Upset Diagram"/>
 <data format="svg" name="heatmap" label="Presence Absence Heatmap"/>
 <data format="html" name="heatmap_html" label="Presence Absence Heatmap interactive"/>
 <data format="tabular" name="outcog_stat" label="COG category counts"/>
 <data format="tabular" name="outcog_stat2" label="COG category 2 counts"/>
 <data format="tabular" name="outcog_clusters" label="COG of clusters"/>
 <data format="tabular" name="fastani" label="ANI" />
 <data format="svg" name="ani_svg" label="ANI heatmap" />
 <data format="txt" name="rarefaction_curves" label="Rarefaction curves data"/>
 <data format="svg" name="rarefaction_curves_svg" label="Rarefaction curves"/>
 <data format="txt" name="heaps" label="Heaps law alpha"/>
 <data format="txt" name="distance_matrix" label="Accessory based distance matrix"/>
 <data format="vcf" name="vcf" label="VCF file"/>
 <data format="txt" name="logfile" label="Logfile"/>
 <data format="txt" name="roary_log" label="Roary Logfile"/>
</outputs>
<tests>
        <test>
            <param name="input" value="GCA_000007385.1,GCA_000010025.1,GCA_000019585.2"/>
            <param name="min_identity" value="80"/>
            <param name="software" value="panacota"/>
            <param name="private_genomes" value=""/>
            <param name="private_genomes_fasta" value=""/>
            <output name="distance_matrix" value="Accessory_based_distance_matrix.txt"/>
            <output name="fastani" value="ANI.txt"/>
        </test>
 </tests>
 <help>

PanExplorer
=======

	 PanExplorer workflow is a snakemake worklow that can be run in the backend of the PanExplorer web application.

	 Homepage: https://panexplorer.southgreen.fr/

	 It allows to perform a pan-genome analysis using published and annotated bacteria genomes, using different tools that can be invoked: Roary, PGAP, PanACoTA.

	 Pangenome graph builder softwares have been implemented recently in the pipeline: Minigraph-Cactus and PGGB (PanGenome Graph Builder)

	 It provides a presence/absence matrix of genes, an UpsetR Diagram for synthetizing the matrix information and a COG assignation summary for each strain.
	 
Please visit the GitHub page for the PanExplorer workflow at: https://github.com/SouthGreenPlatform/PanExplorer_workflow


Inputs
------

	 Inputs can be provided as one of the following:

	 * **List of genbank assembly identifiers** comma-separated(ex: GCA_000007385.1,GCA_000010025.1,GCA_000019585.2)
	 * **Zip of genbank files** They must include the gene annotation and the complete sequence data
	 * **Zip of FASTA file of genomes + Zip of GFF annotation files**: In order to make the association between sequence and annotation, they must be named with the same basename as follows: genome1.fasta, genome1.gff, myspeciesXXX.fasta, myspeciesXXX.gff...

	
Outputs
------

	 Among the outputs:

	 * **Pangenome presence absence matrix** Pangene presence/absence matrix indicating the PAV (Presence Absence Variation) of clustered genes.
	 * **PanBased NJ tree** Distance tree based on PAV data
	 * **Heaps law alpha** Estimating if a pan-genome is open or closed based on a Heaps law model.
	 * **Rarefaction curves** A rarefaction curve is the cumulative number of gene clusters we observe as more and more genomes are being considered
	 * **ANI** Average Nucleotide Identity between genomes
	 * **ANI heatmap** image as SVG
	 * **VCF file** If a pan-genome graph software has been selected, it provides a VCF of variations among all samples.


    </help>
    <citations>
        <citation type="doi">10.1093/bioinformatics/btac504</citation>
    </citations>

</tool>