Snakefile_Regions

include: "Snakefile"

ruleorder: subsample_regions > subsample
ruleorder: adjust_metadata_regions > adjust_metadata

# How to run: if no region is specified, it'll run a subsampled global build (120 per division)
# If a region is selected, it'll do 280/division for that region, and 20/division in the rest of the world
#       -- preferentially sequences near the focal sequences
#
# To run a regional build, be sure to update it to the list in Snakefile
#
# You can run all builds in parallel!
#   snakemake -s Snakefile_Regions --cores 2 all_regions
#
# Or you can specify final or intermediate output files like so:
#   snakemake -s Snakefile_Regions --cores 2 auspice/ncov_europe.json (subsampled regional focal)
#   snakemake -s Snakefile_Regions --cores 2 auspice/ncov.json (subsampled global)
#
# To update ordering/lat_longs after AWS download:
#   snakemake --touch --forceall -s Snakefile_Regions
#   snakemake -s Snakefile_Regions clean_export_regions
#   snakemake -s Snakefile_Regions export_all_regions
# When done adjusting lat-longs & orders, remember to run
#   snakemake -s Snakefile_Regions
# to produce the final Auspice files!

rule all_regions:
    input:
        auspice_json = expand("auspice/ncov{region}.json", region=REGIONS),
        tip_frequencies_json = expand("auspice/ncov{region}_tip-frequencies.json", region=REGIONS),
        dated_auspice_json = expand("auspice/ncov{region}_{date}.json", date=get_todays_date(), region=REGIONS),
        dated_tip_frequencies_json = expand("auspice/ncov{region}_{date}_tip-frequencies.json", date=get_todays_date(), region=REGIONS),
        auspice_json_gisaid = expand("auspice/ncov{region}_gisaid.json", region=REGIONS),
        auspice_json_zh = expand("auspice/ncov{region}_zh.json", region=REGIONS)

# This cleans out files to allow re-run of 'normal' run (not ZH or GISAID)
# with `export` to check lat-longs & orderings
# However, *removes* the ZH & GISAID files so that when doing final run after all
# errors are cleared, these builds will also be rebuilt!
rule clean_export_regions:
    message: "Removing export files: {params}"
    params:
        "results/ncov_with_accessions*.json",
        "results/ncov_gisaid_with_accessions*.json",
        "results/ncov_zh_with_accessions*.json",
        "auspice/ncov*_gisaid.json",
        "auspice/ncov*_zh.json",
        "config/colors*.tsv"
    shell:
        "rm {params}"

# Allows 'normal' run of export to be forced to correct lat-long & ordering
# Just runs this, not ZH & GISAID, to speed up & reduce errors.
rule export_all_regions:
    input:
        colors_file = expand("config/colors{region}.tsv", region=REGIONS),
        auspice_json = expand("results/ncov_with_accessions{region}.json", region=REGIONS),

rule subsample_focus:
    message:
        """
        Subsample all sequences into a focal set
        """
    input:
        sequences = rules.mask.output.alignment,
        metadata = rules.download.output.metadata,
        include = files.include
    output:
        sequences = "results/subsample_focus{region}.fasta"
    params:
        group_by = "division year month",
        seq_per_group_global = 40, # i.e. if not regional build
        seq_per_group_regional = 260
    shell:
        """
        #Figure out what region being wanted
        rgn="{wildcards.region}"

        if [ "$rgn" = "_global" ]; then
            seq_per_group={params.seq_per_group_global}
            regionarg="--exclude-where region="
            region="frog"   #I don't know! It wouldn't work without something!
            echo "Filtering for a global run - $seq_per_group per division"
        else
            seq_per_group={params.seq_per_group_regional}
            region="${{rgn//[_y]/}}"
            region="${{region//[-y]/ }}"
            echo "Filtering for a focal run on $region - $seq_per_group per division"
            regionarg="--exclude-where region!="
            echo "   This is passed to augur as $regionarg'$region'"
        fi

        augur filter \
            --sequences {input.sequences} \
            --metadata {input.metadata} \
            --include {input.include} \
            $regionarg"$region" \
            --group-by {params.group_by} \
            --sequences-per-group $seq_per_group \
            --output {output.sequences} \

        """

rule make_priorities:
    message:
        """
        determine priority for inclusion in as phylogenetic context by
        genetic similiarity to sequences in focal set.
        """
    input:
        alignment = rules.mask.output.alignment,
        metadata = rules.download.output.metadata,
        focal_alignment = rules.subsample_focus.output.sequences
    output:
        priorities = "results/subsampling_priorities{region}.tsv"
    resources:
        mem_mb = 4000
    shell:
        """
        #Figure out what region being wanted
        rgn="{wildcards.region}"

        if [ "$rgn" = "_global" ]; then
            echo "Global run - no priorities needed"
            echo -n >{output.priorities}
        else
            region="${{rgn//[_y]/}}"
            region="${{region//[-y]/ }}"
            echo "Creating priorities for focal build on $region"
            python3 scripts/priorities.py --alignment {input.alignment} \
                            --metadata {input.metadata} \
                            --focal-alignment {input.focal_alignment} \
                            --output {output.priorities}
        fi
        """

rule subsample_context:
    message:
        """
        Subsample the non-focal sequences to provide phylogenetic context
        """
    input:
        sequences = rules.mask.output.alignment,
        metadata = rules.download.output.metadata,
        priorities = rules.make_priorities.output.priorities
    output:
        sequences = "results/subsample_context{region}.fasta"
    params:
        group_by = "country year month",
        sequences_per_group = 20
    shell:
        """
        #Figure out what region being wanted
        rgn="{wildcards.region}"

        if [ "$rgn" = "_global" ]; then
            echo "Global run - no context needed"
            echo -n >{output.sequences}
        else
            region="${{rgn//[_y]/}}"
            region="${{region//[-y]/ }}"
            echo "Creating a filtered context for a focal run on $region. Context is {params.sequences_per_group}seqs per '{params.group_by}'"
            regionarg="--exclude-where region="
            echo "   (This is passed to 'augur filter' as \"$regionarg'$region'\")"

            augur filter \
                $regionarg"$region" \
                --sequences {input.sequences} \
                --metadata {input.metadata} \
                --priority {input.priorities} \
                --group-by {params.group_by} \
                --sequences-per-group {params.sequences_per_group} \
                --output {output.sequences}
        fi
        """

rule subsample_regions:
    message:
        """
        Combine and deduplicate FASTAs
        """
    input:
        rules.subsample_focus.output.sequences,
        rules.subsample_context.output.sequences
    output:
        "results/subsampled_alignment{region}.fasta"
    shell:
        """
        python3 scripts/combine-and-dedup-fastas.py \
            --input {input} \
            --output {output}
        """

rule adjust_metadata_regions:
    input:
        metadata = rules.download.output.metadata
    output:
        metadata = "results/metadata_adjusted{region}.tsv"
    shell:
        """
        #Figure out what region being wanted
        rgn="{wildcards.region}"

        if [ "$rgn" = "_global" ]; then
            echo "Global run - no metadata adjustment needed"
            cp {input.metadata} {output.metadata}
        else
            region="${{rgn//[_y]/}}"
            region="${{region//[-y]/ }}"
             echo "Focal run for $region - adjusting metadata"

            python3 scripts/adjust_regional_meta.py \
                                    --region "$region" \
                                    --metadata {input.metadata} \
                                    --output {output.metadata}
        fi
        """


#
# Deployment and error handlers, including Slack messaging integrations.
#

from os import environ

SLACK_TOKEN   = environ["SLACK_TOKEN"]   = config["slack_token"]   or ""
SLACK_CHANNEL = environ["SLACK_CHANNEL"] = config["slack_channel"] or ""

try:
    deploy_origin = (
        f"from AWS Batch job `{environ['AWS_BATCH_JOB_ID']}`"
        if environ.get("AWS_BATCH_JOB_ID") else
        f"by the hands of {getuser()}@{getfqdn()}"
    )
except:
    # getuser() and getfqdn() may not always succeed, and this catch-all except
    # means that the Snakefile won't crash.
    deploy_origin = "by an unknown identity"

rule deploy_to_staging:
    input:
        *rules.all_regions.input
    params:
        slack_message = f"Deployed <https://nextstrain.org/staging/ncov|nextstrain.org/staging/ncov> {deploy_origin}",
        s3_staging_url = config["s3_staging_url"]
    shell:
        """
        nextstrain deploy {params.s3_staging_url:q} {input:q}

        if [[ -n "$SLACK_TOKEN" && -n "$SLACK_CHANNEL" ]]; then
            curl https://slack.com/api/chat.postMessage \
                --header "Authorization: Bearer $SLACK_TOKEN" \
                --form-string channel="$SLACK_CHANNEL" \
                --form-string text={params.slack_message:q} \
                --fail --silent --show-error \
                --include
        fi
        """

onerror:
    slack_message = f"Build {deploy_origin} failed."

    if SLACK_TOKEN and SLACK_CHANNEL:
        shell(f"""
            curl https://slack.com/api/files.upload \
                --header "Authorization: Bearer $SLACK_TOKEN" \
                --form-string channels="$SLACK_CHANNEL" \
                --form-string initial_comment={{slack_message:q}} \
                --form file=@{{log:q}} \
                --form filetype=text \
                --fail --silent --show-error \
                --include
        """)