use one ref when the sample is from pure strain

hzi-bifo · May 5, 2020 · 0111773 · 0111773
1 parent 4c49ba4
commit 0111773
Show file tree

Hide file tree

Showing 6 changed files with 66 additions and 11 deletions.
diff --git a/README.md b/README.md
@@ -9,11 +9,13 @@ In this benchmarking study: variants callers `BCFtools` (v1.9), `VarScan` (v2.4.
 
 To reproduce the output, you need to use `Bioconda`.
 
-Please follow the instruction [here](https://bioconda.github.io) to install `Bioconda`. And then you need to install `snakemake` and Python package `click`:
+Please follow the instruction [here](https://bioconda.github.io) to install `Bioconda`. And then you need to install `snakemake`, `csvtk` and Python package `click`:
 
 ```shell
 conda install snakemake=5.3.0
+conda install csvtk=0.18.2
 conda install click=7.0
+
 ```
 
 After this has been done, download the pipeline onto your system:

diff --git a/config/config.yaml b/config/config.yaml
@@ -3,7 +3,7 @@ MerlinRef: ref/Merlin.BAC.fa
 TB40ERef: ref/TB40E.GFP.fa
 AD169Ref: ref/AD169.BAC.fa
 PhixRef: ref/Phix.fa
-outpath: ../revision_output_3
+outpath: ../revision_output_4
 threads: 20
 runOnReads: false
 rmHumanEcoli: true

diff --git a/eval_assembly.smk b/eval_assembly.smk
@@ -20,6 +20,23 @@ metaquast_criteria = ["num_contigs", "Largest_contig", "Genome_fraction",
 def make_mix():
     return ["{}/{}".format(sample.split("-")[0], sample) for sample in sample_list]
 
+def get_assembly_ref(wc):
+    if wc.sample.startswith('TA'):
+        if wc.sample.endswith('-1-0'):
+            ref_list = [tb_ref]
+        elif wc.sample.endswith('-0-1'):
+            ref_list = [ad_ref]
+        else:
+            ref_list = [tb_ref, ad_ref]
+    else:
+        if wc.sample.endswith('-1-0'):
+            ref_list = [tb_ref]
+        elif wc.sample.endswith('-0-1'):
+            ref_list = [merlin_ref]
+        else:
+            ref_list = [tb_ref, merlin_ref]
+    return ','.join(ref_list)
+
 
 onsuccess:
     print("The assembly evaluation is done!")
@@ -85,8 +102,9 @@ rule metaquast:
     threads: threads
     params:
         metaquast_outdir = metaquast_dir + "/{mix}/{sample}",
-        ref = lambda wc: ",".join(
-            [tb_ref, ad_ref]) if wc.mix == "TA" else ",".join([tb_ref, merlin_ref])
+        ref = get_assembly_ref
+        # ref = lambda wc: ",".join(
+        #     [tb_ref, ad_ref]) if wc.mix == "TA" else ",".join([tb_ref, merlin_ref])
     shell:
         """
         metaquast.py --unique-mapping -o {params.metaquast_outdir} -R {params.ref} {input.scaffolds} -t {threads}
@@ -99,16 +117,20 @@ rule summarize:
                strain_sample=make_mix())
     output:
         metaquast_dir + "/summary_for_figure/{mix}.{criteria}.merged.tsv"
-    conda:
-        "config/conda_env.yaml"
+    # conda:
+    #     "config/conda_env.yaml"
     params:
-        input_files = metaquast_dir + "/{mix}/*/summary/TSV/{criteria}.tsv"
+        input_files = metaquast_dir + "/{mix}/*/summary/TSV/{criteria}.tsv",
+        joiner = cd + '/program/join_tsv.py'
     shell:
         """
-        paste -d"\t" {params.input_files}|sed '1s/\.scaffolds//g' |csvtk transpose -Tt -|\
+        python {params.joiner} {params.input_files}|sed '1s/\.scaffolds//g' |csvtk transpose -Tt -|\
             awk 'NR==1{{print}}$1!="Assemblies"{{print}}'|sed '1s/\.GFP\|\.BAC//g' > {output}
         """
 
+#    paste -d"\t" {params.input_files}|sed '1s/\.scaffolds//g' |csvtk transpose -Tt -|\
+#             awk 'NR==1{{print}}$1!="Assemblies"{{print}}'|sed '1s/\.GFP\|\.BAC//g' > {output}
+
 # Visualize the evaluation
 rule visualize:
     input:

diff --git a/program/join_tsv.py b/program/join_tsv.py
@@ -0,0 +1,28 @@
+import click
+from sys import stdout
+import pandas as pd
+
+
+@click.command()
+@click.argument('tsv', nargs=-1)
+@click.option('-o', '--out', help='Output joined tsv')
+def join_tsv(tsv, out):
+    df_list = []
+    for file in tsv:
+        df = pd.read_csv(file, sep='\t', index_col=0)
+        df_list.append(df)
+
+    joined_df = pd.concat(df_list, axis=1, sort=False)
+
+    out_file = out if out else stdout
+
+    joined_df.index.name = 'Assemblies'
+    joined_df.reset_index(level=0, inplace=True)
+
+    # joined_df['Assemblies'] = joined_df.index
+
+    joined_df.to_csv(out_file, sep='\t', index=False)
+
+
+if __name__ == '__main__':
+    join_tsv()
diff --git a/run_benchmark.py b/run_benchmark.py
@@ -15,7 +15,7 @@
 import snakemake
 
 wd = os.path.dirname(os.path.realpath(__file__))
-VERSION = '0.2'
+VERSION = '0.3'
 
 
 class SpecialHelpOrder(click.Group):

diff --git a/scripts/metaquast_visualize.R b/scripts/metaquast_visualize.R
@@ -25,8 +25,11 @@ read_metaquast <- function(file_name, dir = input_dir) {
 
   metaquast <- read.table(file, sep = "\t", header = T, na.strings = "-", check.names = F) %>%
     separate(Assemblies, c("sample", "assembler"), sep = "\\.")
-  ref1 <- 3
-  ref2 <- 4
+  ref1 <- 'TB40E'
+  ref2 <- ifelse(startsWith(basename(file_name), 'TA'), 'AD169', 'Merlin')
+
+  # ref1 <- 3
+  # ref2 <- 4
   if (grepl("NGA50", file_name)) {
     metaquast[, ref1] <- ifelse(endsWith(metaquast$sample, "_0_1"), -1, metaquast[, ref1])
     metaquast[, ref2] <- ifelse(endsWith(metaquast$sample, "_1_0"), -1, metaquast[, ref2])