Skip to content

Commit

Permalink
use one ref when the sample is from pure strain
Browse files Browse the repository at this point in the history
  • Loading branch information
dawnmy committed May 5, 2020
1 parent 4c49ba4 commit 0111773
Show file tree
Hide file tree
Showing 6 changed files with 66 additions and 11 deletions.
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,13 @@ In this benchmarking study: variants callers `BCFtools` (v1.9), `VarScan` (v2.4.

To reproduce the output, you need to use `Bioconda`.

Please follow the instruction [here](https://bioconda.github.io) to install `Bioconda`. And then you need to install `snakemake` and Python package `click`:
Please follow the instruction [here](https://bioconda.github.io) to install `Bioconda`. And then you need to install `snakemake`, `csvtk` and Python package `click`:

```shell
conda install snakemake=5.3.0
conda install csvtk=0.18.2
conda install click=7.0

```

After this has been done, download the pipeline onto your system:
Expand Down
2 changes: 1 addition & 1 deletion config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ MerlinRef: ref/Merlin.BAC.fa
TB40ERef: ref/TB40E.GFP.fa
AD169Ref: ref/AD169.BAC.fa
PhixRef: ref/Phix.fa
outpath: ../revision_output_3
outpath: ../revision_output_4
threads: 20
runOnReads: false
rmHumanEcoli: true
Expand Down
34 changes: 28 additions & 6 deletions eval_assembly.smk
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,23 @@ metaquast_criteria = ["num_contigs", "Largest_contig", "Genome_fraction",
def make_mix():
return ["{}/{}".format(sample.split("-")[0], sample) for sample in sample_list]

def get_assembly_ref(wc):
if wc.sample.startswith('TA'):
if wc.sample.endswith('-1-0'):
ref_list = [tb_ref]
elif wc.sample.endswith('-0-1'):
ref_list = [ad_ref]
else:
ref_list = [tb_ref, ad_ref]
else:
if wc.sample.endswith('-1-0'):
ref_list = [tb_ref]
elif wc.sample.endswith('-0-1'):
ref_list = [merlin_ref]
else:
ref_list = [tb_ref, merlin_ref]
return ','.join(ref_list)


onsuccess:
print("The assembly evaluation is done!")
Expand Down Expand Up @@ -85,8 +102,9 @@ rule metaquast:
threads: threads
params:
metaquast_outdir = metaquast_dir + "/{mix}/{sample}",
ref = lambda wc: ",".join(
[tb_ref, ad_ref]) if wc.mix == "TA" else ",".join([tb_ref, merlin_ref])
ref = get_assembly_ref
# ref = lambda wc: ",".join(
# [tb_ref, ad_ref]) if wc.mix == "TA" else ",".join([tb_ref, merlin_ref])
shell:
"""
metaquast.py --unique-mapping -o {params.metaquast_outdir} -R {params.ref} {input.scaffolds} -t {threads}
Expand All @@ -99,16 +117,20 @@ rule summarize:
strain_sample=make_mix())
output:
metaquast_dir + "/summary_for_figure/{mix}.{criteria}.merged.tsv"
conda:
"config/conda_env.yaml"
# conda:
# "config/conda_env.yaml"
params:
input_files = metaquast_dir + "/{mix}/*/summary/TSV/{criteria}.tsv"
input_files = metaquast_dir + "/{mix}/*/summary/TSV/{criteria}.tsv",
joiner = cd + '/program/join_tsv.py'
shell:
"""
paste -d"\t" {params.input_files}|sed '1s/\.scaffolds//g' |csvtk transpose -Tt -|\
python {params.joiner} {params.input_files}|sed '1s/\.scaffolds//g' |csvtk transpose -Tt -|\
awk 'NR==1{{print}}$1!="Assemblies"{{print}}'|sed '1s/\.GFP\|\.BAC//g' > {output}
"""

# paste -d"\t" {params.input_files}|sed '1s/\.scaffolds//g' |csvtk transpose -Tt -|\
# awk 'NR==1{{print}}$1!="Assemblies"{{print}}'|sed '1s/\.GFP\|\.BAC//g' > {output}

# Visualize the evaluation
rule visualize:
input:
Expand Down
28 changes: 28 additions & 0 deletions program/join_tsv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import click
from sys import stdout
import pandas as pd


@click.command()
@click.argument('tsv', nargs=-1)
@click.option('-o', '--out', help='Output joined tsv')
def join_tsv(tsv, out):
df_list = []
for file in tsv:
df = pd.read_csv(file, sep='\t', index_col=0)
df_list.append(df)

joined_df = pd.concat(df_list, axis=1, sort=False)

out_file = out if out else stdout

joined_df.index.name = 'Assemblies'
joined_df.reset_index(level=0, inplace=True)

# joined_df['Assemblies'] = joined_df.index

joined_df.to_csv(out_file, sep='\t', index=False)


if __name__ == '__main__':
join_tsv()
2 changes: 1 addition & 1 deletion run_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import snakemake

wd = os.path.dirname(os.path.realpath(__file__))
VERSION = '0.2'
VERSION = '0.3'


class SpecialHelpOrder(click.Group):
Expand Down
7 changes: 5 additions & 2 deletions scripts/metaquast_visualize.R
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,11 @@ read_metaquast <- function(file_name, dir = input_dir) {

metaquast <- read.table(file, sep = "\t", header = T, na.strings = "-", check.names = F) %>%
separate(Assemblies, c("sample", "assembler"), sep = "\\.")
ref1 <- 3
ref2 <- 4
ref1 <- 'TB40E'
ref2 <- ifelse(startsWith(basename(file_name), 'TA'), 'AD169', 'Merlin')

# ref1 <- 3
# ref2 <- 4
if (grepl("NGA50", file_name)) {
metaquast[, ref1] <- ifelse(endsWith(metaquast$sample, "_0_1"), -1, metaquast[, ref1])
metaquast[, ref2] <- ifelse(endsWith(metaquast$sample, "_1_0"), -1, metaquast[, ref2])
Expand Down

0 comments on commit 0111773

Please sign in to comment.