change samples.tsv to be referred to as 'metadata' rather than 'sampl…

…es', and change metadata samples column name to sampleID
sanjaynagi · Jun 4, 2021 · 1deb55b · 1deb55b
1 parent a655e69
commit 1deb55b
Show file tree

Hide file tree

Showing 25 changed files with 132 additions and 104 deletions.
diff --git a/.test/config/config.yaml b/.test/config/config.yaml
@@ -51,7 +51,7 @@ GSEA:
 
 VariantCalling:
       activate: True
-      ploidy: 10
+      ploidy: 1
       chunks: 9 # Number of chunks to split the genome into when parallelising freebayes
 # Number of chunks to split the genome into when parallelising freebayes
 

diff --git a/.test/config/samples.tsv b/.test/config/samples.tsv
@@ -1,4 +1,4 @@
-samples	treatment	species	strain
+sampleID	treatment	species	strain
 ContTia1-X-test	ContTia	coluzzii 	Tiassale
 ContTia2-X-test	ContTia	coluzzii 	Tiassale
 ContTia4-X-test	ContTia	coluzzii 	Tiassale

diff --git a/README.md b/README.md
@@ -17,7 +17,7 @@ This workflow performs various analyses of illumina paired-end RNA-Sequencing da
 * *Anopheles gambiae s.l* - Reports if DE genes are found underneath known selective sweep signals in the [Ag1000g](https://www.nature.com/articles/nature24995).
 * *Anopheles gambiae s.l* - Determines Karyotype of chromosome 2 inversions using [compkaryo](https://academic.oup.com/g3journal/article/9/10/3249/6026680) - [GH](https://github.com/sanjaynagi/compkaryo)
 
-The workflow is generalised, and will function with any trimmed Illumina paired-end RNA-sequencing. However, certain modules, such as the AIMs analysis, are only appropriate for specific species. These can be activated in the configuration file (config.yaml).
+The workflow is generalised, and will function with any trimmed Illumina paired-end RNA-sequencing. However, certain modules, such as the AIMs analysis, are only appropriate for specific species. These can be activated in the configuration file (config.yaml). The workflow works with pooled samples, or diploid or haploid individuals. 
 
 The workflow is still in construction, and not yet ready for release. If you have any feedback on how the workflow may be improved, please get in touch, or feel free to fork the github repo and create a pull request for any additional features you would like to implement. If you are using the workflow and would like to give feedback or troubleshoot, consider joining the discord server [here](https://discord.gg/RaXjP8APCq)
 

diff --git a/config/exampleconfig.yaml b/config/exampleconfig.yaml
@@ -39,7 +39,8 @@ ref:
   genenames:
         "resources/exampleGeneNames.tsv"
 
-# Chromosome names for the appropriate species. 
+# Chromosome names for the appropriate species.
+# Currently, the variant calling and analysis part of workflow requires genes mapped to chromosomes and their physical location.
 # Please ensure that these correspond exactly to the reference fasta/gff files. Extra unwanted chromosomes (unplaced contigs) can be ignored. 
 # if you need to remove a prefix (Such as "AgamP4_"or "AaegL5_") from a file, use sed - i.e - "sed s/AgamP4_// An.gambiae.fasta > An.gambiae_clean.fasta"
 chroms: ['2L', '2R', '3L', '3R', 'X']

diff --git a/config/examplefastq.tsv b/config/examplefastq.tsv
@@ -1,4 +1,4 @@
-samples	fq1	fq2
+sampleID	fq1	fq2
 Kisumu1	resources/reads/a.chr21.1.fq.gz	resources/reads/a.chr21_2.fq.gz
 B	resources/reads/b.chr21.1.fq.gz	resources/reads/b.chr21.2.fq.gz
 B	resources/reads/b.chr21.1.fq.gz	resources/reads/

diff --git a/config/examplesamples.tsv b/config/examplesamples.tsv
@@ -1,4 +1,4 @@
-samples	treatment	species	strain
+sampleID	treatment	species	strain
 Kisumu1	Kisumu	gambiae	Kisumu
 Kisumu2	Kisumu	gambiae	Kisumu
 Kisumu3	Kisumu	gambiae	Kisumu
@@ -10,4 +10,4 @@ gambiaeCont4	gambiaeCont	gambiae	Bouake
 gambiaePM1	gambiaePM	gambiae	Bouake
 gambiaePM2	gambiaePM	gambiae	Bouake
 gambiaePM3	gambiaePM	gambiae	Bouake
-gambiaePM4	gambiaePM	gambiae	Bouake
+gambiaePM4	gambiaePM	gambiae	Bouake
diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -15,9 +15,9 @@ configfile: "config/config.yaml"
 
 metadata = pd.read_csv(config["samples"], sep="\t")
 validate(metadata, schema="schemas/samples.schema.yaml")
-samples = metadata.samples
+samples = metadata['sampleID']
 
-# IR allele balance mutations file
+# AlleleBalance mutations file
 if config["IRmutations"]["activate"]:
     mutationData = pd.read_csv(config["IRmutations"]["path"], sep="\t") 
 else:

diff --git a/workflow/envs/diffexp.yaml b/workflow/envs/diffexp.yaml
@@ -12,4 +12,5 @@ dependencies:
   - r-glue=1.4.2
   - r-rcolorbrewer=1.1_2
   - r-pheatmap=1.0.12
-  - bioconductor-enhancedvolcano=1.8.0
+  - bioconductor-enhancedvolcano=1.8.0
+  - r-jsonlite
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
@@ -17,15 +17,15 @@ def getFASTQs(wildcards, rules=None):
     if config["fastq"]["auto"]:
         units = pd.read_csv(config["samples"], sep="\t")
         units = (
-            units.assign(fq1=f"resources/reads/" + units["samples"] + "_1.fastq.gz")
-            .assign(fq2=f"resources/reads/" + units["samples"] + "_2.fastq.gz")
-            .set_index("samples")
+            units.assign(fq1=f"resources/reads/" + units["sampleID"] + "_1.fastq.gz")
+            .assign(fq2=f"resources/reads/" + units["sampleID"] + "_2.fastq.gz")
+            .set_index("sampleID")
         )
     else:
         assert os.path.isfile(
             config["fastq"]["table"]
         ), f"config['fastq']['table'] (the config/fastq.tsv file) does not seem to exist. Please create one, or use the 'auto' option and name the fastq files as specified in the config/README.md"
-        units = pd.read_csv(config["fastq"]["table"], sep="\t", index_col="samples")
+        units = pd.read_csv(config["fastq"]["table"], sep="\t", index_col="sampleID")
 
     if len(wildcards) > 1:
         u = units.loc[wildcards.sample, f"fq{wildcards.n}"]
@@ -142,7 +142,7 @@ def GetDesiredOutputs(wildcards):
         )
 
     if config["IRmutations"]["activate"]:
-        wanted_input.extend(["results/allele_balance/allele_balance.xlsx"])
+        wanted_input.extend(["results/alleleBalance/alleleBalance.xlsx"])
 
     if config["GSEA"]["activate"]:
         wanted_input.extend(

diff --git a/workflow/rules/diff.smk b/workflow/rules/diff.smk
@@ -48,7 +48,7 @@ rule DifferentialGeneExpression:
     Produce PCAs, heatmaps, volcano plots
     """
     input:
-        samples=config["samples"],
+        metadata=config["samples"],
         gene_names=config["ref"]["genenames"],
         counts=expand("results/quant/{sample}", sample=samples),
     output:
@@ -78,7 +78,7 @@ rule DifferentialIsoformExpression:
     Produce volcano plots
     """
     input:
-        samples=config["samples"],
+        metadata=config["samples"],
         gene_names=config["ref"]["genenames"],
         counts=expand("results/quant/{sample}", sample=samples),
     output:
@@ -122,7 +122,7 @@ rule progressiveGenesDE:
             direction=["up", "down"],
         ),
     params:
-        samples=config["samples"],
+        metadata=config["samples"],
         comps=config["progressiveGenes"]["groups"],
         pval=config["progressiveGenes"]["padj_threshold"],
         fc=config["progressiveGenes"]["fc_threshold"],
@@ -139,7 +139,7 @@ rule GeneSetEnrichment:
     Perform Gene Set Enrichment analysis with fgsea, on GO terms and KEGG pathways
     """
     input:
-        samples=config["samples"],
+        metadata=config["samples"],
         gaf=config["GSEA"]["gaf"],
         DEresults=expand("results/genediff/{comp}.csv", comp=config["contrasts"]),
         diffsnps=(
@@ -216,7 +216,7 @@ rule GeneCategoryContribution:
     """
     input:
         normcounts="results/quant/normcounts.tsv",
-        samples=config["samples"],
+        metadata=config["samples"],
     output:
         "results/quant/percentageContributionGeneCategories.tsv",
     log:

diff --git a/workflow/rules/variantAnalysis.smk b/workflow/rules/variantAnalysis.smk
@@ -10,7 +10,7 @@ rule mpileupIR:
         bam="results/alignments/{sample}.bam",
         index="results/alignments/{sample}.bam.bai",
     output:
-        "results/allele_balance/counts/{sample}_{mut}_allele_counts.tsv",
+        "results/alleleBalance/counts/{sample}_{mut}_allele_counts.tsv",
     conda:
         "../envs/variants.yaml"
     priority: 10
@@ -35,19 +35,19 @@ rule AlleleBalanceIR:
     """
     input:
         counts=expand(
-            "results/allele_balance/counts/{sample}_{mut}_allele_counts.tsv",
+            "results/alleleBalance/counts/{sample}_{mut}_allele_counts.tsv",
             sample=samples,
             mut=mutationData.Name,
         ),
-        samples=config["samples"],
+        metadata=config["samples"],
         mutations=config["IRmutations"]["path"],
     output:
         expand(
-            "results/allele_balance/csvs/{mut}_allele_balance.csv",
+            "results/alleleBalance/csvs/{mut}_alleleBalance.csv",
             mut=mutationData.Name,
         ),
-        allele_balance="results/allele_balance/allele_balance.xlsx",
-        mean_allele_balance="results/allele_balance/mean_allele_balance.xlsx",
+        alleleBalance="results/alleleBalance/alleleBalance.xlsx",
+        mean_alleleBalance="results/alleleBalance/mean_alleleBalance.xlsx",
     conda:
         "../envs/diffexp.yaml"
     priority: 10
@@ -89,7 +89,7 @@ rule DifferentialSNPs:
     Test to see if any alleles are enriched in one condition versus the other
     """
     input:
-        samples=config["samples"],
+        metadata=config["samples"],
         gff=config["ref"]["gff"],
         geneNames="resources/gene_names.tsv",
         tables=expand(
@@ -127,7 +127,7 @@ rule StatisticsAndPCA:
             "results/variants/vcfs/annot.variants.{chrom}.vcf.gz",
             chrom=config["chroms"],
         ),
-        samples=config["samples"],
+        metadata=config["samples"],
         gff=config["ref"]["gff"],
     output:
         PCAfig=expand(
@@ -162,7 +162,7 @@ rule WindowedFstPBS:
     Calculate Fst and PBS in windows
     """
     input:
-        samples=config["samples"],
+        metadata=config["samples"],
         vcf=expand(
             "results/variants/vcfs/annot.variants.{chrom}.vcf.gz",
             chrom=config["chroms"],
@@ -208,7 +208,7 @@ rule PerGeneFstPBS:
     Calculate Fst and PBS for each gene
     """
     input:
-        samples=config["samples"],
+        metadata=config["samples"],
         gff=config["ref"]["gff"],
         geneNames="resources/gene_names.tsv",
         vcf=expand(
@@ -243,7 +243,7 @@ rule AncestryInformativeMarkers:
             "results/variants/vcfs/annot.variants.{chrom}.vcf.gz",
             chrom=config["chroms"],
         ),
-        samples=config["samples"],
+        metadata=config["samples"],
         aims_zarr_gambcolu=config["AIMs"]["gambcolu"],
         aims_zarr_arab=config["AIMs"]["arab"],
     output:

diff --git a/workflow/schemas/samples.schema.yaml b/workflow/schemas/samples.schema.yaml
@@ -4,7 +4,7 @@ description: Schema to validate the input config/sample.tsv file.
 
 # columns that the config/samples.tsv file can have and which type they should be
 properties:
-  samples:
+  sampleID:
     type: string
     description: sample name/identifier, should be a string ending in the sample number. Should match the input FASTQ files.
   treatment:
@@ -13,17 +13,14 @@ properties:
   species:
     type: string
     description: Name of species.
-  insecticide:
-    type: string
-    description: Name of insecticide, necessary in case of a dataset with multiple different insecticides.
   lab:
     type: boolean
     description: If the sample is a lab-susceptible strain, enter TRUE here, to ensure it is compared to the 'control' (unexposed) resistant strain.
     default: FALSE
 
 # columns that the config/samples.tsv file must have to pass schema validation
 required:
-  - samples
+  - sampleID
   - treatment
   - species
   - strain
diff --git a/workflow/scripts/AncestryInformativeMarkers.py b/workflow/scripts/AncestryInformativeMarkers.py
@@ -9,8 +9,8 @@
 from tools import *
 
 ### AIMS ###
-samples = pd.read_csv(snakemake.input['samples'], sep="\t")
-samples = samples.sort_values(by='species').reset_index(drop=True)
+metadata = pd.read_csv(snakemake.input['metadata'], sep="\t")
+metadate = metadata.sort_values(by='species').reset_index(drop=True)
 chroms = snakemake.params['chroms']
 ploidy = snakemake.params['ploidy']
 numbers = get_numbers_dict(ploidy)
@@ -33,7 +33,7 @@
     path = f"results/variants/vcfs/annot.variants.{chrom}.vcf.gz"
     vcf, geno, acsubpops, pos, depth, snpeff, subpops, pops =  readAndFilterVcf(path=path,
                                                                chrom=chrom,
-                                                               samples=samples,
+                                                               samples=metadata,
                                                                numbers=numbers,
                                                                ploidy=ploidy,
                                                                qualflt=qualflt,
@@ -84,7 +84,7 @@
 
         # for each sample, get proportion of gambiae/coluzzii alleles
         # alleles that are different to both will be missed here
-        for sample in samples.treatment.unique():
+        for sample in metadata.treatment.unique():
             alleles = gn.take(subpops[sample], axis=1).flatten()
 
             # at each AIM, do we have gamb or colu alleles
@@ -105,7 +105,7 @@
         prop_colu = {}
         n_aims_per_sample = {}
 
-        for sample in samples.treatment.unique():
+        for sample in metadata.treatment.unique():
 
             prop_gambiae[sample] = np.nanmean(np.array(list(gambscores[sample].values())))
             all_gamb[sample].append(np.nanmean(np.array(list(gambscores[sample].values()))))
@@ -156,7 +156,7 @@
 ########### The same but for arabiensis v gambiae/coluzzii 
 # script should be modularised but no time atm, isnt necessary
 
-if samples['species'].isin(['arabiensis']).any():
+if metadata['species'].isin(['arabiensis']).any():
 
     aims = zarr.open(snakemake.input['aims_zarr_arab'], mode='r')
 
@@ -172,8 +172,8 @@
         path = f"results/variants/vcfs/annot.variants.{chrom}.vcf.gz"
         vcf, geno, acsubpops, pos, depth, snpeff, subpops, pops =  readAndFilterVcf(path=path,
                                                                 chrom=chrom,
-                                                                samples=samples,
-								numbers=numbers,
+                                                                samples=metadata,
+                                                                numbers=numbers,
                                                                 qualflt=qualflt,
                                                                 missingfltprop=missingprop)
         aimspos = aims[chrom]['POS'][:]
@@ -222,7 +222,7 @@
 
             # for each sample, get proportion of gambiae/arabiensis alleles
             # alleles that are different to both will be missed here
-            for sample in samples.treatment.unique():
+            for sample in metadata.treatment.unique():
                 alleles = gn.take(subpops[sample], axis=1).flatten()
 
                 # at each AIM, do we have gamb or arab alleles
@@ -243,7 +243,7 @@
             prop_arab = {}
             n_aims_per_sample = {}
 
-            for sample in samples.treatment.unique():
+            for sample in metadata.treatment.unique():
 
                 prop_gambiae[sample] = np.nanmean(np.array(list(gambscores[sample].values())))
                 all_gamb[sample].append(np.nanmean(np.array(list(gambscores[sample].values()))))