From 1deb55befaa03758232ac9a80551e83514ad76aa Mon Sep 17 00:00:00 2001 From: Sanjay C Nagi Date: Fri, 4 Jun 2021 13:54:40 +0100 Subject: [PATCH] change samples.tsv to be referred to as 'metadata' rather than 'samples', and change metadata samples column name to sampleID --- .test/config/config.yaml | 2 +- .test/config/samples.tsv | 2 +- README.md | 2 +- config/exampleconfig.yaml | 3 +- config/examplefastq.tsv | 2 +- config/examplesamples.tsv | 4 +- workflow/Snakefile | 4 +- workflow/envs/diffexp.yaml | 3 +- workflow/rules/common.smk | 10 +-- workflow/rules/diff.smk | 10 +-- workflow/rules/variantAnalysis.smk | 22 +++--- workflow/schemas/samples.schema.yaml | 7 +- .../scripts/AncestryInformativeMarkers.py | 20 +++--- workflow/scripts/DeseqGeneDE.R | 71 +++++++++++++------ workflow/scripts/DifferentialSNPs.R | 4 +- workflow/scripts/GeneCategoryContribution.R | 4 +- workflow/scripts/GeneSetEnrichment.R | 2 +- workflow/scripts/GenerateFreebayesParams.R | 2 +- workflow/scripts/MutAlleleBalance.R | 14 ++-- workflow/scripts/PerGeneFstPBS.py | 6 +- workflow/scripts/SleuthIsoformsDE.R | 12 ++-- workflow/scripts/StatisticsAndPCA.py | 14 ++-- workflow/scripts/WindowedFstPBS.py | 6 +- workflow/scripts/checkInputs.py | 4 +- workflow/scripts/tools.py | 6 +- 25 files changed, 132 insertions(+), 104 deletions(-) diff --git a/.test/config/config.yaml b/.test/config/config.yaml index e13f9f5..85b6558 100644 --- a/.test/config/config.yaml +++ b/.test/config/config.yaml @@ -51,7 +51,7 @@ GSEA: VariantCalling: activate: True - ploidy: 10 + ploidy: 1 chunks: 9 # Number of chunks to split the genome into when parallelising freebayes # Number of chunks to split the genome into when parallelising freebayes diff --git a/.test/config/samples.tsv b/.test/config/samples.tsv index e8ed29d..b5c1c79 100755 --- a/.test/config/samples.tsv +++ b/.test/config/samples.tsv @@ -1,4 +1,4 @@ -samples treatment species strain +sampleID treatment species strain ContTia1-X-test ContTia coluzzii Tiassale ContTia2-X-test ContTia coluzzii Tiassale ContTia4-X-test ContTia coluzzii Tiassale diff --git a/README.md b/README.md index 645334b..ba20887 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ This workflow performs various analyses of illumina paired-end RNA-Sequencing da * *Anopheles gambiae s.l* - Reports if DE genes are found underneath known selective sweep signals in the [Ag1000g](https://www.nature.com/articles/nature24995). * *Anopheles gambiae s.l* - Determines Karyotype of chromosome 2 inversions using [compkaryo](https://academic.oup.com/g3journal/article/9/10/3249/6026680) - [GH](https://github.com/sanjaynagi/compkaryo) -The workflow is generalised, and will function with any trimmed Illumina paired-end RNA-sequencing. However, certain modules, such as the AIMs analysis, are only appropriate for specific species. These can be activated in the configuration file (config.yaml). +The workflow is generalised, and will function with any trimmed Illumina paired-end RNA-sequencing. However, certain modules, such as the AIMs analysis, are only appropriate for specific species. These can be activated in the configuration file (config.yaml). The workflow works with pooled samples, or diploid or haploid individuals. The workflow is still in construction, and not yet ready for release. If you have any feedback on how the workflow may be improved, please get in touch, or feel free to fork the github repo and create a pull request for any additional features you would like to implement. If you are using the workflow and would like to give feedback or troubleshoot, consider joining the discord server [here](https://discord.gg/RaXjP8APCq) diff --git a/config/exampleconfig.yaml b/config/exampleconfig.yaml index eac6cd7..66f5f90 100755 --- a/config/exampleconfig.yaml +++ b/config/exampleconfig.yaml @@ -39,7 +39,8 @@ ref: genenames: "resources/exampleGeneNames.tsv" -# Chromosome names for the appropriate species. +# Chromosome names for the appropriate species. +# Currently, the variant calling and analysis part of workflow requires genes mapped to chromosomes and their physical location. # Please ensure that these correspond exactly to the reference fasta/gff files. Extra unwanted chromosomes (unplaced contigs) can be ignored. # if you need to remove a prefix (Such as "AgamP4_"or "AaegL5_") from a file, use sed - i.e - "sed s/AgamP4_// An.gambiae.fasta > An.gambiae_clean.fasta" chroms: ['2L', '2R', '3L', '3R', 'X'] diff --git a/config/examplefastq.tsv b/config/examplefastq.tsv index a98a501..4a7c50d 100644 --- a/config/examplefastq.tsv +++ b/config/examplefastq.tsv @@ -1,4 +1,4 @@ -samples fq1 fq2 +sampleID fq1 fq2 Kisumu1 resources/reads/a.chr21.1.fq.gz resources/reads/a.chr21_2.fq.gz B resources/reads/b.chr21.1.fq.gz resources/reads/b.chr21.2.fq.gz B resources/reads/b.chr21.1.fq.gz resources/reads/ diff --git a/config/examplesamples.tsv b/config/examplesamples.tsv index 407de17..212dfd0 100755 --- a/config/examplesamples.tsv +++ b/config/examplesamples.tsv @@ -1,4 +1,4 @@ -samples treatment species strain +sampleID treatment species strain Kisumu1 Kisumu gambiae Kisumu Kisumu2 Kisumu gambiae Kisumu Kisumu3 Kisumu gambiae Kisumu @@ -10,4 +10,4 @@ gambiaeCont4 gambiaeCont gambiae Bouake gambiaePM1 gambiaePM gambiae Bouake gambiaePM2 gambiaePM gambiae Bouake gambiaePM3 gambiaePM gambiae Bouake -gambiaePM4 gambiaePM gambiae Bouake +gambiaePM4 gambiaePM gambiae Bouake \ No newline at end of file diff --git a/workflow/Snakefile b/workflow/Snakefile index ac78c64..e7d7c70 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -15,9 +15,9 @@ configfile: "config/config.yaml" metadata = pd.read_csv(config["samples"], sep="\t") validate(metadata, schema="schemas/samples.schema.yaml") -samples = metadata.samples +samples = metadata['sampleID'] -# IR allele balance mutations file +# AlleleBalance mutations file if config["IRmutations"]["activate"]: mutationData = pd.read_csv(config["IRmutations"]["path"], sep="\t") else: diff --git a/workflow/envs/diffexp.yaml b/workflow/envs/diffexp.yaml index 9538435..8f4532c 100644 --- a/workflow/envs/diffexp.yaml +++ b/workflow/envs/diffexp.yaml @@ -12,4 +12,5 @@ dependencies: - r-glue=1.4.2 - r-rcolorbrewer=1.1_2 - r-pheatmap=1.0.12 - - bioconductor-enhancedvolcano=1.8.0 \ No newline at end of file + - bioconductor-enhancedvolcano=1.8.0 + - r-jsonlite \ No newline at end of file diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index ded08a6..cc10eca 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -17,15 +17,15 @@ def getFASTQs(wildcards, rules=None): if config["fastq"]["auto"]: units = pd.read_csv(config["samples"], sep="\t") units = ( - units.assign(fq1=f"resources/reads/" + units["samples"] + "_1.fastq.gz") - .assign(fq2=f"resources/reads/" + units["samples"] + "_2.fastq.gz") - .set_index("samples") + units.assign(fq1=f"resources/reads/" + units["sampleID"] + "_1.fastq.gz") + .assign(fq2=f"resources/reads/" + units["sampleID"] + "_2.fastq.gz") + .set_index("sampleID") ) else: assert os.path.isfile( config["fastq"]["table"] ), f"config['fastq']['table'] (the config/fastq.tsv file) does not seem to exist. Please create one, or use the 'auto' option and name the fastq files as specified in the config/README.md" - units = pd.read_csv(config["fastq"]["table"], sep="\t", index_col="samples") + units = pd.read_csv(config["fastq"]["table"], sep="\t", index_col="sampleID") if len(wildcards) > 1: u = units.loc[wildcards.sample, f"fq{wildcards.n}"] @@ -142,7 +142,7 @@ def GetDesiredOutputs(wildcards): ) if config["IRmutations"]["activate"]: - wanted_input.extend(["results/allele_balance/allele_balance.xlsx"]) + wanted_input.extend(["results/alleleBalance/alleleBalance.xlsx"]) if config["GSEA"]["activate"]: wanted_input.extend( diff --git a/workflow/rules/diff.smk b/workflow/rules/diff.smk index 02faf5b..2d5bcdc 100644 --- a/workflow/rules/diff.smk +++ b/workflow/rules/diff.smk @@ -48,7 +48,7 @@ rule DifferentialGeneExpression: Produce PCAs, heatmaps, volcano plots """ input: - samples=config["samples"], + metadata=config["samples"], gene_names=config["ref"]["genenames"], counts=expand("results/quant/{sample}", sample=samples), output: @@ -78,7 +78,7 @@ rule DifferentialIsoformExpression: Produce volcano plots """ input: - samples=config["samples"], + metadata=config["samples"], gene_names=config["ref"]["genenames"], counts=expand("results/quant/{sample}", sample=samples), output: @@ -122,7 +122,7 @@ rule progressiveGenesDE: direction=["up", "down"], ), params: - samples=config["samples"], + metadata=config["samples"], comps=config["progressiveGenes"]["groups"], pval=config["progressiveGenes"]["padj_threshold"], fc=config["progressiveGenes"]["fc_threshold"], @@ -139,7 +139,7 @@ rule GeneSetEnrichment: Perform Gene Set Enrichment analysis with fgsea, on GO terms and KEGG pathways """ input: - samples=config["samples"], + metadata=config["samples"], gaf=config["GSEA"]["gaf"], DEresults=expand("results/genediff/{comp}.csv", comp=config["contrasts"]), diffsnps=( @@ -216,7 +216,7 @@ rule GeneCategoryContribution: """ input: normcounts="results/quant/normcounts.tsv", - samples=config["samples"], + metadata=config["samples"], output: "results/quant/percentageContributionGeneCategories.tsv", log: diff --git a/workflow/rules/variantAnalysis.smk b/workflow/rules/variantAnalysis.smk index 54c1457..67d30be 100644 --- a/workflow/rules/variantAnalysis.smk +++ b/workflow/rules/variantAnalysis.smk @@ -10,7 +10,7 @@ rule mpileupIR: bam="results/alignments/{sample}.bam", index="results/alignments/{sample}.bam.bai", output: - "results/allele_balance/counts/{sample}_{mut}_allele_counts.tsv", + "results/alleleBalance/counts/{sample}_{mut}_allele_counts.tsv", conda: "../envs/variants.yaml" priority: 10 @@ -35,19 +35,19 @@ rule AlleleBalanceIR: """ input: counts=expand( - "results/allele_balance/counts/{sample}_{mut}_allele_counts.tsv", + "results/alleleBalance/counts/{sample}_{mut}_allele_counts.tsv", sample=samples, mut=mutationData.Name, ), - samples=config["samples"], + metadata=config["samples"], mutations=config["IRmutations"]["path"], output: expand( - "results/allele_balance/csvs/{mut}_allele_balance.csv", + "results/alleleBalance/csvs/{mut}_alleleBalance.csv", mut=mutationData.Name, ), - allele_balance="results/allele_balance/allele_balance.xlsx", - mean_allele_balance="results/allele_balance/mean_allele_balance.xlsx", + alleleBalance="results/alleleBalance/alleleBalance.xlsx", + mean_alleleBalance="results/alleleBalance/mean_alleleBalance.xlsx", conda: "../envs/diffexp.yaml" priority: 10 @@ -89,7 +89,7 @@ rule DifferentialSNPs: Test to see if any alleles are enriched in one condition versus the other """ input: - samples=config["samples"], + metadata=config["samples"], gff=config["ref"]["gff"], geneNames="resources/gene_names.tsv", tables=expand( @@ -127,7 +127,7 @@ rule StatisticsAndPCA: "results/variants/vcfs/annot.variants.{chrom}.vcf.gz", chrom=config["chroms"], ), - samples=config["samples"], + metadata=config["samples"], gff=config["ref"]["gff"], output: PCAfig=expand( @@ -162,7 +162,7 @@ rule WindowedFstPBS: Calculate Fst and PBS in windows """ input: - samples=config["samples"], + metadata=config["samples"], vcf=expand( "results/variants/vcfs/annot.variants.{chrom}.vcf.gz", chrom=config["chroms"], @@ -208,7 +208,7 @@ rule PerGeneFstPBS: Calculate Fst and PBS for each gene """ input: - samples=config["samples"], + metadata=config["samples"], gff=config["ref"]["gff"], geneNames="resources/gene_names.tsv", vcf=expand( @@ -243,7 +243,7 @@ rule AncestryInformativeMarkers: "results/variants/vcfs/annot.variants.{chrom}.vcf.gz", chrom=config["chroms"], ), - samples=config["samples"], + metadata=config["samples"], aims_zarr_gambcolu=config["AIMs"]["gambcolu"], aims_zarr_arab=config["AIMs"]["arab"], output: diff --git a/workflow/schemas/samples.schema.yaml b/workflow/schemas/samples.schema.yaml index 00648db..733c069 100644 --- a/workflow/schemas/samples.schema.yaml +++ b/workflow/schemas/samples.schema.yaml @@ -4,7 +4,7 @@ description: Schema to validate the input config/sample.tsv file. # columns that the config/samples.tsv file can have and which type they should be properties: - samples: + sampleID: type: string description: sample name/identifier, should be a string ending in the sample number. Should match the input FASTQ files. treatment: @@ -13,9 +13,6 @@ properties: species: type: string description: Name of species. - insecticide: - type: string - description: Name of insecticide, necessary in case of a dataset with multiple different insecticides. lab: type: boolean description: If the sample is a lab-susceptible strain, enter TRUE here, to ensure it is compared to the 'control' (unexposed) resistant strain. @@ -23,7 +20,7 @@ properties: # columns that the config/samples.tsv file must have to pass schema validation required: - - samples + - sampleID - treatment - species - strain diff --git a/workflow/scripts/AncestryInformativeMarkers.py b/workflow/scripts/AncestryInformativeMarkers.py index eaa7ccf..c771ff3 100644 --- a/workflow/scripts/AncestryInformativeMarkers.py +++ b/workflow/scripts/AncestryInformativeMarkers.py @@ -9,8 +9,8 @@ from tools import * ### AIMS ### -samples = pd.read_csv(snakemake.input['samples'], sep="\t") -samples = samples.sort_values(by='species').reset_index(drop=True) +metadata = pd.read_csv(snakemake.input['metadata'], sep="\t") +metadate = metadata.sort_values(by='species').reset_index(drop=True) chroms = snakemake.params['chroms'] ploidy = snakemake.params['ploidy'] numbers = get_numbers_dict(ploidy) @@ -33,7 +33,7 @@ path = f"results/variants/vcfs/annot.variants.{chrom}.vcf.gz" vcf, geno, acsubpops, pos, depth, snpeff, subpops, pops = readAndFilterVcf(path=path, chrom=chrom, - samples=samples, + samples=metadata, numbers=numbers, ploidy=ploidy, qualflt=qualflt, @@ -84,7 +84,7 @@ # for each sample, get proportion of gambiae/coluzzii alleles # alleles that are different to both will be missed here - for sample in samples.treatment.unique(): + for sample in metadata.treatment.unique(): alleles = gn.take(subpops[sample], axis=1).flatten() # at each AIM, do we have gamb or colu alleles @@ -105,7 +105,7 @@ prop_colu = {} n_aims_per_sample = {} - for sample in samples.treatment.unique(): + for sample in metadata.treatment.unique(): prop_gambiae[sample] = np.nanmean(np.array(list(gambscores[sample].values()))) all_gamb[sample].append(np.nanmean(np.array(list(gambscores[sample].values())))) @@ -156,7 +156,7 @@ ########### The same but for arabiensis v gambiae/coluzzii # script should be modularised but no time atm, isnt necessary -if samples['species'].isin(['arabiensis']).any(): +if metadata['species'].isin(['arabiensis']).any(): aims = zarr.open(snakemake.input['aims_zarr_arab'], mode='r') @@ -172,8 +172,8 @@ path = f"results/variants/vcfs/annot.variants.{chrom}.vcf.gz" vcf, geno, acsubpops, pos, depth, snpeff, subpops, pops = readAndFilterVcf(path=path, chrom=chrom, - samples=samples, - numbers=numbers, + samples=metadata, + numbers=numbers, qualflt=qualflt, missingfltprop=missingprop) aimspos = aims[chrom]['POS'][:] @@ -222,7 +222,7 @@ # for each sample, get proportion of gambiae/arabiensis alleles # alleles that are different to both will be missed here - for sample in samples.treatment.unique(): + for sample in metadata.treatment.unique(): alleles = gn.take(subpops[sample], axis=1).flatten() # at each AIM, do we have gamb or arab alleles @@ -243,7 +243,7 @@ prop_arab = {} n_aims_per_sample = {} - for sample in samples.treatment.unique(): + for sample in metadata.treatment.unique(): prop_gambiae[sample] = np.nanmean(np.array(list(gambscores[sample].values()))) all_gamb[sample].append(np.nanmean(np.array(list(gambscores[sample].values())))) diff --git a/workflow/scripts/DeseqGeneDE.R b/workflow/scripts/DeseqGeneDE.R index 601dbb6..e0ac2cb 100644 --- a/workflow/scripts/DeseqGeneDE.R +++ b/workflow/scripts/DeseqGeneDE.R @@ -18,9 +18,10 @@ library(glue) library(RColorBrewer) library(EnhancedVolcano) library(tidyverse) +library(jsonlite) #read metadata and get contrasts -samples = fread(snakemake@input[['samples']], sep="\t") %>% as.data.frame() +metadata = fread(snakemake@input[['metadata']], sep="\t") %>% as.data.frame() gene_names = fread(snakemake@input[['gene_names']], sep="\t") %>% dplyr::rename("GeneID" = "Gene_stable_ID") @@ -56,13 +57,13 @@ vst_pca = function(counts, samples, colourvar, name="PCA", st="", comparison="") #### write pca of samples to pdf pca2=prcomp(t(vstcounts),center=TRUE) - pc = data.frame(pca2$x) %>% rownames_to_column("samples") + pc = data.frame(pca2$x) %>% rownames_to_column("sampleID") pc = left_join(pc, samples) pdf(glue("results/plots/{name}{st}{comparison}.pdf")) print(ggplot(data=pc,aes(x=PC1, y=PC2, colour=treatment)) + geom_point(size=6, alpha=0.8) + - geom_text_repel(aes(label=samples), colour="black") + + geom_text_repel(aes(label=sampleID), colour="black") + theme_light() + labs(title=glue("{name} {st} {comparison}"), x=glue("PC1 - Variance explained - {round(summary(pca2)$importance[2,][1], 3)}"), @@ -87,19 +88,19 @@ cat("\n", "------------- Kallisto - DESeq2 - RNASeq Differential expression ---- #### Read counts for each sample df = list() -for (sample in samples$samples){ +for (sample in metadata$sampleID){ df[[sample]]= fread(glue("results/quant/{sample}/abundance.tsv"), sep = "\t") } counts = data.frame('GeneID' = df[[1]]$target_id) #get read counts for each gene and fill table -for (sample in samples$samples){ +for (sample in metadata$sampleID){ reads = df[[sample]]$est_counts counts = cbind(counts, reads) } #rename columns -colnames(counts) = c("GeneID", samples$samples) +colnames(counts) = c("GeneID", metadata$sampleID) ## aggregate to gene level counts$GeneID = substr(counts$GeneID, 1, 10) #get first 10 letters, (remove -RA,-RB etc of transcripts) @@ -118,20 +119,20 @@ count_stats %>% fwrite(., "results/quant/count_statistics.tsv",sep="\t") print("Counting and plotting total reads per sample...") pdf("results/quant/total_reads_counted.pdf") -ggplot(count_stats, aes(x=Sample, y=total_counts, fill=samples$treatment)) + +ggplot(count_stats, aes(x=Sample, y=total_counts, fill=metadata$treatment)) + geom_bar(stat='identity') + theme_light() + ggtitle("Total reads counted (mapped to Ag transcriptome (PEST))") + theme(axis.text.x = element_text(angle=45), plot.title = element_text(hjust = 0.5)) null = dev.off() - + # round numbers to be whole, they are not due averaging across transcripts counts = counts %>% rownames_to_column('GeneID') %>% mutate_if(is.numeric, round) %>% column_to_rownames('GeneID') ############ Plots PCA with all data, and performs DESeq2 normalisation ######################## -res = vst_pca(counts, samples, colourvar = 'strain', name="PCA") +res = vst_pca(counts, metadata, colourvar = 'strain', name="PCA") vstcounts = res[[1]] dds = res[[2]] normcounts = res[[3]] @@ -154,16 +155,16 @@ pheatmap(correlations) garbage = dev.off() # add column if it doesnt exist -if(!("lab" %in% colnames(samples))) +if(!("lab" %in% colnames(metadata))) { - samples$lab = 'FALSE' + metadata$lab = 'FALSE' } # for each strain in the dataset, do a PCA plot # analysis of case v control with DESEq2 and make PCAs and volcano plots. -if ("strain" %in% colnames(samples)){ - for (sp in unique(samples$species)){ - df_samples = samples %>% filter(species == sp) +if ("strain" %in% colnames(metadata)){ + for (sp in unique(metadata$species)){ + df_samples = metadata %>% filter(species == sp) df_samples = df_samples %>% filter(lab == 'FALSE') #remove lab samples for (st in unique(df_samples$strain)){ ai_list = list() @@ -172,7 +173,7 @@ if ("strain" %in% colnames(samples)){ if (length(unique(ai_list[[st]]$cohort)) > 1){ cat(glue("\n Running PCA for all {st} samples \n")) # subset counts data to our strains of interest - subcounts = counts[colnames(counts) %in% ai_list[[st]]$samples] + subcounts = counts[colnames(counts) %in% ai_list[[st]]$sampleID] # perform PCA on the data at strain level vstcounts = vst_pca(subcounts, ai_list[[st]], 'treatment', "PCA_", st=st)[[1]] } @@ -187,12 +188,12 @@ ngenes_list = list() for (cont in contrasts){ control = str_split(cont, "_")[[1]][1] # get first of string, which is control case = str_split(cont, "_")[[1]][2] # get case - controls = which(samples$treatment %in% control) - cases = which(samples$treatment %in% case) + controls = which(metadata$treatment %in% control) + cases = which(metadata$treatment %in% case) ## Perform PCA for each comparison subcounts = counts[,c(controls, cases)] - subsamples = samples[c(controls, cases),] + subsamples = metadata[c(controls, cases),] # make treatment a factor with the 'susceptible' as reference subsamples$treatment = as.factor(as.character(subsamples$treatment)) subsamples$treatment = relevel(subsamples$treatment, control) @@ -221,12 +222,19 @@ for (cont in contrasts){ TRUE ~ Gene_name)) %>% select(Gene_name) %>% deframe() #get number of sig genes - ngenes_list[[cont]] = results %>% filter(padj < 0.05) %>% + res1 = results %>% filter(padj < 0.05) %>% + count("direction" = FC > 1) %>% + mutate("direction" = case_when(direction == FALSE ~ "Downregulated, padj = 0.05", + direction == TRUE ~ "Upregulated, padj = 0.05")) %>% + dplyr::rename(!!glue("{cont}_ngenes") := "n") + + res2 = results %>% filter(padj < 0.001) %>% count("direction" = FC > 1) %>% - mutate("direction" = case_when(direction == FALSE ~ "Downregulated", - direction == TRUE ~ "Upregulated")) %>% + mutate("direction" = case_when(direction == FALSE ~ "Downregulated, padj = 0.001", + direction == TRUE ~ "Upregulated, padj = 0.001")) %>% dplyr::rename(!!glue("{cont}_ngenes") := "n") + ngenes_list[[cont]] = bind_rows(res1, res2) # store names of comparisons for xlsx report sheets results_list[[cont]] = results @@ -257,4 +265,25 @@ for (i in 1:length(sheets)){ saveWorkbook(wb,file=snakemake@output[['xlsx']], overwrite = TRUE) +### Get kallisto statistics ### + +totalReads = c() +totalAligned = c() + +# open kallisto json info and store stats, save to file +for (sample in metadata$sampleID){ + run = fromJSON(glue("results/quant/{sample}/run_info.json"), flatten=TRUE) + + totalReads = c(totalReads, run$n_processed) + totalAligned = c(totalAligned, run$n_pseudoaligned) + +} + +df = data.frame("sample" = c(metadata$sampleID, "Total"), + "totalReads" = c(totalReads, sum(totalReads)), + "totalAligned" = c(totalAligned, sum(totalAligned))) %>% + mutate("percentage" = (totalAligned/totalReads)*100) %>% + fwrite("results/quant/KallistoQuantSummary.tsv", sep="\t", col.names = TRUE) + + sessionInfo() diff --git a/workflow/scripts/DifferentialSNPs.R b/workflow/scripts/DifferentialSNPs.R index 898eeeb..682a6ea 100644 --- a/workflow/scripts/DifferentialSNPs.R +++ b/workflow/scripts/DifferentialSNPs.R @@ -17,7 +17,7 @@ library(rtracklayer) ######## parse inputs ############# chroms = snakemake@params[['chroms']] -metadata = fread(snakemake@input[['samples']]) +metadata = fread(snakemake@input[['metadata']]) contrasts = data.frame("contrast" = snakemake@params[['DEcontrasts']]) comparisons = contrasts %>% separate(contrast, into = c("control", "case"), sep = "_") mincounts = snakemake@params[['mincounts']] @@ -43,7 +43,7 @@ for (i in 1:nrow(comparisons)){ nrepscontrol = sum(metadata$treatment %in% control) nrepscase = sum(metadata$treatment %in% case) - samples = metadata[metadata$treatment %in% c(case, control)]$samples + samples = metadata[metadata$treatment %in% c(case, control)]$sampleID print(glue("Extracting allele tables for {case}, {control}")) sample_list = list() diff --git a/workflow/scripts/GeneCategoryContribution.R b/workflow/scripts/GeneCategoryContribution.R index 276e013..f56eea6 100644 --- a/workflow/scripts/GeneCategoryContribution.R +++ b/workflow/scripts/GeneCategoryContribution.R @@ -17,13 +17,13 @@ round_df = function(df, digits) { (df) } -samples = fread(snakemake@input[['samples']], sep="\t") %>% as.data.frame() +metadata = fread(snakemake@input[['metadata']], sep="\t") %>% as.data.frame() #samples = fread("config/samples.tsv", sep="\t") %>% as.data.frame() gene_names = fread("resources/gene_names.tsv", sep="\t")[1:14459,] %>% distinct() # Read in normalised counts and sum biological replicates counts = fread("results/quant/normcounts.tsv", sep="\t") %>% column_to_rownames("GeneID") -counts = data.frame(t(rowsum(t(counts), group = samples$treatment, na.rm = T))) +counts = data.frame(t(rowsum(t(counts), group = metadata$treatment, na.rm = T))) apply(counts, 2, var) diff --git a/workflow/scripts/GeneSetEnrichment.R b/workflow/scripts/GeneSetEnrichment.R index db7b892..1c19a38 100644 --- a/workflow/scripts/GeneSetEnrichment.R +++ b/workflow/scripts/GeneSetEnrichment.R @@ -41,7 +41,7 @@ runEnrich = function(rankedList, GeneSetList, outName){ ###### configuration - metadata and parameters ###### -samples = fread(snakemake@input[['samples']]) %>% as.data.frame() +metadata = fread(snakemake@input[['metadata']]) %>% as.data.frame() comparisons = data.frame("contrast" = snakemake@params[['DEcontrasts']]) gaffile = snakemake@input[['gaf']] variantCalling = snakemake@params[['VariantCalling']] diff --git a/workflow/scripts/GenerateFreebayesParams.R b/workflow/scripts/GenerateFreebayesParams.R index 86f94e6..753af5e 100644 --- a/workflow/scripts/GenerateFreebayesParams.R +++ b/workflow/scripts/GenerateFreebayesParams.R @@ -38,7 +38,7 @@ for (chrom in chroms){ metadata = fread(snakemake@params[['metadata']], sep="\t") -metadata$bams = paste0("results/alignments/", metadata$samples,".bam") +metadata$bams = paste0("results/alignments/", metadata$sampleID,".bam") metadata %>% select(bams, strain) %>% diff --git a/workflow/scripts/MutAlleleBalance.R b/workflow/scripts/MutAlleleBalance.R index f35d5e8..0b82622 100644 --- a/workflow/scripts/MutAlleleBalance.R +++ b/workflow/scripts/MutAlleleBalance.R @@ -9,8 +9,8 @@ library(glue) library(openxlsx) #### allele imbalance #### -metadata = fread(snakemake@input[['samples']], sep="\t") -samples = metadata$samples +metadata = fread(snakemake@input[['metadata']], sep="\t") +samples = metadata$sampleID # Read IR mutation data mutation_data = fread(snakemake@input[['mutations']], sep="\t") @@ -29,7 +29,7 @@ for (m in mutation_data$Name){ allele_list = list() # read data for each sample and subset to what we want for (sample in samples){ - allele_list[[sample]] = fread(glue("results/allele_balance/counts/{sample}_{m}_allele_counts.tsv"))[,c(1:8)] #for each sample read data, and store first 8 columns + allele_list[[sample]] = fread(glue("results/alleleBalance/counts/{sample}_{m}_allele_counts.tsv"))[,c(1:8)] #for each sample read data, and store first 8 columns allele_list[[sample]]$sample = sample #add sample column allele_list[[sample]]$treatment = metadata$treatment[samples == sample] #add treatment column allele_list[[sample]]$mutation = m @@ -49,8 +49,8 @@ for (m in mutation_data$Name){ mean_alleles = alleles %>% group_by(chrom, pos,ref, mutation, treatment) %>% summarise_at(.vars = c("cov","A","C","G","T", propstring) , .funs = c(mean="mean")) #write to file, reorder mean_kdr_alleles - fwrite(alleles, glue("results/allele_balance/csvs/{m}_allele_balance.csv")) - fwrite(mean_alleles, glue("results/allele_balance/csvs/mean_{m}_allele_balance.csv")) + fwrite(alleles, glue("results/alleleBalance/csvs/{m}_alleleBalance.csv")) + fwrite(mean_alleles, glue("results/alleleBalance/csvs/mean_{m}_alleleBalance.csv")) all_list[[m]] = alleles mean_list[[m]] = mean_alleles @@ -66,7 +66,7 @@ for (i in 1:length(sheets)){ writeData(wb, sheets[i], results_list[[i]], rowNames = TRUE, colNames = TRUE) } #### save workbook to disk once all worksheets and data have been added #### -saveWorkbook(wb,file=snakemake@output[['allele_balance']], overwrite = TRUE) +saveWorkbook(wb,file=snakemake@output[['alleleBalance']], overwrite = TRUE) ### mean balance #### @@ -80,6 +80,6 @@ for (i in 1:length(sheets)){ writeData(wb, sheets[i], results_list[[i]], rowNames = TRUE, colNames = TRUE) } #### save workbook to disk once all worksheets and data have been added #### -saveWorkbook(wb,file=snakemake@output[['mean_allele_balance']], overwrite = TRUE) +saveWorkbook(wb,file=snakemake@output[['mean_alleleBalance']], overwrite = TRUE) sessionInfo() diff --git a/workflow/scripts/PerGeneFstPBS.py b/workflow/scripts/PerGeneFstPBS.py index 9f786bb..f19eb2f 100644 --- a/workflow/scripts/PerGeneFstPBS.py +++ b/workflow/scripts/PerGeneFstPBS.py @@ -12,8 +12,8 @@ warnings.filterwarnings('ignore') # suppress numpy runtime warnings, this is a bit dangerous, should be removed for release or resolve source of warnings # snakemake inputs and params -metadata_path = snakemake.input['samples'] -samples = pd.read_csv(metadata_path, sep="\s+") +metadata_path = snakemake.input['metadata'] +metadata = pd.read_csv(metadata_path, sep="\s+") gffpath = snakemake.input['gff'] pbs = snakemake.params['pbs'] pbscomps = snakemake.params['pbscomps'] @@ -51,7 +51,7 @@ # function to read in vcfs and associated SNP data vcf, geno, acsubpops, pos, depth, snpeff, subpops, pops = readAndFilterVcf(path=path, chrom=chrom, - samples=samples, + samples=metadata, numbers=numbers, ploidy=ploidy, qualflt=30, diff --git a/workflow/scripts/SleuthIsoformsDE.R b/workflow/scripts/SleuthIsoformsDE.R index 542f1b8..842c08c 100644 --- a/workflow/scripts/SleuthIsoformsDE.R +++ b/workflow/scripts/SleuthIsoformsDE.R @@ -21,12 +21,12 @@ library(EnhancedVolcano) print("------------- Kallisto - Sleuth - RNASeq isoform Differential expression ---------") #### read data #### -samples = fread(snakemake@input[['samples']], sep="\t") %>% +metadata = fread(snakemake@input[['metadata']], sep="\t") %>% as.data.frame() %>% - dplyr::rename('sample' = "samples") + dplyr::rename('sample' = "sampleID") #add path column for sleuth object -samples$path = paste0("results/quant/",samples$sample) +metadata$path = paste0("results/quant/", metadata$sample) #read metadata and get contrasts gene_names = fread(snakemake@input[['gene_names']], sep="\t") %>% @@ -43,11 +43,11 @@ names_list = list() for (cont in contrasts){ control = str_split(cont, "_")[[1]][1] #get first of string, which is control (or susceptible) case = str_split(cont, "_")[[1]][2] #get second of string, which is case (or resistant) - controls = which(samples$treatment %in% control) #get indices of case/control - cases = which(samples$treatment %in% case) + controls = which(metadata$treatment %in% control) #get indices of case/control + cases = which(metadata$treatment %in% case) ##subset to subcounts of our contrast - subsamples = samples[c(controls, cases),] + subsamples = metadata[c(controls, cases),] #make treatment a factor with the 'susceptible' as reference subsamples$treatment = as.factor(as.character(subsamples$treatment)) diff --git a/workflow/scripts/StatisticsAndPCA.py b/workflow/scripts/StatisticsAndPCA.py index ac8c8ae..c53f113 100644 --- a/workflow/scripts/StatisticsAndPCA.py +++ b/workflow/scripts/StatisticsAndPCA.py @@ -12,8 +12,8 @@ # Read in parameters from snakemake dataset = snakemake.params['dataset'] -samples = pd.read_csv(snakemake.input['samples'], sep="\t") -samples = samples.sort_values(by='species') +metadata = pd.read_csv(snakemake.input['metadata'], sep="\t") +metadata = metadata.sort_values(by='species') chroms = snakemake.params['chroms'] ploidy = snakemake.params['ploidy'] numbers = get_numbers_dict(ploidy) @@ -48,7 +48,7 @@ def isnotmissing(gn): path = f"results/variants/vcfs/annot.variants.{chrom}.vcf.gz" vcf, geno, acsubpops, pos, depth, snpeff, subpops, populations = readAndFilterVcf(path=path, chrom=chrom, - samples=samples, + samples=metadata, numbers=numbers, ploidy=ploidy, qualflt=qualflt, @@ -67,7 +67,7 @@ def isnotmissing(gn): snps_per_gene = {} snps_per_sample = {} - for sample in samples['samples']: + for sample in metadata['sampleID']: bool_ = sample == populations gn = geno.compress(bool_, axis=1) @@ -81,7 +81,7 @@ def isnotmissing(gn): # locate_ranges() to get a boolean, needed as locate_range() will throw errors if no snps found in gene gene_bool = pos.locate_ranges([gene['start']], [gene['end']], strict=False) - for sample in samples['samples']: + for sample in metadata['sampleID']: presentSNPs = missing_array[sample].compress(gene_bool, axis=0) snps_per_sample[sample] = presentSNPs.sum() @@ -106,7 +106,7 @@ def isnotmissing(gn): # Run PCA function defined in tools.py print(f"Performing PCA on {dataset} chromosome {chrom}") - pca(geno, chrom, ploidy, dataset, populations, samples, pop_colours, prune=True, scaler=None) + pca(geno, chrom, ploidy, dataset, populations, metadata, pop_colours, prune=True, scaler=None) ######## Plot variant density over genome (defined in tools.py) ######## plot_density(pos, window_size=100000, title=f"Variant Density chromosome {chrom}", path=f"results/variants/plots/{dataset}_SNPdensity_{chrom}.png") @@ -117,7 +117,7 @@ def isnotmissing(gn): coefdict= {} allcoef = defaultdict(list) - for pop in samples.treatment.unique(): + for pop in metadata.treatment.unique(): # Sequence diversity seqdivdict[pop] = allel.sequence_diversity(pos, acsubpops[pop]) diff --git a/workflow/scripts/WindowedFstPBS.py b/workflow/scripts/WindowedFstPBS.py index 855447f..74d749c 100644 --- a/workflow/scripts/WindowedFstPBS.py +++ b/workflow/scripts/WindowedFstPBS.py @@ -9,8 +9,8 @@ from tools import * -samples = pd.read_csv(snakemake.input['samples'], sep="\t") -samples = samples.sort_values(by='species') +metadata = pd.read_csv(snakemake.input['metadata'], sep="\t") +metadata = metadata.sort_values(by='species') chroms = snakemake.params['chroms'] ploidy = snakemake.params['ploidy'] numbers = get_numbers_dict(ploidy) @@ -35,7 +35,7 @@ path = f"results/variants/vcfs/annot.variants.{chrom}.vcf.gz" vcf, geno, acsubpops, pos, depth, snpeff, subpops, populations = readAndFilterVcf(path=path, chrom=chrom, - samples=samples, + samples=metadata, numbers=numbers, ploidy=ploidy, qualflt=qualflt, diff --git a/workflow/scripts/checkInputs.py b/workflow/scripts/checkInputs.py index ab440e6..99d5dc6 100644 --- a/workflow/scripts/checkInputs.py +++ b/workflow/scripts/checkInputs.py @@ -47,10 +47,10 @@ # Check that samples in fastq.tsv match those in metadata.tsv if autofastq is False: fastq = pd.read_csv(snakemake.params['table'], sep="\t") - assert np.isin(fastq['samples'], metadata['samples']).all(), f"all samples specified in fastq table do not match those in samples.tsv, please check your fastq.tsv file" + assert np.isin(fastq['sampleID'], metadata['sampleID']).all(), f"all samples specified in fastq table do not match those in samples.tsv, please check your fastq.tsv file" else: # Check to see if fastq files match the metadata - for sample in metadata['samples']: + for sample in metadata['sampleID']: for n in [1,2]: fqpath = f"resources/reads/{sample}_{n}.fastq.gz" assert os.path.isfile(fqpath), f"all sample names in 'samples.tsv' do not match a .fastq.gz file in the {snakemake.workflow.basedir}/resources/reads/ directory, please rename or consider using the fastq table option" diff --git a/workflow/scripts/tools.py b/workflow/scripts/tools.py index eb8ab79..b163575 100644 --- a/workflow/scripts/tools.py +++ b/workflow/scripts/tools.py @@ -101,7 +101,7 @@ def readAndFilterVcf(path, chrom, samples, numbers, ploidy, qualflt=30, missingf ind = defaultdict(list) for s,names in enumerate(samplenames): - idx = np.where(np.isin(samples['samples'],names))[0][0] + idx = np.where(np.isin(samples['sampleID'],names))[0][0] t = samples.treatment[idx] ind[t].append(s) subpops = dict(ind) @@ -212,7 +212,7 @@ def plot_pca_coords(coords, model, pc1, pc2, ax, sample_population, samples, pop x = coords[:, pc1] y = coords[:, pc2] for pop in sample_population: - treatment = samples[samples['samples'] == pop]['treatment'].values[0] + treatment = samples[samples['sampleID'] == pop]['treatment'].values[0] flt = (sample_population == pop) ax.plot(x[flt], y[flt], marker='o', linestyle=' ', color=pop_colours[treatment], label=treatment, markersize=14, mec='k', mew=.5) @@ -225,7 +225,7 @@ def plot_pca_coords(coords, model, pc1, pc2, ax, sample_population, samples, pop def fig_pca(coords, model, title, path, samples, pop_colours,sample_population=None): if sample_population is None: - sample_population = samples.samples.values + sample_population = samples['sampleID'].values # plot coords for PCs 1 vs 2, 3 vs 4 fig = plt.figure(figsize=(10, 5)) ax = fig.add_subplot(1, 2, 1)