diff --git a/Dockerfile.base b/Dockerfile.base index f635260..bf0b3db 100644 --- a/Dockerfile.base +++ b/Dockerfile.base @@ -1,39 +1,45 @@ # This Dockerfile builds the quatradis base image with all main dependencies which are not # dependent on any other source code. Using this image as the base provides a big speed up # when building and testing quatradis. -FROM debian:bullseye-slim +FROM debian:bookworm-slim # Install the system dependencies -RUN apt-get update -qq && \ - apt-get install -y sudo bzip2 curl default-jre gcc gzip locales make procps unzip wget && \ - apt-get install -y dirmngr gnupg apt-transport-https ca-certificates software-properties-common && \ - apt-get install -y bwa minimap2 smalt +RUN apt-get update -qq + +# Install build tools +RUN apt-get install -y sudo bzip2 curl default-jre gcc gzip locales make procps unzip wget dirmngr gnupg apt-transport-https ca-certificates software-properties-common + +# Install aligners +RUN apt-get install -y bwa minimap2 smalt # Setup R RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-key '95C0FAF38DB3CCAD0C080A7BDC78B2DDEABC47B7' && \ - add-apt-repository 'deb http://cloud.r-project.org/bin/linux/debian bullseye-cran40/' && \ + add-apt-repository 'deb http://cloud.r-project.org/bin/linux/debian bookworm-cran40/' && \ apt-get update -qq && \ apt-get install -y r-base # Install R dependencies RUN Rscript -e "install.packages('BiocManager')" -e "BiocManager::install()" -e "BiocManager::install(c('edgeR','getopt', 'MASS'))" +# Install headers for build samtools and htslib +RUN apt-get install -y libcurl4-openssl-dev libdeflate-dev + # Install HTSlib to get bgzip installed as an executable -RUN wget https://github.com/samtools/htslib/releases/download/1.3.2/htslib-1.3.2.tar.bz2 -O htslib.tar.bz2 && \ +RUN wget https://github.com/samtools/htslib/releases/download/1.20/htslib-1.20.tar.bz2 -O htslib.tar.bz2 && \ tar -xjvf htslib.tar.bz2 && \ - cd htslib-1.3.2 && \ + cd htslib-1.20 && \ make && \ make install && \ - rm /htslib.tar.bz2 && rm -r /htslib-1.3.2 + rm /htslib.tar.bz2 && rm -r /htslib-1.20 # Install samtools -RUN wget https://github.com/samtools/samtools/releases/download/1.17/samtools-1.17.tar.bz2 -O samtools.tar.bz2 && \ +RUN wget https://github.com/samtools/samtools/releases/download/1.20/samtools-1.20.tar.bz2 -O samtools.tar.bz2 && \ tar -xjvf samtools.tar.bz2 && \ - cd samtools-1.17 && \ + cd samtools-1.20 && \ ./configure && \ make && \ make install && \ - rm /samtools.tar.bz2 && rm -r /samtools-1.17 + rm /samtools.tar.bz2 && rm -r /samtools-1.20 # Install snakemake @@ -51,7 +57,7 @@ RUN /bin/bash -c "curl -L https://repo.anaconda.com/miniconda/Miniconda3-latest- bash miniconda.sh -b -p /opt/conda && \ rm miniconda.sh" RUN /bin/bash -c "conda install -y -c conda-forge mamba && \ - mamba create -q -y -c conda-forge -c bioconda -n quatradis python=3.10 snakemake snakemake-minimal && \ + mamba create -q -y -c conda-forge -c bioconda -n quatradis python=3.11 snakemake snakemake-minimal && \ source activate quatradis && \ mamba install -q -y -c conda-forge -c bioconda snakemake snakemake-minimal && \ mamba install -q -y -c conda-forge singularity && \ @@ -60,4 +66,4 @@ RUN /bin/bash -c "conda install -y -c conda-forge mamba && \ RUN echo "source activate quatradis" > ~/.bashrc ENV PATH /opt/conda/envs/quatradis/bin:/home/quatradis/.local/bin:${PATH} -RUN pip install cython pysam pytest-cov semantic-version matplotlib scipy \ No newline at end of file +RUN pip install cython pysam pytest-cov semantic-version matplotlib scipy diff --git a/README.md b/README.md index 619dbb1..3fd588b 100644 --- a/README.md +++ b/README.md @@ -2,53 +2,55 @@ A set of tools to analyse the output from Transposon Directed Insertion Sequencing (TraDIS) experiments. - -[![Build Status](https://img.shields.io/travis/com/quadram-institute-bioscience/QuaTradis/master)](https://app.travis-ci.com/github/quadram-institute-bioscience/QuaTradis) +[![Build Status](https://img.shields.io/travis/com/quadram-institute-bioscience/QuaTradis/master)](https://app.travis-ci.com/github/quadram-institute-bioscience/QuaTradis) [![Docker Pulls](https://img.shields.io/docker/pulls/sbastkowski/quatradis.svg)](https://hub.docker.com/r/sbastkowski/quatradis) [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/quatradis/README.html) [![Anaconda-Server Badge](https://anaconda.org/bioconda/quatradis/badges/downloads.svg)](https://anaconda.org/bioconda/quatradis) -[![License: GPL v3](https://img.shields.io/badge/License-GPL%20v3-brightgreen.svg)](https://github.com/quadram-institute-bioscience/QuaTradis/blob/master/LICENSE) [![status](https://img.shields.io/badge/Bioinformatics-10.1093-brightgreen.svg)](https://doi.org/10.1093/bioinformatics/btw022) +[![License: GPL v3](https://img.shields.io/badge/License-GPL%20v3-brightgreen.svg)](https://github.com/quadram-institute-bioscience/QuaTradis/blob/master/LICENSE) [![status](https://img.shields.io/badge/Bioinformatics-10.1093-brightgreen.svg)](https://doi.org/10.1093/bioinformatics/btw022) + + ## Contents - * [Introduction](#introduction) - * [Installation](#installation) - * [From source](#from-source) - * [From bioconda](#from-bioconda) - * [From docker](#from-docker) - * [Usage](#usage) - * [Scripts](#scripts) - * [Functions](#functions) - * [Docker](#docker) - * [Contributing](#contributing) - * [Building QuaTradis](#building-quatradis-from-source) - * [Running unit tests](#running-unit-tests) - * [Versioning](#versioning) - * [CI](#ci) - * [License](#license) - * [Feedback/Issues](#feedbackissues) - * [References](#references) - * [Citation](#citation) - -## Introduction - -The QuaTraDIS pipeline provides software utilities for the processing, mapping, and analysis of transposon directed insertion -sequencing data. The pipeline was designed with the data from the TraDIS sequencing protocol in mind, but should work + +- [Introduction](#introduction) +- [Installation](#installation) + - [From source](#from-source) + - [From bioconda](#from-bioconda) + - [From docker](#from-docker) +- [Usage](#usage) + - [Scripts](#scripts) + - [Functions](#functions) + - [Docker](#docker) +- [Contributing](#contributing) + - [Building QuaTradis](#building-quatradis-from-source) + - [Running unit tests](#running-unit-tests) + - [Versioning](#versioning) + - [CI](#ci) +- [License](#license) +- [Feedback/Issues](#feedbackissues) +- [References](#references) +- [Citation](#citation) + +## Introduction + +The QuaTraDIS pipeline provides software utilities for the processing, mapping, and analysis of transposon directed insertion +sequencing data. The pipeline was designed with the data from the TraDIS sequencing protocol in mind, but should work with a variety of transposon directed insertion sequencing protocols as long as they produce data in the expected format. -The primary tools and pipelines in QuaTraDIS consume reads produced multiple TraDIS sequencing experiments to produce -transposon insertion site plot files. Typically this is then followed up with rapid large-scale comparative analysis of -the plots whilst also predicting the impact of these insertion sites on nearby genes. However, a variety of tools and functions -are available to further process, manipulate or analyse this data. For a full list of functions see the [Usage](#usage) +The primary tools and pipelines in QuaTraDIS consume reads produced multiple TraDIS sequencing experiments to produce +transposon insertion site plot files. Typically this is then followed up with rapid large-scale comparative analysis of +the plots whilst also predicting the impact of these insertion sites on nearby genes. However, a variety of tools and functions +are available to further process, manipulate or analyse this data. For a full list of functions see the [Usage](#usage) section below. -For more information on the TraDIS method, see http://bioinformatics.oxfordjournals.org/content/32/7/1109 and +For more information on the TraDIS method, see http://bioinformatics.oxfordjournals.org/content/32/7/1109 and http://genome.cshlp.org/content/19/12/2308. ## Installation -QuaTraDIS is designed to run on debian linux distros. It may also run on other linux distros or MacOS but this has +QuaTraDIS is designed to run on debian linux distros. It may also run on other linux distros or MacOS but this has not been tested. There are multiple paths to installing QuaTraDIS depending on your use case. @@ -64,30 +66,30 @@ Each installation method is described below. [![Github last commit](https://img.shields.io/github/last-commit/quadram-institute-bioscience/quatradis)](https://github.com/quadram-institute-bioscience/QuaTradis) [![Github last release](https://img.shields.io/github/release-date/quadram-institute-bioscience/quatradis)](https://github.com/quadram-institute-bioscience/QuaTradis) [![Github downloads](https://img.shields.io/github/downloads/quadram-institute-bioscience/quatradis/total)](https://github.com/quadram-institute-bioscience/QuaTradis) -[![Github latest version](https://img.shields.io/github/tag/quadram-institute-bioscience/quatradis?sort=semver)](https://github.com/quadram-institute-bioscience/QuaTradis) +[![Github latest version](https://img.shields.io/github/tag/quadram-institute-bioscience/quatradis?sort=semver)](https://github.com/quadram-institute-bioscience/QuaTradis) -QuaTradis has the following dependencies so install these first. We provide some guidelines on how to install on a debian -based system here, but if you are using another OS the actual commands may vary. Also you may wish to install some of -these packages through alternate methods, e.g. from source. In which case you'll need to lookup and follow the +QuaTradis has the following dependencies so install these first. We provide some guidelines on how to install on a debian +based system here, but if you are using another OS the actual commands may vary. Also you may wish to install some of +these packages through alternate methods, e.g. from source. In which case you'll need to lookup and follow the instructions for each package instead of running the commands below. -* System packages: - * git - * bwa - * smalt - * make - * gcc - * python3 - * python3-pip - * r >= 3.6 +- System packages: + - git + - bwa + - smalt + - make + - gcc + - python3 + - python3-pip + - r >= 3.6 ```bash sudo apt-get update -qq && \ sudo apt-get install -y bzip2 default-jre gcc locales make python3-pip r-base unzip wget && \ sudo apt-get install -y bwa minimap2 smalt ``` - -With the core packages installed we also need some python packages. These can be installed using pip as described here: + +With the core packages installed we also need some python packages. These can be installed using pip as described here: ```bash pip3 install Bio cython pysam numpy pytest-cov semantic-version snakeviz @@ -99,7 +101,6 @@ Finally, we also need a few R packages: sudo Rscript -e "install.packages('BiocManager')" -e "BiocManager::install()" -e "BiocManager::install(c('edgeR','getopt', 'MASS'))" ``` - Next move to a location to which you want to put the source code then clone the github repo and create a development build: ```buildoutcfg @@ -107,12 +108,12 @@ git clone https://github.com/quadram-institute-bioscience/QuaTradis.git cd QuaTradis make dev ``` -QuaTradis scripts should now be available. +QuaTradis scripts should now be available. #### Troubleshooting installations on old distributions -Older distributions of linux such as ubuntu 18 come packaged with R 3.4. In this case R packages such as MASS and EdgeR will not install correctly with the commands mentioned above. In this case we would recommend updating to R 3.6+. In this snippet we show how to update R to 4. However be cautious as this snippet will uninstall the current version of R, so be sure this is what you want to do, or figure out another way of running the versions side by side. +Older distributions of linux such as ubuntu 18 come packaged with R 3.4. In this case R packages such as MASS and EdgeR will not install correctly with the commands mentioned above. In this case we would recommend updating to R 3.6+. In this snippet we show how to update R to 4. However be cautious as this snippet will uninstall the current version of R, so be sure this is what you want to do, or figure out another way of running the versions side by side. ```bash sudo apt install libssl-dev libcurl4-openssl-dev libxml2-dev @@ -123,7 +124,6 @@ sudo apt update sudo apt install r-base ``` - ### From Bioconda [![Anaconda-Server Badge](https://anaconda.org/bioconda/quatradis/badges/version.svg)](https://anaconda.org/bioconda/quatradis) @@ -131,10 +131,10 @@ sudo apt install r-base [![Anaconda-Server Badge](https://anaconda.org/bioconda/quatradis/badges/platforms.svg)](https://anaconda.org/bioconda/quatradis) [![Anaconda-Server Badge](https://anaconda.org/bioconda/quatradis/badges/downloads.svg)](https://anaconda.org/bioconda/quatradis) -Before installing QuaTraDIS via conda, first [install conda](https://docs.conda.io/projects/conda/en/latest/user-guide/install/linux.html) -and enable the bioconda channel as described [here](https://bioconda.github.io/). After which if you +Before installing QuaTraDIS via conda, first [install conda](https://docs.conda.io/projects/conda/en/latest/user-guide/install/linux.html) +and enable the bioconda channel as described [here](https://bioconda.github.io/). After which if you don't have mamba installed then we recommend you do this to make the installation process much faster. -We also recommend using a virtual environment with python pinned to 3.10. This should help to avoid version mixups. +We also recommend using a virtual environment with python pinned to 3.11. This should help to avoid version mixups. [conda virtual environment](https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html). Rough steps are described here (adapt as needed for your system): @@ -143,7 +143,7 @@ Rough steps are described here (adapt as needed for your system): # Create a virtual environment for quatradis and activate it. You'll need to reactivate it in every session # in which you wish to use quatradis. -conda create -n "quatradis" python=3.10-9 +conda create -n "quatradis" python=3.11 conda activate quatradis # Install mamba if you haven't already @@ -156,7 +156,6 @@ mamba install -c conda-forge -c bioconda bioconductor-edger mamba install -c conda-forge -c bioconda quatradis ``` - Remember that if using virtual environments you will need to activate it everytime prior to using QuaTraDIS: ```bash @@ -170,26 +169,24 @@ tradis conda deactivate ``` - ### From Docker [![Docker Pulls](https://img.shields.io/docker/pulls/sbastkowski/quatradis.svg)](https://hub.docker.com/r/sbastkowski/quatradis) [![Docker Version](https://img.shields.io/docker/v/sbastkowski/quatradis?sort=semver)](https://hub.docker.com/r/sbastkowski/quatradis) -[![Docker Image Size](https://img.shields.io/docker/image-size/sbastkowski/quatradis?sort=semver)](https://hub.docker.com/r/sbastkowski/quatradis) +[![Docker Image Size](https://img.shields.io/docker/image-size/sbastkowski/quatradis?sort=semver)](https://hub.docker.com/r/sbastkowski/quatradis) QuaTradis can be run in a Docker container. First install Docker, then pull the QuaTraDIS image from dockerhub: docker pull sbastkowski/quatradis -if you wish to use a specific version, check what tags are available in [dockerhub](https://hub.docker.com/r/sbastkowski/quatradis) +if you wish to use a specific version, check what tags are available in [dockerhub](https://hub.docker.com/r/sbastkowski/quatradis) and add `:` to the previous command (replacing with whatever version you wish to use). - ## Usage QuaTraDIS creates a single executable tool called `tradis`, all functions are available within this tool and can therefore be -used in many different ways. However, a typical use case maybe be to first convert your TraDIS sequenced reads into -transposon insertion site plot files. This can be done with a command line such as this below, which has each fastq +used in many different ways. However, a typical use case maybe be to first convert your TraDIS sequenced reads into +transposon insertion site plot files. This can be done with a command line such as this below, which has each fastq file (optionally gzipped) listed in `my_fastq_list.txt`: ```bash @@ -198,16 +195,16 @@ tradis pipeline create_plots --output_dir plots_output my_fastq_list.txt my_refe The output will be a series of directories for each fastq, containing the plot file for each sequence in the given reference, along with the aligned reads, and some statistics regarding transposon insertion sites both for each sample, and across -all samples. The workflow for `tradis pipeline create_plots` is shown in this diagram: +all samples. The workflow for `tradis pipeline create_plots` is shown in this diagram: ![create_plots pipeline](docs/pipeline_create_plots.png) -A set of plot files can then be analysed in relation to each other using the `tradis pipeline compare` command. For example: +A set of plot files can then be analysed in relation to each other using the `tradis pipeline compare` command. For example: ```bash tradis pipeline compare --output_dir analysis_output --annotations my_annotations.embl \ --condition_files cond1.fq.gz cond2.fq.fz \ - --control_files cont1.fq.gz cont2.fq.gz + --control_files cont1.fq.gz cont2.fq.gz ``` The workflow for `tradis pipeline compare` is shown in this diagram: @@ -216,54 +213,51 @@ The workflow for `tradis pipeline compare` is shown in this diagram: The `tradis` tool has multiple functions, as shown below, and can also be found using the tools help message `tradis --help`: -* `pipeline` - * `create_plots` - Creates transposon insertion site plot files from multiple fastq files. This uses a snakemake pipeline and is capable of distributing work over a cluster, running in parallel where possible. - * `compare` - Comparative analysis of multiple plot files across controls, conditions and replicates. Predicts impact of inserts on nearby genes. -* `plot` - * `create` - * `from_fastq` - From a fastq file and reference, maps the reads to the reference and produces the transposon insertion site plot files. - * `from_alignments` - From pre-mapped reads, creates the transposon insertion site plot file - * `combine` - Combines several plots into 1. - * `count` - Takes genome annotation in embl format along with plot files produced by "tradis pipeline" and generates tab-delimited files containing gene-wise annotations of insert sites and read counts. - * `normalise` - Scales the insertion site counts of plot files based on the sample with the maximum number of reads. -* `comparison` - * `logfc_plot` - Run logfc analysis for a specific plot file direction: forward, reverse, combined, original over all experiments - * `presence_absence` - Take in gene report files and produce a heatmap - * `figures` - Create graphics of controls vs conditions - * `gene_report` - Generate gene report from logfc_plot analysis data - * `split` - Splits a plot file into forward only, reverse only and combined plot files. - * `essentiality` - Determines how essential each gene is based on the transposon insertion sites - * `essentiality_analysis` - Compares essentiality across condition and control samples - * `prepare_embl` - Prepares an embl annotations file for comparative analysis from a plotfile. If an existing embl file is supplied then genes in that file are expanded based on data from the plot file -* `utils` - * `index` - Indexes a reference using the specified alignment tool - * `extract_names` - Creates a file containing the sequence names in a fasta file - * `annotation` - Take in an EMBL file and add flanking 3 prime and 5 prime annotation. - * `artemis_project` - Create an artemis project file. Sometimes you want to view the insert site plots in Artemis. It can be quite a manual task to open up different replicates and combinations. This script will generate a project.properties file from a spreadsheet which gets automatically loaded by Artemis (from the current working directory). This then makes it quicker to view multiple different insert site plots. - * `tags` - * `add` - Generates a BAM file with tags added to read strings. - * `check`- Prints 1 if tags are present in alignment file, prints 0 if not. - * `filter` - Create a fastq file containing reads that match the supplied tag. - * `remove` - Creates a fastq file containing reads with the supplied tag removed from the sequences. - -A help menu for each script can be accessed by running the script by adding with "--help". While perusing these tools, +- `pipeline` + - `create_plots` - Creates transposon insertion site plot files from multiple fastq files. This uses a snakemake pipeline and is capable of distributing work over a cluster, running in parallel where possible. + - `compare` - Comparative analysis of multiple plot files across controls, conditions and replicates. Predicts impact of inserts on nearby genes. +- `plot` + - `create` + - `from_fastq` - From a fastq file and reference, maps the reads to the reference and produces the transposon insertion site plot files. + - `from_alignments` - From pre-mapped reads, creates the transposon insertion site plot file + - `combine` - Combines several plots into 1. + - `count` - Takes genome annotation in embl format along with plot files produced by "tradis pipeline" and generates tab-delimited files containing gene-wise annotations of insert sites and read counts. + - `normalise` - Scales the insertion site counts of plot files based on the sample with the maximum number of reads. +- `comparison` + - `logfc_plot` - Run logfc analysis for a specific plot file direction: forward, reverse, combined, original over all experiments + - `presence_absence` - Take in gene report files and produce a heatmap + - `figures` - Create graphics of controls vs conditions + - `gene_report` - Generate gene report from logfc_plot analysis data + - `split` - Splits a plot file into forward only, reverse only and combined plot files. + - `essentiality` - Determines how essential each gene is based on the transposon insertion sites + - `essentiality_analysis` - Compares essentiality across condition and control samples + - `prepare_embl` - Prepares an embl annotations file for comparative analysis from a plotfile. If an existing embl file is supplied then genes in that file are expanded based on data from the plot file +- `utils` + - `index` - Indexes a reference using the specified alignment tool + - `extract_names` - Creates a file containing the sequence names in a fasta file + - `annotation` - Take in an EMBL file and add flanking 3 prime and 5 prime annotation. + - `artemis_project` - Create an artemis project file. Sometimes you want to view the insert site plots in Artemis. It can be quite a manual task to open up different replicates and combinations. This script will generate a project.properties file from a spreadsheet which gets automatically loaded by Artemis (from the current working directory). This then makes it quicker to view multiple different insert site plots. + - `tags` + - `add` - Generates a BAM file with tags added to read strings. + - `check`- Prints 1 if tags are present in alignment file, prints 0 if not. + - `filter` - Create a fastq file containing reads that match the supplied tag. + - `remove` - Creates a fastq file containing reads with the supplied tag removed from the sequences. + +A help menu for each script can be accessed by running the script by adding with "--help". While perusing these tools, note that default parameters are set for comparative experiments, and will need to be modified for gene essentiality studies. - - ### Functions -The functions behind the command line tools described above are available for use in the `quatradis` python package. Details of using -these functions is beyond the scope of this document and left to individual developers to figure out. Please contact the core +The functions behind the command line tools described above are available for use in the `quatradis` python package. Details of using +these functions is beyond the scope of this document and left to individual developers to figure out. Please contact the core QuaTraDIS developers in case you wish to collaborate on new features or just wish to find out more. - ### Docker -The QuaTraDIS docker image is recommended for production use. This is because docker containers allows +The QuaTraDIS docker image is recommended for production use. This is because docker containers allows for better portability and repeatability when compared to a direct installation from source, or even from python packages. -For example, if running QuaTraDIS over multiple nodes in a cluster, in different environments, or if you want to repeat -an experiment some time in the future, docker ensures the complete software configuration is exactly the same each time. +For example, if running QuaTraDIS over multiple nodes in a cluster, in different environments, or if you want to repeat +an experiment some time in the future, docker ensures the complete software configuration is exactly the same each time. An example docker command might look like this: @@ -271,49 +265,49 @@ An example docker command might look like this: docker run --rm -it -u $(id -u ${USER}):$(id -g ${USER}) -v /home/ubuntu/data:/data sbastkowski/quatradis tradis ``` -To explain what is happening here you should familiarise yourself with [docker](https://docs.docker.com/engine/reference/commandline/run/) -if you haven't already. But a quick summary is: +To explain what is happening here you should familiarise yourself with [docker](https://docs.docker.com/engine/reference/commandline/run/) +if you haven't already. But a quick summary is: + - `--rm` Remove the container after it has completed. - `-it` Run in an interactive terminal session -- `-u $(id -u ${USER}):$(id -g ${USER})` Run container as the current host user. This stops all generated files being owned by root. -- `-v /home/ubuntu/data:/data` Mount the local directory `/home/ubuntu/data` to `/data` in the container. Change the first half of the parameter as necessary to mount in your data files. +- `-u $(id -u ${USER}):$(id -g ${USER})` Run container as the current host user. This stops all generated files being owned by root. +- `-v /home/ubuntu/data:/data` Mount the local directory `/home/ubuntu/data` to `/data` in the container. Change the first half of the parameter as necessary to mount in your data files. - `sbastkowski/quatradis` The QuaTraDIS docker image path from dockerhub -Replace `` with whatever command line arguments whatever arguments make sense for your case. Keep in mind +Replace `` with whatever command line arguments whatever arguments make sense for your case. Keep in mind that any file paths need to reference the paths inside the container, not those on the host, e.g. /data/ Once you have something that works for you, then making a simple wrapper script can help simplify future usage. ### Running pipelines on a cluster -Because the pipeline functions use snakemake it is possible to execute each stage as a separate scheduled job. Furthermore, -it is possible to run each job in its own docker container if so configured. This enables us to scale by distributing -work across a cluster, whilst still guaranteeing reproducibility. This behaviour can be achieved by providing a custom -snakemake profile to the `tradis pipeline` function. Details of how to do this are left to the user for now. - +Because the pipeline functions use snakemake it is possible to execute each stage as a separate scheduled job. Furthermore, +it is possible to run each job in its own docker container if so configured. This enables us to scale by distributing +work across a cluster, whilst still guaranteeing reproducibility. This behaviour can be achieved by providing a custom +snakemake profile to the `tradis pipeline` function. Details of how to do this are left to the user for now. ## Contributing QuaTraDIS uses git for version control, with its public repo sited on [github](https://github.com/quadram-institute-bioscience/QuaTradis/tree/master). -Contributions go through a standard feature branch workflow, so for most users this means that changes should be created -in a feature branch off of the default master branch. Changes are merged into `master` by submitting a -[github pull request](https://github.com/quadram-institute-bioscience/QuaTradis/pulls). +Contributions go through a standard feature branch workflow, so for most users this means that changes should be created +in a feature branch off of the default master branch. Changes are merged into `master` by submitting a +[github pull request](https://github.com/quadram-institute-bioscience/QuaTradis/pulls). -Before submitting the PR please check that your code changes pass all unit tests (see below) to save time. Also ideally -write some unit tests to cover your new functionality. PRs require 1 approval from an admin as well as a successful travis +Before submitting the PR please check that your code changes pass all unit tests (see below) to save time. Also ideally +write some unit tests to cover your new functionality. PRs require 1 approval from an admin as well as a successful travis build before the PR can be merged. ### Building QuaTraDIS from source -QuaTraDIS is mostly python code, managed with [setuptools](https://pythonhosted.org/an_example_pypi_project/setuptools.html), -so all the usual methods for building python via this method apply. However, to make things simpler a -[Makefile](Makefile) is provided which various targets for some common actions a developer might want to take. For example -if you want to build a development release type `make dev`. If you would like to install into your local python environment -type `make install`. A local docker image can be built by typing `make docker-build`. +QuaTraDIS is mostly python code, managed with [setuptools](https://pythonhosted.org/an_example_pypi_project/setuptools.html), +so all the usual methods for building python via this method apply. However, to make things simpler a +[Makefile](Makefile) is provided which various targets for some common actions a developer might want to take. For example +if you want to build a development release type `make dev`. If you would like to install into your local python environment +type `make install`. A local docker image can be built by typing `make docker-build`. ### Running unit tests -The suite of unit tests can be run with `make test` from the top-level directory. Alternatively you can run `pytest` +The suite of unit tests can be run with `make test` from the top-level directory. Alternatively you can run `pytest` directly from the `tests` directory. ### CI @@ -321,15 +315,15 @@ directly from the `tests` directory. [![Build Status](https://img.shields.io/github/actions/workflow/status/quadram-institute-bioscience/QuaTradis/create_release.yml?branch=master)](https://github.com/quadram-institute-bioscience/QuaTradis/actions/workflows/create_release.yml) Continuous integration is delivered via [Github Actions](https://github.com/quadram-institute-bioscience/QuaTradis/actions). -There are two pipelines in the [.github/workflows](.github/workflows) directory. One will build and test QuaTradis on -any branch except for master. The status of these builds should be checked before any PRs or merges are applied into the -master branch. For commit to the `master` branch no builds are executed by default, however if a git tag is applied, -then not only is this commit built and tested, but also the docker image is published to dockerhub. See the next section +There are two pipelines in the [.github/workflows](.github/workflows) directory. One will build and test QuaTradis on +any branch except for master. The status of these builds should be checked before any PRs or merges are applied into the +master branch. For commit to the `master` branch no builds are executed by default, however if a git tag is applied, +then not only is this commit built and tested, but also the docker image is published to dockerhub. See the next section for details on how we recommend tags are applied using semantic versioning. ### Versioning -[![Github latest version](https://img.shields.io/github/tag/quadram-institute-bioscience/quatradis?sort=semver)](https://github.com/quadram-institute-bioscience/QuaTradis) +[![Github latest version](https://img.shields.io/github/tag/quadram-institute-bioscience/quatradis?sort=semver)](https://github.com/quadram-institute-bioscience/QuaTradis) When administrators (as defined by github) feel it is time to create a new release and bump the version there are 3 different make targets to choose from depending on which kind of release is required: @@ -341,23 +335,21 @@ make targets to choose from depending on which kind of release is required: However, before running these commands make sure you have the `dev` python packages installed otherwise these will fail. You can do this by typing: `pip install quatradis[dev].` - For details of what each of these do see the [Makefile](Makefile) but this increases the dot release number -in the [VERSION](VERSION) file and commits this to the `master` branch and pushes it to github. The new master commit -is tagged appropriately with the new version. This then triggers travis to build and publish a docker image to dockerhub. - -If administrators need help figuring out which release number to bump then check the [semver2](https://semver.org/) guidelines. +in the [VERSION](VERSION) file and commits this to the `master` branch and pushes it to github. The new master commit +is tagged appropriately with the new version. This then triggers travis to build and publish a docker image to dockerhub. +If administrators need help figuring out which release number to bump then check the [semver2](https://semver.org/) guidelines. ## License -QuaTradis is free software, licensed under [GPLv3](LICENSE). +QuaTradis is free software, licensed under [GPLv3](LICENSE). ## Feedback/Issues + Please report any issues to the [issues page](https://github.com/quadram-institute-bioscience/QuaTradis/issues), or contact the [developers](AUTHORS). - ## References QuaTraDIS is built on top of the work done in BioTraDIS and AlbaTraDIS: @@ -365,7 +357,6 @@ QuaTraDIS is built on top of the work done in BioTraDIS and AlbaTraDIS: - [TraDIS Toolkit](https://bioinformatics.oxfordjournals.org/content/32/7/1109) - [AlbaTraDIS: Comparative analysis of large datasets from parallel transposon mutagenesis experiments](https://doi.org/10.1101/593624) - ## Citation -The QuaTraDIS academic paper is currently work in progress. For now please cite the references above. +The QuaTraDIS academic paper is currently work in progress. For now please cite the references above. diff --git a/quatradis/comparison/cli.py b/quatradis/comparison/cli.py index e97e384..b22078b 100644 --- a/quatradis/comparison/cli.py +++ b/quatradis/comparison/cli.py @@ -2,73 +2,183 @@ from quatradis.comparison.scatterplot import scatterplot_run from quatradis.comparison.presence_absence import presence_absence_run -from quatradis.comparison.comparison import prep_essentiality_pipeline_output_for_comparison, run_comparisons, \ - insertion_site_comparison +from quatradis.comparison.comparison import ( + prep_essentiality_pipeline_output_for_comparison, + run_comparisons, + insertion_site_comparison, +) from quatradis.comparison.gene_stats import gene_statistics from quatradis.comparison.plot_logfc import PlotLogOptions from quatradis.comparison.split import split_plot as sp from quatradis.comparison.essentiality import essentiality as ess from quatradis.embl.prepare import PrepareEMBLFile -from quatradis.comparison.essentiality_analysis import essentiality_analysis as ess_an, EssentialityInput +from quatradis.comparison.essentiality_analysis import ( + essentiality_analysis as ess_an, + EssentialityInput, +) def add_subparser(subparsers): - compare_parser_desc = "Comparative analysis and visualisation of TraDIS experiments (albatradis)" + compare_parser_desc = ( + "Comparative analysis and visualisation of TraDIS experiments (albatradis)" + ) compare_parser = subparsers.add_parser("compare", help=compare_parser_desc) compare_subparsers = compare_parser.add_subparsers(title=compare_parser_desc) - create_parser("insertion_sites", compare_subparsers, insertion_site_comparison_cli, add_insertion_site_comparison_options, - "Run an insertion site comparison for a specific plot file direction: forward, reverse, combined, original over all experiments to produce a file including edgeR output (logfc, p and q values)", - usage="tradis compare insertion_sites [options] --condition_dirs + --control_dirs + ") - create_parser("presence_absence", compare_subparsers, presence_absence, add_pa_options, - "Take in gene report files and produce a heatmap", - usage="tradis compare presence_absence [options] +") - create_parser("figures", compare_subparsers, figures, figures_options, - "Create graphics of controls vs conditions", - usage="tradis compare figures [options] --controls control1.plot control2.plot --conditions condition1.plot condition2.plot") - create_parser("gene_report", compare_subparsers, gene_report, add_genereport_options, - "Generate gene report from comparison analysis data", - usage="tradis compare gene_report [options] ") - create_parser("split", compare_subparsers, split_plot, split_plot_options, - "Splits a plot file into forward, reverse and combined files", - usage="tradis compare split [options] ") - create_parser("essentiality", compare_subparsers, essentiality, essentiality_utils_options, - "Determines how essential each gene is based on the transposon insertion site plot counts", - usage="tradis compare essentiality [options] ") - create_parser("prepare_embl", compare_subparsers, prepare_embl, prepare_embl_utils_options, - "Prepares an embl annotations file for comparative analysis from a plotfile. If an existing embl file is supplied then genes in that file are expanded based on data from the plot file", - usage="tradis compare prepare_embl [options] ") - create_parser("essentiality_analysis", compare_subparsers, essentiality_analysis, essentiality_analysis_options, - "Compares essentiality across condition and control samples", - usage="tradis compare essentiality_analysis [options] --controls --conditions --ess_controls --ess_conditions ") + create_parser( + "insertion_sites", + compare_subparsers, + insertion_site_comparison_cli, + add_insertion_site_comparison_options, + "Run an insertion site comparison for a specific plot file direction: forward, reverse, combined, original over all experiments to produce a file including edgeR output (logfc, p and q values)", + usage="tradis compare insertion_sites [options] --condition_dirs + --control_dirs + ", + ) + create_parser( + "presence_absence", + compare_subparsers, + presence_absence, + add_pa_options, + "Take in gene report files and produce a heatmap", + usage="tradis compare presence_absence [options] +", + ) + create_parser( + "figures", + compare_subparsers, + figures, + figures_options, + "Create graphics of controls vs conditions", + usage="tradis compare figures [options] --controls control1.plot control2.plot --conditions condition1.plot condition2.plot", + ) + create_parser( + "gene_report", + compare_subparsers, + gene_report, + add_genereport_options, + "Generate gene report from comparison analysis data", + usage="tradis compare gene_report [options] ", + ) + create_parser( + "split", + compare_subparsers, + split_plot, + split_plot_options, + "Splits a plot file into forward, reverse and combined files", + usage="tradis compare split [options] ", + ) + create_parser( + "essentiality", + compare_subparsers, + essentiality, + essentiality_utils_options, + "Determines how essential each gene is based on the transposon insertion site plot counts", + usage="tradis compare essentiality [options] ", + ) + create_parser( + "prepare_embl", + compare_subparsers, + prepare_embl, + prepare_embl_utils_options, + "Prepares an embl annotations file for comparative analysis from a plotfile. If an existing embl file is supplied then genes in that file are expanded based on data from the plot file", + usage="tradis compare prepare_embl [options] ", + ) + create_parser( + "essentiality_analysis", + compare_subparsers, + essentiality_analysis, + essentiality_analysis_options, + "Compares essentiality across condition and control samples", + usage="tradis compare essentiality_analysis [options] --controls --conditions --ess_controls --ess_conditions ", + ) def add_insertion_site_comparison_options(parser): - parser.add_argument('emblfile', help='Annotation file in EMBL format', type=str) - parser.add_argument('analysis_type', help='Analysis type: forward, reverse, combined, original', type=str) - parser.add_argument('--controls', - help='Control plot files (optionally gzipped). There must be an equal number of condition and control files.', - nargs='+', type=str) - parser.add_argument('--conditions', - help='Condition plot files (optionally gzipped). There must be an equal number of condition and control files.', - nargs='+', type=str) - - parser.add_argument('--span_gaps', '-s', help='Span a gap if it is this multiple of a window size', type=int, - default=1) - parser.add_argument('--minimum_block', '-b', help='Minimum number of reads which must be in 1 block in comparison', - type=int, default=100) - parser.add_argument('--minimum_logfc', '-f', help='Minimum log fold change +/-', type=float, default=1) - parser.add_argument('--minimum_logcpm', '-c', help='Minimum log counts per million +/-', type=float, default=8.0) - parser.add_argument('--minimum_proportion_insertions', '-d', - help='If the proportion of insertions is too low compared to control, do not call decreased insertions below this level', - type=float, default=0.1) - parser.add_argument('--prefix', '-o', help='Output directory prefix', type=str, default='output') - parser.add_argument('--pvalue', '-p', help='Do not report anything above this p-value', type=float, default=0.05) - parser.add_argument('--qvalue', '-q', help='Do not report anything above this q-value', type=float, default=0.05) - parser.add_argument('--window_size', '-w', help='Window size', type=int, default=100) - parser.add_argument('--dont_report_decreased_insertions', '-r', - help="Whether or not to report decreased insertions", action='store_true', default=False) - parser.add_argument('--genome_length', '-g', help="The length of the genome or sequence being analysed", type=int, - required=True) + parser.add_argument("emblfile", help="Annotation file in EMBL format", type=str) + parser.add_argument( + "analysis_type", + help="Analysis type: forward, reverse, combined, original", + type=str, + ) + parser.add_argument( + "--controls", + help="Control plot files (optionally gzipped). There must be an equal number of condition and control files.", + nargs="+", + type=str, + ) + parser.add_argument( + "--conditions", + help="Condition plot files (optionally gzipped). There must be an equal number of condition and control files.", + nargs="+", + type=str, + ) + + parser.add_argument( + "--span_gaps", + "-s", + help="Span a gap if it is this multiple of a window size (default: 1)", + type=int, + default=1, + ) + parser.add_argument( + "--minimum_block", + "-b", + help="Minimum number of reads which must be in 1 block in comparison (default: 100)", + type=int, + default=100, + ) + parser.add_argument( + "--minimum_logfc", + "-f", + help="Minimum log fold change +/- (default: 1)", + type=float, + default=1, + ) + parser.add_argument( + "--minimum_logcpm", + "-c", + help="Minimum log counts per million +/- (default: 8.0)", + type=float, + default=8.0, + ) + parser.add_argument( + "--minimum_proportion_insertions", + "-d", + help="If the proportion of insertions is too low compared to control, do not call decreased insertions below this level (default: 0.1)", + type=float, + default=0.1, + ) + parser.add_argument( + "--prefix", "-o", help="Output directory prefix", type=str, default="output" + ) + parser.add_argument( + "--pvalue", + "-p", + help="Do not report anything above this p-value (default: 0.05)", + type=float, + default=0.05, + ) + parser.add_argument( + "--qvalue", + "-q", + help="Do not report anything above this q-value (default: 0.05)", + type=float, + default=0.05, + ) + parser.add_argument( + "--window_size", "-w", help="Window size (default: 100)", type=int, default=100 + ) + parser.add_argument( + "--dont_report_decreased_insertions", + "-r", + help="Whether or not to report decreased insertions", + action="store_true", + default=False, + ) + parser.add_argument( + "--genome_length", + "-g", + help="The length of the genome or sequence being analysed", + type=int, + required=True, + ) def insertion_site_comparison_cli(args): @@ -78,7 +188,7 @@ def insertion_site_comparison_cli(args): if len(args.controls) == 0: raise "Must have at least one control and condition file to process" - #controls_all, controls_only_ess, conditions_all, conditions_only_ess = prep_essentiality_pipeline_output_for_comparison( + # controls_all, controls_only_ess, conditions_all, conditions_only_ess = prep_essentiality_pipeline_output_for_comparison( # args.control_dirs, args.condition_dirs, args.analysis_type) options = PlotLogOptions( @@ -90,18 +200,30 @@ def insertion_site_comparison_cli(args): window_size=args.window_size, span_gaps=args.span_gaps, report_decreased_insertions=not args.dont_report_decreased_insertions, - minimum_block=args.minimum_block) - - plotfile, scoresfile = insertion_site_comparison(args.analysis_type, args.controls, args.conditions, - args.emblfile, args.prefix, options, args.verbose) + minimum_block=args.minimum_block, + ) + + plotfile, scoresfile = insertion_site_comparison( + args.analysis_type, + args.controls, + args.conditions, + args.emblfile, + args.prefix, + options, + args.verbose, + ) return plotfile, scoresfile def add_pa_options(parser): - parser.add_argument('emblfile', help='Annotation file in EMBL format', type=str) - parser.add_argument('genereports', help='Gene report spreadsheets', nargs='+', type=str) - parser.add_argument('--prefix', '-o', help='Output directory prefix', type=str, default='output') + parser.add_argument("emblfile", help="Annotation file in EMBL format", type=str) + parser.add_argument( + "genereports", help="Gene report spreadsheets", nargs="+", type=str + ) + parser.add_argument( + "--prefix", "-o", help="Output directory prefix", type=str, default="output" + ) def presence_absence(args): @@ -109,11 +231,29 @@ def presence_absence(args): def figures_options(parser): - parser.add_argument('--controls', '-c', help='control files (use 2 or more)', type=str, nargs='+') - parser.add_argument('--conditions', '-d', help='condition files (use 2 or more)', type=str, nargs='+') - parser.add_argument('--window_size', '-w', help='Window size', type=int, default=50) - parser.add_argument('--prefix', '-p', help='Output prefix', type=str, default='figures') - parser.add_argument('--normalise', '-n', action='store_true', help='normalise the files', default=False) + parser.add_argument( + "--controls", "-c", help="control files (use 2 or more)", type=str, nargs="+" + ) + parser.add_argument( + "--conditions", + "-d", + help="condition files (use 2 or more)", + type=str, + nargs="+", + ) + parser.add_argument( + "--window_size", "-w", help="Window size (default: 50)", type=int, default=50 + ) + parser.add_argument( + "--prefix", "-p", help="Output prefix", type=str, default="figures" + ) + parser.add_argument( + "--normalise", + "-n", + action="store_true", + help="normalise the files", + default=False, + ) def figures(args): @@ -121,38 +261,68 @@ def figures(args): def add_genereport_options(parser): - parser.add_argument('embl', help='Original EMBL file used for analysis', type=str) - parser.add_argument('--combined', help='Combined plotfile', type=str) - parser.add_argument('--forward', help='Forward plotfile', type=str) - parser.add_argument('--reverse', help='Reverse plotfile', type=str) - parser.add_argument('--scores', help='Combined scores', type=str) - parser.add_argument('--window_size', '-w', help='Window size', type=int, default=50) - parser.add_argument('--output_dir', '-o', help='Output filename prefix', type=str, default='.') - parser.add_argument('--annotations', '-a', help='EMBL file used for annotations', type=str) + parser.add_argument("embl", help="Original EMBL file used for analysis", type=str) + parser.add_argument("--combined", help="Combined plotfile", type=str) + parser.add_argument("--forward", help="Forward plotfile", type=str) + parser.add_argument("--reverse", help="Reverse plotfile", type=str) + parser.add_argument("--scores", help="Combined scores", type=str) + parser.add_argument("--window_size", "-w", help="Window size", type=int, default=50) + parser.add_argument( + "--output_dir", "-o", help="Output filename prefix", type=str, default="." + ) + parser.add_argument( + "--annotations", "-a", help="EMBL file used for annotations", type=str + ) def gene_report(args): - gene_statistics(combined_plotfile=args.combined, forward_plotfile=args.forward, reverse_plotfile=args.reverse, - combined_scorefile=args.scores, window_size=args.window_size, embl_file=args.embl, output_dir=args.output_dir, - annotation_file=args.annotations) + gene_statistics( + combined_plotfile=args.combined, + forward_plotfile=args.forward, + reverse_plotfile=args.reverse, + combined_scorefile=args.scores, + window_size=args.window_size, + embl_file=args.embl, + output_dir=args.output_dir, + annotation_file=args.annotations, + ) def split_plot_options(parser): - parser.add_argument('plot_file', help='Plot file to split', type=str) - parser.add_argument('--minimum_threshold', '-m', help='Minimum number of insertions required to pass filter', - type=int, default=5) - parser.add_argument('--output_dir', '-o', help='Output directory', type=str, required=True) - parser.add_argument('--gzipped', '-g', help='Whether to output gzipped plot files', action='store_true') + parser.add_argument("plot_file", help="Plot file to split", type=str) + parser.add_argument( + "--minimum_threshold", + "-m", + help="Minimum number of insertions required to pass filter (default: 5)", + type=int, + default=5, + ) + parser.add_argument( + "--output_dir", "-o", help="Output directory", type=str, required=True + ) + parser.add_argument( + "--gzipped", + "-g", + help="Whether to output gzipped plot files", + action="store_true", + ) def split_plot(args): - sp(args.plot_file, output_dir=args.output_dir, minimum_threshold=args.minimum_threshold, gzipped=args.gzipped) + sp( + args.plot_file, + output_dir=args.output_dir, + minimum_threshold=args.minimum_threshold, + gzipped=args.gzipped, + ) def essentiality_utils_options(parser): - parser.add_argument('count_file', - help='Counts from a transposon insertion site plot file to be used. i.e. generated from "tradis plot count"', - type=str) + parser.add_argument( + "count_file", + help='Counts from a transposon insertion site plot file to be used. i.e. generated from "tradis plot count"', + type=str, + ) def essentiality(args): @@ -160,36 +330,102 @@ def essentiality(args): def prepare_embl_utils_options(parser): - parser.add_argument('plotfile', help='Transposon insertion site plot file to be used', type=str) - parser.add_argument('--output', '-o', help='Output file path to store the prepared EMBL file', type=str, - default='prepared.embl') - parser.add_argument('--emblfile', '-e', - help='If provided genes in this EMBL annotations file will expanded based on data in the plotfile.', - type=str, default=None) - parser.add_argument('--minimum_threshold', '-m', - help='Only include insert sites with this number or greater insertions', type=int, default=5) - parser.add_argument('--prime_feature_size', '-z', - help='Feature size when adding 5/3 prime block when --use_annotation', type=int, default=198) - parser.add_argument('--window_interval', '-l', help='Window interval', type=int, default=25) - parser.add_argument('--window_size', '-w', help='Window size', type=int, default=100) + parser.add_argument( + "plotfile", help="Transposon insertion site plot file to be used", type=str + ) + parser.add_argument( + "--output", + "-o", + help="Output file path to store the prepared EMBL file", + type=str, + default="prepared.embl", + ) + parser.add_argument( + "--emblfile", + "-e", + help="If provided genes in this EMBL annotations file will expanded based on data in the plotfile.", + type=str, + default=None, + ) + parser.add_argument( + "--minimum_threshold", + "-m", + help="Only include insert sites with this number or greater insertions (default: 5)", + type=int, + default=5, + ) + parser.add_argument( + "--prime_feature_size", + "-z", + help="Feature size when adding 5/3 prime block when --use_annotation (default: 198)", + type=int, + default=198, + ) + parser.add_argument( + "--window_interval", + "-l", + help="Window interval (default: 25)", + type=int, + default=25, + ) + parser.add_argument( + "--window_size", "-w", help="Window size (default: 100)", type=int, default=100 + ) def prepare_embl(args): - pef = PrepareEMBLFile(args.plotfile, args.minimum_threshold, args.window_size, args.window_interval, - args.prime_feature_size, args.emblfile) + pef = PrepareEMBLFile( + args.plotfile, + args.minimum_threshold, + args.window_size, + args.window_interval, + args.prime_feature_size, + args.emblfile, + ) pef.create_file(args.output) def essentiality_analysis(args): - i = EssentialityInput(args.controls, args.conditions, args.ess_controls, args.ess_conditions) + i = EssentialityInput( + args.controls, args.conditions, args.ess_controls, args.ess_conditions + ) ess_an(i, args.output_dir, args.type) def essentiality_analysis_options(parser): - parser.add_argument('--controls', '-c', help='control files (use 2 or more)', type=str, nargs='+') - parser.add_argument('--conditions', '-d', help='condition files (use 2 or more)', type=str, nargs='+') - parser.add_argument('--ess_controls', '-e', help='control files (use 2 or more)', type=str, nargs='+') - parser.add_argument('--ess_conditions', '-f', help='condition files (use 2 or more)', type=str, nargs='+') - parser.add_argument('--output_dir', '-o', help='Output directory', type=str, default=".") - parser.add_argument('--type', '-t', help='Analysis type: original, combined, forward, reverse', type=str, required=True) \ No newline at end of file + parser.add_argument( + "--controls", "-c", help="control files (use 2 or more)", type=str, nargs="+" + ) + parser.add_argument( + "--conditions", + "-d", + help="condition files (use 2 or more)", + type=str, + nargs="+", + ) + parser.add_argument( + "--ess_controls", + "-e", + help="control files (use 2 or more)", + type=str, + nargs="+", + ) + parser.add_argument( + "--ess_conditions", + "-f", + help="condition files (use 2 or more)", + type=str, + nargs="+", + ) + parser.add_argument( + "--output_dir", "-o", help="Output directory", type=str, default="." + ) + parser.add_argument( + "--type", + "-t", + help="Analysis type: original, combined, forward, reverse", + type=str, + required=True, + ) + diff --git a/quatradis/embl/expand_genes.py b/quatradis/embl/expand_genes.py index f4b051e..47dbe5d 100644 --- a/quatradis/embl/expand_genes.py +++ b/quatradis/embl/expand_genes.py @@ -1,10 +1,13 @@ -''' Given an annotation file, take each gene, and create a new feature at the start and end to capture promotors''' +""" Given an annotation file, take each gene, and create a new feature at the start and end to capture promotors""" + from quatradis.embl.reader import EMBLReader from quatradis.embl.sequence import EMBLSequence class FeatureProperties: - def __init__(self, start=0, end=0, direction="", gene_name="", locus_tag="", product=""): + def __init__( + self, start=0, end=0, direction="", gene_name="", locus_tag="", product="" + ): self.start = start self.end = end self.direction = direction @@ -30,16 +33,39 @@ def create_3_5_prime_features(self): # The gene itself new_features.append( - FeatureProperties(feature.location.start, feature.location.end, feature.strand, gene_name, locus_tag, - product)) + FeatureProperties( + feature.location.start, + feature.location.end, + feature.location.strand, + gene_name, + locus_tag, + product, + ) + ) # forward direction - if feature.strand == 1: - new_features.append(self.construct_start_feature(feature, gene_name, "__5prime", locus_tag, product)) - new_features.append(self.construct_end_feature(feature, gene_name, "__3prime", locus_tag, product)) + if feature.location.strand == 1: + new_features.append( + self.construct_start_feature( + feature, gene_name, "__5prime", locus_tag, product + ) + ) + new_features.append( + self.construct_end_feature( + feature, gene_name, "__3prime", locus_tag, product + ) + ) else: - new_features.append(self.construct_end_feature(feature, gene_name, "__5prime", locus_tag, product)) - new_features.append(self.construct_start_feature(feature, gene_name, "__3prime", locus_tag, product)) + new_features.append( + self.construct_end_feature( + feature, gene_name, "__5prime", locus_tag, product + ) + ) + new_features.append( + self.construct_start_feature( + feature, gene_name, "__3prime", locus_tag, product + ) + ) return new_features @@ -53,7 +79,14 @@ def construct_end_feature(self, feature, gene_name, suffix, locus_tag, product): if start >= end or end - start < 10: return None - return FeatureProperties(start, end, feature.strand, gene_name + suffix, locus_tag + suffix, product) + return FeatureProperties( + start, + end, + feature.location.strand, + gene_name + suffix, + locus_tag + suffix, + product, + ) def construct_start_feature(self, feature, gene_name, suffix, locus_tag, product): start = feature.location.start - self.feature_size @@ -63,10 +96,17 @@ def construct_start_feature(self, feature, gene_name, suffix, locus_tag, product start = 1 if start >= end or end - start < 10: return None - return FeatureProperties(start, end, feature.strand, gene_name + suffix, locus_tag + suffix, product) + return FeatureProperties( + start, + end, + feature.location.strand, + gene_name + suffix, + locus_tag + suffix, + product, + ) def construct_file(self, filename): - with open(filename, 'w') as emblfile: + with open(filename, "w") as emblfile: emblfile.write(self.header()) for f in self.create_3_5_prime_features(): @@ -106,20 +146,32 @@ def header(self): FH FT source 1..{length} FT /organism="Bacteria" -""".format(length=str(self.genome_length)) +""".format( + length=str(self.genome_length) + ) def construct_feature_forward(self, feature): return """FT CDS {window_start}..{window_end} FT /gene="{gene_name}" FT /locus_tag="{locus_tag}" FT /product="{product}" -""".format(gene_name=feature.gene_name, window_start=str(feature.start + 1), window_end=str(feature.end), - locus_tag=feature.locus_tag, product=feature.product) +""".format( + gene_name=feature.gene_name, + window_start=str(feature.start + 1), + window_end=str(feature.end), + locus_tag=feature.locus_tag, + product=feature.product, + ) def construct_feature_reverse(self, feature): return """FT CDS complement({window_start}..{window_end}) FT /gene="{gene_name}" FT /locus_tag="{locus_tag}" FT /product="{product}" -""".format(gene_name=feature.gene_name, window_start=str(feature.start + 1), window_end=str(feature.end), - locus_tag=feature.locus_tag, product=feature.product) +""".format( + gene_name=feature.gene_name, + window_start=str(feature.start + 1), + window_end=str(feature.end), + locus_tag=feature.locus_tag, + product=feature.product, + ) diff --git a/quatradis/pipelines/compare.smk b/quatradis/pipelines/compare.smk index d940219..1f953f1 100644 --- a/quatradis/pipelines/compare.smk +++ b/quatradis/pipelines/compare.smk @@ -1,5 +1,6 @@ import os import shutil +import sys input_files=[] norm_files=[] @@ -33,6 +34,8 @@ if not shutil.which('gzcat'): else: raise Error("Couldn't find gzcat or zcat on your system. Please install and try again.") +ZCAT_JOIN=' <' if sys.platform == "darwin" else '' + def make_no_normalise_cmd(): cmds = [] for i in range(len(input_files)): @@ -148,9 +151,17 @@ rule insertion_site_comparison: tt=lambda wildcards: wildcards.tt, combined=os.path.join(config["output_dir"], "analysis", controlnames[0], "combined.plot.gz"), output_dir="--prefix=" + os.path.join(config["output_dir"], "comparison", "{tt}"), - zcat=ZCAT_CMD + span_gaps="--span_gaps=" + config["span_gaps"], + minimum_block="--minimum_block=" + config["minimum_block"], + minimum_logfc="--minimum_logfc=" + config["minimum_logfc"], + minimum_logcpm="--minimum_logcpm=" + config["minimum_logcpm"], + minimum_proportion_insertions="--minimum_proportion_insertions=" + config["minimum_proportion_insertions"], + p_value="--pvalue=" + config["p_value"], + q_value="--qvalue=" + config["q_value"], + window_size="--window_size=" + config["window_size"], + zcat=ZCAT_CMD + ZCAT_JOIN message: "Calculating logfc for {output}" - shell: "SEQLENGTH=$({params.zcat} {params.combined} | wc -l); tradis compare insertion_sites {input.embl} {params.tt} {params.output_dir} -g ${{SEQLENGTH}} --verbose --controls {input.controls} --conditions {input.conditions} > {log} 2>&1" + shell: "SEQLENGTH=$({params.zcat} {params.combined} | wc -l | awk '{{$1=$1}};1'); tradis compare insertion_sites {input.embl} {params.tt} {params.output_dir} --genome_length=${{SEQLENGTH}} {params.span_gaps} {params.window_size} {params.p_value} {params.q_value} {params.minimum_block} {params.minimum_logfc} {params.minimum_logcpm} {params.minimum_proportion_insertions} --verbose --controls {input.controls} --conditions {input.conditions} > {log} 2>&1" rule gene_stats: diff --git a/quatradis/pipelines/create_plots.smk b/quatradis/pipelines/create_plots.smk index 1091284..c073384 100644 --- a/quatradis/pipelines/create_plots.smk +++ b/quatradis/pipelines/create_plots.smk @@ -32,24 +32,25 @@ rule create_plot: fq=os.path.join(config["fastq_dir"], "{fq}"), ref=config["reference"] output: - stats=os.path.join(config["output_dir"], "{fq}", "tradis_out.plot.stats") + stats=os.path.join(config["output_dir"], "{fq}", "{fq}.plot.stats") params: aligner=("--aligner=" + config["aligner"]) if config["aligner"] else "", tag=("--tag=" + config["tag"]) if config["tag"] else "", mismatch=("--mismatch=" + str(config["mismatch"])) if config["mismatch"] else "", mapping_score=("--mapping_score=" + str(config["mapping_score"])) if config["mapping_score"] else "", threads="--threads=" + str(config["threads"]) if config["threads"] else "", - output_dir=os.path.join(config["output_dir"], "{fq}") + output_dir=os.path.join(config["output_dir"], "{fq}"), + output_prefix="{fq}" threads: int(config["threads"]) message: "Creating transposon insertion site plot file for {input.fq}" shell: """tradis plot create {params.aligner} {params.threads} {params.tag} {params.mismatch} {params.mapping_score} \ - --output_dir={params.output_dir} --output_prefix=tradis_out --no_ref_index {input.fq} {input.ref}""" + --output_dir={params.output_dir} --output_prefix={params.output_prefix} --no_ref_index {input.fq} {input.ref}""" rule combine_stats: input: - stats=expand(os.path.join(config["output_dir"], "{fq}", "tradis_out.plot.stats"), fq=config["fastqs"]) + stats=expand(os.path.join(config["output_dir"], "{fq}", "{fq}.plot.stats"), fq=config["fastqs"]) output: os.path.join(config["output_dir"], "combined.plot.stats") message: "Combining plot stats" shell: diff --git a/quatradis/pipelines/pipeline.py b/quatradis/pipelines/pipeline.py index 856dba9..5ea8899 100644 --- a/quatradis/pipelines/pipeline.py +++ b/quatradis/pipelines/pipeline.py @@ -52,6 +52,12 @@ def create_plots_options(parser): default="results", help="The output directory to use for all output files (default: results)", ) + parser.add_argument( + "--cores", + type=int, + default=1, + help="Number of processing cores to use for paralleising snakemake tasks (default: 1)", + ) parser.add_argument( "-n", "--threads", @@ -126,7 +132,7 @@ def create_plots_pipeline(args): start_snakemake( pipeline, snakemake_config, - threads=args.threads, + cores=args.cores, snakemake_profile=args.snakemake_profile, verbose=args.verbose, ) @@ -151,18 +157,25 @@ def compare_options(parser): default="results", help="The output directory to use for all output files (default: results)", ) + parser.add_argument( + "--cores", + type=int, + default=1, + help="Number of processing cores to use for paralleising snakemake tasks (default: 1)", + ) parser.add_argument( "--disable_normalisation", - action='store_true', + action="store_true", default=False, help="Don't normalise the plots prior to comparison (default: false)", ) + parser.add_argument( "-n", "--threads", type=int, default=1, - help="number of threads to use when processing (default: 1)", + help="number of threads to use when processing (default: 1) (deprecated: use --cores instead)", ) parser.add_argument( "--annotations", @@ -174,28 +187,74 @@ def compare_options(parser): parser.add_argument( "--minimum_threshold", "-m", - help="Only include insert sites with this number or greater insertions", + help="Only include insert sites with this number or greater insertions (default: 5)", type=int, default=5, ) parser.add_argument( "--minimum_proportion_insertions", - help="If the proportion of insertions is too low compared to control, dont call decreased insertions below this level", + help="If the proportion of insertions is too low compared to control, dont call decreased insertions below this level (default: 0.1)", type=float, default=0.1, ) parser.add_argument( "--prime_feature_size", "-z", - help="Feature size when adding 5/3 prime block when --use_annotation", + help="Feature size when adding 5/3 prime block when --use_annotation (default: 198)", type=int, default=198, ) parser.add_argument( - "--window_interval", "-l", help="Window interval", type=int, default=25 + "--window_interval", + "-l", + help="Window interval (default: 25)", + type=int, + default=25, + ) + parser.add_argument( + "--minimum_block", + "-b", + help="Minimum number of reads which must be in 1 block in comparison (default: 10)", + type=int, + default=10, ) parser.add_argument( - "--window_size", "-w", help="Window size", type=int, default=100 + "--minimum_logfc", + "-f", + help="Minimum log fold change +/- (default: 1)", + type=float, + default=1, + ) + parser.add_argument( + "--minimum_logcpm", + "-c", + help="Minimum log counts per million +/- (default: 8.0)", + type=float, + default=8.0, + ) + parser.add_argument( + "--pvalue", + "-p", + help="Do not report anything above this p-value (default: 0.05)", + type=float, + default=0.05, + ) + parser.add_argument( + "--qvalue", + "-q", + help="Do not report anything above this q-value (default: 0.05)", + type=float, + default=0.05, + ) + parser.add_argument( + "--span_gaps", + "-s", + help="Span a gap if it is this multiple of a window size (default: 1)", + type=int, + default=1, + ) + parser.add_argument( + "--window_size", "-w", help="Window size (default: 100)", type=int, default=100 ) parser.add_argument( "-sp", @@ -229,13 +288,20 @@ def compare_pipeline(args): ofql.write("control_files:\n") for x in args.control_files: ofql.write("- " + x + "\n") - ofql.write(create_yaml_option("normalise", not args.disable_normalisation, bool=True)) - ofql.write(create_yaml_option("threads", args.threads, num=True)) + ofql.write( + create_yaml_option("normalise", not args.disable_normalisation, bool=True) + ) ofql.write(create_yaml_option("annotations", args.annotations)) ofql.write(create_yaml_option("minimum_threshold", args.minimum_threshold)) ofql.write(create_yaml_option("prime_feature_size", args.prime_feature_size)) ofql.write(create_yaml_option("window_interval", args.window_interval)) ofql.write(create_yaml_option("window_size", args.window_size)) + ofql.write(create_yaml_option("minimum_block", args.minimum_block)) + ofql.write(create_yaml_option("minimum_logfc", args.minimum_logfc)) + ofql.write(create_yaml_option("minimum_logcpm", args.minimum_logcpm)) + ofql.write(create_yaml_option("p_value", args.pvalue)) + ofql.write(create_yaml_option("q_value", args.qvalue)) + ofql.write(create_yaml_option("span_gaps", args.span_gaps)) ofql.write( create_yaml_option( "minimum_proportion_insertions", args.minimum_proportion_insertions @@ -247,14 +313,19 @@ def compare_pipeline(args): start_snakemake( pipeline, snakemake_config, - threads=args.threads, + cores=( + # For backward compatibility + args.cores + if args.cores > args.threads + else args.threads + ), snakemake_profile=args.snakemake_profile, verbose=args.verbose, ) def start_snakemake( - snakefile, snakemake_config, threads=1, snakemake_profile=None, verbose=False + snakefile, snakemake_config, cores=1, snakemake_profile=None, verbose=False ): if verbose: print("Using snakemake config files at: " + ", ".join([snakemake_config])) @@ -264,7 +335,7 @@ def start_snakemake( "snakemake", "--snakefile=" + snakefile, "--configfile=" + snakemake_config, - "--cores=" + str(threads), + "--cores=" + str(cores), "--printshellcmds", ] diff --git a/quatradis/tisp/combine.py b/quatradis/tisp/combine.py index 642caf8..dd6e3d9 100644 --- a/quatradis/tisp/combine.py +++ b/quatradis/tisp/combine.py @@ -3,6 +3,7 @@ """ import csv import os +import sys from Bio import bgzf @@ -31,7 +32,9 @@ def prepare_and_create_tabix_for_combined_plots(tabix_plot, combined_dir): for line in tabix_plot: tabix_plot_fh.write(line + "\n") - os.system("zcat " + tabix_plot_name + " | sort -k1,1 -k2,2n | bgzip > " + sorted_tabix_plot_name + + joiner = " < " if sys.platform == "darwin" else " " + + os.system("zcat" + joiner + tabix_plot_name + " | sort -k1,1 -k2,2n | bgzip > " + sorted_tabix_plot_name + " && tabix -b 2 -e 2 " + sorted_tabix_plot_name) os.remove(tabix_plot_name) diff --git a/quatradis/tisp/generator/from_alignments.py b/quatradis/tisp/generator/from_alignments.py index 8f83ce6..e908fc8 100644 --- a/quatradis/tisp/generator/from_alignments.py +++ b/quatradis/tisp/generator/from_alignments.py @@ -4,6 +4,7 @@ import collections import os import subprocess +import sys import pysam from Bio import bgzf @@ -144,12 +145,15 @@ def create_plot_files(mapped_reads, plot_out_prefix="tradis.plot", cutoff_score= def get_number_reads(seq_file_in): + # If running on mac take form stdin + joiner = " < " if sys.platform == "darwin" else " " + # Make sure we can handle gzipped or non-gzipped fastq input if seq_file_in.endswith('.gz'): cat_cmd = "zcat" else: cat_cmd = "cat" - lines = int(subprocess.check_output(["bash", "-c", cat_cmd + " " + seq_file_in + " | wc -l"]).strip()) + lines = int(subprocess.check_output(["bash", "-c", cat_cmd + joiner + seq_file_in + " | wc -l"]).strip()) return lines / 4 diff --git a/quatradis/tisp/parser.py b/quatradis/tisp/parser.py index a7a6f3c..3ea2df7 100644 --- a/quatradis/tisp/parser.py +++ b/quatradis/tisp/parser.py @@ -2,6 +2,7 @@ import os import pandas import subprocess +import sys from Bio import bgzf @@ -133,8 +134,9 @@ def create_file_handle(plot_file, working_dir=""): @staticmethod def get_plot_file_length(plot_file, working_dir): plot_file = working_dir + os.sep + plot_file if working_dir else plot_file + joiner = " < " if sys.platform == "darwin" else " " cat = "zcat" if plot_file[-3:] == ".gz" else "cat" - wc = subprocess.check_output(["bash", "-c", cat + " " + plot_file + " | wc | awk '{print $1}'"]) + wc = subprocess.check_output(["bash", "-c", cat + joiner + plot_file + " | wc | awk '{print $1}'"]) file_length = str(wc, 'UTF-8').strip() return int(file_length)