Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merge dev changes #5

Merged
merged 13 commits into from
Sep 30, 2024
25 changes: 16 additions & 9 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ jobs:
matrix:
python-version: ["3.8", "3.12"]
poetry-version: ["1.5.1"]
sentieon-version: ["202308.01", "202308.02"]
sentieon-version: ["202308.01", "202308.03"]
os: [ubuntu-22.04] #, macos-latest, windows-latest]
runs-on: ${{ matrix.os }}
steps:
Expand Down Expand Up @@ -48,6 +48,13 @@ jobs:
run: |
sudo curl -L -o /usr/local/bin/bedtools "https://github.com/arq5x/bedtools2/releases/download/v2.31.0/bedtools.static"
sudo chmod ugo+x /usr/local/bin/bedtools
- name: Install mosdepth
run: |
sudo curl -L -o /usr/local/bin/mosdepth "https://github.com/brentp/mosdepth/releases/download/v0.3.8/mosdepth"
sudo chmod ugo+x /usr/local/bin/mosdepth
- name: Install multiqc
run: |
pip install multiqc
- name: Install sentieon
run: |
curl -L https://s3.amazonaws.com/sentieon-release/software/sentieon-genomics-$SENTIEON_VERSION.tar.gz | tar -zxf -
Expand Down Expand Up @@ -78,7 +85,7 @@ jobs:
- name: Smoke test - short-read
run: |
. .venv/bin/activate
sentieon-cli dnascope -t 1 -r "tests/smoke/r ef.fa" --pcr-free -g \
sentieon-cli -v dnascope -t 1 -r "tests/smoke/r ef.fa" --pcr-free -g \
--duplicate-marking rmdup --consensus --align \
--input_ref "tests/smoke/r ef.fa" -i "tests/smoke/illumina.cram" \
-m "DNAscope IlluminaWGS2.0.bundle" "output_sr.vcf.gz"
Expand All @@ -90,7 +97,7 @@ jobs:
. .venv/bin/activate
samtools fastq --reference "tests/smoke/r ef.fa" -1 sr_r1.fastq.gz \
-2 sr_r2.fastq.gz "tests/smoke/illumina.cram" > /dev/null
sentieon-cli dnascope -t 1 -r "tests/smoke/r ef.fa" --pcr-free \
sentieon-cli -v dnascope -t 1 -r "tests/smoke/r ef.fa" --pcr-free \
--r1-fastq sr_r1.fastq.gz --r2-fastq sr_r2.fastq.gz \
--readgroups "@RG\tID:HG002-1\tSM:HG002" \
-m "DNAscope IlluminaWGS2.0.bundle" --assay WES \
Expand All @@ -101,21 +108,21 @@ jobs:
- name: Smoke test - long-read
run: |
. .venv/bin/activate
sentieon-cli dnascope-longread -t 1 -r "tests/smoke/r ef.fa" \
sentieon-cli -v dnascope-longread -t 1 -r "tests/smoke/r ef.fa" \
-i "tests/smoke/sam ple.cram" -m "DNAscope PacBio2.1.bundle" \
--repeat-model tests/smoke/sample_repeat.model -g \
-b "tests/smoke/diploid.bed" \
--haploid-bed "tests/smoke/haploid.bed" \
"output hifi.vcf.gz"
sentieon driver -r "tests/smoke/r ef.fa" -t 1 --algo GVCFtyper \
-v "output hifi.g.vcf.gz" output_hifi_gvcftyper.vcf.gz
if [ ! -f "output hifi.sv.vcf.gz" -o ! -f "output hifi.haploid.vcf.gz" ]; then
if [ ! -f "output hifi.sv.vcf.gz" ]; then
exit 1
fi
- name: Smoke test - gVCF
run: |
. .venv/bin/activate
sentieon-cli dnascope-longread --tech ONT -t 1 -r "tests/smoke/r ef.fa" \
sentieon-cli -v dnascope-longread --tech ONT -t 1 -r "tests/smoke/r ef.fa" \
-i "tests/smoke/sam ple.cram" -m "DNAscope PacBio2.1.bundle" \
--repeat-model tests/smoke/sample_repeat.model -g "output ont.vcf.gz"
sentieon driver -r "tests/smoke/r ef.fa" -t 1 --algo GVCFtyper \
Expand All @@ -126,7 +133,7 @@ jobs:
- name: Smoke test - realignment
run: |
. .venv/bin/activate
sentieon-cli dnascope-longread -t 1 -r "tests/smoke/r ef.fa" \
sentieon-cli -v dnascope-longread -t 1 -r "tests/smoke/r ef.fa" \
-i "tests/smoke/sam ple.cram" -m "DNAscope PacBio2.1.bundle" \
--repeat-model tests/smoke/sample_repeat.model --align \
--input_ref "tests/smoke/r ef.fa" "output realigned.vcf.gz"
Expand All @@ -139,11 +146,11 @@ jobs:
samtools fastq --reference "tests/smoke/r ef.fa" \
"tests/smoke/sam ple.cram" | \
gzip -c > sample.fq.gz
sentieon-cli dnascope-longread -t 1 -r "tests/smoke/r ef.fa" \
sentieon-cli -v dnascope-longread -t 1 -r "tests/smoke/r ef.fa" \
--fastq sample.fq.gz --readgroups '@RG\tID:sample-1\tSM:sample' \
-m "DNAscope PacBio2.1.bundle" \
--repeat-model tests/smoke/sample_repeat.model "output fq.vcf.gz"
if [ ! -f "output fq.vcf.gz" -o ! -f "output fq_mm2_sorted_fq_0.cram" -o ! -f "output fq.sv.vcf.gz" ]; then
exit 1
fi


127 changes: 127 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
FROM debian:stable-20240904-slim AS downloader

ARG SENTIEON_VERSION
RUN test -n "$SENTIEON_VERSION"

LABEL container.base.image="debian:stable-20240904-slim" \
software.version="${SENTIEON_VERSION}" \
software.website="https://www.sentieon.com/"

# Install samtools
RUN apt-get update && apt-get install -y curl bzip2 autoconf automake make gcc perl zlib1g-dev libbz2-dev liblzma-dev libcurl4-gnutls-dev libssl-dev libncurses5-dev libdeflate-dev && \
mkdir -p /opt/samtools/ && \
curl -L "https://github.com/samtools/samtools/releases/download/1.20/samtools-1.20.tar.bz2" | \
tar -C /opt/samtools/ -jxf - && \
cd /opt/samtools/samtools-1.20 && \
./configure && \
make install

# Install bcftools
RUN apt-get update && apt-get install -y bzip2 autoconf automake make gcc perl zlib1g-dev libbz2-dev liblzma-dev libcurl4-gnutls-dev libssl-dev libperl-dev libgsl0-dev && \
mkdir -p /opt/bcftools/ && \
curl -L "https://github.com/samtools/bcftools/releases/download/1.20/bcftools-1.20.tar.bz2" | \
tar -C /opt/bcftools/ -jxf - && \
cd /opt/bcftools/bcftools-1.20/ && \
./configure && \
make install

# Install bedtools
RUN apt-get update && apt-get install -y curl && \
curl -L -o /usr/local/bin/bedtools-2.30.0 "https://github.com/arq5x/bedtools2/releases/download/v2.30.0/bedtools.static.binary"

# Install igzip
RUN apt-get update && apt-get install -y curl autoconf automake libtool make gcc nasm && \
mkdir -p /opt/isa-l && \
curl -L "https://github.com/intel/isa-l/archive/refs/tags/v2.30.0.tar.gz" | \
tar -C /opt/isa-l -zxf - && \
cd /opt/isa-l/isa-l-2.30.0 && \
./autogen.sh && \
./configure --prefix=/usr --libdir=/usr/lib && \
make install

# Install mosdepth
RUN apt-get update && apt-get install -y curl && \
curl -L -o /usr/local/bin/mosdepth-0.3.9 "https://github.com/brentp/mosdepth/releases/download/v0.3.9/mosdepth"

# Download the Sentieon software
RUN apt-get update && apt-get install -y curl && \
mkdir -p /opt/sentieon/ && \
curl -L "https://s3.amazonaws.com/sentieon-release/software/sentieon-genomics-${SENTIEON_VERSION}.tar.gz" | \
tar -zxf - -C /opt/sentieon/

# Install poetry
RUN apt-get update && apt-get install -y git python3 python3-venv && \
python3 -m venv /opt/poetry-venv && \
VIRTUAL_ENV=/opt/poetry-venv PATH=/opt/poetry-venv/bin:$PATH pip install poetry

# Build the sentieon-cli
COPY data /opt/sentieon-cli/data
COPY pyproject.toml /opt/sentieon-cli/pyproject.toml
COPY sentieon_cli /opt/sentieon-cli/sentieon_cli
COPY README.md /opt/sentieon-cli/README.md
RUN cd /opt/sentieon-cli/ && \
/opt/poetry-venv/bin/poetry build -f wheel

# Build the container
FROM debian:stable-20240904-slim
ARG SENTIEON_VERSION
ENV SENTIEON_VERSION=$SENTIEON_VERSION

COPY --from=downloader /opt/sentieon/sentieon-genomics-${SENTIEON_VERSION} /opt/sentieon/sentieon-genomics-${SENTIEON_VERSION}
COPY --from=downloader /usr/bin/igzip /usr/bin/igzip
COPY --from=downloader /usr/lib/libisal.a /usr/lib/libisal.a
COPY --from=downloader /usr/lib/libisal.so.2.0.30 /usr/lib/libisal.so.2.0.30
COPY --from=downloader /usr/lib/libisal.la /usr/lib/libisal.la
COPY --from=downloader /usr/local/bin/samtools /usr/local/bin/samtools
COPY --from=downloader /usr/local/bin/bcftools /usr/local/bin/bcftools
COPY --from=downloader /usr/local/bin/bedtools-2.30.0 /usr/local/bin/bedtools-2.30.0
COPY --from=downloader /usr/local/bin/mosdepth-0.3.9 /usr/local/bin/mosdepth-0.3.9
COPY --from=downloader /opt/sentieon-cli/dist /opt/sentieon-cli/dist

CMD ["/bin/bash"]

# Create links
RUN cd /usr/local/lib && \
ln -s libisal.so.2.0.30 libisal.so.2 && \
ln -s libisal.so.2 libisal.so && \
cd /usr/local/bin/ && \
ln -s bedtools-2.30.0 bedtools && \
ln -s mosdepth-0.3.9 mosdepth && \
chmod ugo+x bedtools-2.30.0 mosdepth-0.3.9

# Install jemalloc as the recommended memory allocation tool, see https://support.sentieon.com/appnotes/jemalloc/
# Install procps for process monitoring
# Install other dependencies
RUN apt-get update && \
apt-get install -y libjemalloc2 procps libdeflate-dev libbz2-dev \
liblzma-dev libcurl4-gnutls-dev libssl-dev libperl-dev libgsl0-dev \
libncurses5-dev

ENV SENTIEON_INSTALL_DIR=/opt/sentieon/sentieon-genomics-$SENTIEON_VERSION
ENV PATH $SENTIEON_INSTALL_DIR/bin/:$PATH
ENV LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so.2

# A default jemalloc configuration that should work well for most use-cases, see http://jemalloc.net/jemalloc.3.html
ENV MALLOC_CONF=metadata_thp:auto,background_thread:true,dirty_decay_ms:30000,muzzy_decay_ms:30000

# Create a venv for the sentieon-cli
RUN apt-get update && apt-get install -y git python3 python3-venv curl && \
python3 -m venv /opt/sentieon-cli-venv
ENV VIRTUAL_ENV /opt/sentieon-cli-venv
ENV PATH /opt/sentieon-cli-venv/bin:$PATH

# Install multiqc into the venv
RUN pip install multiqc

# Install the sentieon-cli into the venv
RUN pip install /opt/sentieon-cli/dist/*.whl

# Test the container
RUN sentieon driver --help && \
igzip --help && \
samtools --help && \
bcftools --help && \
bedtools --help && \
multiqc -h && \
mosdepth -h && \
sentieon-cli -h
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ A command-line interface for the Sentieon software

Download the latest tar.gz file from the GitHub release page, https://github.com/sentieon/sentieon-cli/releases/ and install the package with pip:
```sh
curl -LO https://github.com/sentieon/sentieon-cli/releases/download/v0.4.0/sentieon_cli-1.0.0.tar.gz
curl -LO https://github.com/sentieon/sentieon-cli/releases/download/v1.0.0/sentieon_cli-1.0.0.tar.gz
pip install sentieon_cli-1.0.0.tar.gz
```

Expand Down Expand Up @@ -54,4 +54,4 @@ The `sentieon-cli` supports the following global arguments:
- [**DNAscope LongRead**](docs/dnascope-longread.md) - DNAscope LongRead pipeline implementations for germline SNV and indel calling from long read data.

## License
Unless otherwise indicated, files in this repository are licensed under a BSD 2-Clause License.
Unless otherwise indicated, files in this repository are licensed under a BSD 2-Clause License.
52 changes: 52 additions & 0 deletions sentieon_cli/command_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,32 @@ def cmd_pyexec_vcf_mod_haploid_patch2(
return shlex.join(cmd)


def bcftools_concat(
out_vcf: pathlib.Path,
in_vcfs: List[pathlib.Path],
) -> str:
"""VCF processing through bcftools concat"""
cmds = []
cmds.append(
[
"bcftools",
"concat",
"-aD",
]
+ [str(x) for x in in_vcfs]
)
cmds.append(
[
"sentieon",
"util",
"vcfconvert",
"-",
str(out_vcf),
]
)
return " | ".join([shlex.join(x) for x in cmds])


def get_rg_lines(
input_aln: pathlib.Path,
dry_run: bool,
Expand Down Expand Up @@ -234,6 +260,7 @@ def cmd_samtools_fastq_minimap2(
sample_name: str,
input_ref: Optional[pathlib.Path] = None,
fastq_taglist: str = "*",
minimap2_args: str = "-Y",
util_sort_args: str = "--cram_write_options version=3.0,compressor=rans",
) -> str:
"""Re-align an input BAM/CRAM/uBAM/uCRAM file with minimap2"""
Expand Down Expand Up @@ -262,6 +289,7 @@ def cmd_samtools_fastq_minimap2(
"-t",
str(cores),
"-a",
minimap2_args,
"-x",
f"{model_bundle}/minimap2.model",
str(reference),
Expand Down Expand Up @@ -408,6 +436,7 @@ def cmd_fastq_minimap2(
model_bundle: pathlib.Path,
cores: int,
unzip: str = "gzip",
minimap2_args: str = "-Y",
util_sort_args: str = "--cram_write_options version=3.0,compressor=rans",
) -> str:
"""Align an input fastq file with minimap2"""
Expand All @@ -423,6 +452,7 @@ def cmd_fastq_minimap2(
"-t",
str(cores),
"-a",
minimap2_args,
"-x",
f"{model_bundle}/minimap2.model",
"-R",
Expand Down Expand Up @@ -536,3 +566,25 @@ def cmd_multiqc(
]
)
return shlex.join(cmd)


def cmd_mosdepth(
sample_input: pathlib.Path,
output_directory: pathlib.Path,
fasta: Optional[pathlib.Path] = None,
threads: int = 1,
xargs: str = (
"--by 500 --no-per-base --use-median -T 1,3,5,10,15,20,30,40,50"
),
) -> str:
cmd = [
"mosdepth",
"--fasta",
str(fasta),
"--threads",
str(threads),
]
cmd.extend(shlex.split(xargs))
cmd.append(str(output_directory))
cmd.append(str(sample_input))
return shlex.join(cmd)
7 changes: 5 additions & 2 deletions sentieon_cli/dnascope.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,15 +34,18 @@
SVSolver,
WgsMetricsAlgo,
)
from .logging import get_logger
from .util import (
__version__,
check_version,
library_preloaded,
logger,
path_arg,
tmp,
)

logger = get_logger(__name__)


ALN_MIN_VERSIONS = {
"sentieon driver": packaging.version.Version("202308"),
"samtools": packaging.version.Version("1.16"),
Expand Down Expand Up @@ -651,7 +654,7 @@ def dnascope(
assert model_bundle
assert str(output_vcf).endswith(".vcf.gz")

logger.setLevel(kwargs["loglevel"])
logger.parent.setLevel(kwargs["loglevel"])
logger.info("Starting sentieon-cli version: %s", __version__)

if not library_preloaded("libjemalloc.so"):
Expand Down
Loading
Loading