From ee7464473bd26b0dc9859f294d07d30251792b4c Mon Sep 17 00:00:00 2001 From: Nicole Deflaux Date: Thu, 26 Jan 2017 14:29:58 -0800 Subject: [PATCH 1/4] Fix parameter name and allow for any scala version. Change-Id: Iefd87158f5b93cc0f6b5feecd487bb642fd1fa5f --- docs/source/conf.py | 2 +- docs/source/includes/spark_setup.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index a6eb119..626474a 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -252,7 +252,7 @@ .. GLOBAL SUBSTITUTIONS CAN GO HERE -.. |sparkADC| replace:: If the `Application Default Credentials`_ are not sufficient, use ``--secretsFile=PATH/TO/YOUR/client_secrets.json``. If you do not already have this file, see the `authentication instructions`_ to obtain it. +.. |sparkADC| replace:: If the `Application Default Credentials`_ are not sufficient, use ``--client-secrets=PATH/TO/YOUR/client_secrets.json``. If you do not already have this file, see the `authentication instructions`_ to obtain it. .. |dataflowADC| replace:: If the `Application Default Credentials`_ are not sufficient, use ``--client-secrets PATH/TO/YOUR/client_secrets.json``. If you do not already have this file, see the `authentication instructions`_ to obtain it. .. |dataflowSomeRefs| replace:: Use a comma-separated list to run over multiple disjoint regions. For example to run over `BRCA1`_ and `BRCA2`_ ``--references=chr13:32889610:32973808,chr17:41196311:41277499``. .. |dataflowAllRefs| replace:: To run this pipeline over the entire genome, use ``--allReferences`` instead of ``--references=chr17:41196311:41277499``. diff --git a/docs/source/includes/spark_setup.rst b/docs/source/includes/spark_setup.rst index a37078b..2c292c2 100644 --- a/docs/source/includes/spark_setup.rst +++ b/docs/source/includes/spark_setup.rst @@ -43,6 +43,6 @@ cd spark-examples sbt assembly - cp target/scala-2.10/googlegenomics-spark-examples-assembly-*.jar ~/ + cp target/scala-2.*/googlegenomics-spark-examples-assembly-*.jar ~/ cd ~/ From 21e28cae84e700b5e7ca8eda807a1a6085f1b976 Mon Sep 17 00:00:00 2001 From: Nicole Deflaux Date: Fri, 27 Jan 2017 12:44:39 -0800 Subject: [PATCH 2/4] Add PrecisionFDA dataset. Change-Id: I0409a9c9554d360d1725f0f298d0e3fa5649832b --- docs/source/conf.py | 1 + .../discover_public_data/genomic_data_toc.rst | 1 + .../platinum_genomes_deepvariant.rst | 4 +- .../discover_public_data/precision_fda.rst | 38 +++++++++++++++++++ .../reference_genomes.rst | 26 +++++++++++++ 5 files changed, 68 insertions(+), 2 deletions(-) create mode 100644 docs/source/use_cases/discover_public_data/precision_fda.rst diff --git a/docs/source/conf.py b/docs/source/conf.py index 626474a..b7b59b6 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -186,6 +186,7 @@ .. _VariantSet: https://cloud.google.com/genomics/reference/rest/v1/variantsets .. _Load Genomic Variants: https://cloud.google.com/genomics/v1/load-variants .. _Understanding the BigQuery Variants Table Schema: https://cloud.google.com/genomics/v1/bigquery-variants-schema +.. _Verily DeepVariant: https://cloud.google.com/genomics/v1alpha2/deepvariant .. _Using Google Cloud Storage with Big Data: https://cloud.google.com/storage/docs/working-with-big-data .. _gsutil: https://cloud.google.com/storage/docs/gsutil diff --git a/docs/source/use_cases/discover_public_data/genomic_data_toc.rst b/docs/source/use_cases/discover_public_data/genomic_data_toc.rst index 658f5bb..b02f26a 100644 --- a/docs/source/use_cases/discover_public_data/genomic_data_toc.rst +++ b/docs/source/use_cases/discover_public_data/genomic_data_toc.rst @@ -23,6 +23,7 @@ __ RenderedVersion_ 1000_genomes platinum_genomes platinum_genomes_deepvariant + precision_fda reference_genomes mssng_data isb_cgc_data diff --git a/docs/source/use_cases/discover_public_data/platinum_genomes_deepvariant.rst b/docs/source/use_cases/discover_public_data/platinum_genomes_deepvariant.rst index 85f8c81..672101e 100644 --- a/docs/source/use_cases/discover_public_data/platinum_genomes_deepvariant.rst +++ b/docs/source/use_cases/discover_public_data/platinum_genomes_deepvariant.rst @@ -11,13 +11,13 @@ Platinum Genomes DeepVariant | **If you are reading this on github, you should instead click** `here`__. | +-----------------------------------------------------------------------------------+ -.. _RenderedVersion: http://googlegenomics.readthedocs.org/en/latest/use_cases/discover_public_data/platinum_genomes.html +.. _RenderedVersion: http://googlegenomics.readthedocs.org/en/latest/use_cases/discover_public_data/platinum_genomes_deepvariant.html __ RenderedVersion_ .. comment: end: goto-read-the-docs -This dataset comprises the `6 member CEPH pedigree 1463 `_ called using the DeepVariant toolchain and reference genome GRCh38. See the `DeepVariant preprint `_ for full details: +This dataset comprises the `6 member CEPH pedigree 1463 `_ called using the the alpha version of the `Verily DeepVariant`_ toolchain aligned to :ref:`vgrch38` reference genome. See the `DeepVariant preprint `_ for full details: | `Creating a universal SNP and small indel variant caller with deep neural networks `_ | Ryan Poplin, Dan Newburger, Jojo Dijamco, Nam Nguyen, Dion Loy, Sam Gross, Cory Y. McLean, Mark A. DePristo diff --git a/docs/source/use_cases/discover_public_data/precision_fda.rst b/docs/source/use_cases/discover_public_data/precision_fda.rst new file mode 100644 index 0000000..d49c31d --- /dev/null +++ b/docs/source/use_cases/discover_public_data/precision_fda.rst @@ -0,0 +1,38 @@ +PrecisionFDA Truth Challenge +============================ + +.. comment: begin: goto-read-the-docs + +.. container:: visible-only-on-github + + +-----------------------------------------------------------------------------------+ + | **The properly rendered version of this document can be found at Read The Docs.** | + | | + | **If you are reading this on github, you should instead click** `here`__. | + +-----------------------------------------------------------------------------------+ + +.. _RenderedVersion: http://googlegenomics.readthedocs.org/en/latest/use_cases/discover_public_data/precision_fda.html + +__ RenderedVersion_ + +.. comment: end: goto-read-the-docs + +This dataset includes both: + +* the input for the `PrecisionFDA Truth Challenge `_ comprised of whole-genome sequences for HG001 (NA12878) and HG002 (NA24385) +* the output from the alpha version of the `Verily DeepVariant`_ toolchain aligned to :ref:`vgrch38` reference genome. See the `DeepVariant preprint `_ for full details: + + | `Creating a universal SNP and small indel variant caller with deep neural networks `_ + | Ryan Poplin, Dan Newburger, Jojo Dijamco, Nam Nguyen, Dion Loy, Sam Gross, Cory Y. McLean, Mark A. DePristo + | DOI: https://doi.org/10.1101/092890 + | + +Google Cloud Platform data locations +------------------------------------ + +* Google Cloud Storage folder `gs://genomics-public-data/precision-fda `_ + +Provenance +---------- + +* The FASTQ files in `gs://genomics-public-data/precision-fda/input `_ were run through the `Verily DeepVariant`_ alpha toolchain to produce the corresponding files in `gs://genomics-public-data/precision-fda/output/deepvariant-alpha `_. diff --git a/docs/source/use_cases/discover_public_data/reference_genomes.rst b/docs/source/use_cases/discover_public_data/reference_genomes.rst index d0fc6cd..3db344d 100644 --- a/docs/source/use_cases/discover_public_data/reference_genomes.rst +++ b/docs/source/use_cases/discover_public_data/reference_genomes.rst @@ -58,6 +58,32 @@ Genome Reference Consortium Human Build 38 includes data from 39 gzipped fasta f More information on this source data can be found in this `NCBI article `__ and in the `FTP README `__. + +.. _vgrch38: + +Verily's GRCh38 +^^^^^^^^^^^^^^^ + +Verily's GRCh38 reference genome is fully compatible with any b38 genome in the autosome. + +Verily elected to use a version of GRCh38 that excludes all patch sequences, omits alternate haplotype chromosomes, includes decoy sequences, and masks out duplicate copies of centromeric regions. There is an existing genome assembly version, named `GRCh38_no_alt_plus_hs38d1 `_, for which these transformations have all already been performed by the GRC. This assembly version was created specifically for analysis, with its rationale and exact genome modifications thoroughly documented in its `README file `_. Consequently, we elected to use this as our base genome assembly, but applied a few modifications described below. + +A common variation in representation in genome assemblies is the decision on chromosome naming. The two most prevalent naming schemes for the nuclear and mitochondrial chromosomes are: + +'Chr' naming: + chr{1..22}, chrX, chrY, chrM + +'Int' naming: + {1..22}, X, Y, MT + +Ensuring consistency across datasets is of paramount importance, to avoid the annoying-at-best, error-prone-at-worst conversion between the naming schemes. Because much of the additional data files we use are provided by GENCODE, which uses 'chr' naming when referring to the same major genome assembly (GRCh38), we elected to use 'chr' naming as well. This necessitated transformation from the 'int' naming used by the GRCh38_no_alt_plus_hs38d1. + +We noticed that the carefully-curated GRCh38_no_alt_plus_hs38d1 genome sequence contains extended IUPAC code representations for some base pairs. Because these codes cause issues for many tools, we decided to follow the VCF 4.3 specification which recommends converting all extended IUPAC codes to the first matching alphabetical base pair. + +Because we modified both the chromosome naming scheme and the sequence of 74 extended IUPAC characters, a unique naming scheme for the genome reference we use is imperative. We elected to use the generic naming scheme + +``_Verily_v`` + hg19 ^^^^ From 5a6ae0289224634bd7eaa130cf8bd130d3acf309 Mon Sep 17 00:00:00 2001 From: Nicole Deflaux Date: Fri, 27 Jan 2017 17:00:59 -0800 Subject: [PATCH 3/4] Update to more concise description. Change-Id: I1d5bc8484127b6ee9362dd492800d5bd46a8dbf4 --- .../reference_genomes.rst | 24 +++++++++++-------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/docs/source/use_cases/discover_public_data/reference_genomes.rst b/docs/source/use_cases/discover_public_data/reference_genomes.rst index 3db344d..8b2f894 100644 --- a/docs/source/use_cases/discover_public_data/reference_genomes.rst +++ b/docs/source/use_cases/discover_public_data/reference_genomes.rst @@ -66,23 +66,27 @@ Verily's GRCh38 Verily's GRCh38 reference genome is fully compatible with any b38 genome in the autosome. -Verily elected to use a version of GRCh38 that excludes all patch sequences, omits alternate haplotype chromosomes, includes decoy sequences, and masks out duplicate copies of centromeric regions. There is an existing genome assembly version, named `GRCh38_no_alt_plus_hs38d1 `_, for which these transformations have all already been performed by the GRC. This assembly version was created specifically for analysis, with its rationale and exact genome modifications thoroughly documented in its `README file `_. Consequently, we elected to use this as our base genome assembly, but applied a few modifications described below. +Verily's GRCh38: -A common variation in representation in genome assemblies is the decision on chromosome naming. The two most prevalent naming schemes for the nuclear and mitochondrial chromosomes are: +* excludes all patch sequences +* omits alternate haplotype chromosomes +* includes decoy sequences +* masks out duplicate copies of centromeric regions -'Chr' naming: - chr{1..22}, chrX, chrY, chrM +The base assembly is `GRCh38_no_alt_plus_hs38d1 `_. This assembly version was created specifically for analysis, with its rationale and exact genome modifications thoroughly documented in its `README `_ file. -'Int' naming: - {1..22}, X, Y, MT +Verily applied the following modifications to the base assembly: -Ensuring consistency across datasets is of paramount importance, to avoid the annoying-at-best, error-prone-at-worst conversion between the naming schemes. Because much of the additional data files we use are provided by GENCODE, which uses 'chr' naming when referring to the same major genome assembly (GRCh38), we elected to use 'chr' naming as well. This necessitated transformation from the 'int' naming used by the GRCh38_no_alt_plus_hs38d1. +* Reference segment names are prefixed with "chr". -We noticed that the carefully-curated GRCh38_no_alt_plus_hs38d1 genome sequence contains extended IUPAC code representations for some base pairs. Because these codes cause issues for many tools, we decided to follow the VCF 4.3 specification which recommends converting all extended IUPAC codes to the first matching alphabetical base pair. + +--------------------------------------------------------------+ + | Many of the additional data files we use are provided | + | by GENCODE, which uses "chr" naming convention. | + +--------------------------------------------------------------+ -Because we modified both the chromosome naming scheme and the sequence of 74 extended IUPAC characters, a unique naming scheme for the genome reference we use is imperative. We elected to use the generic naming scheme +* All 74 extended IUPAC codes are converted to the first matching alphabetical base pair as recommended in the VCF 4.3 specification. -``_Verily_v`` +* This release of the genome reference is named ``GRCh38_Verily_v1`` hg19 ^^^^ From a5e5915153a7ef1b47422da66b8b8cca725ef80a Mon Sep 17 00:00:00 2001 From: Nicole Deflaux Date: Mon, 30 Jan 2017 10:40:40 -0800 Subject: [PATCH 4/4] Fix typo. Change-Id: I32d6ff651a56649eba1c5eddb81ff721c004a79a --- .../discover_public_data/platinum_genomes_deepvariant.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/use_cases/discover_public_data/platinum_genomes_deepvariant.rst b/docs/source/use_cases/discover_public_data/platinum_genomes_deepvariant.rst index 672101e..49dac99 100644 --- a/docs/source/use_cases/discover_public_data/platinum_genomes_deepvariant.rst +++ b/docs/source/use_cases/discover_public_data/platinum_genomes_deepvariant.rst @@ -17,7 +17,7 @@ __ RenderedVersion_ .. comment: end: goto-read-the-docs -This dataset comprises the `6 member CEPH pedigree 1463 `_ called using the the alpha version of the `Verily DeepVariant`_ toolchain aligned to :ref:`vgrch38` reference genome. See the `DeepVariant preprint `_ for full details: +This dataset comprises the `6 member CEPH pedigree 1463 `_ called using the alpha version of the `Verily DeepVariant`_ toolchain aligned to :ref:`vgrch38` reference genome. See the `DeepVariant preprint `_ for full details: | `Creating a universal SNP and small indel variant caller with deep neural networks `_ | Ryan Poplin, Dan Newburger, Jojo Dijamco, Nam Nguyen, Dion Loy, Sam Gross, Cory Y. McLean, Mark A. DePristo