Add new docs

PacificBiosciences · Mar 31, 2021 · 8f3aefa · 8f3aefa
1 parent 0427d51
commit 8f3aefa
Show file tree

Hide file tree

Showing 45 changed files with 848 additions and 0 deletions.
diff --git a/doc/img/isoseq-dedup-faq.png b/doc/img/isoseq-dedup-faq.png
diff --git a/docs/CNAME b/docs/CNAME
@@ -0,0 +1 @@
+isoseq.how
diff --git a/docs/_config.yml b/docs/_config.yml
@@ -0,0 +1,38 @@
+remote_theme: pmarsceill/just-the-docs
+
+# Aux links for the upper right navigation
+aux_links:
+  "File an issue":
+    - "https://github.com/PacificBiosciences/pbbioconda/issues/new?template=bug_report.md"
+
+# Makes Aux links open in a new tab. Default is false
+aux_links_new_tab: true
+
+color_scheme: custom
+
+# Footer content
+# appears at the bottom of every page's main content
+footer_content: "THIS WEBSITE AND CONTENT AND ALL SITE-RELATED SERVICES, INCLUDING ANY DATA, ARE PROVIDED \"AS IS,\" WITH ALL FAULTS, WITH NO REPRESENTATIONS OR WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, ANY WARRANTIES OF MERCHANTABILITY, SATISFACTORY QUALITY, NON-INFRINGEMENT OR FITNESS FOR A PARTICULAR PURPOSE. YOU ASSUME TOTAL RESPONSIBILITY AND RISK FOR YOUR USE OF THIS SITE, ALL SITE-RELATED SERVICES, AND ANY THIRD PARTY WEBSITES OR APPLICATIONS. NO ORAL OR WRITTEN INFORMATION OR ADVICE SHALL CREATE A WARRANTY OF ANY KIND. ANY REFERENCES TO SPECIFIC PRODUCTS OR SERVICES ON THE WEBSITES DO NOT CONSTITUTE OR IMPLY A RECOMMENDATION OR ENDORSEMENT BY PACIFIC BIOSCIENCES."
+
+# Footer last edited timestamp
+last_edit_timestamp: true # show or hide edit time - page must have `last_modified_date` defined in the frontmatter
+last_edit_time_format: "%b %e %Y at %I:%M %p" # uses ruby's time format: https://ruby-doc.org/stdlib-2.7.0/libdoc/time/rdoc/Time.html
+
+# Footer "Edit this page on GitHub" link text
+gh_edit_link: false # show or hide edit this page link
+
+
+title: "Iso-Seq Docs"
+tagline: "Scalable De Novo Isoform Discovery from PacBio HiFi Reads"
+
+search_enabled: true
+
+ga_tracking: G-SY7XDRP17G
+ga_tracking_anonymize_ip: true
+
+url: "https://isoseq.how"
+
+plugins:
+  - jekyll-redirect-from
+  - jekyll-sitemap
+  - jekyll-seo-tag
diff --git a/docs/_includes/head_custom.html b/docs/_includes/head_custom.html
@@ -0,0 +1,8 @@
+<link rel="apple-touch-icon" sizes="180x180" href="/assets/images/apple-touch-icon.png">
+<link rel="icon" type="image/png" sizes="32x32" href="/assets/images/favicon-32x32.png">
+<link rel="icon" type="image/png" sizes="16x16" href="/assets/images/favicon-16x16.png">
+<link rel="manifest" href="/assets/images/site.webmanifest">
+<link rel="shortcut icon" href="/assets/images/favicon.ico">
+<meta name="msapplication-TileColor" content="#da532c">
+<meta name="theme-color" content="#ffffff">
+<meta http-equiv="Content-Security-Policy" content="upgrade-insecure-requests">
diff --git a/docs/_sass/color_schemes/custom.scss b/docs/_sass/color_schemes/custom.scss
@@ -0,0 +1,5 @@
+$link-color: $blue-000;
+$content-width: 900px;
+$nav-width: 224px;
+$nav-width-md: 200px;
+$sidebar-color: $grey-lt-000;
diff --git a/docs/assets/images/android-chrome-192x192.png b/docs/assets/images/android-chrome-192x192.png
diff --git a/docs/assets/images/android-chrome-512x512.png b/docs/assets/images/android-chrome-512x512.png
diff --git a/docs/assets/images/apple-touch-icon.png b/docs/assets/images/apple-touch-icon.png
diff --git a/docs/assets/images/favicon-16x16.png b/docs/assets/images/favicon-16x16.png
diff --git a/docs/assets/images/favicon-32x32.png b/docs/assets/images/favicon-32x32.png
diff --git a/docs/assets/images/favicon.ico b/docs/assets/images/favicon.ico
diff --git a/docs/assets/images/site.webmanifest b/docs/assets/images/site.webmanifest
@@ -0,0 +1 @@
+{"name":"","short_name":"","icons":[{"src":"/android-chrome-192x192.png","sizes":"192x192","type":"image/png"},{"src":"/android-chrome-512x512.png","sizes":"512x512","type":"image/png"}],"theme_color":"#ffffff","background_color":"#ffffff","display":"standalone"}
diff --git a/docs/changelog.md b/docs/changelog.md
@@ -0,0 +1,47 @@
+---
+layout: default
+title: Changelog
+nav_order: 99
+---
+
+# Version changelog
+
+ * **3.4.0**
+   * SMRT Link release 10.0.0
+   * Add support for UMI and cell barcode handling, by adding `tag` and `dedup`
+   * Add `refine --min-rq` to support RQ filtering for unfiltered
+     `<movie>.reads.bam` input
+
+ * 3.3.0
+   * SMRT Link release 9.0.0
+
+ * 3.2.2
+   * Fix `polish` not generating fasta/q output. This bug was introduced in
+     v3.2.0
+
+ * 3.2.1
+   * Fix a gff index 1-off bug in `collapse`
+   * We have removed implicit dependencies from the bioconda recipe. Please
+     install `pbccs`, `lima`, and `pbcoretools` as needed.
+
+ * 3.2.0
+   * **`polish` dropped support for RS II datasets!**
+   * Add `collapse` step for aligned transcript BAM input
+   * Enable CCS-only workflow `cluster --use-qvs`
+   * Add `refine --min-polya-length`
+   * Add `cluster --singletons` to output unclustered FLNCs; potential sample
+     prep artifacts!
+   * Fix minimap2 bugs. Outputs might change slightly.
+
+ * 3.1.2
+   * Reduce `polish` memory footprint
+
+ * 3.1.1
+   * Edge case fix where `polish` would not finish and stale
+   * Improve `polish` run time for large scale datasets (> 1M CCS)
+   * Improve `polish` result quality
+
+ * 3.1.0
+   * We outsourced the poly(A) tail removal and concatemer detection into a new
+     tool called `refine`. Your custom `primers.fasta` is used in this step to
+     detect concatemers.
diff --git a/docs/clustering/cli-workflow.md b/docs/clustering/cli-workflow.md
@@ -0,0 +1,145 @@
+---
+layout: default
+parent: Clustering
+title: CLI Workflow
+nav_order: 3
+---
+
+# CLI Workflow
+
+The low-level workflow explained via CLI calls. All necessary dependencies are
+installed via bioconda.
+
+## Step 1 - Input
+### CLR data from Sequel / Sequel II / Sequel IIe
+For each SMRT cell a `movieX.subreads.bam` is needed for processing.
+
+Each sequencing run is processed by [*ccs*](https://github.com/PacificBiosciences/ccs)
+to generate one representative circular consensus sequence (CCS) for each ZMW.
+It is advised to use the latest CCS version 4.2.0 or newer.
+_ccs_ can be installed with `conda install pbccs`.
+
+    $ ccs movieX.subreads.bam movieX.ccs.bam --min-rq 0.9
+
+You can easily parallelize _ccs_ generation by chunking, please follow [this how-to](https://ccs.how/faq/parallelize).
+
+### CCS data from Sequel IIe
+If on-instrument CCS was performed, you can use the `reads.bam` or
+`hifi_reads.bam` as input.
+
+The `hifi_reads.bam` contains only HiFi reads, with predicted accuracy ≥Q20. No
+additional filtering is required.
+
+The `reads.bam` contains one representative sequence per productive ZMW,
+irrespective of quality and passes. Do not forget to use `isoseq3 refine
+--min-rq 0.9` in step 3!
+
+## Step 2 - Primer removal and demultiplexing
+Removal of primers and identification of barcodes is performed using [*lima*](https://lima.how/),
+which can be installed with \
+`conda install lima` and offers a specialized `--isoseq` mode.
+Even in the case that your sample is not barcoded, primer removal is performed
+by *lima*.
+If there are more than two sequences in your `primer.fasta` file or better said
+more than one pair of 5' and 3' primers, please use *lima* with `--peek-guess`
+to remove spurious false positive signal.
+More information about how to name input primer(+barcode)
+sequences in this [lima Iso-Seq FAQ](https://lima.how/faq/isoseq).
+
+    $ lima movieX.ccs.bam barcoded_primers.fasta movieX.fl.bam --isoseq --peek-guess
+
+**Example 1:**
+Following is the `primer.fasta` for the Clontech SMARTer and NEB cDNA library
+prep, which are the officially recommended protocols:
+
+    >NEB_5p
+    GCAATGAAGTCGCAGGGTTGGG
+    >Clontech_5p
+    AAGCAGTGGTATCAACGCAGAGTACATGGGG
+    >NEB_Clontech_3p
+    GTACTCTGCGTTGATACCACTGCTT
+
+**Example 2:**
+Following are examples for barcoded primers using a 16bp barcode followed by
+Clontech primer:
+
+    >primer_5p
+    AAGCAGTGGTATCAACGCAGAGTACATGGGG
+    >brain_3p
+    CGCACTCTGATATGTGGTACTCTGCGTTGATACCACTGCTT
+    >liver_3p
+    CTCACAGTCTGTGTGTGTACTCTGCGTTGATACCACTGCTT
+
+*Lima* will remove unwanted combinations and orient sequences to 5' → 3' orientation.
+
+Output files will be called according to their primer pair. Example for
+single sample libraries:
+
+    movieX.fl.NEB_5p--NEB_Clontech_3p.bam
+
+If your library contains multiple samples, execute the following workflow
+for each primer pair:
+
+    movieX.fl.primer_5p--brain_3p.bam
+    movieX.fl.primer_5p--liver_3p.bam
+
+## Step 3 - Refine
+Your data now contains full-length reads, but still needs to be refined by:
+ - [Trimming](https://github.com/PacificBiosciences/trim_isoseq_polyA) of poly(A) tails
+ - Rapid concatemer [identification](https://github.com/jeffdaily/parasail) and removal
+
+**Input**\
+The input file for *refine* is one demultiplexed CCS file with full-length reads
+and the primer fasta file:
+ - `<movie.primer--pair>.fl.bam` or `<movie.primer--pair>.fl.consensusreadset.xml`
+ - `primers.fasta`
+
+**Output**\
+The following output files of *refine* contain full-length non-concatemer reads:
+ - `<movie>.flnc.bam`
+ - `<movie>.flnc.transcriptset.xml`
+
+Actual command to refine:
+
+    $ isoseq refine movieX.NEB_5p--NEB_Clontech_3p.fl.bam primers.fasta movieX.flnc.bam
+
+If your sample has poly(A) tails, use `--require-polya`.
+This filters for FL reads that have a poly(A) tail
+with at least 20 base pairs (`--min-polya-length`) and removes identified tail:
+
+    $ isoseq refine movieX.NEB_5p--NEB_Clontech_3p.fl.bam movieX.flnc.bam --require-polya
+
+**Attention!**\
+If your workflow input is `reads.bam`, use `--min-rq 0.9`
+
+## Step 3b - Merge SMRT Cells
+If you used more than one SMRT cells, list all of your `<movie>.flnc.bam` in one
+`flnc.fofn`, a file of filenames:
+
+    $ ls movie*.flnc.bam movie*.flnc.bam movie*.flnc.bam > flnc.fofn
+
+## Step 4 - Clustering
+Compared to previous IsoSeq approaches, *IsoSeq v3* performs a single clustering
+technique.
+Due to the nature of the algorithm, it can't be efficiently parallelized.
+It is advised to give this step as many coresas possible.
+The individual steps of *cluster* are as following:
+
+ - Clustering using hierarchical n*log(n) [alignment](https://github.com/lh3/minimap2) and iterative cluster merging
+ - Polished [POA](https://github.com/rvaser/spoa) sequence generation, using a QV guided consensus approach
+
+**Input**
+The input file for *cluster* is one FLNC file:
+ - `<movie>.flnc.bam` or `flnc.fofn`
+
+**Output**
+The following output files of *cluster* contain polished isoforms:
+ - `<prefix>.bam`
+ - `<prefix>.hq.fasta.gz` with predicted accuracy ≥ 0.99
+ - `<prefix>.lq.fasta.gz` with predicted accuracy < 0.99
+ - `<prefix>.bam.pbi`
+ - `<prefix>.transcriptset.xml`
+
+Example invocation:
+
+    $ isoseq cluster flnc.fofn clustered.bam --verbose --use-qvs
diff --git a/docs/clustering/examples.md b/docs/clustering/examples.md
@@ -0,0 +1,91 @@
+---
+layout: default
+parent: Clustering
+title: Examples
+nav_order: 4
+---
+
+## Real-world example
+
+### Single sample
+This is an example of an end-to-end cmd-line-only workflow to get from
+subreads to transcripts. It's a 1% subsampled Alzheimer dataset.
+You can either download the subreads and call HiFi on your own or skip this step
+and download the HiFi reads generated by CCS v4.2:
+
+    $ wget https://downloads.pacbcloud.com/public/dataset/IsoSeq_sandbox/2020_Alzheimer8M_subset/alz.1perc.subreads.bam
+
+    $ ccs --version
+    ccs 4.0.0
+
+    $ ccs alz.1perc.subreads.bam alz.1perc.ccs.bam --min-rq 0.9
+
+    # Or download the pre-computed HiFi reads
+    $ wget https://downloads.pacbcloud.com/public/dataset/IsoSeq_sandbox/2020_Alzheimer8M_subset/alz.1perc.ccs.bam
+
+    $ cat primers.fasta
+    >NEB_5p
+    GCAATGAAGTCGCAGGGTTGGGG
+    >Clontech_5p
+    AAGCAGTGGTATCAACGCAGAGTACATGGGG
+    >NEB_Clontech_3p
+    GTACTCTGCGTTGATACCACTGCTT
+
+    $ lima --version
+    lima 1.11.0 (commit v1.11.0)
+
+    $ lima alz.1perc.ccs.bam primers.fasta alz.fl.bam --isoseq --peek-guess
+
+    $ ls alz.fl*
+    alz.fl.json         alz.fl.lima.summary
+    alz.fl.lima.clips   alz.fl.NEB_5p--NEB_Clontech_3p.bam
+    alz.fl.lima.counts  alz.fl.NEB_5p--NEB_Clontech_3p.bam.pbi
+    alz.fl.lima.guess   alz.fl.NEB_5p--NEB_Clontech_3p.subreadset.xml
+    alz.fl.lima.report
+
+    $ isoseq refine alz.fl.NEB_5p--NEB_Clontech_3p.bam primers.fasta alz.flnc.bam
+
+    $ ls alz.flnc.*
+    alz.flnc.bam                   alz.flnc.filter_summary.json
+    alz.flnc.bam.pbi               alz.flnc.report.csv
+    alz.flnc.consensusreadset.xml
+
+    $ isoseq cluster alz.flnc.bam clustered.bam --verbose --use-qvs
+    Read BAM                 : (37648) 1s 235ms
+    Convert to reads         : 589ms 797us
+    Sort Reads               : 8ms 409us
+    Aligning Linear          : 23s 63ms
+    Read to clusters         : 861ms 287us
+    Aligning Linear          : 20s 279ms
+    Merge by mapping         : 7s 242ms
+    Consensus                : 4s 663ms
+    Merge by mapping         : 980ms 742us
+    Consensus                : 103ms 913us
+    Write output             : 1s 799ms
+
+    $ ls clustered*
+    clustered.bam                 clustered.hq.fasta.gz
+    clustered.bam.pbi             clustered.lq.bam
+    clustered.cluster             clustered.lq.bam.pbi
+    clustered.cluster_report.csv  clustered.lq.fasta.gz
+    clustered.hq.bam              clustered.transcriptset.xml
+    clustered.hq.bam.pbi
+
+### Multiplexed samples
+
+    # Download HiFi reads
+    $ wget https://downloads.pacbcloud.com/public/dataset/IsoSeq_sandbox/2020_MultiplexIsoSeq_toy/m54363_190223_194117.ccs.bam
+
+    # Download barcoded primers
+    $ wget https://downloads.pacbcloud.com/public/dataset/IsoSeq_sandbox/2020_MultiplexIsoSeq_toy/NEB_barcode16.fasta
+
+    # Demux and primer removal
+    $ lima m54363_190223_194117.ccs.bam NEB_barcode16.fasta fl.bam --isoseq --peek-guess
+
+    # Combine inputs
+    $ ls fl.bc1001_5p--bc1001_3p.bam fl.bc1002_5p--bc1002_3p.bam > all.fofn
+
+    # Remove poly(A) tails and concatemer
+    $ isoseq3 refine all.fofn NEB_barcode16.fasta flnc.bam --require-polya --log-level DEBUG
+
+    $ isoseq3 cluster flnc.bam clustered.bam --use-qvs --verbose
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"name":"","short_name":"","icons":[{"src":"/android-chrome-192x192.png","sizes":"192x192","type":"image/png"},{"src":"/android-chrome-512x512.png","sizes":"512x512","type":"image/png"}],"theme_color":"#ffffff","background_color":"#ffffff","display":"standalone"}