diff --git a/docs/config.rst b/docs/config.rst index 0ed9db44..7d849241 100644 --- a/docs/config.rst +++ b/docs/config.rst @@ -24,20 +24,20 @@ To add species, the following command can be used:: # path to the annotation (.gtf) file for the species # to be added -The ``spacemake config update_species`` takes the same arguments as above, while ``spacemake config delete_species`` takes only ``--name``. +The ``spacemake config update-species`` takes the same arguments as above, while ``spacemake config delete-species`` takes only ``--name``. As of version ``0.7`` you can add multiple reference sequences per species. For that, -simply execute ``add_species`` multiple times, varying ``--reference ...`` but keeping ``--name`` constant. +simply execute ``add-species`` multiple times, varying ``--reference ...`` but keeping ``--name`` constant. To list the currently available ``species``, type:: - spacemake config list_species + spacemake config list-species -Configure barcode\_flavors +Configure barcode-flavors -------------------------- -.. _configure-barcode_flavor: +.. _configure-barcode-flavor: This sample-variable describes how the cell-barcode and the UMI should be extracted from Read1 and Read2. The ``default`` value for barcode\_flavor will be dropseq: ``cell = r1[0:12]`` (cell-barcode comes from first 12nt of Read1) and @@ -45,10 +45,10 @@ The ``default`` value for barcode\_flavor will be dropseq: ``cell = r1[0:12]`` ( **If a sample has no barcode\_flavor provided, the default run\_mode will be used** -Provided barcode\_flavors +Provided barcode-flavors ^^^^^^^^^^^^^^^^^^^^^^^^^ -Spacemake provides the following barcode\_flavors out of the box: +Spacemake provides the following barcode-flavors out of the box: .. code-block:: yaml @@ -74,16 +74,16 @@ Spacemake provides the following barcode\_flavors out of the box: cell: "r1[0:16]" UMI: "r1[16:28]" -To list the currently available ``barcode_flavor``-s, type:: +To list the currently available ``barcode-flavor``-s, type:: - spacemake config list_barcode_flavors + spacemake config list_barcode-flavors -Add a new barcode\_flavor +Add a new barcode-flavor ^^^^^^^^^^^^^^^^^^^^^^^^^ .. code-block:: - spacemake config add_barcode_flavor \ + spacemake config add_barcode-flavor \ --name NAME \ # name of the barcode flavor @@ -92,26 +92,26 @@ Add a new barcode\_flavor # Example: to set UMI to 13-20 NT of Read1, use --umi r1[12:20]. # It is also possible to use the first 8nt of Read2 as UMI: --umi r2[0:8]. - --cell_barcode CELL_BARCODE + --cell-barcode CELL-BARCODE # structure of CELL BARCODE, using python's list syntax. - # Example: to set the cell_barcode to 1-12 nt of Read1, use --cell_barcode r1[0:12]. + # Example: to set the cell-barcode to 1-12 nt of Read1, use --cell-barcode r1[0:12]. # It is also possible to reverse the CELL BARCODE, for instance with r1[0:12][::-1]. -Update/delete a barcode\_flavor +Update/delete a barcode-flavor ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The ``spacemake config update_barcode_flavor`` takes the same arguments as above, while ``spacemake config delete_barcode_flavor`` takes only ``--name``. +The ``spacemake config update-barcode-flavor`` takes the same arguments as above, while ``spacemake config delete-barcode-flavor`` takes only ``--name``. -Configure run\_modes +Configure run-modes -------------------- -.. _configure-run_mode: +.. _configure-run-mode: Specifying a "run mode" is an essential flexibity that spacemake offers. -Through setting a ``run_mode``, a sample can be processed and analysed downstream in various fashions. +Through setting a ``run-mode``, a sample can be processed and analysed downstream in various fashions. -Each ``run_mode`` can have the following variables: +Each ``run-mode`` can have the following variables: ``n_beads`` number of cell-barcode expected @@ -137,7 +137,7 @@ Each ``run_mode`` can have the following variables: counted this way, which map to exactly one CDS or UTR segment of a gene. ``mesh_data`` (spatial only) - if ``True`` a mesh will be created when running this ``run_mode``. + if ``True`` a mesh will be created when running this ``run-mode``. ``mesh_type`` (spatial only) spacemake currently offers two types of meshes: (1) ``circle``, where circles with a given @@ -156,12 +156,12 @@ Each ``run_mode`` can have the following variables: filter out pucks from DGE creation and subsequent steps of the pipeline. If set to 0, no pucks are excluded. -``parent_run_mode`` - Each ``run_mode`` can have a parent, to which it will fall back. - If a one of the ``run_mode`` variables is missing, the variable of the parent will be used. - If parent is not provided, the ``default`` ``run_mode`` will be the parent. +``parent_run-mode`` + Each ``run-mode`` can have a parent, to which it will fall back. + If a one of the ``run-mode`` variables is missing, the variable of the parent will be used. + If parent is not provided, the ``default`` ``run-mode`` will be the parent. -Provided run\_modes +Provided run-modes ^^^^^^^^^^^^^^^^^^^^^ .. code-block:: yaml @@ -234,23 +234,23 @@ Provided run\_modes - 1000 .. note:: - If a sample has no ``run_mode`` provided, the ``default`` will be used + If a sample has no ``run-mode`` provided, the ``default`` will be used .. note:: - If a ``run_mode`` variable is not provided, the variable of the default ``run_mode`` will be used + If a ``run-mode`` variable is not provided, the variable of the default ``run-mode`` will be used -To list the currently available ``run_mode``-s, type:: +To list the currently available ``run-mode``-s, type:: - spacemake config list_run_modes + spacemake config list_run-modes Add a new run\_mode ^^^^^^^^^^^^^^^^^^^ -See the :ref:`variable descriptions ` above. +See the :ref:`variable descriptions ` above. .. code-block:: - spacemake config add_run_mode \ + spacemake config add_run-mode \ --name NAME \ --parent_run_mode PARENT_RUN_MODE \ --umi_cutoff UMI_CUTOFF [UMI_CUTOFF ...] \ @@ -265,10 +265,10 @@ See the :ref:`variable descriptions ` above. --mesh_spot_diameter_um MESH_SPOT_DIAMETER_UM \ --mesh_spot_distance_um MESH_SPOT_DISTANCE_UM -Update/delete a run\_mode +Update/delete a run-mode ^^^^^^^^^^^^^^^^^^^^^^^^^ -The ``spacemake config update_run_mode`` takes the same arguments as above, while ``spacemake config delete_run_mode`` takes only ``--name``. +The ``spacemake config update-run-mode`` takes the same arguments as above, while ``spacemake config delete-run-mode`` takes only ``--name``. Configure pucks @@ -279,7 +279,7 @@ Configure pucks Each spatial sample is associated with a ``puck``. The ``puck`` variable defines the dimensionality of the underlying spatial structure, which spacemake uses during the automated analysis and plotting, as well as the binning (meshing) of -the data when selected in the ``run_mode``. +the data when selected in the ``run-mode``. Each puck has the following variables: diff --git a/docs/initialize.rst b/docs/initialize.rst index 99ce58b4..42b79d2e 100644 --- a/docs/initialize.rst +++ b/docs/initialize.rst @@ -11,19 +11,20 @@ Optional arguments The `spacemake init` command takes the following optional arguments: -``root_dir`` - The ``root_dir`` for the spacemake instance. Defaults to ``.``, the directory in which `spacemake init` is ran. +``root-dir`` + The ``root-dir`` for the spacemake instance. Defaults to ``.``, the directory in which `spacemake init` is ran. -``temp_dir`` +``temp-dir`` Path to the temporary directory, defaults to ``/tmp``. -``download_species`` - If set, spacemake will download the genome (.fa) and annotation (.gtf) files for mouse and human (from gencode, as specified `here `_. +``download-species`` + If set, spacemake will download the genome (.fa) and annotation (.gtf) files for mouse and + human from gencode, as specified `here `_. Hence, the complete `spacemake init` command looks like this:: spacemake init \ - --root_dir ROOT_DIR \ # optional - --temp_dir TEMP_DIR \ # optional - --download_species \ # optional - --dropseq_tools DROPSEQ_TOOLS # required + --root-dir ROOT-DIR \ # optional + --temp-dir TEMP-DIR \ # optional + --download-species \ # optional + --dropseq-tools DROPSEQ-TOOLS # required diff --git a/docs/projects/index.rst b/docs/projects/index.rst index 492745a0..60b17686 100644 --- a/docs/projects/index.rst +++ b/docs/projects/index.rst @@ -6,7 +6,7 @@ directory of the spacemake project. Each sample has exactly one row in this ``project_df.csv`` file. In the back-end, spacemake uses a ``pandas.DataFrame`` to load and save this ``.csv`` file on disk. This data frame -is indexed by key ``(project_id, sample_id)`` +is indexed by key ``(project-id, sample-id)`` The spacemake class responsible for this back-end logic is the :ref:`ProjectDF` class. @@ -18,11 +18,11 @@ Sample parameters In spacemake each sample can have the folloing variables: -``project_id`` - ``project_id`` of a sample +``project-id`` + ``project-id`` of a sample -``sample_id`` - ``sample_id`` of a sample +``sample-id`` + ``sample-id`` of a sample ``R1`` ``.fastq.gz`` file path(s) to Read1 read file(s). Can be either a single file, or a space separated list of consecutive files. If a list provided, the files will be merged together and the merged ``R1.fastq.gz`` will be processed downstream. @@ -117,8 +117,8 @@ In spacemake each sample can have the folloing variables: If not provided, a ``default`` puck will be used with ``width_um=3000``, ``spot_diameter_um=10``. -``puck_id`` (optional) - ``puck_id`` of a sample +``puck-id`` (optional) + ``puck-id`` of a sample ``puck_barcode_file`` (optional) the path to the file contining (x,y) positions of the barcodes. If the ``puck`` for this @@ -142,8 +142,8 @@ In spacemake each sample can have the folloing variables: To add a single sample, we can use the following command:: spacemake projects add_sample \ - --project_id PROJECT_ID \ # required - --sample_id SAMPLE_ID \ # required + --project-id PROJECT-ID \ # required + --sample-id SAMPLE-ID \ # required --R1 R1 [R1 R1 ...] \ # required, if no longreads --R2 R2 [R2 R2 ...] \ # required, if no longreads --longreads LONGREADS \ # required, if no R1 & R2 @@ -151,7 +151,7 @@ To add a single sample, we can use the following command:: --barcode_flavor BARCODE_FLAVOR \ # optional --species SPECIES \ # required --puck PUCK \ # optional - --puck_id PUCK_ID \ # optional + --puck-id PUCK-ID \ # optional --puck_barcode_file PUCK_BARCODE_FILE \ # optional --investigator INVESTIGATOR \ # optional --experiment EXPERIMENT \ # optional @@ -221,10 +221,10 @@ Step 4: add your sample Once everything is configured you can add your custom spatial sample with the following command:: spacemake projects add_sample \ - # your sample's project_id \ - --project_id PROJECT_ID \ - # your sample's sample_id \ - --sample_id SAMPLE_ID \ + # your sample's project-id \ + --project-id PROJECT-ID \ + # your sample's sample-id \ + --sample-id SAMPLE-ID \ # one or more R1.fastq.gz files --R1 R1 [R1 R1 ...] \ # one or more R2.fastq.gz files @@ -265,30 +265,30 @@ The ``samples.yaml`` should have the following structure: .. code-block:: yaml additional_projects: - - project_id: visium - sample_id: visium_1 + - project-id: visium + sample-id: visium_1 R1: R2: species: mouse puck: visium - barcode_flavor: visium - run_mode: [visium] - - project_id: visium - sample_id: visium_2 + barcode-flavor: visium + run-mode: [visium] + - project-id: visium + sample-id: visium_2 R1: R2: species: human puck: visium - barcode_flavor: visium - run_mode: [visium] - - project_id: slideseq - sample_id: slideseq_1 + barcode-flavor: visium + run-mode: [visium] + - project-id: slideseq + sample-id: slideseq_1 R1: R2: species: mouse puck: slideseq - barcode_flavor: slideseq_14bc - run_mode: [default, slideseq] + barcode-flavor: slideseq_14bc + run-mode: [default, slideseq] puck_barcode_file: Under ``additional_projects`` we define a list where each element will be a key:value pair, to be inserted in the ``project_df.csv`` @@ -296,7 +296,7 @@ Under ``additional_projects`` we define a list where each element will be a key: .. note:: When using the above command, if a sample is already present in the ``project_df.csv`` rather than adding it again, spacemake will update it. - If someone runs ``spacemake projects add_samples_from_yaml --samples yaml samples.yaml`` and + If someone runs ``spacemake projects add-samples-from-yaml --samples yaml samples.yaml`` and then modifies something in the ``samples.yaml``, and runs the command again, the ``project_df.csv`` will contain the updated version of the settings. @@ -307,14 +307,14 @@ You can add samples directly from an Illumina sample-sheet, assuming the sample- To use this functionality, type:: - spacemake projects add_sample_sheet \ - --sample_sheet \ - --basecalls_dir + spacemake projects add-sample-sheet \ + --sample-sheet \ + --basecalls-dir The sample-sheet columns have to obey certain conventions for spacemake to parse it properly: -* ``Sample_ID`` contains the ``sample_id`` in the project. -* ``Sample_Project`` contains the ``project_id`` in the project. +* ``Sample_ID`` contains the ``sample-id`` in the project. +* ``Sample_Project`` contains the ``project-id`` in the project. * ``Description`` must end with ``_species``, where species is the one configured for the samples in the project, e.g. ``HEK293_wt_human``. Spacemake will also parse the fields ``Investigator``, ``Date``, and ``Experiment`` from the sample-sheet and add them to the project metadata. @@ -334,11 +334,14 @@ to specify which extra variables to show. Merging samples ---------------- -Spacemake can merge samples that have been resequenced to increase the number of quantified molecules in the data. To merge samples, first configure, add, and process the individual samples as they are. Make sure that the samples belong in the same project, e.g. have the same ``project_id``. Then merge them by typing:: +Spacemake can merge samples that have been resequenced to increase the number of quantified +molecules in the data. To merge samples, first configure, add, and process the individual samples +as they are. Make sure that the samples belong in the same project, e.g. have the +same ``project-id``. Then merge them by typing:: - spacemake projects merge_samples \ - --merge_project_id \ - --merged_sample_id \ - --sample_id_list + spacemake projects merge-samples \ + --merge-project-id \ + --merged-sample-id \ + --sample-id-list The above command will merge the two samples by creating a new sample with the same variables. Spacemake performs the merging at the level of the ``bam`` files, thus properly processing the merged sample by collapsing PCR duplicates. Processing will automatically run until the creation of the ``qc_sheets`` and the automated analyses. diff --git a/docs/quick-start/index.rst b/docs/quick-start/index.rst index 6c58515b..36caedc2 100644 --- a/docs/quick-start/index.rst +++ b/docs/quick-start/index.rst @@ -53,16 +53,16 @@ To add an `Open-ST` sample, type in the terminal: .. code-block:: console - spacemake projects add_sample \ - --project_id \ - --sample_id \ + spacemake projects add-sample \ + --project-id \ + --sample-id \ --R1 \ # single R1 or several R1 files --R2 \ # single R2 or several R2 files --species \ --puck openst \ - --puck_barcode_file \ - --run_mode openst \ - --barcode_flavor openst + --puck-barcode-file \ + --run-mode openst \ + --barcode-flavor openst .. note:: @@ -72,7 +72,7 @@ To add an `Open-ST` sample, type in the terminal: ``sample_1a_R1.fastq.gz``, ``sample_1b_R1.fastq.gz``, ``sample_1a_R2.fastq.gz``, ``sample_1b_R2.fastq.gz``, meaning that R1 and R2 are both split into two, one can simply call spacemake with the following command:: - spacemake projects add_sample \ + spacemake projects add-sample \ ... --R1 sample_1a_R1.fastq.gz sample_1b_R1.fastq.gz \ --R2 sample_1a_R2.fastq.gz sample_1b_R2.fastq.gz \ @@ -83,7 +83,7 @@ To add an `Open-ST` sample, type in the terminal: With Open-ST data, each sample covers a *piece* of capture area, which contains at least one tile (``puck``). - Thus, we need to provide ``--puck_barcode_file`` (since each puck has different barcodes, unlike for visium samples). + Thus, we need to provide ``--puck-barcode-file`` (since each puck has different barcodes, unlike for visium samples). This file should be a comma or tab separated, containing column names as first row. Acceptable column names are: - ``cell_bc``, ``barcodes`` or ``barcode`` for cell-barcode @@ -93,9 +93,9 @@ To add an `Open-ST` sample, type in the terminal: These can be generated using the `openst package `_. It is typically unknown *a priori* which tiles fall under a sample, so in principle all available - ``puck_barcode_files`` would need to be specified under ``--puck_barcode_file``. To generate output files + ``puck-barcode-files`` would need to be specified under ``--puck-barcode-file``. To generate output files and reports only for the relevant tiles per sample, we provide the variable ``spatial_barcode_min_matches`` - under ``run_mode``, as the minimum proportion of spatial barcodes that a tile has in common with + under ``run-mode``, as the minimum proportion of spatial barcodes that a tile has in common with the sample reads to be considered during quantification and downstream analysis. The default threshold (0.1, i.e., at least 10% of barcodes per tile are present in the sample) was chosen @@ -104,19 +104,19 @@ To add an `Open-ST` sample, type in the terminal: because this threshold was too high, you can update the sample to add missing tiles, and then rerun spacemake using a lower ``spatial_barcode_min_matches``. -The above will add a new Open-ST project with ``barcode_flavor, run_mode, puck`` all +The above will add a new Open-ST project with ``barcode-flavor, run-mode, puck`` all set to ``openst``. -The structure in ``barcode_flavor`` assumes that sequencing of the library is paired-end, +The structure in ``barcode-flavor`` assumes that sequencing of the library is paired-end, with Read1 having the spot barcodes, and Read2 containing the UMI (first 9 nucleotides) and the sequence to be aligned to the genome. You can see the details by running -``spacemake config list_barcode_flavors``. +``spacemake config list-barcode-flavors``. -The ``run_mode`` is tailored to Open-ST samples and it +The ``run-mode`` is tailored to Open-ST samples and it (i) counts intronic reads, (ii) includes multi-mappers, (iii) bins the data into regular hexagons of a 7 um side, and (iv) performs automated analyses for three UMI thresholds [100, 250, 500]. You can see the details by running -``spacemake config list_run_modes``. +``spacemake config list-run-modes``. The ``puck`` variable is used to correctly set the size of the capture area. This is required for accurately plotting the QC sheets and the results of the automated @@ -144,17 +144,17 @@ To add a `Visium`_ sample, type in terminal: .. code-block:: console - spacemake projects add_sample \ - --project_id \ - --sample_id \ + spacemake projects add-sample \ + --project-id \ + --sample-id \ --R1 \ # single R1 or several R1 files --R2 \ # single R2 or several R2 files --species \ --puck visium \ - --run_mode visium \ - --barcode_flavor visium + --run-mode visium \ + --barcode-flavor visium -Above we add a new visium project with ``puck, run_mode, barcode_flavor`` all set to ``visium``. +Above we add a new visium project with ``puck, run-mode, barcode-flavor`` all set to ``visium``. This is possible as spacemake comes with pre-defined variables, all suited for visium. The visium ``run_mode`` will process the sample in the same way as `spaceranger `_ would: intronic reads will not be counted, multi-mappers (where the multi-mapping read maps only to one CDS or UTR region) will be counted, @@ -183,32 +183,32 @@ To add a `Slide-seq`_ sample, type in terminal: .. code-block:: console - spacemake projects add_sample \ - --project_id \ - --sample_id \ + spacemake projects add-sample \ + --project-id \ + --sample-id \ --R1 \ --R2 \ --species \ --puck slide_seq \ - --run_mode slide_seq \ - --barcode_flavor slide_seq_14bc \ - --puck_barcode_file + --run-mode slide_seq \ + --barcode-flavor slide_seq_14bc \ + --puck-barcode-file -Above we add a new `Slide-seq`_ project with the ``puck, run_mode`` will be set to ``slide_seq`` +Above we add a new `Slide-seq`_ project with the ``puck, run-mode`` will be set to ``slide_seq`` which are pre-defined settings for `Slide-seq`_ samples. .. note:: For spatial samples other than visium - such as `Slide-seq`_ - we need to provide a - ``puck_barcode_file`` (since each puck has different barcodes, unlike for visium samples). + ``puck-barcode-file`` (since each puck has different barcodes, unlike for visium samples). This file should be a comma or tab separated, containing column names as first row. Acceptable column names are: - ``cell_bc``, ``barcodes`` or ``barcode`` for cell-barcode - ``xcoord`` or ``x_pos`` for x-positions - ``ycoord`` or ``y_pos`` for y-positions -In this example ``barcode_flavor`` will be set to ``slide_seq_14bc``, -a pre-defined ``barcode_flavor`` in spacemake, where the ``cell_barcode`` comes from the first 14nt of Read1, and the ``UMI`` comes from nt 13-22 (remaining 9 nt). -The other pre-defined ``barcode_flavor`` for `Slide-seq`_ is ``slide_seq_15bc``: here ``cell_barcode`` again comes from the first 14nt of Read1, but the ``UMI`` comes from nt 14-22 (remaining 8) of Read1. +In this example ``barcode-flavor`` will be set to ``slide_seq_14bc``, +a pre-defined ``barcode-flavor`` in spacemake, where the ``cell_barcode`` comes from the first 14nt of Read1, and the ``UMI`` comes from nt 13-22 (remaining 9 nt). +The other pre-defined ``barcode-flavor`` for `Slide-seq`_ is ``slide_seq_15bc``: here ``cell_barcode`` again comes from the first 14nt of Read1, but the ``UMI`` comes from nt 14-22 (remaining 8) of Read1. To see the values of these predefined variables checkout the :ref:`configuration ` docs. @@ -234,26 +234,26 @@ is similar to Slide-seq: .. code-block:: console - spacemake projects add_sample \ - --project_id \ - --sample_id \ + spacemake projects add-sample \ + --project-id \ + --sample-id \ --R1 \ # single R1 or several R1 files --R2 \ # single R2 or several R2 files --species \ --puck seq_scope \ - --run_mode seq_scope \ - --barcode_flavor seq_scope \ - --puck_barcode_file + --run-mode seq_scope \ + --barcode-flavor seq_scope \ + --puck-barcode-file -Here we used the pre-defined variables for ``puck``, ``barcode_flavor`` and ``run_mode`` all set to ``seq_scope``. +Here we used the pre-defined variables for ``puck``, ``barcode-flavor`` and ``run-mode`` all set to ``seq_scope``. The ``seq_scope`` ``puck`` has 1000 micron width and bead size set to 1 micron. -The ``seq_scope`` ``barcode_flavor`` describes how the ``cell_barcode`` and he ``UMI`` should be extracted. +The ``seq_scope`` ``barcode-flavor`` describes how the ``cell_barcode`` and he ``UMI`` should be extracted. As described in the `Seq-scope`_ paper. ``cell_barcode`` comes from nt 1-20 of Read1, and ``UMI`` comes from 1-9nt of Read2. -The ``seq_scope`` ``run_mode`` has its settings as follows: +The ``seq_scope`` ``run-mode`` has its settings as follows: .. code-block:: yaml @@ -293,36 +293,36 @@ single-cell experiments, where there is no spatial information available. To add a scRNA-seq sample, simply type:: - spacemake projects add_sample \ - --project_id \ - --sample_id \ + spacemake projects add-sample \ + --project-id \ + --sample-id \ --R1 \ # single R1 or several R1 files --R2 \ # single R2 or several R2 files --species \ - --run_mode scRNA_seq \ - --barcode_flavor default # use other flavors for 10X Chromium + --run-mode scRNA_seq \ + --barcode-flavor default # use other flavors for 10X Chromium -As seen above, we define fewer variables as before: only ``species`` and ``run_mode`` are needed. +As seen above, we define fewer variables as before: only ``species`` and ``run-mode`` are needed. .. warning:: - As it can be seen, we set the ``barcode_flavor`` to ``default``, which will use the `Drop-seq`_ + As it can be seen, we set the ``barcode-flavor`` to ``default``, which will use the `Drop-seq`_ scRNA-seq barcoding strategy (``cell_barcode`` is 1-12nt of Read1, ``UMI`` is 13-20 nt of Read1). For 10X samples either the ``sc_10x_v2`` (10X Chromium Single Cell 3' V2) or ``visium`` (10X Chromium Single Cell 3' V3, same as visium) should be be used as - ``barcode_flavor``. Both are pre-defined in spacemake. + ``barcode-flavor``. Both are pre-defined in spacemake. - If another barcoding strategy is used, a custom ``barcode_flavor`` has to be defined. - :ref:`Here is a complete guide on how to add a custom barcode_flavor `. + If another barcoding strategy is used, a custom ``barcode-flavor`` has to be defined. + :ref:`Here is a complete guide on how to add a custom barcode-flavor `. .. note:: - By setting ``run_mode`` to ``scRNA_seq`` we used the pre-defined ``run_mode`` settings tailored for single-cell experiments: expected number of cells (or beads) will be 10k, introns will be counted, UMI cutoff will be at 500, multi-mappers will not be counted and polyA and adapter sequences will be trimmed from Read2. + By setting ``run-mode`` to ``scRNA_seq`` we used the pre-defined ``run-mode`` settings tailored for single-cell experiments: expected number of cells (or beads) will be 10k, introns will be counted, UMI cutoff will be at 500, multi-mappers will not be counted and polyA and adapter sequences will be trimmed from Read2. - Of course, running single-cell samples with other ``run_mode`` settings is also possible. + Of course, running single-cell samples with other ``run-mode`` settings is also possible. - :ref:`Here it can be learned how to add a custom run_mode `, tailored to the experimental needs. + :ref:`Here it can be learned how to add a custom run-mode `, tailored to the experimental needs. To see the values of these predefined variables checkout the :ref:`configuration ` docs. diff --git a/environment.yaml b/environment.yaml index e0f21c6d..bfbe7420 100644 --- a/environment.yaml +++ b/environment.yaml @@ -31,7 +31,7 @@ dependencies: - isal - pytest - pytest-cov - - mrfifo>=0.3.0 + - mrfifo>=0.8.1 - pandas>2 - scanpy>=1.8.1 - leidenalg>=0.8.1 diff --git a/spacemake/bin/fastq_to_uBAM.py b/spacemake/bin/fastq_to_uBAM.py index a13e2a3f..b8688de4 100644 --- a/spacemake/bin/fastq_to_uBAM.py +++ b/spacemake/bin/fastq_to_uBAM.py @@ -100,7 +100,7 @@ def quality_trim(fq_src, min_qual=20, phred_base=33): def render_to_sam(fq1, fq2, sam_out, args, **kwargs): - logger = util.setup_logging(args, "spacemake.bin.fastq_to_uBAM.worker") + logger = util.setup_logging(args, "fastq_to_uBAM.worker", rename_process=False) logger.debug( f"starting up with fq1={fq1}, fq2={fq2} sam_out={sam_out} and args={args}" ) @@ -151,41 +151,6 @@ def iter_single(fq2): sam_out.write(make_sam_record(flag=4, **attrs)) return N - # if args.paired_end: - # # if args.paired_end[0] == 'r': - # # # rev_comp R1 and reverse qual1 - # # q1 = q1[::-1] - # # r1 = util.rev_comp(r1) - - # # if args.paired_end[1] == 'r': - # # # rev_comp R2 and reverse qual2 - # # q2 = q2[::-1] - # # r2 = util.rev_comp(r2) - - # rec1 = fmt.make_bam_record( - # qname=fqid, - # r1="THISSHOULDNEVERBEREFERENCED", - # r2=r1, - # R1=r1, - # R2=r2, - # r2_qual=q1, - # r2_qname=fqid, - # flag=69, # unmapped, paired, first in pair - # ) - # rec2 = fmt.make_bam_record( - # qname=fqid, - # r1="THISSHOULDNEVERBEREFERENCED", - # r2=r2, - # R1=r1, - # R2=r2, - # r2_qual=q2, - # r2_qname=fqid, - # flag=133, # unmapped, paired, second in pair - # ) - # results.append(rec1) - # results.append(rec2) - - # else: def main(args): @@ -194,7 +159,9 @@ def main(args): # queues for communication between processes w = ( - mf.Workflow("fastq_to_uBAM", total_pipe_buffer_MB=args.pipe_buffer) + mf.Workflow( + f"[{args.sample}]fastq_to_uBAM", total_pipe_buffer_MB=args.pipe_buffer + ) # open reads2.fastq.gz .gz_reader(inputs=input_reads2, output=mf.FIFO("read2", "wb")).distribute( input=mf.FIFO("read2", "rt"), @@ -244,6 +211,9 @@ def main(args): # output="/dev/stdout" # ) output=mf.FIFO("sam_combined", "wt"), + log_rate_every_n=1000000, + log_rate_template="written {M_out:.1f} M BAM records ({MPS:.3f} M/s, overall {mps:.3f} M/s)", + log_name="fastq_to_uBAM.collect", ) # compress to BAM w.funnel( diff --git a/spacemake/snakemake/scripts/filter_mm_reads.py b/spacemake/snakemake/scripts/filter_mm_reads.py index 139a3a01..98266d04 100644 --- a/spacemake/snakemake/scripts/filter_mm_reads.py +++ b/spacemake/snakemake/scripts/filter_mm_reads.py @@ -44,19 +44,26 @@ def is_exonic(aln): bam_out = pysam.AlignmentFile(args.out_bam, 'wb', header= bam_in.header) counter = 0 start_time = datetime.datetime.now() + total_start_time = datetime.datetime.now() + time_interval = 30 multi_mappers = [] for aln in bam_in.fetch(until_eof=True): - counter = counter + 1 + counter += 1 - if counter % 1000000 == 0: - finish_time = datetime.datetime.now() - delta_seconds = (finish_time - start_time).seconds - # restart time - start_time = finish_time - print(f'Processed 1 million records in {delta_seconds} seconds, total records processed {counter}. current time: {finish_time}') + finish_time = datetime.datetime.now() + delta_seconds = (finish_time - start_time).seconds + total_elapsed_seconds = (finish_time - total_start_time).total_seconds() + + if delta_seconds >= time_interval: + formatted_time = finish_time.strftime('%Y-%m-%d %H:%M:%S') + records_per_second = counter / delta_seconds + print(f'Processed {counter:,} records in {total_elapsed_seconds:,.0f} seconds. Average processing rate: {records_per_second:,.0f} records/second. Current time: {formatted_time}') + + start_time = finish_time + mapped_number = aln.get_tag('NH') if mapped_number == 1: @@ -80,3 +87,5 @@ def is_exonic(aln): # reset multimapper list multi_mappers = [] + + print(f'Finished processing {counter:,} records in {total_elapsed_seconds:,.0f} seconds. Current time: {formatted_time}') \ No newline at end of file diff --git a/spacemake/util.py b/spacemake/util.py index 7fdedf40..f8d269d3 100644 --- a/spacemake/util.py +++ b/spacemake/util.py @@ -2,7 +2,7 @@ import os import logging -#from contextlib import ContextDecorator, contextmanager +# from contextlib import ContextDecorator, contextmanager from spacemake.errors import SpacemakeError, FileWrongExtensionError from spacemake.contrib import __version__, __license__, __author__, __email__ @@ -10,14 +10,16 @@ bool_in_str = ["True", "true", "False", "false"] -def generate_kmers(k, nts='ACGT'): + +def generate_kmers(k, nts="ACGT"): if k == 0: - yield '' + yield "" elif k > 0: for x in nts: - for mer in generate_kmers(k-1, nts=nts): + for mer in generate_kmers(k - 1, nts=nts): yield x + mer + class dotdict(dict): """dot.notation access to dictionary attributes""" @@ -50,6 +52,7 @@ def wc_fill(x, wc): data_root_type=getattr(wc, "data_root_type", "complete_data"), ) + def quiet_bam_open(*argc, **kw): """_summary_ @@ -112,6 +115,7 @@ def ensure_path(path): os.makedirs(dirname, exist_ok=True) return path + def timed_loop( src, logger, @@ -189,7 +193,13 @@ def read_fq(fname, skim=0): src = FASTQ_src(fname) # assume its a stream or file-like object already n = 0 - for record in timed_loop(src, logger, T=15, template="processed {i} reads in {dT:.1f}sec. ({rate:.3f} k rec/sec)", skim=skim): + for record in timed_loop( + src, + logger, + T=15, + template="processed {i} reads in {dT:.1f}sec. ({rate:.3f} k rec/sec)", + skim=skim, + ): yield record n += 1 @@ -396,6 +406,7 @@ def fasta_chunks(lines, strip=True, fuse=True): # except SpacemakeError as e: # print(e) + def message_aggregation(log_listen="spacemake", print_logger=False, print_success=True): from functools import wraps @@ -428,10 +439,11 @@ def handle(this, record): print(f"{LINE_SEPARATOR}SUCCESS!") return res - return wrapper + return wrapper return the_decorator + def str_to_list(value): # if list in string representation, return the list if value is None: @@ -477,11 +489,14 @@ def setup_logging( name="spacemake.main", log_file="", FORMAT="%(asctime)-20s\t{sample:30s}\t%(name)-50s\t%(levelname)s\t%(message)s", + rename_process=True, ): sample = getattr(args, "sample", "na") - import setproctitle - if name != "spacemake.main": - setproctitle.setproctitle(f"{name} {sample}") + if rename_process: + import setproctitle + + if name != "spacemake.main": + setproctitle.setproctitle(f"{name} {sample}") FORMAT = FORMAT.format(sample=sample) @@ -514,11 +529,14 @@ def setup_logging( return logger + def setup_smk_logging(name="spacemake.smk", **kw): import argparse + args = argparse.Namespace(**kw) return setup_logging(args, name=name) + default_log_level = "INFO"