From 171452844e60c13e19162deb785a2bb301c32904 Mon Sep 17 00:00:00 2001 From: nukappa Date: Wed, 4 Dec 2024 21:31:20 +0100 Subject: [PATCH 01/12] first pass changing snake to kebab in docs --- docs/config.rst | 68 +++++++++++------------ docs/initialize.rst | 19 +++---- docs/projects/index.rst | 77 +++++++++++++------------- docs/quick-start/index.rst | 108 ++++++++++++++++++------------------- 4 files changed, 138 insertions(+), 134 deletions(-) diff --git a/docs/config.rst b/docs/config.rst index 0ed9db44..7d849241 100644 --- a/docs/config.rst +++ b/docs/config.rst @@ -24,20 +24,20 @@ To add species, the following command can be used:: # path to the annotation (.gtf) file for the species # to be added -The ``spacemake config update_species`` takes the same arguments as above, while ``spacemake config delete_species`` takes only ``--name``. +The ``spacemake config update-species`` takes the same arguments as above, while ``spacemake config delete-species`` takes only ``--name``. As of version ``0.7`` you can add multiple reference sequences per species. For that, -simply execute ``add_species`` multiple times, varying ``--reference ...`` but keeping ``--name`` constant. +simply execute ``add-species`` multiple times, varying ``--reference ...`` but keeping ``--name`` constant. To list the currently available ``species``, type:: - spacemake config list_species + spacemake config list-species -Configure barcode\_flavors +Configure barcode-flavors -------------------------- -.. _configure-barcode_flavor: +.. _configure-barcode-flavor: This sample-variable describes how the cell-barcode and the UMI should be extracted from Read1 and Read2. The ``default`` value for barcode\_flavor will be dropseq: ``cell = r1[0:12]`` (cell-barcode comes from first 12nt of Read1) and @@ -45,10 +45,10 @@ The ``default`` value for barcode\_flavor will be dropseq: ``cell = r1[0:12]`` ( **If a sample has no barcode\_flavor provided, the default run\_mode will be used** -Provided barcode\_flavors +Provided barcode-flavors ^^^^^^^^^^^^^^^^^^^^^^^^^ -Spacemake provides the following barcode\_flavors out of the box: +Spacemake provides the following barcode-flavors out of the box: .. code-block:: yaml @@ -74,16 +74,16 @@ Spacemake provides the following barcode\_flavors out of the box: cell: "r1[0:16]" UMI: "r1[16:28]" -To list the currently available ``barcode_flavor``-s, type:: +To list the currently available ``barcode-flavor``-s, type:: - spacemake config list_barcode_flavors + spacemake config list_barcode-flavors -Add a new barcode\_flavor +Add a new barcode-flavor ^^^^^^^^^^^^^^^^^^^^^^^^^ .. code-block:: - spacemake config add_barcode_flavor \ + spacemake config add_barcode-flavor \ --name NAME \ # name of the barcode flavor @@ -92,26 +92,26 @@ Add a new barcode\_flavor # Example: to set UMI to 13-20 NT of Read1, use --umi r1[12:20]. # It is also possible to use the first 8nt of Read2 as UMI: --umi r2[0:8]. - --cell_barcode CELL_BARCODE + --cell-barcode CELL-BARCODE # structure of CELL BARCODE, using python's list syntax. - # Example: to set the cell_barcode to 1-12 nt of Read1, use --cell_barcode r1[0:12]. + # Example: to set the cell-barcode to 1-12 nt of Read1, use --cell-barcode r1[0:12]. # It is also possible to reverse the CELL BARCODE, for instance with r1[0:12][::-1]. -Update/delete a barcode\_flavor +Update/delete a barcode-flavor ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The ``spacemake config update_barcode_flavor`` takes the same arguments as above, while ``spacemake config delete_barcode_flavor`` takes only ``--name``. +The ``spacemake config update-barcode-flavor`` takes the same arguments as above, while ``spacemake config delete-barcode-flavor`` takes only ``--name``. -Configure run\_modes +Configure run-modes -------------------- -.. _configure-run_mode: +.. _configure-run-mode: Specifying a "run mode" is an essential flexibity that spacemake offers. -Through setting a ``run_mode``, a sample can be processed and analysed downstream in various fashions. +Through setting a ``run-mode``, a sample can be processed and analysed downstream in various fashions. -Each ``run_mode`` can have the following variables: +Each ``run-mode`` can have the following variables: ``n_beads`` number of cell-barcode expected @@ -137,7 +137,7 @@ Each ``run_mode`` can have the following variables: counted this way, which map to exactly one CDS or UTR segment of a gene. ``mesh_data`` (spatial only) - if ``True`` a mesh will be created when running this ``run_mode``. + if ``True`` a mesh will be created when running this ``run-mode``. ``mesh_type`` (spatial only) spacemake currently offers two types of meshes: (1) ``circle``, where circles with a given @@ -156,12 +156,12 @@ Each ``run_mode`` can have the following variables: filter out pucks from DGE creation and subsequent steps of the pipeline. If set to 0, no pucks are excluded. -``parent_run_mode`` - Each ``run_mode`` can have a parent, to which it will fall back. - If a one of the ``run_mode`` variables is missing, the variable of the parent will be used. - If parent is not provided, the ``default`` ``run_mode`` will be the parent. +``parent_run-mode`` + Each ``run-mode`` can have a parent, to which it will fall back. + If a one of the ``run-mode`` variables is missing, the variable of the parent will be used. + If parent is not provided, the ``default`` ``run-mode`` will be the parent. -Provided run\_modes +Provided run-modes ^^^^^^^^^^^^^^^^^^^^^ .. code-block:: yaml @@ -234,23 +234,23 @@ Provided run\_modes - 1000 .. note:: - If a sample has no ``run_mode`` provided, the ``default`` will be used + If a sample has no ``run-mode`` provided, the ``default`` will be used .. note:: - If a ``run_mode`` variable is not provided, the variable of the default ``run_mode`` will be used + If a ``run-mode`` variable is not provided, the variable of the default ``run-mode`` will be used -To list the currently available ``run_mode``-s, type:: +To list the currently available ``run-mode``-s, type:: - spacemake config list_run_modes + spacemake config list_run-modes Add a new run\_mode ^^^^^^^^^^^^^^^^^^^ -See the :ref:`variable descriptions ` above. +See the :ref:`variable descriptions ` above. .. code-block:: - spacemake config add_run_mode \ + spacemake config add_run-mode \ --name NAME \ --parent_run_mode PARENT_RUN_MODE \ --umi_cutoff UMI_CUTOFF [UMI_CUTOFF ...] \ @@ -265,10 +265,10 @@ See the :ref:`variable descriptions ` above. --mesh_spot_diameter_um MESH_SPOT_DIAMETER_UM \ --mesh_spot_distance_um MESH_SPOT_DISTANCE_UM -Update/delete a run\_mode +Update/delete a run-mode ^^^^^^^^^^^^^^^^^^^^^^^^^ -The ``spacemake config update_run_mode`` takes the same arguments as above, while ``spacemake config delete_run_mode`` takes only ``--name``. +The ``spacemake config update-run-mode`` takes the same arguments as above, while ``spacemake config delete-run-mode`` takes only ``--name``. Configure pucks @@ -279,7 +279,7 @@ Configure pucks Each spatial sample is associated with a ``puck``. The ``puck`` variable defines the dimensionality of the underlying spatial structure, which spacemake uses during the automated analysis and plotting, as well as the binning (meshing) of -the data when selected in the ``run_mode``. +the data when selected in the ``run-mode``. Each puck has the following variables: diff --git a/docs/initialize.rst b/docs/initialize.rst index 99ce58b4..42b79d2e 100644 --- a/docs/initialize.rst +++ b/docs/initialize.rst @@ -11,19 +11,20 @@ Optional arguments The `spacemake init` command takes the following optional arguments: -``root_dir`` - The ``root_dir`` for the spacemake instance. Defaults to ``.``, the directory in which `spacemake init` is ran. +``root-dir`` + The ``root-dir`` for the spacemake instance. Defaults to ``.``, the directory in which `spacemake init` is ran. -``temp_dir`` +``temp-dir`` Path to the temporary directory, defaults to ``/tmp``. -``download_species`` - If set, spacemake will download the genome (.fa) and annotation (.gtf) files for mouse and human (from gencode, as specified `here `_. +``download-species`` + If set, spacemake will download the genome (.fa) and annotation (.gtf) files for mouse and + human from gencode, as specified `here `_. Hence, the complete `spacemake init` command looks like this:: spacemake init \ - --root_dir ROOT_DIR \ # optional - --temp_dir TEMP_DIR \ # optional - --download_species \ # optional - --dropseq_tools DROPSEQ_TOOLS # required + --root-dir ROOT-DIR \ # optional + --temp-dir TEMP-DIR \ # optional + --download-species \ # optional + --dropseq-tools DROPSEQ-TOOLS # required diff --git a/docs/projects/index.rst b/docs/projects/index.rst index 492745a0..60b17686 100644 --- a/docs/projects/index.rst +++ b/docs/projects/index.rst @@ -6,7 +6,7 @@ directory of the spacemake project. Each sample has exactly one row in this ``project_df.csv`` file. In the back-end, spacemake uses a ``pandas.DataFrame`` to load and save this ``.csv`` file on disk. This data frame -is indexed by key ``(project_id, sample_id)`` +is indexed by key ``(project-id, sample-id)`` The spacemake class responsible for this back-end logic is the :ref:`ProjectDF` class. @@ -18,11 +18,11 @@ Sample parameters In spacemake each sample can have the folloing variables: -``project_id`` - ``project_id`` of a sample +``project-id`` + ``project-id`` of a sample -``sample_id`` - ``sample_id`` of a sample +``sample-id`` + ``sample-id`` of a sample ``R1`` ``.fastq.gz`` file path(s) to Read1 read file(s). Can be either a single file, or a space separated list of consecutive files. If a list provided, the files will be merged together and the merged ``R1.fastq.gz`` will be processed downstream. @@ -117,8 +117,8 @@ In spacemake each sample can have the folloing variables: If not provided, a ``default`` puck will be used with ``width_um=3000``, ``spot_diameter_um=10``. -``puck_id`` (optional) - ``puck_id`` of a sample +``puck-id`` (optional) + ``puck-id`` of a sample ``puck_barcode_file`` (optional) the path to the file contining (x,y) positions of the barcodes. If the ``puck`` for this @@ -142,8 +142,8 @@ In spacemake each sample can have the folloing variables: To add a single sample, we can use the following command:: spacemake projects add_sample \ - --project_id PROJECT_ID \ # required - --sample_id SAMPLE_ID \ # required + --project-id PROJECT-ID \ # required + --sample-id SAMPLE-ID \ # required --R1 R1 [R1 R1 ...] \ # required, if no longreads --R2 R2 [R2 R2 ...] \ # required, if no longreads --longreads LONGREADS \ # required, if no R1 & R2 @@ -151,7 +151,7 @@ To add a single sample, we can use the following command:: --barcode_flavor BARCODE_FLAVOR \ # optional --species SPECIES \ # required --puck PUCK \ # optional - --puck_id PUCK_ID \ # optional + --puck-id PUCK-ID \ # optional --puck_barcode_file PUCK_BARCODE_FILE \ # optional --investigator INVESTIGATOR \ # optional --experiment EXPERIMENT \ # optional @@ -221,10 +221,10 @@ Step 4: add your sample Once everything is configured you can add your custom spatial sample with the following command:: spacemake projects add_sample \ - # your sample's project_id \ - --project_id PROJECT_ID \ - # your sample's sample_id \ - --sample_id SAMPLE_ID \ + # your sample's project-id \ + --project-id PROJECT-ID \ + # your sample's sample-id \ + --sample-id SAMPLE-ID \ # one or more R1.fastq.gz files --R1 R1 [R1 R1 ...] \ # one or more R2.fastq.gz files @@ -265,30 +265,30 @@ The ``samples.yaml`` should have the following structure: .. code-block:: yaml additional_projects: - - project_id: visium - sample_id: visium_1 + - project-id: visium + sample-id: visium_1 R1: R2: species: mouse puck: visium - barcode_flavor: visium - run_mode: [visium] - - project_id: visium - sample_id: visium_2 + barcode-flavor: visium + run-mode: [visium] + - project-id: visium + sample-id: visium_2 R1: R2: species: human puck: visium - barcode_flavor: visium - run_mode: [visium] - - project_id: slideseq - sample_id: slideseq_1 + barcode-flavor: visium + run-mode: [visium] + - project-id: slideseq + sample-id: slideseq_1 R1: R2: species: mouse puck: slideseq - barcode_flavor: slideseq_14bc - run_mode: [default, slideseq] + barcode-flavor: slideseq_14bc + run-mode: [default, slideseq] puck_barcode_file: Under ``additional_projects`` we define a list where each element will be a key:value pair, to be inserted in the ``project_df.csv`` @@ -296,7 +296,7 @@ Under ``additional_projects`` we define a list where each element will be a key: .. note:: When using the above command, if a sample is already present in the ``project_df.csv`` rather than adding it again, spacemake will update it. - If someone runs ``spacemake projects add_samples_from_yaml --samples yaml samples.yaml`` and + If someone runs ``spacemake projects add-samples-from-yaml --samples yaml samples.yaml`` and then modifies something in the ``samples.yaml``, and runs the command again, the ``project_df.csv`` will contain the updated version of the settings. @@ -307,14 +307,14 @@ You can add samples directly from an Illumina sample-sheet, assuming the sample- To use this functionality, type:: - spacemake projects add_sample_sheet \ - --sample_sheet \ - --basecalls_dir + spacemake projects add-sample-sheet \ + --sample-sheet \ + --basecalls-dir The sample-sheet columns have to obey certain conventions for spacemake to parse it properly: -* ``Sample_ID`` contains the ``sample_id`` in the project. -* ``Sample_Project`` contains the ``project_id`` in the project. +* ``Sample_ID`` contains the ``sample-id`` in the project. +* ``Sample_Project`` contains the ``project-id`` in the project. * ``Description`` must end with ``_species``, where species is the one configured for the samples in the project, e.g. ``HEK293_wt_human``. Spacemake will also parse the fields ``Investigator``, ``Date``, and ``Experiment`` from the sample-sheet and add them to the project metadata. @@ -334,11 +334,14 @@ to specify which extra variables to show. Merging samples ---------------- -Spacemake can merge samples that have been resequenced to increase the number of quantified molecules in the data. To merge samples, first configure, add, and process the individual samples as they are. Make sure that the samples belong in the same project, e.g. have the same ``project_id``. Then merge them by typing:: +Spacemake can merge samples that have been resequenced to increase the number of quantified +molecules in the data. To merge samples, first configure, add, and process the individual samples +as they are. Make sure that the samples belong in the same project, e.g. have the +same ``project-id``. Then merge them by typing:: - spacemake projects merge_samples \ - --merge_project_id \ - --merged_sample_id \ - --sample_id_list + spacemake projects merge-samples \ + --merge-project-id \ + --merged-sample-id \ + --sample-id-list The above command will merge the two samples by creating a new sample with the same variables. Spacemake performs the merging at the level of the ``bam`` files, thus properly processing the merged sample by collapsing PCR duplicates. Processing will automatically run until the creation of the ``qc_sheets`` and the automated analyses. diff --git a/docs/quick-start/index.rst b/docs/quick-start/index.rst index 6c58515b..36caedc2 100644 --- a/docs/quick-start/index.rst +++ b/docs/quick-start/index.rst @@ -53,16 +53,16 @@ To add an `Open-ST` sample, type in the terminal: .. code-block:: console - spacemake projects add_sample \ - --project_id \ - --sample_id \ + spacemake projects add-sample \ + --project-id \ + --sample-id \ --R1 \ # single R1 or several R1 files --R2 \ # single R2 or several R2 files --species \ --puck openst \ - --puck_barcode_file \ - --run_mode openst \ - --barcode_flavor openst + --puck-barcode-file \ + --run-mode openst \ + --barcode-flavor openst .. note:: @@ -72,7 +72,7 @@ To add an `Open-ST` sample, type in the terminal: ``sample_1a_R1.fastq.gz``, ``sample_1b_R1.fastq.gz``, ``sample_1a_R2.fastq.gz``, ``sample_1b_R2.fastq.gz``, meaning that R1 and R2 are both split into two, one can simply call spacemake with the following command:: - spacemake projects add_sample \ + spacemake projects add-sample \ ... --R1 sample_1a_R1.fastq.gz sample_1b_R1.fastq.gz \ --R2 sample_1a_R2.fastq.gz sample_1b_R2.fastq.gz \ @@ -83,7 +83,7 @@ To add an `Open-ST` sample, type in the terminal: With Open-ST data, each sample covers a *piece* of capture area, which contains at least one tile (``puck``). - Thus, we need to provide ``--puck_barcode_file`` (since each puck has different barcodes, unlike for visium samples). + Thus, we need to provide ``--puck-barcode-file`` (since each puck has different barcodes, unlike for visium samples). This file should be a comma or tab separated, containing column names as first row. Acceptable column names are: - ``cell_bc``, ``barcodes`` or ``barcode`` for cell-barcode @@ -93,9 +93,9 @@ To add an `Open-ST` sample, type in the terminal: These can be generated using the `openst package `_. It is typically unknown *a priori* which tiles fall under a sample, so in principle all available - ``puck_barcode_files`` would need to be specified under ``--puck_barcode_file``. To generate output files + ``puck-barcode-files`` would need to be specified under ``--puck-barcode-file``. To generate output files and reports only for the relevant tiles per sample, we provide the variable ``spatial_barcode_min_matches`` - under ``run_mode``, as the minimum proportion of spatial barcodes that a tile has in common with + under ``run-mode``, as the minimum proportion of spatial barcodes that a tile has in common with the sample reads to be considered during quantification and downstream analysis. The default threshold (0.1, i.e., at least 10% of barcodes per tile are present in the sample) was chosen @@ -104,19 +104,19 @@ To add an `Open-ST` sample, type in the terminal: because this threshold was too high, you can update the sample to add missing tiles, and then rerun spacemake using a lower ``spatial_barcode_min_matches``. -The above will add a new Open-ST project with ``barcode_flavor, run_mode, puck`` all +The above will add a new Open-ST project with ``barcode-flavor, run-mode, puck`` all set to ``openst``. -The structure in ``barcode_flavor`` assumes that sequencing of the library is paired-end, +The structure in ``barcode-flavor`` assumes that sequencing of the library is paired-end, with Read1 having the spot barcodes, and Read2 containing the UMI (first 9 nucleotides) and the sequence to be aligned to the genome. You can see the details by running -``spacemake config list_barcode_flavors``. +``spacemake config list-barcode-flavors``. -The ``run_mode`` is tailored to Open-ST samples and it +The ``run-mode`` is tailored to Open-ST samples and it (i) counts intronic reads, (ii) includes multi-mappers, (iii) bins the data into regular hexagons of a 7 um side, and (iv) performs automated analyses for three UMI thresholds [100, 250, 500]. You can see the details by running -``spacemake config list_run_modes``. +``spacemake config list-run-modes``. The ``puck`` variable is used to correctly set the size of the capture area. This is required for accurately plotting the QC sheets and the results of the automated @@ -144,17 +144,17 @@ To add a `Visium`_ sample, type in terminal: .. code-block:: console - spacemake projects add_sample \ - --project_id \ - --sample_id \ + spacemake projects add-sample \ + --project-id \ + --sample-id \ --R1 \ # single R1 or several R1 files --R2 \ # single R2 or several R2 files --species \ --puck visium \ - --run_mode visium \ - --barcode_flavor visium + --run-mode visium \ + --barcode-flavor visium -Above we add a new visium project with ``puck, run_mode, barcode_flavor`` all set to ``visium``. +Above we add a new visium project with ``puck, run-mode, barcode-flavor`` all set to ``visium``. This is possible as spacemake comes with pre-defined variables, all suited for visium. The visium ``run_mode`` will process the sample in the same way as `spaceranger `_ would: intronic reads will not be counted, multi-mappers (where the multi-mapping read maps only to one CDS or UTR region) will be counted, @@ -183,32 +183,32 @@ To add a `Slide-seq`_ sample, type in terminal: .. code-block:: console - spacemake projects add_sample \ - --project_id \ - --sample_id \ + spacemake projects add-sample \ + --project-id \ + --sample-id \ --R1 \ --R2 \ --species \ --puck slide_seq \ - --run_mode slide_seq \ - --barcode_flavor slide_seq_14bc \ - --puck_barcode_file + --run-mode slide_seq \ + --barcode-flavor slide_seq_14bc \ + --puck-barcode-file -Above we add a new `Slide-seq`_ project with the ``puck, run_mode`` will be set to ``slide_seq`` +Above we add a new `Slide-seq`_ project with the ``puck, run-mode`` will be set to ``slide_seq`` which are pre-defined settings for `Slide-seq`_ samples. .. note:: For spatial samples other than visium - such as `Slide-seq`_ - we need to provide a - ``puck_barcode_file`` (since each puck has different barcodes, unlike for visium samples). + ``puck-barcode-file`` (since each puck has different barcodes, unlike for visium samples). This file should be a comma or tab separated, containing column names as first row. Acceptable column names are: - ``cell_bc``, ``barcodes`` or ``barcode`` for cell-barcode - ``xcoord`` or ``x_pos`` for x-positions - ``ycoord`` or ``y_pos`` for y-positions -In this example ``barcode_flavor`` will be set to ``slide_seq_14bc``, -a pre-defined ``barcode_flavor`` in spacemake, where the ``cell_barcode`` comes from the first 14nt of Read1, and the ``UMI`` comes from nt 13-22 (remaining 9 nt). -The other pre-defined ``barcode_flavor`` for `Slide-seq`_ is ``slide_seq_15bc``: here ``cell_barcode`` again comes from the first 14nt of Read1, but the ``UMI`` comes from nt 14-22 (remaining 8) of Read1. +In this example ``barcode-flavor`` will be set to ``slide_seq_14bc``, +a pre-defined ``barcode-flavor`` in spacemake, where the ``cell_barcode`` comes from the first 14nt of Read1, and the ``UMI`` comes from nt 13-22 (remaining 9 nt). +The other pre-defined ``barcode-flavor`` for `Slide-seq`_ is ``slide_seq_15bc``: here ``cell_barcode`` again comes from the first 14nt of Read1, but the ``UMI`` comes from nt 14-22 (remaining 8) of Read1. To see the values of these predefined variables checkout the :ref:`configuration ` docs. @@ -234,26 +234,26 @@ is similar to Slide-seq: .. code-block:: console - spacemake projects add_sample \ - --project_id \ - --sample_id \ + spacemake projects add-sample \ + --project-id \ + --sample-id \ --R1 \ # single R1 or several R1 files --R2 \ # single R2 or several R2 files --species \ --puck seq_scope \ - --run_mode seq_scope \ - --barcode_flavor seq_scope \ - --puck_barcode_file + --run-mode seq_scope \ + --barcode-flavor seq_scope \ + --puck-barcode-file -Here we used the pre-defined variables for ``puck``, ``barcode_flavor`` and ``run_mode`` all set to ``seq_scope``. +Here we used the pre-defined variables for ``puck``, ``barcode-flavor`` and ``run-mode`` all set to ``seq_scope``. The ``seq_scope`` ``puck`` has 1000 micron width and bead size set to 1 micron. -The ``seq_scope`` ``barcode_flavor`` describes how the ``cell_barcode`` and he ``UMI`` should be extracted. +The ``seq_scope`` ``barcode-flavor`` describes how the ``cell_barcode`` and he ``UMI`` should be extracted. As described in the `Seq-scope`_ paper. ``cell_barcode`` comes from nt 1-20 of Read1, and ``UMI`` comes from 1-9nt of Read2. -The ``seq_scope`` ``run_mode`` has its settings as follows: +The ``seq_scope`` ``run-mode`` has its settings as follows: .. code-block:: yaml @@ -293,36 +293,36 @@ single-cell experiments, where there is no spatial information available. To add a scRNA-seq sample, simply type:: - spacemake projects add_sample \ - --project_id \ - --sample_id \ + spacemake projects add-sample \ + --project-id \ + --sample-id \ --R1 \ # single R1 or several R1 files --R2 \ # single R2 or several R2 files --species \ - --run_mode scRNA_seq \ - --barcode_flavor default # use other flavors for 10X Chromium + --run-mode scRNA_seq \ + --barcode-flavor default # use other flavors for 10X Chromium -As seen above, we define fewer variables as before: only ``species`` and ``run_mode`` are needed. +As seen above, we define fewer variables as before: only ``species`` and ``run-mode`` are needed. .. warning:: - As it can be seen, we set the ``barcode_flavor`` to ``default``, which will use the `Drop-seq`_ + As it can be seen, we set the ``barcode-flavor`` to ``default``, which will use the `Drop-seq`_ scRNA-seq barcoding strategy (``cell_barcode`` is 1-12nt of Read1, ``UMI`` is 13-20 nt of Read1). For 10X samples either the ``sc_10x_v2`` (10X Chromium Single Cell 3' V2) or ``visium`` (10X Chromium Single Cell 3' V3, same as visium) should be be used as - ``barcode_flavor``. Both are pre-defined in spacemake. + ``barcode-flavor``. Both are pre-defined in spacemake. - If another barcoding strategy is used, a custom ``barcode_flavor`` has to be defined. - :ref:`Here is a complete guide on how to add a custom barcode_flavor `. + If another barcoding strategy is used, a custom ``barcode-flavor`` has to be defined. + :ref:`Here is a complete guide on how to add a custom barcode-flavor `. .. note:: - By setting ``run_mode`` to ``scRNA_seq`` we used the pre-defined ``run_mode`` settings tailored for single-cell experiments: expected number of cells (or beads) will be 10k, introns will be counted, UMI cutoff will be at 500, multi-mappers will not be counted and polyA and adapter sequences will be trimmed from Read2. + By setting ``run-mode`` to ``scRNA_seq`` we used the pre-defined ``run-mode`` settings tailored for single-cell experiments: expected number of cells (or beads) will be 10k, introns will be counted, UMI cutoff will be at 500, multi-mappers will not be counted and polyA and adapter sequences will be trimmed from Read2. - Of course, running single-cell samples with other ``run_mode`` settings is also possible. + Of course, running single-cell samples with other ``run-mode`` settings is also possible. - :ref:`Here it can be learned how to add a custom run_mode `, tailored to the experimental needs. + :ref:`Here it can be learned how to add a custom run-mode `, tailored to the experimental needs. To see the values of these predefined variables checkout the :ref:`configuration ` docs. From 6ad170c0c8a611999bf8e5fd314f7db958fc0ead Mon Sep 17 00:00:00 2001 From: nukappa Date: Thu, 5 Dec 2024 08:19:53 +0100 Subject: [PATCH 02/12] cleaner output for the filter_mm_reads tool --- .../snakemake/scripts/filter_mm_reads.py | 23 +++++++++++++------ 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/spacemake/snakemake/scripts/filter_mm_reads.py b/spacemake/snakemake/scripts/filter_mm_reads.py index 139a3a01..98266d04 100644 --- a/spacemake/snakemake/scripts/filter_mm_reads.py +++ b/spacemake/snakemake/scripts/filter_mm_reads.py @@ -44,19 +44,26 @@ def is_exonic(aln): bam_out = pysam.AlignmentFile(args.out_bam, 'wb', header= bam_in.header) counter = 0 start_time = datetime.datetime.now() + total_start_time = datetime.datetime.now() + time_interval = 30 multi_mappers = [] for aln in bam_in.fetch(until_eof=True): - counter = counter + 1 + counter += 1 - if counter % 1000000 == 0: - finish_time = datetime.datetime.now() - delta_seconds = (finish_time - start_time).seconds - # restart time - start_time = finish_time - print(f'Processed 1 million records in {delta_seconds} seconds, total records processed {counter}. current time: {finish_time}') + finish_time = datetime.datetime.now() + delta_seconds = (finish_time - start_time).seconds + total_elapsed_seconds = (finish_time - total_start_time).total_seconds() + + if delta_seconds >= time_interval: + formatted_time = finish_time.strftime('%Y-%m-%d %H:%M:%S') + records_per_second = counter / delta_seconds + print(f'Processed {counter:,} records in {total_elapsed_seconds:,.0f} seconds. Average processing rate: {records_per_second:,.0f} records/second. Current time: {formatted_time}') + + start_time = finish_time + mapped_number = aln.get_tag('NH') if mapped_number == 1: @@ -80,3 +87,5 @@ def is_exonic(aln): # reset multimapper list multi_mappers = [] + + print(f'Finished processing {counter:,} records in {total_elapsed_seconds:,.0f} seconds. Current time: {formatted_time}') \ No newline at end of file From e02d8351c394790ec2462299987762c56b23629f Mon Sep 17 00:00:00 2001 From: Marvin Jens Date: Thu, 5 Dec 2024 13:39:03 +0100 Subject: [PATCH 03/12] * add log rate output to fastq_to_uBAM.py * make names of subprocesses nicer --- environment.yaml | 2 +- spacemake/bin/fastq_to_uBAM.py | 44 ++++++---------------------------- spacemake/util.py | 36 +++++++++++++++++++++------- 3 files changed, 35 insertions(+), 47 deletions(-) diff --git a/environment.yaml b/environment.yaml index e0f21c6d..bfbe7420 100644 --- a/environment.yaml +++ b/environment.yaml @@ -31,7 +31,7 @@ dependencies: - isal - pytest - pytest-cov - - mrfifo>=0.3.0 + - mrfifo>=0.8.1 - pandas>2 - scanpy>=1.8.1 - leidenalg>=0.8.1 diff --git a/spacemake/bin/fastq_to_uBAM.py b/spacemake/bin/fastq_to_uBAM.py index a13e2a3f..b8688de4 100644 --- a/spacemake/bin/fastq_to_uBAM.py +++ b/spacemake/bin/fastq_to_uBAM.py @@ -100,7 +100,7 @@ def quality_trim(fq_src, min_qual=20, phred_base=33): def render_to_sam(fq1, fq2, sam_out, args, **kwargs): - logger = util.setup_logging(args, "spacemake.bin.fastq_to_uBAM.worker") + logger = util.setup_logging(args, "fastq_to_uBAM.worker", rename_process=False) logger.debug( f"starting up with fq1={fq1}, fq2={fq2} sam_out={sam_out} and args={args}" ) @@ -151,41 +151,6 @@ def iter_single(fq2): sam_out.write(make_sam_record(flag=4, **attrs)) return N - # if args.paired_end: - # # if args.paired_end[0] == 'r': - # # # rev_comp R1 and reverse qual1 - # # q1 = q1[::-1] - # # r1 = util.rev_comp(r1) - - # # if args.paired_end[1] == 'r': - # # # rev_comp R2 and reverse qual2 - # # q2 = q2[::-1] - # # r2 = util.rev_comp(r2) - - # rec1 = fmt.make_bam_record( - # qname=fqid, - # r1="THISSHOULDNEVERBEREFERENCED", - # r2=r1, - # R1=r1, - # R2=r2, - # r2_qual=q1, - # r2_qname=fqid, - # flag=69, # unmapped, paired, first in pair - # ) - # rec2 = fmt.make_bam_record( - # qname=fqid, - # r1="THISSHOULDNEVERBEREFERENCED", - # r2=r2, - # R1=r1, - # R2=r2, - # r2_qual=q2, - # r2_qname=fqid, - # flag=133, # unmapped, paired, second in pair - # ) - # results.append(rec1) - # results.append(rec2) - - # else: def main(args): @@ -194,7 +159,9 @@ def main(args): # queues for communication between processes w = ( - mf.Workflow("fastq_to_uBAM", total_pipe_buffer_MB=args.pipe_buffer) + mf.Workflow( + f"[{args.sample}]fastq_to_uBAM", total_pipe_buffer_MB=args.pipe_buffer + ) # open reads2.fastq.gz .gz_reader(inputs=input_reads2, output=mf.FIFO("read2", "wb")).distribute( input=mf.FIFO("read2", "rt"), @@ -244,6 +211,9 @@ def main(args): # output="/dev/stdout" # ) output=mf.FIFO("sam_combined", "wt"), + log_rate_every_n=1000000, + log_rate_template="written {M_out:.1f} M BAM records ({MPS:.3f} M/s, overall {mps:.3f} M/s)", + log_name="fastq_to_uBAM.collect", ) # compress to BAM w.funnel( diff --git a/spacemake/util.py b/spacemake/util.py index 7fdedf40..f8d269d3 100644 --- a/spacemake/util.py +++ b/spacemake/util.py @@ -2,7 +2,7 @@ import os import logging -#from contextlib import ContextDecorator, contextmanager +# from contextlib import ContextDecorator, contextmanager from spacemake.errors import SpacemakeError, FileWrongExtensionError from spacemake.contrib import __version__, __license__, __author__, __email__ @@ -10,14 +10,16 @@ bool_in_str = ["True", "true", "False", "false"] -def generate_kmers(k, nts='ACGT'): + +def generate_kmers(k, nts="ACGT"): if k == 0: - yield '' + yield "" elif k > 0: for x in nts: - for mer in generate_kmers(k-1, nts=nts): + for mer in generate_kmers(k - 1, nts=nts): yield x + mer + class dotdict(dict): """dot.notation access to dictionary attributes""" @@ -50,6 +52,7 @@ def wc_fill(x, wc): data_root_type=getattr(wc, "data_root_type", "complete_data"), ) + def quiet_bam_open(*argc, **kw): """_summary_ @@ -112,6 +115,7 @@ def ensure_path(path): os.makedirs(dirname, exist_ok=True) return path + def timed_loop( src, logger, @@ -189,7 +193,13 @@ def read_fq(fname, skim=0): src = FASTQ_src(fname) # assume its a stream or file-like object already n = 0 - for record in timed_loop(src, logger, T=15, template="processed {i} reads in {dT:.1f}sec. ({rate:.3f} k rec/sec)", skim=skim): + for record in timed_loop( + src, + logger, + T=15, + template="processed {i} reads in {dT:.1f}sec. ({rate:.3f} k rec/sec)", + skim=skim, + ): yield record n += 1 @@ -396,6 +406,7 @@ def fasta_chunks(lines, strip=True, fuse=True): # except SpacemakeError as e: # print(e) + def message_aggregation(log_listen="spacemake", print_logger=False, print_success=True): from functools import wraps @@ -428,10 +439,11 @@ def handle(this, record): print(f"{LINE_SEPARATOR}SUCCESS!") return res - return wrapper + return wrapper return the_decorator + def str_to_list(value): # if list in string representation, return the list if value is None: @@ -477,11 +489,14 @@ def setup_logging( name="spacemake.main", log_file="", FORMAT="%(asctime)-20s\t{sample:30s}\t%(name)-50s\t%(levelname)s\t%(message)s", + rename_process=True, ): sample = getattr(args, "sample", "na") - import setproctitle - if name != "spacemake.main": - setproctitle.setproctitle(f"{name} {sample}") + if rename_process: + import setproctitle + + if name != "spacemake.main": + setproctitle.setproctitle(f"{name} {sample}") FORMAT = FORMAT.format(sample=sample) @@ -514,11 +529,14 @@ def setup_logging( return logger + def setup_smk_logging(name="spacemake.smk", **kw): import argparse + args = argparse.Namespace(**kw) return setup_logging(args, name=name) + default_log_level = "INFO" From 4a15cf992f0f933397fc314900473450740775fe Mon Sep 17 00:00:00 2001 From: Marvin Jens Date: Mon, 9 Dec 2024 11:57:51 +0100 Subject: [PATCH 04/12] * added CRAM suport by means of supplying --bam-fmt=C arg. * corrected minor rate output confusion (running and total average rates were swapped) --- spacemake/bin/fastq_to_uBAM.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/spacemake/bin/fastq_to_uBAM.py b/spacemake/bin/fastq_to_uBAM.py index b8688de4..a66bf144 100644 --- a/spacemake/bin/fastq_to_uBAM.py +++ b/spacemake/bin/fastq_to_uBAM.py @@ -212,7 +212,7 @@ def main(args): # ) output=mf.FIFO("sam_combined", "wt"), log_rate_every_n=1000000, - log_rate_template="written {M_out:.1f} M BAM records ({MPS:.3f} M/s, overall {mps:.3f} M/s)", + log_rate_template="written {M_out:.1f} M BAM records ({mps:.3f} M/s, overall {MPS:.3f} M/s)", log_name="fastq_to_uBAM.collect", ) # compress to BAM @@ -221,7 +221,7 @@ def main(args): input=mf.FIFO("sam_combined", "rt"), output=args.out_bam, _manage_fifos=False, - fmt="Sbh", + fmt=f"Sh{args.bam_fmt}", threads=16, ) return w.run() @@ -357,6 +357,11 @@ def parse_args(): default="CR:{cell},CB:{cell},MI:{UMI},RG:A", help="a template of comma-separated BAM tags to generate. Variables are replaced with extracted cell barcode, UMI etc.", ) + parser.add_argument( + "--bam-fmt", default="b", choices=["b", "C"], + help="b=BAM (default), C=CRAM", + ) + args = parser.parse_args() if args.parallel < 1: raise ValueError(f"--parallel {args.parallel} is invalid. Must be >= 1") From 3c90089a069e196ba6162b766e1e317f8cb1ca51 Mon Sep 17 00:00:00 2001 From: Marvin Jens Date: Wed, 11 Dec 2024 18:23:01 +0100 Subject: [PATCH 05/12] * add experimental support for qual score quantization, and qname simplification for up to ~40% reduction in uBAM file size --- spacemake/bin/fastq_to_uBAM.py | 87 ++++++++++++++++++++++++++++++---- 1 file changed, 77 insertions(+), 10 deletions(-) diff --git a/spacemake/bin/fastq_to_uBAM.py b/spacemake/bin/fastq_to_uBAM.py index a66bf144..35a466da 100644 --- a/spacemake/bin/fastq_to_uBAM.py +++ b/spacemake/bin/fastq_to_uBAM.py @@ -84,25 +84,60 @@ def make_sam_record( def quality_trim(fq_src, min_qual=20, phred_base=33): + from cutadapt.qualtrim import quality_trim_index for name, seq, qual in fq_src: end = len(seq) - q = np.array(bytearray(qual.encode("ASCII"))) - phred_base - qtrim = q >= min_qual - new_end = end - (qtrim[::-1]).argmax() + new_start, new_end = quality_trim_index(qual, min_qual, min_qual) + # we use the cutadapt function here (which implements BWA's logic). + n_trimmed = len(qual) - (new_end - new_start) # TODO: yield A3,T3 adapter-trimming tags - # TODO: convert to cutadapt/BWA qual-trim logic - if new_end != end: - qual = qual[:new_end] - seq = seq[:new_end] + qual = qual[new_start:new_end] + seq = seq[new_start:new_end] yield (name, seq, qual) -def render_to_sam(fq1, fq2, sam_out, args, **kwargs): +#def quality_trim(fq_src, min_qual=20, phred_base=33): +# for name, seq, qual in fq_src: +# end = len(seq) +# q = np.array(bytearray(qual.encode("ASCII"))) - phred_base +# qtrim = q >= min_qual +# new_end = end - (qtrim[::-1]).argmax() +# + # TODO: yield A3,T3 adapter-trimming tags + # TODO: convert to cutadapt/BWA qual-trim logic +# if new_end != end: +# qual = qual[:new_end] +# seq = seq[:new_end] + +# yield (name, seq, qual) + + +def simplify_qname(qname, n): + return f"{int(n):d}" +# return f"{qname.split(':')[0]}_{n}" + + +QMAP = {} +for Q in range(40): + C = chr(Q + 33) + q = (Q // 5) * 5 + c = chr(q + 33) + QMAP[C] = c + + +def quantize_quality(qual): + return "".join([QMAP[q] for q in qual]) + + +def render_to_sam(fq1, fq2, sam_out, args, _extra_args={}, **kwargs): + + _n = _extra_args['n'] + logger = util.setup_logging(args, "fastq_to_uBAM.worker", rename_process=False) logger.debug( - f"starting up with fq1={fq1}, fq2={fq2} sam_out={sam_out} and args={args}" + f"starting up with fq1={fq1}, fq2={fq2} sam_out={sam_out} and args={args} _extra_wargs={_extra_args}" ) def iter_paired(fq1, fq2): @@ -123,6 +158,8 @@ def iter_single(fq2): ) for fqid, seq, qual in fq_src2: +# if args.qual_quantization: +# qual = quantize_quality(qual) yield (fqid, "NA", "NA"), (fqid, seq, qual) if fq1: @@ -147,6 +184,12 @@ def iter_single(fq2): for (fqid, r1, q1), (_, r2, q2) in ingress: N.count("total") + if args.qual_quantization: + q2 = quantize_quality(q2) + + if args.simplify_read_id: + fqid = simplify_qname(fqid, N.stats['total'] + args.chunk_size * _n) + attrs = fmt(r2_qname=fqid, r1=r1, r1_qual=q1, r2=r2, r2_qual=q2) sam_out.write(make_sam_record(flag=4, **attrs)) @@ -335,6 +378,28 @@ def parse_args(): type=int, help="phred quality base (default=33)", ) + parser.add_argument( + "--qual-quantization", + default=False, + # type=bool, + action="store_true", + help="Quantize quality scores to just 4 levels for better compression/smaller file size (default=False)", + ) + parser.add_argument( + "--simplify-read-id", + default=False, + # type=bool, + action="store_true", + help="Simplify read name (FASTQ id, QNAME) for better compression/smaller file size (default=False)", + ) + + parser.add_argument( + "--qname-simplification", + default=False, + action="store_true", + help="Truncate flow-cell tile coordinates from QNAME and replace with counter for better compression/smaller file size (default=False)", + ) + parser.add_argument( "--pipe-buffer", default=4, @@ -358,7 +423,9 @@ def parse_args(): help="a template of comma-separated BAM tags to generate. Variables are replaced with extracted cell barcode, UMI etc.", ) parser.add_argument( - "--bam-fmt", default="b", choices=["b", "C"], + "--bam-fmt", + default="b", + choices=["b", "C"], help="b=BAM (default), C=CRAM", ) From 43af4f77fe354314b0aa6703f73b5692839aa0e8 Mon Sep 17 00:00:00 2001 From: Marvin Jens Date: Fri, 17 Jan 2025 12:48:45 +0100 Subject: [PATCH 06/12] * more experiments with CRAM in fastq_to_uBAM to tweak file size --- spacemake/bin/fastq_to_uBAM.py | 56 +++++++++++++++++++++++----------- 1 file changed, 38 insertions(+), 18 deletions(-) diff --git a/spacemake/bin/fastq_to_uBAM.py b/spacemake/bin/fastq_to_uBAM.py index 35a466da..c6b2c336 100644 --- a/spacemake/bin/fastq_to_uBAM.py +++ b/spacemake/bin/fastq_to_uBAM.py @@ -85,6 +85,7 @@ def make_sam_record( def quality_trim(fq_src, min_qual=20, phred_base=33): from cutadapt.qualtrim import quality_trim_index + for name, seq, qual in fq_src: end = len(seq) new_start, new_end = quality_trim_index(qual, min_qual, min_qual) @@ -98,15 +99,15 @@ def quality_trim(fq_src, min_qual=20, phred_base=33): yield (name, seq, qual) -#def quality_trim(fq_src, min_qual=20, phred_base=33): +# def quality_trim(fq_src, min_qual=20, phred_base=33): # for name, seq, qual in fq_src: # end = len(seq) # q = np.array(bytearray(qual.encode("ASCII"))) - phred_base # qtrim = q >= min_qual # new_end = end - (qtrim[::-1]).argmax() # - # TODO: yield A3,T3 adapter-trimming tags - # TODO: convert to cutadapt/BWA qual-trim logic +# TODO: yield A3,T3 adapter-trimming tags +# TODO: convert to cutadapt/BWA qual-trim logic # if new_end != end: # qual = qual[:new_end] # seq = seq[:new_end] @@ -116,13 +117,15 @@ def quality_trim(fq_src, min_qual=20, phred_base=33): def simplify_qname(qname, n): return f"{int(n):d}" + + # return f"{qname.split(':')[0]}_{n}" - + QMAP = {} -for Q in range(40): +for Q in range(41): C = chr(Q + 33) - q = (Q // 5) * 5 + q = int(np.round(Q / 10, 0)) * 10 c = chr(q + 33) QMAP[C] = c @@ -133,11 +136,11 @@ def quantize_quality(qual): def render_to_sam(fq1, fq2, sam_out, args, _extra_args={}, **kwargs): - _n = _extra_args['n'] + w = _extra_args["n"] logger = util.setup_logging(args, "fastq_to_uBAM.worker", rename_process=False) logger.debug( - f"starting up with fq1={fq1}, fq2={fq2} sam_out={sam_out} and args={args} _extra_wargs={_extra_args}" + f"starting up with fq1={fq1}, fq2={fq2} sam_out={sam_out} and args={args} _extra_args={_extra_args}" ) def iter_paired(fq1, fq2): @@ -158,8 +161,8 @@ def iter_single(fq2): ) for fqid, seq, qual in fq_src2: -# if args.qual_quantization: -# qual = quantize_quality(qual) + if args.qual_quantization: + qual = quantize_quality(qual) yield (fqid, "NA", "NA"), (fqid, seq, qual) if fq1: @@ -179,17 +182,21 @@ def iter_single(fq2): # args = argparse.Namespace(**kw) N = mf.util.CountDict() + c = args.chunk_size + p = args.parallel fmt = make_formatter_from_args(args) # , **params for (fqid, r1, q1), (_, r2, q2) in ingress: - N.count("total") if args.qual_quantization: - q2 = quantize_quality(q2) + q2 = quantize_quality(q2) if args.simplify_read_id: - fqid = simplify_qname(fqid, N.stats['total'] + args.chunk_size * _n) + i = N.stats["total"] + n = c * (w + (i // c) * (p + w)) + i % c + fqid = simplify_qname(fqid, n) + N.count("total") attrs = fmt(r2_qname=fqid, r1=r1, r1_qual=q1, r2=r2, r2_qual=q2) sam_out.write(make_sam_record(flag=4, **attrs)) @@ -259,12 +266,15 @@ def main(args): log_name="fastq_to_uBAM.collect", ) # compress to BAM + fmt_opt = " ".join([f"--output-fmt-option {o}" for o in args.output_fmt_option]) + fmt = f"Sh -O {args.output_fmt} {fmt_opt}" + w.funnel( func=mf.parts.bam_writer, # mf.parts.null_writer, # input=mf.FIFO("sam_combined", "rt"), output=args.out_bam, _manage_fifos=False, - fmt=f"Sh{args.bam_fmt}", + fmt=fmt, threads=16, ) return w.run() @@ -423,10 +433,20 @@ def parse_args(): help="a template of comma-separated BAM tags to generate. Variables are replaced with extracted cell barcode, UMI etc.", ) parser.add_argument( - "--bam-fmt", - default="b", - choices=["b", "C"], - help="b=BAM (default), C=CRAM", + "--output-fmt", + default="BAM", + choices=[ + "SAM", + "BAM", + "CRAM", + ], + help="SAM, BAM (default), or CRAM", + ) + parser.add_argument( + "--output-fmt-option", + default="", + nargs="+", + help="passed options to `samtools view` as --output-fmt-options", ) args = parser.parse_args() From c7fcfc851e46e6b2e6f76f583fdbd6ccc30f5dcc Mon Sep 17 00:00:00 2001 From: Marvin Jens Date: Mon, 27 Jan 2025 08:32:41 +0100 Subject: [PATCH 07/12] * initial CRAM support for fastq_to_uBAM, with some experimental features to improve compression even further (qname simplification, qual binning) --- spacemake/bin/fastq_to_uBAM.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/spacemake/bin/fastq_to_uBAM.py b/spacemake/bin/fastq_to_uBAM.py index c6b2c336..fe233992 100644 --- a/spacemake/bin/fastq_to_uBAM.py +++ b/spacemake/bin/fastq_to_uBAM.py @@ -115,8 +115,9 @@ def quality_trim(fq_src, min_qual=20, phred_base=33): # yield (name, seq, qual) -def simplify_qname(qname, n): - return f"{int(n):d}" +def simplify_qname(qname, i, w, c, p): + n = c * (w + (i // c) * (p + w)) + i % c + return f"n={int(n):d} worker={w} parallel={p} internal={i} chunk_size={c}" # return f"{qname.split(':')[0]}_{n}" @@ -193,8 +194,7 @@ def iter_single(fq2): if args.simplify_read_id: i = N.stats["total"] - n = c * (w + (i // c) * (p + w)) + i % c - fqid = simplify_qname(fqid, n) + fqid = simplify_qname(fqid, i, w, c, p) N.count("total") attrs = fmt(r2_qname=fqid, r1=r1, r1_qual=q1, r2=r2, r2_qual=q2) From 59282052d6c5c625e4bef251c3c47a0dbd9c2c45 Mon Sep 17 00:00:00 2001 From: Marvin Jens Date: Wed, 29 Jan 2025 22:13:23 +0100 Subject: [PATCH 08/12] Revert "* initial CRAM support for fastq_to_uBAM, with some experimental features to improve compression even further (qname simplification, qual binning)" This reverts commit c7fcfc851e46e6b2e6f76f583fdbd6ccc30f5dcc. --- spacemake/bin/fastq_to_uBAM.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/spacemake/bin/fastq_to_uBAM.py b/spacemake/bin/fastq_to_uBAM.py index fe233992..c6b2c336 100644 --- a/spacemake/bin/fastq_to_uBAM.py +++ b/spacemake/bin/fastq_to_uBAM.py @@ -115,9 +115,8 @@ def quality_trim(fq_src, min_qual=20, phred_base=33): # yield (name, seq, qual) -def simplify_qname(qname, i, w, c, p): - n = c * (w + (i // c) * (p + w)) + i % c - return f"n={int(n):d} worker={w} parallel={p} internal={i} chunk_size={c}" +def simplify_qname(qname, n): + return f"{int(n):d}" # return f"{qname.split(':')[0]}_{n}" @@ -194,7 +193,8 @@ def iter_single(fq2): if args.simplify_read_id: i = N.stats["total"] - fqid = simplify_qname(fqid, i, w, c, p) + n = c * (w + (i // c) * (p + w)) + i % c + fqid = simplify_qname(fqid, n) N.count("total") attrs = fmt(r2_qname=fqid, r1=r1, r1_qual=q1, r2=r2, r2_qual=q2) From 15e24ab2f9ec2d036d588513b30b4ea3261f2964 Mon Sep 17 00:00:00 2001 From: Marvin Jens Date: Wed, 29 Jan 2025 22:13:23 +0100 Subject: [PATCH 09/12] Revert "* more experiments with CRAM in fastq_to_uBAM to tweak file size" This reverts commit 43af4f77fe354314b0aa6703f73b5692839aa0e8. --- spacemake/bin/fastq_to_uBAM.py | 56 +++++++++++----------------------- 1 file changed, 18 insertions(+), 38 deletions(-) diff --git a/spacemake/bin/fastq_to_uBAM.py b/spacemake/bin/fastq_to_uBAM.py index c6b2c336..35a466da 100644 --- a/spacemake/bin/fastq_to_uBAM.py +++ b/spacemake/bin/fastq_to_uBAM.py @@ -85,7 +85,6 @@ def make_sam_record( def quality_trim(fq_src, min_qual=20, phred_base=33): from cutadapt.qualtrim import quality_trim_index - for name, seq, qual in fq_src: end = len(seq) new_start, new_end = quality_trim_index(qual, min_qual, min_qual) @@ -99,15 +98,15 @@ def quality_trim(fq_src, min_qual=20, phred_base=33): yield (name, seq, qual) -# def quality_trim(fq_src, min_qual=20, phred_base=33): +#def quality_trim(fq_src, min_qual=20, phred_base=33): # for name, seq, qual in fq_src: # end = len(seq) # q = np.array(bytearray(qual.encode("ASCII"))) - phred_base # qtrim = q >= min_qual # new_end = end - (qtrim[::-1]).argmax() # -# TODO: yield A3,T3 adapter-trimming tags -# TODO: convert to cutadapt/BWA qual-trim logic + # TODO: yield A3,T3 adapter-trimming tags + # TODO: convert to cutadapt/BWA qual-trim logic # if new_end != end: # qual = qual[:new_end] # seq = seq[:new_end] @@ -117,15 +116,13 @@ def quality_trim(fq_src, min_qual=20, phred_base=33): def simplify_qname(qname, n): return f"{int(n):d}" - - # return f"{qname.split(':')[0]}_{n}" - + QMAP = {} -for Q in range(41): +for Q in range(40): C = chr(Q + 33) - q = int(np.round(Q / 10, 0)) * 10 + q = (Q // 5) * 5 c = chr(q + 33) QMAP[C] = c @@ -136,11 +133,11 @@ def quantize_quality(qual): def render_to_sam(fq1, fq2, sam_out, args, _extra_args={}, **kwargs): - w = _extra_args["n"] + _n = _extra_args['n'] logger = util.setup_logging(args, "fastq_to_uBAM.worker", rename_process=False) logger.debug( - f"starting up with fq1={fq1}, fq2={fq2} sam_out={sam_out} and args={args} _extra_args={_extra_args}" + f"starting up with fq1={fq1}, fq2={fq2} sam_out={sam_out} and args={args} _extra_wargs={_extra_args}" ) def iter_paired(fq1, fq2): @@ -161,8 +158,8 @@ def iter_single(fq2): ) for fqid, seq, qual in fq_src2: - if args.qual_quantization: - qual = quantize_quality(qual) +# if args.qual_quantization: +# qual = quantize_quality(qual) yield (fqid, "NA", "NA"), (fqid, seq, qual) if fq1: @@ -182,21 +179,17 @@ def iter_single(fq2): # args = argparse.Namespace(**kw) N = mf.util.CountDict() - c = args.chunk_size - p = args.parallel fmt = make_formatter_from_args(args) # , **params for (fqid, r1, q1), (_, r2, q2) in ingress: + N.count("total") if args.qual_quantization: - q2 = quantize_quality(q2) + q2 = quantize_quality(q2) if args.simplify_read_id: - i = N.stats["total"] - n = c * (w + (i // c) * (p + w)) + i % c - fqid = simplify_qname(fqid, n) + fqid = simplify_qname(fqid, N.stats['total'] + args.chunk_size * _n) - N.count("total") attrs = fmt(r2_qname=fqid, r1=r1, r1_qual=q1, r2=r2, r2_qual=q2) sam_out.write(make_sam_record(flag=4, **attrs)) @@ -266,15 +259,12 @@ def main(args): log_name="fastq_to_uBAM.collect", ) # compress to BAM - fmt_opt = " ".join([f"--output-fmt-option {o}" for o in args.output_fmt_option]) - fmt = f"Sh -O {args.output_fmt} {fmt_opt}" - w.funnel( func=mf.parts.bam_writer, # mf.parts.null_writer, # input=mf.FIFO("sam_combined", "rt"), output=args.out_bam, _manage_fifos=False, - fmt=fmt, + fmt=f"Sh{args.bam_fmt}", threads=16, ) return w.run() @@ -433,20 +423,10 @@ def parse_args(): help="a template of comma-separated BAM tags to generate. Variables are replaced with extracted cell barcode, UMI etc.", ) parser.add_argument( - "--output-fmt", - default="BAM", - choices=[ - "SAM", - "BAM", - "CRAM", - ], - help="SAM, BAM (default), or CRAM", - ) - parser.add_argument( - "--output-fmt-option", - default="", - nargs="+", - help="passed options to `samtools view` as --output-fmt-options", + "--bam-fmt", + default="b", + choices=["b", "C"], + help="b=BAM (default), C=CRAM", ) args = parser.parse_args() From a7eaaf5850eb5cf0c190661e1344cc83a1e2a2cb Mon Sep 17 00:00:00 2001 From: Marvin Jens Date: Wed, 29 Jan 2025 22:13:23 +0100 Subject: [PATCH 10/12] Revert "* add experimental support for qual score quantization, and qname simplification for up to ~40% reduction in uBAM file size" This reverts commit 3c90089a069e196ba6162b766e1e317f8cb1ca51. --- spacemake/bin/fastq_to_uBAM.py | 87 ++++------------------------------ 1 file changed, 10 insertions(+), 77 deletions(-) diff --git a/spacemake/bin/fastq_to_uBAM.py b/spacemake/bin/fastq_to_uBAM.py index 35a466da..a66bf144 100644 --- a/spacemake/bin/fastq_to_uBAM.py +++ b/spacemake/bin/fastq_to_uBAM.py @@ -84,60 +84,25 @@ def make_sam_record( def quality_trim(fq_src, min_qual=20, phred_base=33): - from cutadapt.qualtrim import quality_trim_index for name, seq, qual in fq_src: end = len(seq) - new_start, new_end = quality_trim_index(qual, min_qual, min_qual) - # we use the cutadapt function here (which implements BWA's logic). - n_trimmed = len(qual) - (new_end - new_start) + q = np.array(bytearray(qual.encode("ASCII"))) - phred_base + qtrim = q >= min_qual + new_end = end - (qtrim[::-1]).argmax() - # TODO: yield A3,T3 adapter-trimming tags - qual = qual[new_start:new_end] - seq = seq[new_start:new_end] - - yield (name, seq, qual) - - -#def quality_trim(fq_src, min_qual=20, phred_base=33): -# for name, seq, qual in fq_src: -# end = len(seq) -# q = np.array(bytearray(qual.encode("ASCII"))) - phred_base -# qtrim = q >= min_qual -# new_end = end - (qtrim[::-1]).argmax() -# # TODO: yield A3,T3 adapter-trimming tags # TODO: convert to cutadapt/BWA qual-trim logic -# if new_end != end: -# qual = qual[:new_end] -# seq = seq[:new_end] - -# yield (name, seq, qual) - - -def simplify_qname(qname, n): - return f"{int(n):d}" -# return f"{qname.split(':')[0]}_{n}" - - -QMAP = {} -for Q in range(40): - C = chr(Q + 33) - q = (Q // 5) * 5 - c = chr(q + 33) - QMAP[C] = c - - -def quantize_quality(qual): - return "".join([QMAP[q] for q in qual]) - + if new_end != end: + qual = qual[:new_end] + seq = seq[:new_end] -def render_to_sam(fq1, fq2, sam_out, args, _extra_args={}, **kwargs): + yield (name, seq, qual) - _n = _extra_args['n'] +def render_to_sam(fq1, fq2, sam_out, args, **kwargs): logger = util.setup_logging(args, "fastq_to_uBAM.worker", rename_process=False) logger.debug( - f"starting up with fq1={fq1}, fq2={fq2} sam_out={sam_out} and args={args} _extra_wargs={_extra_args}" + f"starting up with fq1={fq1}, fq2={fq2} sam_out={sam_out} and args={args}" ) def iter_paired(fq1, fq2): @@ -158,8 +123,6 @@ def iter_single(fq2): ) for fqid, seq, qual in fq_src2: -# if args.qual_quantization: -# qual = quantize_quality(qual) yield (fqid, "NA", "NA"), (fqid, seq, qual) if fq1: @@ -184,12 +147,6 @@ def iter_single(fq2): for (fqid, r1, q1), (_, r2, q2) in ingress: N.count("total") - if args.qual_quantization: - q2 = quantize_quality(q2) - - if args.simplify_read_id: - fqid = simplify_qname(fqid, N.stats['total'] + args.chunk_size * _n) - attrs = fmt(r2_qname=fqid, r1=r1, r1_qual=q1, r2=r2, r2_qual=q2) sam_out.write(make_sam_record(flag=4, **attrs)) @@ -378,28 +335,6 @@ def parse_args(): type=int, help="phred quality base (default=33)", ) - parser.add_argument( - "--qual-quantization", - default=False, - # type=bool, - action="store_true", - help="Quantize quality scores to just 4 levels for better compression/smaller file size (default=False)", - ) - parser.add_argument( - "--simplify-read-id", - default=False, - # type=bool, - action="store_true", - help="Simplify read name (FASTQ id, QNAME) for better compression/smaller file size (default=False)", - ) - - parser.add_argument( - "--qname-simplification", - default=False, - action="store_true", - help="Truncate flow-cell tile coordinates from QNAME and replace with counter for better compression/smaller file size (default=False)", - ) - parser.add_argument( "--pipe-buffer", default=4, @@ -423,9 +358,7 @@ def parse_args(): help="a template of comma-separated BAM tags to generate. Variables are replaced with extracted cell barcode, UMI etc.", ) parser.add_argument( - "--bam-fmt", - default="b", - choices=["b", "C"], + "--bam-fmt", default="b", choices=["b", "C"], help="b=BAM (default), C=CRAM", ) From 852c868307109ce3b763efd9b026060687858db6 Mon Sep 17 00:00:00 2001 From: Marvin Jens Date: Wed, 29 Jan 2025 22:13:23 +0100 Subject: [PATCH 11/12] Revert "* added CRAM suport by means of supplying --bam-fmt=C arg." This reverts commit 4a15cf992f0f933397fc314900473450740775fe. --- spacemake/bin/fastq_to_uBAM.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/spacemake/bin/fastq_to_uBAM.py b/spacemake/bin/fastq_to_uBAM.py index a66bf144..b8688de4 100644 --- a/spacemake/bin/fastq_to_uBAM.py +++ b/spacemake/bin/fastq_to_uBAM.py @@ -212,7 +212,7 @@ def main(args): # ) output=mf.FIFO("sam_combined", "wt"), log_rate_every_n=1000000, - log_rate_template="written {M_out:.1f} M BAM records ({mps:.3f} M/s, overall {MPS:.3f} M/s)", + log_rate_template="written {M_out:.1f} M BAM records ({MPS:.3f} M/s, overall {mps:.3f} M/s)", log_name="fastq_to_uBAM.collect", ) # compress to BAM @@ -221,7 +221,7 @@ def main(args): input=mf.FIFO("sam_combined", "rt"), output=args.out_bam, _manage_fifos=False, - fmt=f"Sh{args.bam_fmt}", + fmt="Sbh", threads=16, ) return w.run() @@ -357,11 +357,6 @@ def parse_args(): default="CR:{cell},CB:{cell},MI:{UMI},RG:A", help="a template of comma-separated BAM tags to generate. Variables are replaced with extracted cell barcode, UMI etc.", ) - parser.add_argument( - "--bam-fmt", default="b", choices=["b", "C"], - help="b=BAM (default), C=CRAM", - ) - args = parser.parse_args() if args.parallel < 1: raise ValueError(f"--parallel {args.parallel} is invalid. Must be >= 1") From 8f4a5902b6b81d2751f01c5a6be401d2311b7a87 Mon Sep 17 00:00:00 2001 From: Marvin Jens Date: Wed, 29 Jan 2025 22:13:23 +0100 Subject: [PATCH 12/12] Revert "* add log rate output to fastq_to_uBAM.py" This reverts commit e02d8351c394790ec2462299987762c56b23629f. --- environment.yaml | 2 +- spacemake/bin/fastq_to_uBAM.py | 44 ++++++++++++++++++++++++++++------ spacemake/util.py | 36 +++++++--------------------- 3 files changed, 47 insertions(+), 35 deletions(-) diff --git a/environment.yaml b/environment.yaml index bfbe7420..e0f21c6d 100644 --- a/environment.yaml +++ b/environment.yaml @@ -31,7 +31,7 @@ dependencies: - isal - pytest - pytest-cov - - mrfifo>=0.8.1 + - mrfifo>=0.3.0 - pandas>2 - scanpy>=1.8.1 - leidenalg>=0.8.1 diff --git a/spacemake/bin/fastq_to_uBAM.py b/spacemake/bin/fastq_to_uBAM.py index b8688de4..a13e2a3f 100644 --- a/spacemake/bin/fastq_to_uBAM.py +++ b/spacemake/bin/fastq_to_uBAM.py @@ -100,7 +100,7 @@ def quality_trim(fq_src, min_qual=20, phred_base=33): def render_to_sam(fq1, fq2, sam_out, args, **kwargs): - logger = util.setup_logging(args, "fastq_to_uBAM.worker", rename_process=False) + logger = util.setup_logging(args, "spacemake.bin.fastq_to_uBAM.worker") logger.debug( f"starting up with fq1={fq1}, fq2={fq2} sam_out={sam_out} and args={args}" ) @@ -151,6 +151,41 @@ def iter_single(fq2): sam_out.write(make_sam_record(flag=4, **attrs)) return N + # if args.paired_end: + # # if args.paired_end[0] == 'r': + # # # rev_comp R1 and reverse qual1 + # # q1 = q1[::-1] + # # r1 = util.rev_comp(r1) + + # # if args.paired_end[1] == 'r': + # # # rev_comp R2 and reverse qual2 + # # q2 = q2[::-1] + # # r2 = util.rev_comp(r2) + + # rec1 = fmt.make_bam_record( + # qname=fqid, + # r1="THISSHOULDNEVERBEREFERENCED", + # r2=r1, + # R1=r1, + # R2=r2, + # r2_qual=q1, + # r2_qname=fqid, + # flag=69, # unmapped, paired, first in pair + # ) + # rec2 = fmt.make_bam_record( + # qname=fqid, + # r1="THISSHOULDNEVERBEREFERENCED", + # r2=r2, + # R1=r1, + # R2=r2, + # r2_qual=q2, + # r2_qname=fqid, + # flag=133, # unmapped, paired, second in pair + # ) + # results.append(rec1) + # results.append(rec2) + + # else: def main(args): @@ -159,9 +194,7 @@ def main(args): # queues for communication between processes w = ( - mf.Workflow( - f"[{args.sample}]fastq_to_uBAM", total_pipe_buffer_MB=args.pipe_buffer - ) + mf.Workflow("fastq_to_uBAM", total_pipe_buffer_MB=args.pipe_buffer) # open reads2.fastq.gz .gz_reader(inputs=input_reads2, output=mf.FIFO("read2", "wb")).distribute( input=mf.FIFO("read2", "rt"), @@ -211,9 +244,6 @@ def main(args): # output="/dev/stdout" # ) output=mf.FIFO("sam_combined", "wt"), - log_rate_every_n=1000000, - log_rate_template="written {M_out:.1f} M BAM records ({MPS:.3f} M/s, overall {mps:.3f} M/s)", - log_name="fastq_to_uBAM.collect", ) # compress to BAM w.funnel( diff --git a/spacemake/util.py b/spacemake/util.py index f8d269d3..7fdedf40 100644 --- a/spacemake/util.py +++ b/spacemake/util.py @@ -2,7 +2,7 @@ import os import logging -# from contextlib import ContextDecorator, contextmanager +#from contextlib import ContextDecorator, contextmanager from spacemake.errors import SpacemakeError, FileWrongExtensionError from spacemake.contrib import __version__, __license__, __author__, __email__ @@ -10,16 +10,14 @@ bool_in_str = ["True", "true", "False", "false"] - -def generate_kmers(k, nts="ACGT"): +def generate_kmers(k, nts='ACGT'): if k == 0: - yield "" + yield '' elif k > 0: for x in nts: - for mer in generate_kmers(k - 1, nts=nts): + for mer in generate_kmers(k-1, nts=nts): yield x + mer - class dotdict(dict): """dot.notation access to dictionary attributes""" @@ -52,7 +50,6 @@ def wc_fill(x, wc): data_root_type=getattr(wc, "data_root_type", "complete_data"), ) - def quiet_bam_open(*argc, **kw): """_summary_ @@ -115,7 +112,6 @@ def ensure_path(path): os.makedirs(dirname, exist_ok=True) return path - def timed_loop( src, logger, @@ -193,13 +189,7 @@ def read_fq(fname, skim=0): src = FASTQ_src(fname) # assume its a stream or file-like object already n = 0 - for record in timed_loop( - src, - logger, - T=15, - template="processed {i} reads in {dT:.1f}sec. ({rate:.3f} k rec/sec)", - skim=skim, - ): + for record in timed_loop(src, logger, T=15, template="processed {i} reads in {dT:.1f}sec. ({rate:.3f} k rec/sec)", skim=skim): yield record n += 1 @@ -406,7 +396,6 @@ def fasta_chunks(lines, strip=True, fuse=True): # except SpacemakeError as e: # print(e) - def message_aggregation(log_listen="spacemake", print_logger=False, print_success=True): from functools import wraps @@ -439,11 +428,10 @@ def handle(this, record): print(f"{LINE_SEPARATOR}SUCCESS!") return res - return wrapper + return wrapper return the_decorator - def str_to_list(value): # if list in string representation, return the list if value is None: @@ -489,14 +477,11 @@ def setup_logging( name="spacemake.main", log_file="", FORMAT="%(asctime)-20s\t{sample:30s}\t%(name)-50s\t%(levelname)s\t%(message)s", - rename_process=True, ): sample = getattr(args, "sample", "na") - if rename_process: - import setproctitle - - if name != "spacemake.main": - setproctitle.setproctitle(f"{name} {sample}") + import setproctitle + if name != "spacemake.main": + setproctitle.setproctitle(f"{name} {sample}") FORMAT = FORMAT.format(sample=sample) @@ -529,14 +514,11 @@ def setup_logging( return logger - def setup_smk_logging(name="spacemake.smk", **kw): import argparse - args = argparse.Namespace(**kw) return setup_logging(args, name=name) - default_log_level = "INFO"