rajewsky-lab · danilexn · Apr 18, 2024 · Jan 18, 2024 · Jan 18, 2024 · Jan 18, 2024
diff --git a/docs/conf.py b/docs/conf.py
@@ -6,7 +6,7 @@
 copyright = '2021-2023, Rajewsky lab'
 author = 'Tamas Ryszard Sztanka-Toth, Marvin Jens, Nikos Karaiskos, Nikolaus Rajewsky'
 
-version = '0.7.5'
+version = '0.7.6'
 release = version
 
 # -- General configuration

diff --git a/setup.cfg b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = spacemake
-version = 0.7.5
+version = 0.7.6
 author = Tamas Ryszard Sztanka-Toth, Marvin Jens, Nikos Karaiskos, Nikolaus Rajewsky 
 author_email = [email protected]
 description = A bioinformatic pipeline for the analysis of spatial transcriptomic data

diff --git a/spacemake/preprocess/dge.py b/spacemake/preprocess/dge.py
@@ -1,3 +1,8 @@
+import logging
+
+logger_name = "spacemake.preprocess.dge"
+logger = logging.getLogger(logger_name)
+
 def calculate_adata_metrics(adata, dge_summary_path=None, n_reads=None):
     import scanpy as sc
     import pandas as pd
@@ -151,7 +156,7 @@ def dge_to_sparse_adata(dge_path, dge_summary_path):
                 "need to add mt-missing because no mitochondrial stuff was among the genes for annotation"
             )
             gene_names.append("mt-missing")
-            X = vstack([X, np.zeros(X.shape[1])]).tocsr()
+            X = vstack([X, np.zeros((1, X.get_shape()[1]))]).tocsr()
 
         # create anndata object, but we get the transpose of X, so matrix will
         # be in CSC format
@@ -169,6 +174,9 @@ def dge_to_sparse_adata(dge_path, dge_summary_path):
         # calculate per shannon_entropy and string_compression per bead
         calculate_shannon_entropy_scompression(adata)
 
+        if adata.X.sum() == 0:
+            logger.warn(f"The DGE from {dge_path} is empty")
+
         return adata
 
 
@@ -252,10 +260,17 @@ def attach_puck_variables(adata, puck_variables):
     adata.uns["puck_variables"] = puck_variables
 
     x_pos_max, y_pos_max = tuple(adata.obsm["spatial"].max(axis=0))
+    x_pos_min, y_pos_min = tuple(adata.obsm["spatial"].min(axis=0))
 
     width_um = adata.uns["puck_variables"]["width_um"]
-    coord_by_um = x_pos_max / width_um
-    height_um = int(y_pos_max / coord_by_um)
+    coord_by_um = (x_pos_max - x_pos_min) / width_um
+
+    # this can be NaN if only one coordinate (only one cell, will fail)
+    if coord_by_um > 0:
+        height_um = int((y_pos_max - y_pos_min) / coord_by_um)
+    else:
+        height_um = 1 # avoid division by zero and error in reports
+        coord_by_um = 1
 
     adata.uns["puck_variables"]["height_um"] = height_um
     adata.uns["puck_variables"]["coord_by_um"] = coord_by_um

diff --git a/spacemake/project_df.py b/spacemake/project_df.py
@@ -725,6 +725,9 @@ def assert_valid(self):
 
             # check that puck_barcode_file(s), R1 and R2 files exist
             for n_col in ['puck_barcode_file', 'R1', 'R2']:
+                if n_col in ['R1', 'R2'] and row['is_merged']:
+                    continue
+
                 if type(row[n_col]) is list:
                     for _it_row in row[n_col]:
                         if not os.path.exists(_it_row):
@@ -742,6 +745,22 @@ def assert_valid(self):
                     _valid_puck_coordinate = False
             elif type(row['puck_barcode_file']) is str and row['puck_barcode_file'] == '':
                 _valid_puck_coordinate = False
+
+            # check that merging variables are properly specified
+            # otherwise, this throws 'MissingInputException' because it cannot find the bam files
+            if row['is_merged']:
+                if len(row['merged_from']) < 2:
+                    raise SystemExit(SpacemakeError(f"At {index}, there is <2 samples under 'merged_from'"))
+                for merged_i in row['merged_from']:
+                    if (not isinstance(merged_i, tuple) and not isinstance(merged_i, list)) or len(merged_i) != 2:
+                        raise SystemExit(SpacemakeError(f"At {index}, wrong format for 'merged_from'.\n"+
+                                                        "It must be something like "+
+                                                        "[('project', 'sample_a'), ('project', 'sample_b')]"))
+                    try:
+                        self.assert_sample(merged_i[0], merged_i[1])
+                    except ProjectSampleNotFoundError as e:
+                        self.logger.error(f"Merging error at {index}")
+                        raise e
 
             if not _valid_puck_coordinate:
                 _puck_vars = self.get_puck_variables(project_id = index[0], sample_id = index[1])

diff --git a/spacemake/smk.py b/spacemake/smk.py
@@ -433,6 +433,34 @@ def populate_variables_from_args(pdf, args, arg_prefix=''):
 
     return ret
 
+def consolidate_pucks_merged_samples():
+    for index, row in pdf.df.iterrows():
+        project_id, sample_id = index
+        puck_ids = row['puck_barcode_file_id']
+
+        if (not row['is_merged']) or (not pdf.is_spatial(project_id, sample_id, puck_ids)):
+            continue
+
+        if len(puck_ids) >= 1:
+            puck_ids = puck_ids[0]
+        elif len(puck_ids):
+            puck_ids = pdf.project_df_default_values['puck_barcode_file_id']
+
+        merged_from = row['merged_from']
+        puck_id_file = set()
+
+        for sample_tuple in merged_from:
+            pid = pdf.df.loc[sample_tuple]['puck_barcode_file_id']
+            pbf = pdf.df.loc[sample_tuple]['puck_barcode_file']
+            _tuple = [(id, bf) for id, bf in zip(pid, pbf)]
+
+            puck_id_file.update([ tuple(t) for t in _tuple ])
+
+        pid, pbf = list(zip(*list(puck_id_file)))
+        pdf.df.loc[index, 'puck_barcode_file_id'] = list(pid)
+        pdf.df.loc[index, 'puck_barcode_file'] = list(pbf)
+
+
 def update_project_df_barcode_matches(prealigned=False):
     from spacemake.snakemake.variables import puck_count_barcode_matches_summary, puck_count_prealigned_barcode_matches_summary
 
@@ -441,9 +469,18 @@ def update_project_df_barcode_matches(prealigned=False):
     else:
         _bc_file = puck_count_barcode_matches_summary
 
-    for index, _ in pdf.df.iterrows():
+    for index, row in pdf.df.iterrows():
         project_id, sample_id = index
 
+        puck_ids = row['puck_barcode_file_id']
+        if len(puck_ids) >= 1:
+            puck_ids = puck_ids[0]
+        elif len(puck_ids):
+            puck_ids = pdf.project_df_default_values['puck_barcode_file_id']
+
+        if (row['is_merged'] and prealigned) or (not pdf.is_spatial(project_id, sample_id, puck_ids)):
+            continue
+
         # check if barcodes have been filtered
         _f_barcodes_df = _bc_file.format(project_id=project_id,
                 sample_id=sample_id)
@@ -457,11 +494,11 @@ def update_project_df_barcode_matches(prealigned=False):
 
             above_threshold_mask = barcodes_df['pass_threshold'] == 1
 
-            _puck_barcode_files = barcodes_df[above_threshold_mask]['puck_barcode_file'].values.tolist().__str__()
-            _puck_barcode_files_id = barcodes_df[above_threshold_mask]['puck_barcode_file_id'].values.tolist().__str__()
+            _puck_barcode_files = barcodes_df[above_threshold_mask]['puck_barcode_file'].values.tolist()
+            _puck_barcode_files_id = barcodes_df[above_threshold_mask]['puck_barcode_file_id'].values.tolist()
 
-            pdf.df.loc[index, 'puck_barcode_file'] = _puck_barcode_files
-            pdf.df.loc[index, 'puck_barcode_file_id'] = _puck_barcode_files_id
+            pdf.df.at[index, 'puck_barcode_file'] = _puck_barcode_files
+            pdf.df.at[index, 'puck_barcode_file_id'] = _puck_barcode_files_id
 
 @message_aggregation(logger_name)
 def spacemake_run(pdf, args):
@@ -533,6 +570,7 @@ def spacemake_run(pdf, args):
     # update valid pucks (above threshold) before continuing to downstream
     # this performs counting of matching barcodes after alignment
     update_project_df_barcode_matches(prealigned=True)
+    consolidate_pucks_merged_samples()
     pdf.dump()
 
     # run snakemake after prealignment filter

diff --git a/spacemake/snakemake/main.smk b/spacemake/snakemake/main.smk
@@ -161,8 +161,9 @@ rule get_stats_prealigned_barcodes:
             data_root_type='complete_data',
             downsampling_percentage='',
             run_on_external=False,
+            filter_merged=False
         ),
-        get_prealignment_files(puck_count_prealigned_barcode_matches_summary)
+        get_prealignment_files(puck_count_prealigned_barcode_matches_summary, filter_merged=True)
 
 rule get_whitelist_barcodes:
     input:
@@ -541,7 +542,51 @@ rule create_mesh_spatial_dge:
 
 rule puck_collection_stitching:
     input:
-        unpack(get_puck_collection_stitching_input),
+        unpack(lambda wc: get_puck_collection_stitching_input(wc, to_mesh=False)),
+        # the puck_barcode_files_summary is required for puck_metadata
+        puck_barcode_files_summary
+    output:
+        dge_spatial_collection,
+        dge_spatial_collection_obs
+    params:
+        puck_data = lambda wildcards: project_df.get_puck_variables(
+                                project_id = wildcards.project_id,
+                                sample_id = wildcards.sample_id),
+        puck_metadata = lambda wildcards: project_df.get_puck_barcode_ids_and_files(
+            project_id=wildcards.project_id, sample_id=wildcards.sample_id
+        ),
+    run:
+        _pc = puck_collection.merge_pucks_to_collection(
+            # takes all input except the puck_barcode_files
+            input[:-1],
+            params['puck_metadata'][0],
+            params['puck_data']['coordinate_system'],
+            "",
+            "puck_id",
+        )
+
+        # x_pos and y_pos to be global coordinates
+        _pc.obs['x_pos'] = _pc.obsm['spatial'][..., 0]
+        _pc.obs['y_pos'] = _pc.obsm['spatial'][..., 1]
+
+        _pc.write_h5ad(output[0])
+
+        # add 'cell_bc' name to index for same format as individual pucks
+        # this also ensures compatibility with qc_sequencing_create_sheet.Rmd
+        df = _pc.obs
+        df.index = np.arange(len(df))
+        df.index.name = "cell_bc"
+
+        # only get numeric columns, to avoid problems during summarisation
+        # we could implement sth like df.A.str.extract('(\d+)')
+        # to avoid losing information from columns that are not numeric
+        df._get_numeric_data().to_csv(output[1])
+
+
+# TODO: collapse this with previous rule so we have a single point where we create the dge_spatial_collection
+rule puck_collection_stitching_meshed:
+    input:
+        unpack(lambda wc: get_puck_collection_stitching_input(wc, to_mesh=True)),
         # the puck_barcode_files_summary is required for puck_metadata
         puck_barcode_files_summary
     output:
@@ -569,10 +614,13 @@ rule puck_collection_stitching:
         _pc.obs['y_pos'] = _pc.obsm['spatial'][..., 1]
 
         _pc.write_h5ad(output[0])
+
         # add 'cell_bc' name to index for same format as individual pucks
         # this also ensures compatibility with qc_sequencing_create_sheet.Rmd
         df = _pc.obs
+        df.index = np.arange(len(df))
         df.index.name = "cell_bc"
+
         # only get numeric columns, to avoid problems during summarisation
         # we could implement sth like df.A.str.extract('(\d+)')
         # to avoid losing information from columns that are not numeric
@@ -742,7 +790,8 @@ rule count_barcode_matches:
                     'matching_ratio': [matching_ratio],
                 })], ignore_index=True, sort=False)
 
-            above_threshold_mask = out_df.matching_ratio >= params['run_mode_variables']['spatial_barcode_min_matches']
+            # we use > so whenever default: 0 we exclude empty pucks
+            above_threshold_mask = out_df.matching_ratio > params['run_mode_variables']['spatial_barcode_min_matches']
             out_df['pass_threshold'] = 0
             out_df['pass_threshold'][above_threshold_mask] = 1
 

diff --git a/spacemake/snakemake/scripts/snakemake_helper_functions.py b/spacemake/snakemake/scripts/snakemake_helper_functions.py
@@ -62,22 +62,25 @@ def get_output_files(
                 project_id = project_id,
                 sample_id = sample_id
             )
-
-            if "coordinate_system" not in puck_vars.keys():
+
+            if (len(puck_barcode_file_ids) == 0) \
+                or ("coordinate_system" not in puck_vars.keys()):
                 continue
 
             coordinate_system = puck_vars["coordinate_system"]
-
             if coordinate_system == '':
                 continue
 
-        # add the non spatial barcode by default
-        non_spatial_pbf_id = project_df.project_df_default_values[
-            "puck_barcode_file_id"
-        ][0]
+            puck_barcode_file_ids = "puck_collection"
+
+        else:
+            # add the non spatial barcode by default
+            non_spatial_pbf_id = project_df.project_df_default_values[
+                "puck_barcode_file_id"
+            ][0]
 
-        if non_spatial_pbf_id not in puck_barcode_file_ids:
-            puck_barcode_file_ids.append(non_spatial_pbf_id)
+            if non_spatial_pbf_id not in puck_barcode_file_ids:
+                puck_barcode_file_ids.append(non_spatial_pbf_id)
 
         for run_mode in row["run_mode"]:
             run_mode_variables = project_df.config.get_run_mode(run_mode).variables
@@ -88,10 +91,7 @@ def get_output_files(
                     polyA_adapter_trimmed = ".polyA_adapter_trimmed"
                 else:
                     polyA_adapter_trimmed = ""
-
-            if check_puck_collection:
-                puck_barcode_file_ids = "puck_collection"
-
+
             out_files = out_files + expand(
                 pattern,
                 project_id=project_id,
@@ -144,9 +144,12 @@ def get_output_files_qc(
     return _out_files
 
 
-def get_prealignment_files(pattern):
+def get_prealignment_files(pattern, filter_merged=False):
     df = project_df.df
 
+    if filter_merged:
+        df = df.loc[~df.is_merged]
+
     prealignment_files = []
 
     for index, _ in df.iterrows():
@@ -216,17 +219,21 @@ def get_all_dges_collection(wildcards):
             sample_id = sample_id
         )
 
-        if "coordinate_system" not in puck_vars.keys():
+        puck_barcode_file_ids = project_df.get_puck_barcode_ids_and_files(
+                project_id, sample_id
+            )[0]
+
+        if (len(puck_barcode_file_ids) == 0) \
+            or ("coordinate_system" not in puck_vars.keys()):
             continue
 
         coordinate_system = puck_vars["coordinate_system"]
-
         if coordinate_system == '':
             continue
 
         if not os.path.exists(coordinate_system):
             raise FileNotFoundError(f"at project {project_id} sample {sample_id} "+
-                                    f"'coordinate_system' file {coordinate_system} could not be found")
+                                    f"'coordinate_system' file {coordinate_system} cannot be found")
 
         # will consider puck_collection within all dges if a coordinate system is specified in the puck
         with_puck_collection = False if not coordinate_system else True
@@ -612,6 +619,7 @@ def get_dge_from_run_mode(
     downsampling_percentage,
     puck_barcode_file_id,
     only_spatial=False,
+    to_mesh=None
 ):
     has_dge = project_df.has_dge(project_id=project_id, sample_id=sample_id)
 
@@ -678,9 +686,12 @@ def get_dge_from_run_mode(
     if not is_spatial and not only_spatial:
         dge_out_pattern = dge_out_h5ad
         dge_out_summary_pattern = dge_out_h5ad_obs
-    elif run_mode_variables["mesh_data"]:
+    elif run_mode_variables["mesh_data"] or to_mesh:
         dge_out_pattern = dge_spatial_mesh
         dge_out_summary_pattern = dge_spatial_mesh_obs
+    elif to_mesh is not None and to_mesh == False:
+        dge_out_pattern = dge_spatial
+        dge_out_summary_pattern = dge_spatial_obs
     else:
         dge_out_pattern = dge_spatial
         dge_out_summary_pattern = dge_spatial_obs
@@ -827,7 +838,7 @@ def get_all_barcode_readcounts(wildcards, prealigned=False):
         "polyA_adapter_trimmed": polyA_adapter_trimmed_wildcard,
     }
 
-    if prealigned:
+    if prealigned or is_merged:
         return {"bc_readcounts": expand(barcode_readcounts, **extra_args)}
     else:
         return {"bc_readcounts": expand(barcode_readcounts_prealigned, **extra_args)}
@@ -929,7 +940,7 @@ def get_puck_file(wildcards):
         return {"barcode_file": puck_barcode_file}
 
 
-def get_puck_collection_stitching_input(wildcards):
+def get_puck_collection_stitching_input(wildcards, to_mesh=False):
     # there are three options:
     # 1) no spatial dge
     # 2) spatial dge, no mesh
@@ -948,6 +959,7 @@ def get_puck_collection_stitching_input(wildcards):
                 downsampling_percentage=wildcards.downsampling_percentage,
                 puck_barcode_file_id=puck_barcode_file_ids,
                 only_spatial=True,
+                to_mesh=to_mesh
             )["dge"]][0]
 
 

diff --git a/test_data/test_reads.R1.fastq.gz b/test_data/test_reads.R1.fastq.gz
diff --git a/test_data/test_reads.R2.fastq.gz b/test_data/test_reads.R2.fastq.gz