Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix merging error #91

Merged
merged 24 commits into from
Apr 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
6dc231e
add tiny spatial test data
danilexn Jan 18, 2024
4f08526
no qc and prealignment stats of merged samples
danilexn Jan 18, 2024
02a4473
do not require R1 and R2 in merged samples
danilexn Jan 18, 2024
bca4239
support filtering of merged files in prealignment file generation
danilexn Jan 18, 2024
da56deb
compute union of passed spatial pucks in main smk workflow
danilexn Jan 18, 2024
1c18a36
support lists of one element when updating pucks
danilexn Jan 18, 2024
eb01f8b
spatial puck width/height with max-min instead of max
danilexn Jan 18, 2024
5baabdb
support non-mesh puck collection in main smk
danilexn Jan 18, 2024
3dc82ca
throw error when adata is empty
danilexn Jan 18, 2024
79ffb15
exclude empty pucks when default matches: 0
danilexn Jan 18, 2024
0a10d9c
add spatial tiny test data
danilexn Jan 18, 2024
0fd785c
consolidate pucks: fix reference to puck_ids
danilexn Jan 18, 2024
563aa72
fix: qc not working on puck_collection when multiple samples
danilexn Jan 24, 2024
dfa1fc5
Merge branch 'rajewsky-lab:master' into merging_issue
Apr 17, 2024
89abb7e
remove too large spatial test data
danilexn Apr 17, 2024
c297310
preprocess/dge avoid division by zero
danilexn Apr 17, 2024
d38119c
dge: fix vstack for missing mt in tiny test data
danilexn Apr 17, 2024
aaa74b1
update test reads and spatial tiles for 1 match in 2 tiles
danilexn Apr 17, 2024
f216360
fixed test reads so it runs
danilexn Apr 17, 2024
7cf1339
dge: warning when empty, set puck metrics to 1 when nan
danilexn Apr 17, 2024
847c49e
project_df: more verbosity for merge validation
danilexn Apr 17, 2024
bf082f0
main.smk: no exception when empty DGE, just warning
danilexn Apr 17, 2024
9e27e30
bump version 0.7.5 to 0.7.6
danilexn Apr 17, 2024
ac5709b
main.smk: generate qc reports when merging
danilexn Apr 17, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
copyright = '2021-2023, Rajewsky lab'
author = 'Tamas Ryszard Sztanka-Toth, Marvin Jens, Nikos Karaiskos, Nikolaus Rajewsky'

version = '0.7.5'
version = '0.7.6'
release = version

# -- General configuration
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[metadata]
name = spacemake
version = 0.7.5
version = 0.7.6
author = Tamas Ryszard Sztanka-Toth, Marvin Jens, Nikos Karaiskos, Nikolaus Rajewsky
author_email = [email protected]
description = A bioinformatic pipeline for the analysis of spatial transcriptomic data
Expand Down
21 changes: 18 additions & 3 deletions spacemake/preprocess/dge.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
import logging

logger_name = "spacemake.preprocess.dge"
logger = logging.getLogger(logger_name)

def calculate_adata_metrics(adata, dge_summary_path=None, n_reads=None):
import scanpy as sc
import pandas as pd
Expand Down Expand Up @@ -151,7 +156,7 @@ def dge_to_sparse_adata(dge_path, dge_summary_path):
"need to add mt-missing because no mitochondrial stuff was among the genes for annotation"
)
gene_names.append("mt-missing")
X = vstack([X, np.zeros(X.shape[1])]).tocsr()
X = vstack([X, np.zeros((1, X.get_shape()[1]))]).tocsr()

# create anndata object, but we get the transpose of X, so matrix will
# be in CSC format
Expand All @@ -169,6 +174,9 @@ def dge_to_sparse_adata(dge_path, dge_summary_path):
# calculate per shannon_entropy and string_compression per bead
calculate_shannon_entropy_scompression(adata)

if adata.X.sum() == 0:
logger.warn(f"The DGE from {dge_path} is empty")

return adata


Expand Down Expand Up @@ -252,10 +260,17 @@ def attach_puck_variables(adata, puck_variables):
adata.uns["puck_variables"] = puck_variables

x_pos_max, y_pos_max = tuple(adata.obsm["spatial"].max(axis=0))
x_pos_min, y_pos_min = tuple(adata.obsm["spatial"].min(axis=0))

width_um = adata.uns["puck_variables"]["width_um"]
coord_by_um = x_pos_max / width_um
height_um = int(y_pos_max / coord_by_um)
coord_by_um = (x_pos_max - x_pos_min) / width_um

# this can be NaN if only one coordinate (only one cell, will fail)
if coord_by_um > 0:
height_um = int((y_pos_max - y_pos_min) / coord_by_um)
else:
height_um = 1 # avoid division by zero and error in reports
coord_by_um = 1

adata.uns["puck_variables"]["height_um"] = height_um
adata.uns["puck_variables"]["coord_by_um"] = coord_by_um
Expand Down
19 changes: 19 additions & 0 deletions spacemake/project_df.py
Original file line number Diff line number Diff line change
Expand Up @@ -725,6 +725,9 @@ def assert_valid(self):

# check that puck_barcode_file(s), R1 and R2 files exist
for n_col in ['puck_barcode_file', 'R1', 'R2']:
if n_col in ['R1', 'R2'] and row['is_merged']:
continue

if type(row[n_col]) is list:
for _it_row in row[n_col]:
if not os.path.exists(_it_row):
Expand All @@ -742,6 +745,22 @@ def assert_valid(self):
_valid_puck_coordinate = False
elif type(row['puck_barcode_file']) is str and row['puck_barcode_file'] == '':
_valid_puck_coordinate = False

# check that merging variables are properly specified
# otherwise, this throws 'MissingInputException' because it cannot find the bam files
if row['is_merged']:
if len(row['merged_from']) < 2:
raise SystemExit(SpacemakeError(f"At {index}, there is <2 samples under 'merged_from'"))
for merged_i in row['merged_from']:
if (not isinstance(merged_i, tuple) and not isinstance(merged_i, list)) or len(merged_i) != 2:
raise SystemExit(SpacemakeError(f"At {index}, wrong format for 'merged_from'.\n"+
"It must be something like "+
"[('project', 'sample_a'), ('project', 'sample_b')]"))
try:
self.assert_sample(merged_i[0], merged_i[1])
except ProjectSampleNotFoundError as e:
self.logger.error(f"Merging error at {index}")
raise e

if not _valid_puck_coordinate:
_puck_vars = self.get_puck_variables(project_id = index[0], sample_id = index[1])
Expand Down
48 changes: 43 additions & 5 deletions spacemake/smk.py
Original file line number Diff line number Diff line change
Expand Up @@ -433,6 +433,34 @@ def populate_variables_from_args(pdf, args, arg_prefix=''):

return ret

def consolidate_pucks_merged_samples():
for index, row in pdf.df.iterrows():
project_id, sample_id = index
puck_ids = row['puck_barcode_file_id']

if (not row['is_merged']) or (not pdf.is_spatial(project_id, sample_id, puck_ids)):
continue

if len(puck_ids) >= 1:
puck_ids = puck_ids[0]
elif len(puck_ids):
puck_ids = pdf.project_df_default_values['puck_barcode_file_id']

merged_from = row['merged_from']
puck_id_file = set()

for sample_tuple in merged_from:
pid = pdf.df.loc[sample_tuple]['puck_barcode_file_id']
pbf = pdf.df.loc[sample_tuple]['puck_barcode_file']
_tuple = [(id, bf) for id, bf in zip(pid, pbf)]

puck_id_file.update([ tuple(t) for t in _tuple ])

pid, pbf = list(zip(*list(puck_id_file)))
pdf.df.loc[index, 'puck_barcode_file_id'] = list(pid)
pdf.df.loc[index, 'puck_barcode_file'] = list(pbf)


def update_project_df_barcode_matches(prealigned=False):
from spacemake.snakemake.variables import puck_count_barcode_matches_summary, puck_count_prealigned_barcode_matches_summary

Expand All @@ -441,9 +469,18 @@ def update_project_df_barcode_matches(prealigned=False):
else:
_bc_file = puck_count_barcode_matches_summary

for index, _ in pdf.df.iterrows():
for index, row in pdf.df.iterrows():
project_id, sample_id = index

puck_ids = row['puck_barcode_file_id']
if len(puck_ids) >= 1:
puck_ids = puck_ids[0]
elif len(puck_ids):
puck_ids = pdf.project_df_default_values['puck_barcode_file_id']

if (row['is_merged'] and prealigned) or (not pdf.is_spatial(project_id, sample_id, puck_ids)):
continue

# check if barcodes have been filtered
_f_barcodes_df = _bc_file.format(project_id=project_id,
sample_id=sample_id)
Expand All @@ -457,11 +494,11 @@ def update_project_df_barcode_matches(prealigned=False):

above_threshold_mask = barcodes_df['pass_threshold'] == 1

_puck_barcode_files = barcodes_df[above_threshold_mask]['puck_barcode_file'].values.tolist().__str__()
_puck_barcode_files_id = barcodes_df[above_threshold_mask]['puck_barcode_file_id'].values.tolist().__str__()
_puck_barcode_files = barcodes_df[above_threshold_mask]['puck_barcode_file'].values.tolist()
_puck_barcode_files_id = barcodes_df[above_threshold_mask]['puck_barcode_file_id'].values.tolist()

pdf.df.loc[index, 'puck_barcode_file'] = _puck_barcode_files
pdf.df.loc[index, 'puck_barcode_file_id'] = _puck_barcode_files_id
pdf.df.at[index, 'puck_barcode_file'] = _puck_barcode_files
pdf.df.at[index, 'puck_barcode_file_id'] = _puck_barcode_files_id

@message_aggregation(logger_name)
def spacemake_run(pdf, args):
Expand Down Expand Up @@ -533,6 +570,7 @@ def spacemake_run(pdf, args):
# update valid pucks (above threshold) before continuing to downstream
# this performs counting of matching barcodes after alignment
update_project_df_barcode_matches(prealigned=True)
consolidate_pucks_merged_samples()
pdf.dump()

# run snakemake after prealignment filter
Expand Down
55 changes: 52 additions & 3 deletions spacemake/snakemake/main.smk
Original file line number Diff line number Diff line change
Expand Up @@ -161,8 +161,9 @@ rule get_stats_prealigned_barcodes:
data_root_type='complete_data',
downsampling_percentage='',
run_on_external=False,
filter_merged=False
),
get_prealignment_files(puck_count_prealigned_barcode_matches_summary)
get_prealignment_files(puck_count_prealigned_barcode_matches_summary, filter_merged=True)

rule get_whitelist_barcodes:
input:
Expand Down Expand Up @@ -541,7 +542,51 @@ rule create_mesh_spatial_dge:

rule puck_collection_stitching:
input:
unpack(get_puck_collection_stitching_input),
unpack(lambda wc: get_puck_collection_stitching_input(wc, to_mesh=False)),
# the puck_barcode_files_summary is required for puck_metadata
puck_barcode_files_summary
output:
dge_spatial_collection,
dge_spatial_collection_obs
params:
puck_data = lambda wildcards: project_df.get_puck_variables(
project_id = wildcards.project_id,
sample_id = wildcards.sample_id),
puck_metadata = lambda wildcards: project_df.get_puck_barcode_ids_and_files(
project_id=wildcards.project_id, sample_id=wildcards.sample_id
),
run:
_pc = puck_collection.merge_pucks_to_collection(
# takes all input except the puck_barcode_files
input[:-1],
params['puck_metadata'][0],
params['puck_data']['coordinate_system'],
"",
"puck_id",
)

# x_pos and y_pos to be global coordinates
_pc.obs['x_pos'] = _pc.obsm['spatial'][..., 0]
_pc.obs['y_pos'] = _pc.obsm['spatial'][..., 1]

_pc.write_h5ad(output[0])

# add 'cell_bc' name to index for same format as individual pucks
# this also ensures compatibility with qc_sequencing_create_sheet.Rmd
df = _pc.obs
df.index = np.arange(len(df))
df.index.name = "cell_bc"

# only get numeric columns, to avoid problems during summarisation
# we could implement sth like df.A.str.extract('(\d+)')
# to avoid losing information from columns that are not numeric
df._get_numeric_data().to_csv(output[1])


# TODO: collapse this with previous rule so we have a single point where we create the dge_spatial_collection
rule puck_collection_stitching_meshed:
input:
unpack(lambda wc: get_puck_collection_stitching_input(wc, to_mesh=True)),
# the puck_barcode_files_summary is required for puck_metadata
puck_barcode_files_summary
output:
Expand Down Expand Up @@ -569,10 +614,13 @@ rule puck_collection_stitching:
_pc.obs['y_pos'] = _pc.obsm['spatial'][..., 1]

_pc.write_h5ad(output[0])

# add 'cell_bc' name to index for same format as individual pucks
# this also ensures compatibility with qc_sequencing_create_sheet.Rmd
df = _pc.obs
df.index = np.arange(len(df))
df.index.name = "cell_bc"

# only get numeric columns, to avoid problems during summarisation
# we could implement sth like df.A.str.extract('(\d+)')
# to avoid losing information from columns that are not numeric
Expand Down Expand Up @@ -742,7 +790,8 @@ rule count_barcode_matches:
'matching_ratio': [matching_ratio],
})], ignore_index=True, sort=False)

above_threshold_mask = out_df.matching_ratio >= params['run_mode_variables']['spatial_barcode_min_matches']
# we use > so whenever default: 0 we exclude empty pucks
above_threshold_mask = out_df.matching_ratio > params['run_mode_variables']['spatial_barcode_min_matches']
out_df['pass_threshold'] = 0
out_df['pass_threshold'][above_threshold_mask] = 1

Expand Down
52 changes: 32 additions & 20 deletions spacemake/snakemake/scripts/snakemake_helper_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,22 +62,25 @@ def get_output_files(
project_id = project_id,
sample_id = sample_id
)

if "coordinate_system" not in puck_vars.keys():

if (len(puck_barcode_file_ids) == 0) \
or ("coordinate_system" not in puck_vars.keys()):
continue

coordinate_system = puck_vars["coordinate_system"]

if coordinate_system == '':
continue

# add the non spatial barcode by default
non_spatial_pbf_id = project_df.project_df_default_values[
"puck_barcode_file_id"
][0]
puck_barcode_file_ids = "puck_collection"

else:
# add the non spatial barcode by default
non_spatial_pbf_id = project_df.project_df_default_values[
"puck_barcode_file_id"
][0]

if non_spatial_pbf_id not in puck_barcode_file_ids:
puck_barcode_file_ids.append(non_spatial_pbf_id)
if non_spatial_pbf_id not in puck_barcode_file_ids:
puck_barcode_file_ids.append(non_spatial_pbf_id)

for run_mode in row["run_mode"]:
run_mode_variables = project_df.config.get_run_mode(run_mode).variables
Expand All @@ -88,10 +91,7 @@ def get_output_files(
polyA_adapter_trimmed = ".polyA_adapter_trimmed"
else:
polyA_adapter_trimmed = ""

if check_puck_collection:
puck_barcode_file_ids = "puck_collection"


out_files = out_files + expand(
pattern,
project_id=project_id,
Expand Down Expand Up @@ -144,9 +144,12 @@ def get_output_files_qc(
return _out_files


def get_prealignment_files(pattern):
def get_prealignment_files(pattern, filter_merged=False):
df = project_df.df

if filter_merged:
df = df.loc[~df.is_merged]

prealignment_files = []

for index, _ in df.iterrows():
Expand Down Expand Up @@ -216,17 +219,21 @@ def get_all_dges_collection(wildcards):
sample_id = sample_id
)

if "coordinate_system" not in puck_vars.keys():
puck_barcode_file_ids = project_df.get_puck_barcode_ids_and_files(
project_id, sample_id
)[0]

if (len(puck_barcode_file_ids) == 0) \
or ("coordinate_system" not in puck_vars.keys()):
continue

coordinate_system = puck_vars["coordinate_system"]

if coordinate_system == '':
continue

if not os.path.exists(coordinate_system):
raise FileNotFoundError(f"at project {project_id} sample {sample_id} "+
f"'coordinate_system' file {coordinate_system} could not be found")
f"'coordinate_system' file {coordinate_system} cannot be found")

# will consider puck_collection within all dges if a coordinate system is specified in the puck
with_puck_collection = False if not coordinate_system else True
Expand Down Expand Up @@ -612,6 +619,7 @@ def get_dge_from_run_mode(
downsampling_percentage,
puck_barcode_file_id,
only_spatial=False,
to_mesh=None
):
has_dge = project_df.has_dge(project_id=project_id, sample_id=sample_id)

Expand Down Expand Up @@ -678,9 +686,12 @@ def get_dge_from_run_mode(
if not is_spatial and not only_spatial:
dge_out_pattern = dge_out_h5ad
dge_out_summary_pattern = dge_out_h5ad_obs
elif run_mode_variables["mesh_data"]:
elif run_mode_variables["mesh_data"] or to_mesh:
dge_out_pattern = dge_spatial_mesh
dge_out_summary_pattern = dge_spatial_mesh_obs
elif to_mesh is not None and to_mesh == False:
dge_out_pattern = dge_spatial
dge_out_summary_pattern = dge_spatial_obs
else:
dge_out_pattern = dge_spatial
dge_out_summary_pattern = dge_spatial_obs
Expand Down Expand Up @@ -827,7 +838,7 @@ def get_all_barcode_readcounts(wildcards, prealigned=False):
"polyA_adapter_trimmed": polyA_adapter_trimmed_wildcard,
}

if prealigned:
if prealigned or is_merged:
return {"bc_readcounts": expand(barcode_readcounts, **extra_args)}
else:
return {"bc_readcounts": expand(barcode_readcounts_prealigned, **extra_args)}
Expand Down Expand Up @@ -929,7 +940,7 @@ def get_puck_file(wildcards):
return {"barcode_file": puck_barcode_file}


def get_puck_collection_stitching_input(wildcards):
def get_puck_collection_stitching_input(wildcards, to_mesh=False):
# there are three options:
# 1) no spatial dge
# 2) spatial dge, no mesh
Expand All @@ -948,6 +959,7 @@ def get_puck_collection_stitching_input(wildcards):
downsampling_percentage=wildcards.downsampling_percentage,
puck_barcode_file_id=puck_barcode_file_ids,
only_spatial=True,
to_mesh=to_mesh
)["dge"]][0]


Expand Down
Binary file modified test_data/test_reads.R1.fastq.gz
Binary file not shown.
Binary file modified test_data/test_reads.R2.fastq.gz
Binary file not shown.
Loading