Skip to content

Commit

Permalink
(fix): write out 64 bit indptr for concat_on_disk when appropriate (#…
Browse files Browse the repository at this point in the history
…1493)

Co-authored-by: Philipp A. <[email protected]>
  • Loading branch information
ilan-gold and flying-sheep authored May 10, 2024
1 parent 1916d32 commit 7f483d9
Show file tree
Hide file tree
Showing 4 changed files with 31 additions and 2 deletions.
3 changes: 3 additions & 0 deletions docs/release-notes/0.10.8.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
```{rubric} Bugfix
```

* Write out `64bit` indptr when appropriate for {func}`~anndata.experimental.concat_on_disk` {pr}`1493` {user}`ilan-gold`


```{rubric} Documentation
```

Expand Down
4 changes: 3 additions & 1 deletion src/anndata/_io/specs/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -498,14 +498,16 @@ def write_sparse_compressed(
):
g = f.require_group(key)
g.attrs["shape"] = value.shape
dataset_kwargs = dict(dataset_kwargs)
indptr_dtype = dataset_kwargs.pop("indptr_dtype", value.indptr.dtype)

# Allow resizing for hdf5
if isinstance(f, H5Group) and "maxshape" not in dataset_kwargs:
dataset_kwargs = dict(maxshape=(None,), **dataset_kwargs)

g.create_dataset("data", data=value.data, **dataset_kwargs)
g.create_dataset("indices", data=value.indices, **dataset_kwargs)
g.create_dataset("indptr", data=value.indptr, **dataset_kwargs)
g.create_dataset("indptr", data=value.indptr, dtype=indptr_dtype, **dataset_kwargs)


write_csr = partial(write_sparse_compressed, fmt="csr")
Expand Down
9 changes: 8 additions & 1 deletion src/anndata/experimental/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,8 +220,15 @@ def write_concat_sparse(
elems = _gen_slice_to_append(
datasets, reindexers, max_loaded_elems, axis, fill_value
)
number_non_zero = sum(len(d.group["indices"]) for d in datasets)
init_elem = next(elems)
write_elem(output_group, output_path, init_elem)
indptr_dtype = "int64" if number_non_zero >= np.iinfo(np.int32).max else "int32"
write_elem(
output_group,
output_path,
init_elem,
dataset_kwargs=dict(indptr_dtype=indptr_dtype),
)
del init_elem
out_dataset: BaseCompressedSparseDataset = read_as_backed(output_group[output_path])
for temp_elem in elems:
Expand Down
17 changes: 17 additions & 0 deletions tests/test_io_elementwise.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,23 @@ def test_dask_write_sparse(store, sparse_format):
assert store["X_dask/indices"].dtype == np.int64


@pytest.mark.parametrize("sparse_format", ["csr", "csc"])
def test_write_indptr_dtype_override(store, sparse_format):
X = sparse.random(
100,
100,
format=sparse_format,
density=0.1,
random_state=np.random.default_rng(),
)

write_elem(store, "X", X, dataset_kwargs=dict(indptr_dtype="int64"))

assert store["X/indptr"].dtype == np.int64
assert X.indptr.dtype == np.int32
np.testing.assert_array_equal(store["X/indptr"][...], X.indptr)


def test_io_spec_raw(store):
adata = gen_adata((3, 2))
adata.raw = adata
Expand Down

0 comments on commit 7f483d9

Please sign in to comment.