Skip to content

Commit

Permalink
Merge branch 'main' into sparse-arrays
Browse files Browse the repository at this point in the history
  • Loading branch information
ilan-gold authored Apr 10, 2024
2 parents 84f3c86 + a03e984 commit eea660f
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 51 deletions.
2 changes: 1 addition & 1 deletion .azure-pipelines.yml
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ jobs:
sys.exit(0 if len(results) > 3000 else f"Error: only {len(results)} tests run")
displayName: "Check if enough tests ran"

- task: PublishCodeCoverageResults@1
- task: PublishCodeCoverageResults@2
inputs:
codeCoverageTool: Cobertura
summaryFileLocation: "test-data/coverage.xml"
Expand Down
69 changes: 35 additions & 34 deletions benchmarks/benchmarks/readwrite.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,43 +77,44 @@


class H5ADInMemorySizeSuite:
params = [PBMC_3K_URL]
param_names = ["input_url"]
_urls = dict(pbmc3k=PBMC_3K_URL)
params = _urls.keys()
param_names = ["input_data"]

def setup(self, input_url):
self.filepath = pooch.retrieve(url=input_url, known_hash=None)
def setup(self, input_data: str):
self.filepath = pooch.retrieve(url=self._urls[input_data], known_hash=None)

def track_in_memory_size(self, input_url):
def track_in_memory_size(self, *_):
adata = anndata.read_h5ad(self.filepath)
adata_size = sys.getsizeof(adata)

return adata_size

def track_actual_in_memory_size(self, input_url):
def track_actual_in_memory_size(self, *_):
adata = anndata.read_h5ad(self.filepath)
adata_size = get_actualsize(adata)

return adata_size


class H5ADReadSuite:
# params = [PBMC_REDUCED_PATH, PBMC_3K_PATH, BM_43K_CSR_PATH]
params = [PBMC_3K_URL]
param_names = ["input_url"]
_urls = dict(pbmc3k=PBMC_3K_URL)
params = _urls.keys()
param_names = ["input_data"]

def setup(self, input_url):
self.filepath = pooch.retrieve(url=input_url, known_hash=None)
def setup(self, input_data: str):
self.filepath = pooch.retrieve(url=self._urls[input_data], known_hash=None)

def time_read_full(self, input_url):
def time_read_full(self, *_):
anndata.read_h5ad(self.filepath)

def peakmem_read_full(self, input_url):
def peakmem_read_full(self, *_):
anndata.read_h5ad(self.filepath)

def mem_readfull_object(self, input_url):
def mem_readfull_object(self, *_):
return anndata.read_h5ad(self.filepath)

def track_read_full_memratio(self, input_url):
def track_read_full_memratio(self, *_):
mem_recording = memory_usage(
(sedate(anndata.read_h5ad, 0.005), (self.filepath,)), interval=0.001
)
Expand All @@ -123,23 +124,23 @@ def track_read_full_memratio(self, input_url):
print(base_size)
return (np.max(mem_recording) - np.min(mem_recording)) / base_size

def peakmem_read_backed(self, input_url):
def peakmem_read_backed(self, *_):
anndata.read_h5ad(self.filepath, backed="r")

def mem_read_backed_object(self, input_url):
def mem_read_backed_object(self, *_):
return anndata.read_h5ad(self.filepath, backed="r")


class H5ADWriteSuite:
# params = [PBMC_REDUCED_PATH, PBMC_3K_PATH, BM_43K_CSR_PATH]
params = [PBMC_3K_URL]
param_names = ["input_url"]
_urls = dict(pbmc3k=PBMC_3K_URL)
params = _urls.keys()
param_names = ["input_data"]

def setup(self, input_url):
def setup(self, input_data: str):
mem_recording, adata = memory_usage(
(
sedate(anndata.read_h5ad, 0.005),
(pooch.retrieve(input_url, known_hash=None),),
(pooch.retrieve(self._urls[input_data], known_hash=None),),
),
retval=True,
interval=0.001,
Expand All @@ -149,40 +150,40 @@ def setup(self, input_url):
self.tmpdir = tempfile.TemporaryDirectory()
self.writepth = Path(self.tmpdir.name) / "out.h5ad"

def teardown(self, input_url):
def teardown(self, *_):
self.tmpdir.cleanup()

def time_write_full(self, input_url):
def time_write_full(self, *_):
self.adata.write_h5ad(self.writepth, compression=None)

def peakmem_write_full(self, input_url):
def peakmem_write_full(self, *_):
self.adata.write_h5ad(self.writepth)

def track_peakmem_write_full(self, input_url):
def track_peakmem_write_full(self, *_):
return get_peak_mem((sedate(self.adata.write_h5ad), (self.writepth,)))

def time_write_compressed(self, input_url):
def time_write_compressed(self, *_):
self.adata.write_h5ad(self.writepth, compression="gzip")

def peakmem_write_compressed(self, input_url):
def peakmem_write_compressed(self, *_):
self.adata.write_h5ad(self.writepth, compression="gzip")

def track_peakmem_write_compressed(self, input_url):
def track_peakmem_write_compressed(self, *_):
return get_peak_mem(
(sedate(self.adata.write_h5ad), (self.writepth,), {"compression": "gzip"})
)


class H5ADBackedWriteSuite(H5ADWriteSuite):
# params = [PBMC_REDUCED_PATH, PBMC_3K_PATH]
params = [PBMC_3K_URL]
param_names = ["input_url"]
_urls = dict(pbmc3k=PBMC_3K_URL)
params = _urls.keys()
param_names = ["input_data"]

def setup(self, input_url):
def setup(self, input_data):
mem_recording, adata = memory_usage(
(
sedate(anndata.read_h5ad, 0.005),
(pooch.retrieve(input_url, known_hash=None),),
(pooch.retrieve(self._urls[input_data], known_hash=None),),
{"backed": "r"},
),
retval=True,
Expand Down
33 changes: 17 additions & 16 deletions benchmarks/benchmarks/sparse_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,42 +16,43 @@ def make_alternating_mask(n):


class SparseCSRContiguousSlice:
_slices = {
"0:1000": slice(0, 1000),
"0:9000": slice(0, 9000),
":9000:-1": slice(None, 9000, -1),
"::-2": slice(None, None, 2),
"array": np.array([0, 5000, 9999]),
"arange": np.arange(0, 1000),
"first": 0,
"alternating": make_alternating_mask(10),
}
params = (
[
(10_000, 10_000),
# (10_000, 500)
],
[
slice(0, 1000),
slice(0, 9000),
slice(None, 9000, -1),
slice(None, None, 2),
np.array([0, 5000, 9999]),
np.arange(0, 1000),
0,
make_alternating_mask(10),
],
_slices.keys(),
)
param_names = ["shape", "slice"]

def setup(self, shape, slice):
def setup(self, shape: tuple[int, int], slice: str):
X = sparse.random(
*shape, density=0.01, format="csr", random_state=np.random.default_rng(42)
)
self.slice = slice
self.slice = self._slices[slice]
g = zarr.group()
write_elem(g, "X", X)
self.x = sparse_dataset(g["X"])
self.adata = AnnData(self.x)

def time_getitem(self, shape, slice):
def time_getitem(self, *_):
self.x[self.slice]

def peakmem_getitem(self, shape, slice):
def peakmem_getitem(self, *_):
self.x[self.slice]

def time_getitem_adata(self, shape, slice):
def time_getitem_adata(self, *_):
self.adata[self.slice]

def peakmem_getitem_adata(self, shape, slice):
def peakmem_getitem_adata(self, *_):
self.adata[self.slice]

0 comments on commit eea660f

Please sign in to comment.