From bd9b23c572e0b3cb21335a2bf9f8cdc5023dd90d Mon Sep 17 00:00:00 2001
From: Joep Vanlier <joep.vanlier@gmail.com>
Date: Fri, 9 Aug 2024 17:03:30 +0200
Subject: [PATCH] caching: handle maximum size in bytes

Considering the items can have varying sizes, having a maximum size expressed in `bytes` makes more sense. At the end of the day, this is what would impact the user's experience the most.
---
 lumicks/pylake/channel.py                      | 17 +++++++++++++----
 lumicks/pylake/tests/test_file/test_caching.py |  4 ++++
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/lumicks/pylake/channel.py b/lumicks/pylake/channel.py
index 26106fb16..34e83ccea 100644
--- a/lumicks/pylake/channel.py
+++ b/lumicks/pylake/channel.py
@@ -2,10 +2,10 @@
 
 import numbers
 from typing import Union
-from functools import lru_cache
 
 import numpy as np
 import numpy.typing as npt
+from cachetools import LRUCache, cached
 
 from .detail.plotting import _annotate
 from .detail.timeindex import to_seconds, to_timestamp
@@ -13,20 +13,25 @@
 from .nb_widgets.range_selector import SliceRangeSelectorWidget
 
 
-@lru_cache(maxsize=100)
+@cached(LRUCache(maxsize=1 << 30, getsizeof=lambda x: x.nbytes), info=True)  # 1 GB of cache
 def _get_array(cache_object):
     return cache_object.read_array()
 
 
 class LazyCache:
-    def __init__(self, location, dset):
+    def __init__(self, location, dset, nbytes):
         """A lazy globally cached wrapper around an object that is convertible to a numpy array"""
         self._location = location
         self._dset = dset
+        self._nbytes = nbytes
 
     def __len__(self):
         return len(self._dset)
 
+    @property
+    def nbytes(self):
+        return self._nbytes
+
     def __hash__(self):
         return hash(self._location)
 
@@ -36,7 +41,11 @@ def from_h5py_dset(dset, field=None):
         if field:
             location = f"{location}.{field}"
             dset = dset.fields(field)
-        return LazyCache(location, dset)
+            item_size = dset.read_dtype.itemsize
+        else:
+            item_size = dset.dtype.itemsize
+
+        return LazyCache(location, dset, nbytes=item_size * len(dset))
 
     def read_array(self):
         # Note, we deliberately do _not_ allow additional arguments to asarray since we would
diff --git a/lumicks/pylake/tests/test_file/test_caching.py b/lumicks/pylake/tests/test_file/test_caching.py
index 963ca8468..ff7024649 100644
--- a/lumicks/pylake/tests/test_file/test_caching.py
+++ b/lumicks/pylake/tests/test_file/test_caching.py
@@ -15,6 +15,7 @@ def test_global_cache_continuous(h5_file):
     # These should point to the same data
     assert id(f1x1.data) == id(f1x2.data)
     assert _get_array.cache_info().hits == 1
+    assert _get_array.cache_info().currsize == 40
 
     with pytest.raises(ValueError, match="assignment destination is read-only"):
         f1x1.data[5:100] = 3
@@ -33,8 +34,10 @@ def test_global_cache_timeseries(h5_file):
     # These should point to the same data
     assert id(f1x1.data) == id(f1x2.data)
     assert _get_array.cache_info().hits == 1
+    assert _get_array.cache_info().currsize == 16
     assert id(f1x1.timestamps) == id(f1x2.timestamps)
     assert _get_array.cache_info().hits == 2
+    assert _get_array.cache_info().currsize == 32
 
     with pytest.raises(ValueError, match="assignment destination is read-only"):
         f1x1.data[5:100] = 3
@@ -53,6 +56,7 @@ def test_global_cache_timetags(h5_file):
         # These should point to the same data
         assert id(tags1.data) == id(tags2.data)
         assert _get_array.cache_info().hits == 1
+        assert _get_array.cache_info().currsize == 72
 
         with pytest.raises(ValueError, match="assignment destination is read-only"):
             tags1.data[5:100] = 3