From fe3cd6885f8af8b66cbf9dab894d9f8f35c94d03 Mon Sep 17 00:00:00 2001 From: bbean Date: Wed, 6 Nov 2024 13:42:01 -0700 Subject: [PATCH] implement changes suggested in PR #162 --- opencsp/common/lib/cv/CacheableImage.py | 107 +++++++-- .../cv/spot_analysis/SpotAnalysisOperable.py | 20 ++ .../AbstractSpotAnalysisImageProcessor.py | 68 +++++- ...test_AbstractSpotAnalysisImageProcessor.py | 210 ++++++++++++------ .../common/lib/cv/test/test_CacheableImage.py | 26 ++- 5 files changed, 337 insertions(+), 94 deletions(-) diff --git a/opencsp/common/lib/cv/CacheableImage.py b/opencsp/common/lib/cv/CacheableImage.py index fe8f6057e..3b27d8c26 100644 --- a/opencsp/common/lib/cv/CacheableImage.py +++ b/opencsp/common/lib/cv/CacheableImage.py @@ -24,11 +24,15 @@ class CacheableImage: priority order for the data that is returned from various methods: (1) in-memory array, (2) numpy cache file, (3) image source file. - Only one of the inputs (array, cache_path, source_path) are required. The - "array" parameter should be the raw image data, the cache_path should be a - string to a .npy file, and the source_path should be a string to an image - file. Note that the cache_path file doesn't need to exist yet, but can - instead be an indicator for where to cache the image to. + Parameters + ---------- + Only one of the inputs (array, cache_path, source_path) are required in the + constructor. In fact, there is a method :py:meth:`from_single_source` that + tries to guess which input is being provided. The "array" parameter should + be the raw image data, the cache_path should be a string to a .npy file, and + the source_path should be a string to an image file. Note that the + cache_path file doesn't need to exist yet, but can instead be an indicator + for where to cache the image to. The following table determines how images are retrieved based on the given parameters ([X] = given and file exists, [.] = given and file doesn't @@ -72,6 +76,29 @@ class CacheableImage: +-------+---------+----------+ | | | [X] | +-------+---------+----------+ + + + sys.getsizeof + ------------- + This class overrides the default __sizeof__ dunder method, meaning that the + size returned by sys.getsizeof(cacheable_image) is not just the size of all + variables tracked by the instance. Rather, the size of the Numpy array and + Pillow image are returned. This metric better represents the + memory-conserving use case that is intended for this class. + + __sizeof__ returns close to 0 if the array and image attributes have been + set to None (aka the cache() method has been executed). Note that this does + not depend on the state of the garbage collector, which might not actually + free the memory for some time after it is no longer being tracked by this + class. An attempt at freeing the memory can be done immediately with + `gc.collect()` but the results of this are going to be implementation + specific. + + The size of the source path, image path, and all other attributes are not + included in the return value of __sizeof__. This decision was made for + simplicity. Also the additional memory has very little impact. For example a + 256 character path uses ~0.013% as much memory as a 1920x1080 monochrome + image. """ _cacheable_images_registry: weakref.WeakKeyDictionary["CacheableImage", int] = {} @@ -81,6 +108,11 @@ class CacheableImage: # Like _cacheable_images_registry, but for instances that have been deregistered and not reregistered. _cacheable_images_last_access_index: int = 0 # The last value used in the _cacheable_images_registry for maintaining access order. + _expected_cached_size: int = (48 * 2) * 2 + # Upper bound on the anticipated return value from __sizeof__ after cache() + # has just been evaluated. Each python variable uses ~"48" bytes, there are + # "*2" variables included (array and image), and we don't care about + # specifics so add some buffer "*2". def __init__(self, array: np.ndarray = None, cache_path: str = None, source_path: str = None): """ @@ -425,11 +457,10 @@ def _does_source_image_match(self, nparray: np.ndarray): except Exception: return False - enforce_source_image_matches = True - # Debugging: check that the programmer didn't misuse CacheableImage - # by setting the source_path to a file that doesn't actually match - # the numpy array or cache path. - if enforce_source_image_matches and not arrays_are_equal: + # Check that the programmer didn't misuse CacheableImage by setting + # the source_path to a file that doesn't actually match the numpy + # array or cache path. + if not arrays_are_equal: try: import os import opencsp.common.lib.opencsp_path.opencsp_root_path as orp @@ -465,8 +496,8 @@ def cache(self, cache_path: str = None): Parameters ---------- cache_path : str, optional - The path to cache to, as necessary. Can be None if either cache_path - or source_path are already set. By default None. + The path/name.ext to cache to, as necessary. Can be None if either + cache_path or source_path are already set. By default None. """ # use either self.cache_path or cache_path, depending on: # 1. self.cache_path exists @@ -516,6 +547,8 @@ def cache(self, cache_path: str = None): # This instance has already been cached to disk at least once. Don't # need to cache it again. We check the size to make sure that the # file was actually cached and doesn't just exist as a placeholder. + # I chose '> 10' instead of '> 0' because I'm paranoid that + # getsize() will return a small number of bytes on some systems. pass else: # Write the numpy array to disk. @@ -533,10 +566,15 @@ def cache(self, cache_path: str = None): def save_image(self, image_path_name_ext: str): """ - Saves this image as an image file to the given file. + Saves this image as an image file to the given file. This method is best + used when an image is intended to be kept after a computation, in which + case the newly saved image file can be the on-disk reference instead of + an on-disk cache file. Note: this replaces the internal reference to source_path, if any, with - the newly given path. + the newly given path. It is therefore suggested to not use this method + unless you are using this class as part of an end-use application, in + order to avoid unintended side effects. Parameters ---------- @@ -550,8 +588,37 @@ def save_image(self, image_path_name_ext: str): def cache_images_to_disk_as_necessary( memory_limit_bytes: int, tmp_path_generator: Callable[[], str], log_level=lt.log.DEBUG ): - """Check memory usage and convert images to files (aka file path - strings) as necessary in order to reduce memory usage.""" + """ + Check memory usage and convert images to files (aka file path strings) + as necessary in order to reduce memory usage. + + Note that due to the small amount of necessary memory used by each + CacheableImage instance, all instances can be cached and still be above + the `memory_limit_bytes` threshold. This can happen either when + memory_limit_bytes is sufficiently small, or the number of live + CacheableImages is sufficiently large. In these cases, this method may + not be able to lower the amount of memory in use. + + Parameters + ---------- + memory_limit_bytes : int + The total number of bytes of memory that all CacheableImages are + allowed to use for their in-memory arrays and images, in sum. Note + that each CachableImage instance will still use some small amount of + memory even after it has been cached. + tmp_path_generator : Callable[[], str] + A function that returns a path/name.ext for a file that does not + exist yet. This file will be used to save the numpy array out to. + log_level : int, optional + The level to print out status messages to, including the amount of + memory in use before and after caching images. By default + lt.log.DEBUG. + """ + # By providing the memory_limit_bytes as a parameter, we're effectively + # enabling the user to choose a lower memory threshold than is the + # default. There's also the benefit of requiring the user to think about + # how much memory they want to use, which is going to be system and + # application specific. total_mem_size = CacheableImage.all_cacheable_images_size() if total_mem_size <= memory_limit_bytes: return @@ -571,12 +638,14 @@ def cache_images_to_disk_as_necessary( # free the LRU instance's memory by cacheing it to disk cacheable_image_size = sys.getsizeof(cacheable_image) - if cacheable_image_size == 0: - pass # already cached to disk + if cacheable_image_size <= cacheable_image._expected_cached_size: + continue # already cached to disk, probably if cacheable_image.cache_path is not None: cacheable_image.cache(None) else: # cache_path is None cacheable_image.cache(tmp_path_generator()) - total_mem_size -= cacheable_image_size + + bytes_cached_to_disk = cacheable_image_size - sys.getsizeof(cacheable_image) + total_mem_size -= bytes_cached_to_disk log_method(f"New total memory size after cacheing images: {int(total_mem_size / 1024 / 1024)}MB") diff --git a/opencsp/common/lib/cv/spot_analysis/SpotAnalysisOperable.py b/opencsp/common/lib/cv/spot_analysis/SpotAnalysisOperable.py index 96582ffc6..532b9c4f3 100644 --- a/opencsp/common/lib/cv/spot_analysis/SpotAnalysisOperable.py +++ b/opencsp/common/lib/cv/spot_analysis/SpotAnalysisOperable.py @@ -114,6 +114,26 @@ def get_all_images(self, primary=True, supporting=True, visualization=True, algo """ Get a list of all images tracked by this operable including all primary images, supporting images, visualization, and algorithm images. + + Parameters + ---------- + primary : bool, optional + True to include the primary image in the list of returned images. By + default True. + supporting : bool, optional + True to include the supporting images, if any, in the list of + returned images. By default True. + visualization : bool, optional + True to include the visualization images in the list of returned + images. By default True. + algorithm : bool, optional + True to include the algorithm images, if any, in the list of + returned images. By default True. + + Returns + ------- + list[CacheableImage] + The images tracked by this operable. """ ret: list[CacheableImage] = [] diff --git a/opencsp/common/lib/cv/spot_analysis/image_processor/AbstractSpotAnalysisImageProcessor.py b/opencsp/common/lib/cv/spot_analysis/image_processor/AbstractSpotAnalysisImageProcessor.py index c60574a82..c54da7628 100644 --- a/opencsp/common/lib/cv/spot_analysis/image_processor/AbstractSpotAnalysisImageProcessor.py +++ b/opencsp/common/lib/cv/spot_analysis/image_processor/AbstractSpotAnalysisImageProcessor.py @@ -134,7 +134,15 @@ def assign_inputs( 'AbstractSpotAnalysisImagesProcessor', list[SpotAnalysisOperable], Iterator[SpotAnalysisOperable] ], ): - """Register the input operables to be processed either with the run() method, or as an iterator.""" + """ + Register the input operables to be processed either with the run() + method, or as an iterator. + + Parameters + ---------- + operables : Union[ AbstractSpotAnalysisImagesProcessor, list[SpotAnalysisOperable], Iterator[SpotAnalysisOperable] ] + The operables to be processed. + """ # initialize the state for a new set of inputs self.input_operables = operables self._num_images_processed = 0 @@ -192,12 +200,33 @@ def _get_tmp_path(self) -> str: self._tmp_images_saved += 1 return path_name_ext - def _save_image(self, im: CacheableImage, idx_list: list[int], dir: str, name_prefix: str = None, ext="jpg"): + def _save_image(self, im: CacheableImage, idx_list: list[int], dir: str, name_prefix: str = None, ext="jpg") -> str: + # Saves the given image to the given path. + # + # Parameters + # ---------- + # im : CacheableImage + # The image to be saved. + # idx_list : list[int] + # Length-1 list where idx_list[0] is the count of images saved with + # this method. Used for naming the saved images. This value is updated + # as part of the execution of this method. + # dir : str + # The directory to save the image to. + # name_prefix : str, optional + # A prefix to prepend to the image name, by default empty string + # ext : str, optional + # The extension/type to save the image with, by default "jpg" + # + # Returns + # ------- + # str + # The path/name.ext of the newly saved image. idx = idx_list[0] image_name = ("" if name_prefix == None else f"{name_prefix}_") + f"SA_preprocess_{self.name}{idx}" image_path_name_ext = os.path.join(dir, image_name + "." + ext) lt.debug("Saving SpotAnalysis processed image to " + image_path_name_ext) - im.save_image(image_path_name_ext) + im.to_image().save(image_path_name_ext) idx_list[0] = idx + 1 return image_path_name_ext @@ -212,7 +241,26 @@ def run( | Union['AbstractSpotAnalysisImagesProcessor'] ), ) -> list[SpotAnalysisOperable]: - """Performs image processing on the input images.""" + """ + Performs image processing on the input operables and returns the results. + + This is provided as a convenience method. The more typical way to use + this class is to create a SpotAnalysis instance, assign this image + processor to that instance, and then iterate over the results. + + See also: :py:meth:`process_images` as another convenience method. + + Parameters + ---------- + operables : ImagesIterable | ImagesStream | SpotAnalysisImagesStream | list[SpotAnalysisOperable] | Iterator[SpotAnalysisOperable] | Union[AbstractSpotAnalysisImagesProcessor] + The input operables to be processed. If these are images, then they + will be wrapped in a SpotAnalysisOperablesStream. + + Returns + ------- + list[SpotAnalysisOperable] + The resulting operables after processing. + """ if isinstance(operables, (ImagesIterable, ImagesStream)): operables = SpotAnalysisImagesStream(operables) if isinstance(operables, SpotAnalysisImagesStream): @@ -304,6 +352,16 @@ def process_images(self, images: list[CacheableImage | np.ndarray | Image.Image] spot_analysis = SpotAnalysis("Log Scale Images", processors) spot_analysis.set_primary_images(images) results = [result for result in spot_analysis] + + Parameters + ---------- + images : list[CacheableImage | np.ndarray | Image.Image] + The images to be processed. + + Returns + ------- + list[CacheableImage] + The resulting images after processing. """ # import here to avoid cyclic imports from opencsp.common.lib.cv.SpotAnalysis import SpotAnalysis @@ -379,7 +437,7 @@ def __iter__(self): # We must have already finished processing all input images, either # through the run() method or by simply having iterated through them # all. - raise StopIteration + return self elif self.input_iter != None: # We must be iterating through the input images already. return self diff --git a/opencsp/common/lib/cv/spot_analysis/image_processor/test/test_AbstractSpotAnalysisImageProcessor.py b/opencsp/common/lib/cv/spot_analysis/image_processor/test/test_AbstractSpotAnalysisImageProcessor.py index e0abbcc6d..e7cc15ab6 100644 --- a/opencsp/common/lib/cv/spot_analysis/image_processor/test/test_AbstractSpotAnalysisImageProcessor.py +++ b/opencsp/common/lib/cv/spot_analysis/image_processor/test/test_AbstractSpotAnalysisImageProcessor.py @@ -1,13 +1,15 @@ import copy import dataclasses +import random import unittest import numpy as np -import opencsp.common.lib.tool.file_tools as ft import opencsp.common.lib.cv.CacheableImage as ci import opencsp.common.lib.cv.spot_analysis.image_processor.AbstractSpotAnalysisImageProcessor as asaip import opencsp.common.lib.cv.spot_analysis.SpotAnalysisOperable as sao +import opencsp.common.lib.tool.file_tools as ft +import opencsp.common.lib.tool.log_tools as lt class DoNothingImageProcessor(asaip.AbstractSpotAnalysisImagesProcessor): @@ -41,109 +43,181 @@ def setUp(self) -> None: self.example_operable = sao.SpotAnalysisOperable(self.cacheable_image) self.example_operables: list[sao.SpotAnalysisOperable] = [] - for i in range(3): + self.num_example_operables = random.randint(0, 10) + for i in range(self.num_example_operables): ci_i = ci.CacheableImage(cache_path=self.example_cache_path) sao_i = sao.SpotAnalysisOperable(ci_i) self.example_operables.append(sao_i) + self.example_operables_gte2: list[sao.SpotAnalysisOperable] = [] + self.num_example_operables_gte2 = self.num_example_operables + 2 + for i in range(self.num_example_operables_gte2): + ci_i = ci.CacheableImage(cache_path=self.example_cache_path) + sao_i = sao.SpotAnalysisOperable(ci_i) + self.example_operables_gte2.append(sao_i) + def test_name(self): """ Verify that the auto-assigned name is the class name, and that it can be overwritten with a specific name. """ - instance = DoNothingImageProcessor() - self.assertEqual(instance.name, "DoNothingImageProcessor") - instance = DoNothingImageProcessor("Other Name") - self.assertEqual(instance.name, "Other Name") + try: + instance = DoNothingImageProcessor() + self.assertEqual(instance.name, "DoNothingImageProcessor") + instance = DoNothingImageProcessor("Other Name") + self.assertEqual(instance.name, "Other Name") + + except: + lt.error( + "Error in test_AbstractSpotAnalysisImageProcessor.test_name(): " + + f"failed for operables {self.example_operables=}" + ) + raise def test_finished(self): """ Verify that finished is True only when no images have been assigned, or when all images have been processed. """ - # 0 - instance = DoNothingImageProcessor() - self.assertTrue(instance.finished) - - # 1 - instance.assign_inputs([self.example_operable]) - self.assertFalse(instance.finished) - for result in instance: - pass - self.assertTrue(instance.finished) - - # > 1 - instance.assign_inputs(self.example_operables) - self.assertFalse(instance.finished) - for result in instance: - pass - self.assertTrue(instance.finished) - - def test_iterator_finishes_all(self): + try: + # 0 + instance = DoNothingImageProcessor() + self.assertTrue(instance.finished) + + # 1 + instance.assign_inputs([self.example_operable]) + self.assertFalse(instance.finished) + for result in instance: + pass + self.assertTrue(instance.finished) + + # > 1 + instance.assign_inputs(self.example_operables_gte2) + for result in instance: + pass + self.assertTrue(instance.finished) + + except: + lt.error( + "Error in test_AbstractSpotAnalysisImageProcessor.test_finished(): " + + f"failed for operables {self.example_operables_gte2=}" + ) + raise + + def test_0_operables(self): + """Verify that we see the expected behavior when attempting to run with 0 input.""" instance = DoNothingImageProcessor() nprocessed = 0 - # test with an assignment of a few operables - instance.assign_inputs(self.example_operables) - for result in instance: - nprocessed += 1 - self.assertEqual(nprocessed, 3) + # test assignment + instance.assign_inputs([]) - # test with an assignment of an additional "two" operables - instance.assign_inputs([self.example_operable, self.example_operable]) + # test processing for result in instance: nprocessed += 1 - self.assertEqual(nprocessed, 5) + self.assertEqual(0, nprocessed) + + # test running + results = instance.run([]) + self.assertEqual(len(results), 0) + + def test_iterator_finishes_all(self): + try: + instance = DoNothingImageProcessor() + nprocessed = 0 + + # test with an assignment of a few operables + instance.assign_inputs(self.example_operables) + for result in instance: + nprocessed += 1 + self.assertEqual(nprocessed, self.num_example_operables) + + # test with an assignment of an additional "two" operables + instance.assign_inputs([self.example_operable, self.example_operable]) + for result in instance: + nprocessed += 1 + self.assertEqual(nprocessed, self.num_example_operables + 2) + + except: + lt.error( + "Error in test_AbstractSpotAnalysisImageProcessor.test_iterator_finishes_all(): " + + f"failed for operables {self.example_operables=}" + ) + raise def test_run(self): """Verify that run() touches all the operables""" - # sanity check: no pixels are equal to 1 - for operable in self.example_operables: - self.assertTrue(np.all(operable.primary_image.nparray != 1)) + try: + # sanity check: no pixels are equal to 1 + for operable in self.example_operables: + self.assertTrue(np.all(operable.primary_image.nparray != 1)) - # process all images - instance = SetOnesImageProcessor() - results = instance.run(self.example_operables) + # process all images + instance = SetOnesImageProcessor() + results = instance.run(self.example_operables) + + # verify the input operables haven't been touched + for operable in self.example_operables: + self.assertTrue(np.all(operable.primary_image.nparray != 1)) - # verify the input operables haven't been touched - for operable in self.example_operables: - self.assertTrue(np.all(operable.primary_image.nparray != 1)) + # verify all pixels in the new operables are equal to 1 + for operable in results: + self.assertTrue(np.all(operable.primary_image.nparray == 1)) - # verify all pixels in the new operables are equal to 1 - for operable in results: - self.assertTrue(np.all(operable.primary_image.nparray == 1)) + except: + lt.error( + "Error in test_AbstractSpotAnalysisImageProcessor.test_run(): " + + f"failed for operables {self.example_operables=}" + ) + raise def test_process_operable(self): """Verify that process_operable() updates the pixels""" - for operable in self.example_operables: - # sanity check: no pixels are equal to 1 - self.assertTrue(np.all(operable.primary_image.nparray != 1)) + try: + for operable in self.example_operables: + # sanity check: no pixels are equal to 1 + self.assertTrue(np.all(operable.primary_image.nparray != 1)) - # process the operable - instance = SetOnesImageProcessor() - result = instance.process_operable(operable, is_last=True) + # process the operable + instance = SetOnesImageProcessor() + result = instance.process_operable(operable, is_last=True) + + # verify the input operable hasn't been touched + self.assertTrue(np.all(operable.primary_image.nparray != 1)) - # verify the input operable hasn't been touched - self.assertTrue(np.all(operable.primary_image.nparray != 1)) + # verify all pixels in the new operable are equal to 1 + self.assertTrue(np.all(result[0].primary_image.nparray == 1)) - # verify all pixels in the new operable are equal to 1 - self.assertTrue(np.all(result[0].primary_image.nparray == 1)) + except: + lt.error( + "Error in test_AbstractSpotAnalysisImageProcessor.test_process_operable(): " + + f"failed for operables {self.example_operables=}" + ) + raise def test_process_images(self): """Verify that process_images() updates the pixels""" - for operable in self.example_operables: - # sanity check: no pixels are equal to 1 - self.assertTrue(np.all(operable.primary_image.nparray != 1)) - - # process the image - instance = SetOnesImageProcessor() - result = instance.process_images([operable.primary_image]) - - # verify the input image hasn't been touched - self.assertTrue(np.all(operable.primary_image.nparray != 1)) - - # verify all pixels in the new image are equal to 1 - self.assertTrue(np.all(result[0].nparray == 1)) + try: + for operable in self.example_operables: + # sanity check: no pixels are equal to 1 + self.assertTrue(np.all(operable.primary_image.nparray != 1)) + + # process the image + instance = SetOnesImageProcessor() + result = instance.process_images([operable.primary_image]) + + # verify the input image hasn't been touched + self.assertTrue(np.all(operable.primary_image.nparray != 1)) + + # verify all pixels in the new image are equal to 1 + self.assertTrue(np.all(result[0].nparray == 1)) + + except: + lt.error( + "Error in test_AbstractSpotAnalysisImageProcessor.test_process_images(): " + + f"failed for operables {self.example_operables=}" + ) + raise if __name__ == '__main__': diff --git a/opencsp/common/lib/cv/test/test_CacheableImage.py b/opencsp/common/lib/cv/test/test_CacheableImage.py index de2f68903..26955c040 100644 --- a/opencsp/common/lib/cv/test/test_CacheableImage.py +++ b/opencsp/common/lib/cv/test/test_CacheableImage.py @@ -41,7 +41,7 @@ def setUp(self) -> None: self.noexist_source_path = ft.join(self.out_dir, "noexist.png") # de-register all cacheable images - while CacheableImage.lru() != None: + while CacheableImage.lru() is not None: pass def test_init_valid(self): @@ -165,6 +165,7 @@ def test_cache(self): # fmt: off valid_combinations = [ + # np array, .np cache file, .png source file, expected .np cache file [ self.example_array, None, None, default_cache_file ], [ self.example_array, self.example_cache_path, None, self.example_cache_path ], [ self.example_array, None, self.noexist_source_path, default_cache_file ], @@ -216,7 +217,7 @@ def test_cache(self): self.assertFalse(ft.file_exists(noexist_cache_file), msg=err_msg) # check memory usage - self.assertAlmostEqual(0, sys.getsizeof(cacheable), delta=100, msg=err_msg) + self.assertAlmostEqual(0, sys.getsizeof(cacheable), delta=cacheable._expected_cached_size, msg=err_msg) # verify that loading from the cache works uncached_array = cacheable.nparray @@ -330,6 +331,27 @@ def test_save_image(self): cached_size = sys.getsizeof(ci) self.assertLess(cached_size, in_memory_size) + def test_cache_memlimit0(self): + """Check that cacheing doesn't halt forever when the memory limit is 0.""" + default_cache_file = ft.join(self.out_dir, f"{self.test_name}.npy") + cache_path_gen = lambda: default_cache_file + + # create the cacheable image + ci1 = CacheableImage(cache_path=self.example_cache_path) + self.assertAlmostEqual(0, sys.getsizeof(ci1), delta=ci1._expected_cached_size) + + # verify we're not cached yet + ci1.nparray + self.assertAlmostEqual(40 * 40 * 3, sys.getsizeof(ci1), delta=ci1._expected_cached_size) + + # verify the memory limit is working + ci1.cache_images_to_disk_as_necessary(1e10, cache_path_gen) + self.assertAlmostEqual(40 * 40 * 3, sys.getsizeof(ci1), delta=ci1._expected_cached_size) + + # check that a memory limit of 0 is accepted + ci1.cache_images_to_disk_as_necessary(0, cache_path_gen) + self.assertAlmostEqual(0, sys.getsizeof(ci1), delta=ci1._expected_cached_size) + if __name__ == '__main__': unittest.main()