diff --git a/docs/cli.rst b/docs/cli.rst index 1adf0422d..253e654cc 100644 --- a/docs/cli.rst +++ b/docs/cli.rst @@ -91,6 +91,108 @@ and tracing upwards through is_a and part_of relationships: uberon viz -p i,p hand foot +Cache Control +------------- + +OAK may download data from remote sources as part of its normal operations. For +example, using the :code:`sqlite:obo:...` input selector will cause OAK to +fetch the requested Semantic-SQL database from a centralised repository. +Whenever that happens, the downloaded data will be cached in a local directory +so that subsequent commands using the same input selector do not have to +download the file again. + +By default, OAK will refresh (download again) a previously downloaded file if +it was last downloaded more than 7 days ago. + +The behavior of the cache can be controlled in two ways: with an option on the +command line and with a configuration file. + +Controlling the cache on the command line +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The global option :code:`--caching` gives the user some control on how the +cache works. + +To change the default cache expiry lifetime of 7 days, the :code:`--caching` +option accepts a value of the form :code:`ND`, where *N* is a positive integer +and *D* can be either :code:`s`, :code:`d`, :code:`w`, :code:`m`, or :code:`y` +to indicate that *N* is a number of seconds, days, weeks, months, or years, +respectively. If the *D* part is omitted, it defaults to :code:`d`. + +For example, :code:`--caching=3w` instructs OAK to refresh a cached file if it +was last refreshed 21 days ago. + +The :code:`--caching` option also accepts the following special values: + +- :code:`refresh` to force OAK to always refresh a file regardless of its age; +- :code:`no-refresh` to do the opposite, that is, preventing OAK from + refreshing a file regardless of its age; +- :code:`clear` to forcefully clear the cache (which will trigger a refresh as + a consequence); +- :code:`reset` is a synonym of :code:`clear`. + +Note the difference between :code:`refresh` and :code:`clear`. The former will +only cause the requested file to be refreshed, leaving any other file that may +exist in the cache untouched. The latter will delete all cached files, so that +not only the requested file will be downloaded again, but any other +previously cached file will also have to be downloaded again the next time they +are requested. + +In both case, refreshing and clearing will only happen if the OAK command in +which the :code:`--caching` option is used attempts to look up a cached file. +Otherwise the option will have no effect. + +To forcefully clear the cache independently of any command, the +:ref:`cache-clear` command may be used. The contents of the cache may be +explored at any time with the :ref:`cache-ls` command. + +Controlling the cache with a configuration file +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Finer control of how the cache works is possible through a configuration file +that OAK will look up for at the following locations: + +- under GNU/Linux: in ``$XDG_CONFIG_HOME/ontology-access-kit/cache.conf``; +- under macOS: in ``$HOME/Library/Application Support/ontology-access-kit/cache.conf``; +- under Windows: in ``%LOCALAPPDATA%\ontology-access-kit\ontology-access-kit\cache.conf``. + +The file should contain lines of the form :code:`pattern = policy`, where: + +- *pattern* is a shell-type globbing pattern indicating the files that will be + concerned by the policy set forth on the line; +- *policy* is the same type of value as expected by the :code:`--caching` + option as explained in the previous section. + +Blank lines and lines starting with :code:`#` are ignored. + +If the *pattern* is :code:`default` (or :code:`*`), the corresponding policy +will be used for any cached file that does not have a matching policy. + +Here is a sample configuration file: + +.. code-block:: + + # Uberon will be refreshed if older than 1 month + uberon.db = 1m + # FBbt will be refreshed if older than 2 weeks + fbbt.db = 2w + # Other FlyBase ontologies will be refreshed if older than 2 months + fb*.db = 2m + # All other files will be refreshed if older than 3 weeks + default = 3w + +Note that when looking up the policy to apply to a given file, patterns are +tried in the order they appear in the file. This is why the :code:`fbbt.db` +pattern in the example above must be listed *before* the less specific +:code:`fb*.db` pattern, otherwise it would be ignored. (This does not apply to +the default pattern -- whether it is specified as :code:`default` or as +:code:`*` -- which is always tried after all the other patterns.) + +The :code:`--caching` option described in the previous section always takes +precedence over the configuration file. That is, all rules set forth in the +configuration will be ignored if the :code:`--caching` option is specified on +the command line. + Commands ----------- diff --git a/docs/intro/tutorial07.rst b/docs/intro/tutorial07.rst index 3ebdfb6c6..e94f184c9 100644 --- a/docs/intro/tutorial07.rst +++ b/docs/intro/tutorial07.rst @@ -64,6 +64,10 @@ This will download the pato.db sqlite file once, and cache it. PyStow is used to cache the file, and the default location is ``~/.data/oaklib``. +By default, a cached SQLite file will be automatically refreshed (downloaded +again) if it is older than 7 days. For details on how to alter the behavior of +the cache, see the :ref:`Cache Control` section in the CLI documentation. + Building your own SQLite files ------------------- diff --git a/src/oaklib/cli.py b/src/oaklib/cli.py index 810211c7d..8fd51aa5e 100644 --- a/src/oaklib/cli.py +++ b/src/oaklib/cli.py @@ -9,14 +9,12 @@ # See https://stackoverflow.com/questions/47972638/how-can-i-define-the-order-of-click-sub-commands-in-help import json import logging -import os import statistics as stats import sys from collections import defaultdict from enum import Enum, unique from itertools import chain from pathlib import Path -from time import time from types import ModuleType from typing import ( Any, @@ -28,7 +26,6 @@ import click import kgcl_schema.grammar.parser as kgcl_parser -import pystow import sssom.writers as sssom_writers import sssom_schema import yaml @@ -42,6 +39,7 @@ import oaklib.datamodels.taxon_constraints as tcdm from oaklib import datamodels +from oaklib.constants import FILE_CACHE from oaklib.converters.logical_definition_flattener import LogicalDefinitionFlattener from oaklib.datamodels import synonymizer_datamodel from oaklib.datamodels.association import RollupGroup @@ -149,6 +147,7 @@ generate_disjoint_class_expressions_axioms, ) from oaklib.utilities.basic_utils import pairs_as_dict +from oaklib.utilities.caching import CachePolicy from oaklib.utilities.iterator_utils import chunk from oaklib.utilities.kgcl_utilities import ( generate_change_id, @@ -568,6 +567,11 @@ def _apply_changes(impl, changes: List[kgcl.Change]): show_default=True, help="If set, will profile the command", ) +@click.option( + "--caching", + type=CachePolicy.ClickType, + help="Set the cache management policy", +) def main( verbose: int, quiet: bool, @@ -587,6 +591,7 @@ def main( prefix, profile: bool, import_depth: Optional[int], + caching: Optional[CachePolicy], **kwargs, ): """ @@ -635,6 +640,7 @@ def exit(): import requests_cache requests_cache.install_cache(requests_cache_db) + FILE_CACHE.force_policy(caching) resource = OntologyResource() resource.slug = input settings.autosave = autosave @@ -5454,12 +5460,14 @@ def cache_ls(): """ List the contents of the pystow oaklib cache. - TODO: this currently only works on unix-based systems. """ - directory = pystow.api.join("oaklib") - command = f"ls -al {directory}" - click.secho(f"[pystow] {command}", fg="cyan", bold=True) - os.system(command) # noqa:S605 + units = ["B", "KB", "MB", "GB", "TB"] + for path, size, mtime in FILE_CACHE.get_contents(subdirs=True): + i = 0 + while size > 1024 and i < len(units) - 1: + size /= 1024 + i += 1 + click.echo(f"{path} ({size:.2f} {units[i]}, {mtime:%Y-%m-%d})") @main.command() @@ -5475,17 +5483,9 @@ def cache_clear(days_old: int): Clear the contents of the pystow oaklib cache. """ - directory = pystow.api.join("oaklib") - now = time() - for item in Path(directory).glob("*"): - if ".db" not in str(item): - continue - mtime = item.stat().st_mtime - curr_days_old = (int(now) - int(mtime)) / 86400 - logging.info(f"{item} is {curr_days_old}") - if curr_days_old > days_old: - click.echo(f"Deleting {item} which is {curr_days_old}") - item.unlink() + + for name, _, age in FILE_CACHE.clear(subdirs=False, older_than=days_old, pattern="*.db*"): + click.echo(f"Deleted {name} which was {age.days} days old") @main.command() diff --git a/src/oaklib/constants.py b/src/oaklib/constants.py index 348adb0c7..7160ae825 100644 --- a/src/oaklib/constants.py +++ b/src/oaklib/constants.py @@ -2,9 +2,13 @@ import pystow +from oaklib.utilities.caching import FileCache + __all__ = [ "OAKLIB_MODULE", + "FILE_CACHE", ] OAKLIB_MODULE = pystow.module("oaklib") +FILE_CACHE = FileCache(OAKLIB_MODULE) TIMEOUT_SECONDS = 30 diff --git a/src/oaklib/implementations/llm_implementation.py b/src/oaklib/implementations/llm_implementation.py index 6faa01006..43b7e3649 100644 --- a/src/oaklib/implementations/llm_implementation.py +++ b/src/oaklib/implementations/llm_implementation.py @@ -8,7 +8,6 @@ from dataclasses import dataclass from typing import TYPE_CHECKING, Dict, Iterable, Iterator, List, Optional, Tuple -import pystow from linkml_runtime.dumpers import yaml_dumper from sssom_schema import Mapping from tenacity import ( @@ -19,6 +18,7 @@ ) from oaklib import BasicOntologyInterface +from oaklib.constants import FILE_CACHE from oaklib.datamodels.class_enrichment import ClassEnrichmentResult from oaklib.datamodels.item_list import ItemList from oaklib.datamodels.obograph import DefinitionPropertyValue @@ -148,7 +148,7 @@ def config_to_prompt(configuration: Optional[ValidationConfiguration]) -> Option for obj in configuration.documentation_objects: if obj.startswith("http:") or obj.startswith("https:"): - path = pystow.ensure("oaklib", "documents", url=obj) + path = FILE_CACHE.ensure("documents", url=obj) else: path = obj with open(path) as f: diff --git a/src/oaklib/implementations/sqldb/sql_implementation.py b/src/oaklib/implementations/sqldb/sql_implementation.py index d05be4986..7a0ee46e3 100644 --- a/src/oaklib/implementations/sqldb/sql_implementation.py +++ b/src/oaklib/implementations/sqldb/sql_implementation.py @@ -63,7 +63,7 @@ import oaklib.datamodels.ontology_metadata as om import oaklib.datamodels.validation_datamodel as vdm -from oaklib.constants import OAKLIB_MODULE +from oaklib.constants import FILE_CACHE from oaklib.datamodels import obograph, ontology_metadata from oaklib.datamodels.association import Association from oaklib.datamodels.obograph import ( @@ -342,7 +342,7 @@ def __post_init__(self): # Option 1 uses direct URL construction: url = f"https://s3.amazonaws.com/bbop-sqlite/{prefix}.db.gz" logging.info(f"Ensuring gunzipped for {url}") - db_path = OAKLIB_MODULE.ensure_gunzip(url=url, autoclean=False) + db_path = FILE_CACHE.ensure_gunzip(url=url, autoclean=False) # Option 2 uses botocore to interface with the S3 API directly: # db_path = OAKLIB_MODULE.ensure_from_s3(s3_bucket="bbop-sqlite", s3_key=f"{prefix}.db") locator = f"sqlite:///{db_path}" diff --git a/src/oaklib/utilities/caching.py b/src/oaklib/utilities/caching.py new file mode 100644 index 000000000..c8010b30e --- /dev/null +++ b/src/oaklib/utilities/caching.py @@ -0,0 +1,396 @@ +import fnmatch +import logging +import os.path +import re +import time +from datetime import datetime, timedelta +from pathlib import Path + +from appdirs import user_config_dir +from pystow.utils import base_from_gzip_name, name_from_url + +from oaklib.datamodels.vocabulary import APP_NAME + +_durations = {'d': 1, 'w': 7, 'm': 30, 'y': 365} +_logger = logging.getLogger(__name__) + + +class CachePolicy(object): + """Represents the behaviour of a cache. + + Once a CachePolicy object has been created (typically using the static + constructor from_string, or one of the static properties for special + policies), use the refresh_file() method to determine whether a given file + should be refreshed: + + >>> if my_policy.refresh_file(my_cache_file): + >>> # refresh the cache file + >>> else: + >>> # no need to refresh + + Use the refresh() method to check an arbitrary timestamp against the policy + (e.g. if the cached data is not in a file): + + >>> if my_policy.refresh(timestamp_of_last_refresh): + >>> # refresh the data + """ + + def __init__(self, max_age): + """Creates a new instance. + + If positive, the max_age parameter is the number of seconds after which + cached data should be refreshed. This parameter can also accept some + special values: + + - 0 indicates refresh should always occur, regardless of the age of the + cached data; + - -1 indicates the cache should be cleared. + + It is recommended to obtain such special policies using either the + from_string static constructor or the static properties REFRESH, RESET, + rather than calling this constructor directly. This allows comparing a + policy against those pre-established policies as follows: + + >>> if my_policy == CachePolicy.RESET: + >>> # force reset + """ + + self._max_age = max_age + + def refresh(self, then): + """Indicates whether a refresh should occur for data last refreshed at + the indicated time. + + :param then: the time the data were last cached or refreshed, in + seconds since the Unix epoch + :return: True if the data should be refreshed, otherwise False + """ + + if self._max_age <= 0: + # Forceful refresh/reset, even if "then" is somehow in the future + return True + return time.time() - then > self._max_age + + def refresh_file(self, pathname): + """Indicates whether the specified file should be refreshed. + + This uses the last modification time of the file to determine the age + of the cached data. If the file does not exist, a refresh will + necessarily be mandated. + + :param pathname: the path to the file that maybe should be refreshed + :return: True if the file should be refreshed, otherwise False + """ + + if not os.path.exists(pathname): + return True + return self.refresh(os.path.getmtime(pathname)) + + @property + def always_refresh(self): + """Indicates whether this policy mandates a systematic refresh of the + cache.""" + + return self._max_age == 0 + + @property + def never_refresh(self): + """Indicates whether this policy mandates never refreshing the + cache.""" + + return self._max_age == timedelta.max.total_seconds() + + @property + def reset(self): + """Indicates whether this policy mandates a reset of the cache.""" + + return self._max_age == -1 + + _refresh_policy = None + _no_refresh_policy = None + _reset_policy = None + _click_type = None + + @classmethod + def from_string(cls, value): + """Creates a new instance from a string representation. + + This is the recommended way of getting a CachePolicy object. The value + can be either: + + - a number of seconds, followed by 's'; + - a number of days, optionally followed by 'd'; + - a number of weeks, followed by 'w'; + - a number of months, followed by 'm'; + - a number of years, followed by 'y'. + + Such a value will result in a policy mandating that cached data are + refreshed after the elapsed number of seconds, days, weeks, months, or + years since they were last cached. Note that in this context, a 'month' + is always 30 days and a 'year' is always 365 days. That is, '3m' is + merely a shortcut for '90d' (or simply '90') and '2y' is merely a + shortcut for '730d'. + + The value can also be: + + - 'refresh', to get the REFRESH policy; + - 'no-refresh', to get the NO_REFRESH policy; + - 'reset' or 'clear', to get the RESET policy. + + Any other value will cause None to be returned. + """ + + value = value.lower() + if value == 'refresh': + return cls.REFRESH + elif value == 'no-refresh': + return cls.NO_REFRESH + elif value in ['reset', 'clear']: + return cls.RESET + else: + if m := re.match('^([0-9]+)([sdwmy])?', value): + num, qual = m.groups() + if not qual: + qual = 'd' + if qual == 's': + return cls(int(num)) + else: + return cls(timedelta(days=int(num) * _durations[qual]).total_seconds()) + return None + + @classmethod + @property + def REFRESH(cls): + """A policy that cached data should always be refreshed.""" + + if cls._refresh_policy is None: + cls._refresh_policy = cls(max_age=0) + return cls._refresh_policy + + @classmethod + @property + def NO_REFRESH(cls): + """A policy that cached data should never be refreshed.""" + + if cls._no_refresh_policy is None: + cls._no_refresh_policy = cls(max_age=timedelta.max.total_seconds()) + return cls._no_refresh_policy + + @classmethod + @property + def RESET(cls): + """A policy that cached data should be cleared and refreshed.""" + + if cls._reset_policy is None: + cls._reset_policy = cls(max_age=-1) + return cls._reset_policy + + @classmethod + @property + def ClickType(cls): + """Helper method to parse a CachePolicy with Click. + + Use that method as the 'type' of a Click option to let Click + automatically convert the value of the option into a CachePolicy + instance. + + Example: + + >>> @click.option("--caching", type=CachePolicy.ClickType, + default="1w") + """ + + if cls._click_type is None: + from click import ParamType + + class CachePolicyParamType(ParamType): + name = 'cache-policy' + + def convert(self, value, param, ctx): + if isinstance(value, cls): + return value + + if p := cls.from_string(value): + return p + else: + self.fail(f"Cannot convert '{value}' to a cache policy", param, ctx) + + cls._click_type = CachePolicyParamType() + + return cls._click_type + + +class FileCache(object): + """Represents a file-based cache. + + This is intended as a layer built on top of Pystow, to add cache management + features that are lacking in Pystow. + """ + + def __init__(self, module): + """Creates a new instance. + + :param module: a Pystow module representing the location where cached + data will be stored; all methods in this class will defer to this + object whenever a file needs to be actually refreshed + """ + + self._module = module + self._default_policy = CachePolicy.from_string('1w') + self._forced_policy = None + self._policies = [] + self._config_file = os.path.join(user_config_dir(APP_NAME), "cache.conf") + self._config_read = False + + def force_policy(self, policy): + """Forces the cache to use the specified policy, regardless of any + otherwise configured policies. + + :param policy: the policy to use; may be None to allow the use of + configured policies + """ + + self._forced_policy = policy + + def ensure_gunzip(self, url, name=None, autoclean=True): + """Looks up and maybe downloads and gunzips a file. + + This is a wrapper around Pystow's method of the same name. It behaves + similarly but, if the file is already present in the cache, it will + additionally check whether it needs to be downloaded again, according + to the current caching policy. + """ + + if self._forced_policy == CachePolicy.RESET: + self.clear(pattern="*.db*") + + if not name: + name = name_from_url(url) + + ungz_name = base_from_gzip_name(name) + db_path = self._module.join(name=ungz_name) + + if self._get_policy(ungz_name).refresh_file(db_path): + self._module.ensure_gunzip(url=url, name=name, autoclean=autoclean, force=True) + + return db_path + + def ensure(self, *subkeys, url, name=None): + """Looks up and maybe downloads a file.""" + + if self._forced_policy == CachePolicy.RESET: + self.clear(pattern="*.db*") + + if not name: + name = name_from_url(url) + + path = self._module.join(*subkeys, name=name) + + if self._get_policy(name).refresh_file(path): + self._module.ensure(*subkeys, url=url, name=name, force=True) + + return path + + def get_contents(self, subdirs=False): + """Gets a list of files present in the cache. + + This returns a list of (name, size, mtime) tuples, where: + + - name is the filename (relative to the cache directory); + - size is its size in bytes; + - mtime is its modification time, as a datetime object. + + If subdirs is True, the list includes files present in any subdirectory + within the cache. The default is to list only the files immediately + under the cache directory, excluding any subdirectory. + """ + + contents = [] + for path, name in self._iter_files(subdirs=subdirs): + stat = path.stat() + contents.append((name, stat.st_size, datetime.fromtimestamp(stat.st_mtime))) + return contents + + def clear(self, subdirs=False, older_than=None, pattern="*"): + """Deletes files present in the cache. + + :param subdirs: if True, deletes files in subdirectories + :param older_than: if set, only deletes files that were last modified + longer ago than the specified number of days + :param pattern: only deletes files matching the specified pattern + :return: a list of tuples describing the files that were deleted; the + tuples are similar to the ones returned by get_contents, except + that the third item is the age of the deleted file (as a timedelta + object relative to current time) + """ + + now = time.time() + cleared = [] + for path, name in self._iter_files(subdirs=subdirs, pattern=pattern): + stat = path.stat() + age = now - stat.st_mtime + if older_than is not None and age <= older_than * 86400: + continue + cleared.append((name, stat.st_size, timedelta(seconds=age))) + path.unlink() + return cleared + + def _iter_files(self, subdirs=False, pattern="*"): + """Helper method to get the files present in the cache. + + :param subdirs: if True, get files in subdirectories + :param pattern: get files matching the pattern + :return: a list of (path, name) tuples where path is a Path object + pointing to a file in the cache, and name is its name relative to + the cache directory + """ + + base = self._module.join() + if subdirs: + pattern = "**/" + pattern + return [(c, str(c.relative_to(base))) for c in Path(base).glob(pattern) if c.is_file()] + + def _get_policy(self, name): + """Gets the caching policy to use for the specified name.""" + + if self._forced_policy is not None: + return self._forced_policy + + if not self._config_read: + self._get_configuration(self._config_file) + + for pattern, policy in self._policies: + if fnmatch.fnmatch(name, pattern): + return policy + + return self._default_policy + + def _get_configuration(self, pathname): + """Gets cache policies from a configuration file.""" + + if not os.path.exists(pathname): + return + + filename = os.path.basename(pathname) + with open(pathname, "r") as f: + for n, line in enumerate(f): + if line.startswith("#") or line.isspace(): + continue + + items = line.split("=", maxsplit=1) + pattern = items[0].strip() + if len(items) != 2: + _logger.warning(f"{filename}({n}): Ignoring missing caching policy for {pattern}") + continue + + policy = CachePolicy.from_string(items[1].strip()) + if policy is None: + _logger.warning(f"{filename}({n}): Ignoring invalid caching policy for {pattern}") + continue + + if pattern in ["default", "*"]: + self._default_policy = policy + else: + self._policies.append((pattern, policy)) + + self._config_read = True diff --git a/tests/input/cache.conf b/tests/input/cache.conf new file mode 100644 index 000000000..408e6757d --- /dev/null +++ b/tests/input/cache.conf @@ -0,0 +1,16 @@ +# Test file for the file cache configuration + +# Default policy: refresh after 1 week +default = 1w + +# Refresh Uberon after 2 weeks +uberon.db = 2w + +# Refresh FlyBase ontologies after 1 month +fb*.db = 1m + +# Warning: pattern without associated policy +missing_policy.db + +# Warning: invalid policy +invalid_policy.db = invalid diff --git a/tests/test_utilities/test_caching.py b/tests/test_utilities/test_caching.py new file mode 100644 index 000000000..e8dff2467 --- /dev/null +++ b/tests/test_utilities/test_caching.py @@ -0,0 +1,136 @@ +import os +import time +import unittest + +from oaklib.utilities.caching import CachePolicy, FileCache + + +class TestCachePolicy(unittest.TestCase): + + def test_refresh_policy(self): + policy = CachePolicy.from_string("refresh") + + self.assertTrue(policy.always_refresh) + self.assertFalse(policy.never_refresh) + self.assertFalse(policy.reset) + + self.assertEqual(CachePolicy.REFRESH, policy) + + now = time.time() + self.assertTrue(policy.refresh(now)) + self.assertTrue(policy.refresh(now + 86400)) # 1 day in the future + self.assertTrue(policy.refresh(now - 86400)) # 1 day in the past + + def test_never_refresh_policy(self): + policy = CachePolicy.from_string("no-refresh") + + self.assertTrue(policy.never_refresh) + self.assertFalse(policy.always_refresh) + self.assertFalse(policy.reset) + + self.assertEqual(CachePolicy.NO_REFRESH, policy) + + now = time.time() + self.assertFalse(policy.refresh(now)) + self.assertFalse(policy.refresh(now + 86400)) + self.assertFalse(policy.refresh(now - 86400)) + + # inexistent file is always refreshed even under "no-refresh" + self.assertTrue(policy.refresh_file("inexistent-file")) + + def test_reset_policy(self): + policy = CachePolicy.from_string("reset") + self.assertEqual(policy, CachePolicy.from_string("clear")) + + self.assertTrue(policy.reset) + self.assertFalse(policy.always_refresh) + self.assertFalse(policy.never_refresh) + + self.assertEqual(CachePolicy.RESET, policy) + + now = time.time() + self.assertTrue(policy.refresh(now)) + self.assertTrue(policy.refresh(now + 86400)) + self.assertTrue(policy.refresh(now - 86400)) + + def test_refresh_after_1day_policy(self): + policy = CachePolicy.from_string('1d') + + self.assertFalse(policy.always_refresh) + self.assertFalse(policy.never_refresh) + self.assertFalse(policy.reset) + + now = time.time() + self.assertTrue(policy.refresh(now - 90000)) # 25 hours in the past + self.assertFalse(policy.refresh(now - 82800)) # 23 hours in the past + + def test_refresh_file(self): + now = time.time() + + # Create dummy file with known mtime 3 days in the past + path = "tests/output/dummy-cache" + with open(path, "w"): + pass + os.utime(path, (now - 259200, now - 259200)) + + self.assertTrue(CachePolicy.REFRESH.refresh_file(path)) + self.assertTrue(CachePolicy.RESET.refresh_file(path)) + self.assertFalse(CachePolicy.NO_REFRESH.refresh_file(path)) + self.assertTrue(CachePolicy.from_string('2d').refresh_file(path)) + self.assertFalse(CachePolicy.from_string('4d').refresh_file(path)) + + os.unlink(path) + + # Inexistent file gets refreshed even under no-refresh + self.assertTrue(CachePolicy.NO_REFRESH.refresh_file(path)) + + def test_parsing_durations(self): + self.assertEqual(CachePolicy.from_string("1")._max_age, 86400) + self.assertEqual(CachePolicy.from_string("1d")._max_age, 86400) + self.assertEqual(CachePolicy.from_string("86400s")._max_age, 86400) + self.assertEqual(CachePolicy.from_string("1w")._max_age, 86400 * 7) + self.assertEqual(CachePolicy.from_string("1m")._max_age, 86400 * 30) + self.assertEqual(CachePolicy.from_string("1y")._max_age, 86400 * 365) + + self.assertIsNone(CachePolicy.from_string("bogus")) + +class TestFileCache(unittest.TestCase): + + def test_parse_cache_configuration(self): + cache = FileCache(None) # we don't need a Pystow module here + + with self.assertLogs() as log: + cache._get_configuration("tests/input/cache.conf") + self.assertTrue("missing caching policy" in log.output[0]) + self.assertTrue("invalid caching policy" in log.output[1]) + + self.assertEqual(cache._default_policy._max_age, 86400 * 7) + self.assertEqual(cache._policies[0][0], "uberon.db") + self.assertEqual(cache._policies[0][1]._max_age, 86400 * 7 * 2) + self.assertEqual(cache._policies[1][0], "fb*.db") + self.assertEqual(cache._policies[1][1]._max_age, 86400 * 30) + + def test_policy_selector(self): + cache = FileCache(None) + cache._policies.append(("uberon.db", CachePolicy.from_string("2w"))) + cache._policies.append(("fbbt.db", CachePolicy.from_string("3w"))) + cache._policies.append(("fb*.db", CachePolicy.from_string("1m"))) + cache._policies.append(("fbcv.db", CachePolicy.from_string("1y"))) + + # Prevent a configuration file from messing with the test + cache._config_read = True + + # Check the right policy is selected + self.assertEqual(cache._get_policy("uberon.db")._max_age, 86400 * 7 * 2) + self.assertEqual(cache._get_policy("fbbt.db")._max_age, 86400 * 7 * 3) + self.assertEqual(cache._get_policy("fbdv.db")._max_age, 86400 * 30) + self.assertEqual(cache._get_policy("fbcv.db")._max_age, 86400 * 30) + self.assertEqual(cache._get_policy("other.db")._max_age, 86400 * 7) + + # Check that "forced policy" takes precedence + cache.force_policy(CachePolicy.from_string("2d")) + self.assertEqual(cache._get_policy("uberon.db")._max_age, 86400 * 2) + self.assertEqual(cache._get_policy("fbbt.db")._max_age, 86400 * 2) + self.assertEqual(cache._get_policy("fbdv.db")._max_age, 86400 * 2) + self.assertEqual(cache._get_policy("fbcv.db")._max_age, 86400 * 2) + self.assertEqual(cache._get_policy("other.db")._max_age, 86400 * 2)