diff --git a/.github/actions/test/action.yml b/.github/actions/test/action.yml index 924d3b33..46b9e238 100644 --- a/.github/actions/test/action.yml +++ b/.github/actions/test/action.yml @@ -4,7 +4,6 @@ inputs: python-version: description: "Which Python version to run on" required: true - default: "3.9" runs: using: "composite" diff --git a/.github/actions/test_tutorials/action.yml b/.github/actions/test_tutorials/action.yml index 9190c23f..21c98b16 100644 --- a/.github/actions/test_tutorials/action.yml +++ b/.github/actions/test_tutorials/action.yml @@ -4,7 +4,6 @@ inputs: python-version: description: "Which Python version to run on" required: true - default: "3.9" runs: using: "composite" steps: diff --git a/.github/workflows/main_test_and_release.yml b/.github/workflows/main_test_and_release.yml index 848d7644..ee3365f7 100644 --- a/.github/workflows/main_test_and_release.yml +++ b/.github/workflows/main_test_and_release.yml @@ -20,7 +20,7 @@ jobs: strategy: matrix: os: [ubuntu-latest] - python-version: ["3.8", "3.9", "3.10", "3.11"] + python-version: [ "3.10", "3.11", "3.12"] env: python-version: ${{ matrix.python-version }} @@ -37,7 +37,7 @@ jobs: jupyter-tutorials: runs-on: ${{ matrix.os }} env: - python-version: "3.9" + python-version: "3.10" strategy: matrix: diff --git a/.github/workflows/static_type_checks.yml b/.github/workflows/static_type_checks.yml index cefcbb36..61a30a36 100644 --- a/.github/workflows/static_type_checks.yml +++ b/.github/workflows/static_type_checks.yml @@ -29,7 +29,7 @@ jobs: uses: actions/setup-python@v4 id: setup_python with: - python-version: "3.9" + python-version: "3.10" - name: Install dependencies shell: bash diff --git a/.gitignore b/.gitignore index 1d99cb95..fdeafadb 100644 --- a/.gitignore +++ b/.gitignore @@ -167,4 +167,5 @@ poetry.lock **/tmp/ **/.tmp/ -*nbconvert* \ No newline at end of file +*nbconvert* +.testmondata diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 81bc3913..6bcc9fd9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,13 +1,8 @@ default_stages: [commit] repos: - - repo: https://github.com/psf/black - rev: 22.8.0 - hooks: - - id: black - - repo: https://github.com/charliermarsh/ruff-pre-commit - rev: v0.0.254 + rev: v0.2.1 hooks: - id: ruff args: @@ -19,6 +14,9 @@ repos: "--fix", "--exit-non-zero-on-fix", ] + types_or: [ python, pyi, jupyter ] + - id: ruff-format + types_or: [ python, pyi, jupyter ] - repo: https://github.com/pre-commit/pre-commit-hooks rev: v2.3.0 diff --git a/Dockerfile b/Dockerfile index 16b347ae..2c00698b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3.9 +FROM python:3.10 RUN apt-get update && apt-get install -y curl diff --git a/docs/conf.py b/docs/conf.py index fa4221ca..14d32005 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -68,10 +68,7 @@ html_show_sourcelink = True -source_suffix = { - ".rst": "restructuredtext", - ".md": "markdown", -} +source_suffix = {".rst": "restructuredtext", ".md": "markdown"} html_static_path = ["_static"] html_favicon = "_static/favicon.ico" @@ -98,21 +95,15 @@ """, "class": "", - }, + } ], "source_repository": repo_url, "source_branch": "main", "source_directory": "docs/", "light_logo": "icon.png", "dark_logo": "icon_dark.png", - "light_css_variables": { - "color-brand-primary": "#ff5454", - "color-brand-content": "#ff7575", - }, - "dark_css_variables": { - "color-brand-primary": "#ff8f8f", - "color-brand-content": "#ff8f8f", - }, + "light_css_variables": {"color-brand-primary": "#ff5454", "color-brand-content": "#ff7575"}, + "dark_css_variables": {"color-brand-primary": "#ff8f8f", "color-brand-content": "#ff8f8f"}, "sidebar_hide_name": False, "navigation_with_keys": True, } diff --git a/docs/tutorials/01_basic.ipynb b/docs/tutorials/01_basic.ipynb index a354fae2..6d4da214 100644 --- a/docs/tutorials/01_basic.ipynb +++ b/docs/tutorials/01_basic.ipynb @@ -890,9 +890,9 @@ "metadata": {}, "outputs": [], "source": [ - "from timeseriesflattener.feature_specs.single_specs import OutcomeSpec\n", - "from timeseriesflattener.aggregation_fns import maximum\n", "import pandas as pd\n", + "from timeseriesflattener.aggregation_fns import maximum\n", + "from timeseriesflattener.feature_specs.single_specs import OutcomeSpec\n", "\n", "test_df = pd.DataFrame()\n", "\n", @@ -944,9 +944,9 @@ "metadata": {}, "outputs": [], "source": [ - "from timeseriesflattener.feature_specs.single_specs import PredictorSpec, StaticSpec\n", - "from timeseriesflattener.aggregation_fns import mean\n", "import numpy as np\n", + "from timeseriesflattener.aggregation_fns import mean\n", + "from timeseriesflattener.feature_specs.single_specs import PredictorSpec, StaticSpec\n", "\n", "temporal_predictor_spec = PredictorSpec(\n", " timeseries_df=df_synth_predictors,\n", @@ -1116,9 +1116,7 @@ ], "source": [ "sex_predictor_spec = StaticSpec(\n", - " timeseries_df=df_synth_sex,\n", - " feature_base_name=\"female\",\n", - " prefix=\"pred\",\n", + " timeseries_df=df_synth_sex, feature_base_name=\"female\", prefix=\"pred\"\n", ")\n", "\n", "df_synth_sex" @@ -1185,7 +1183,9 @@ "metadata": {}, "outputs": [], "source": [ - "ts_flattener.add_spec([sex_predictor_spec, temporal_predictor_spec, temporal_interval_predictor_spec, outcome_spec])" + "ts_flattener.add_spec(\n", + " [sex_predictor_spec, temporal_predictor_spec, temporal_interval_predictor_spec, outcome_spec]\n", + ")" ] }, { diff --git a/docs/tutorials/02_advanced.ipynb b/docs/tutorials/02_advanced.ipynb index 1147e119..c697817c 100644 --- a/docs/tutorials/02_advanced.ipynb +++ b/docs/tutorials/02_advanced.ipynb @@ -1,553 +1,547 @@ { - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Advanced Tutorial" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In the basic tutorial we covered how to add static features, predictors and outcomes.\n", - "In this tutorial, we'll expand on that, covering how to effectively add many features by:\n", - "1. Creating feature combinations from specifications,\n", - "2. Using caching, so you can iterate on your datasets without having to complete full computations every time\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Creating feature combinations\n", - "Manually specifying a handful of features one at a time is rather straightforward, but what if you want to generate hundreds of features? Or want to have multiple different lookbehind windows, e.g. a month, 6 months and a year? Then the amount of code you'll have to write will grow quite substantially and becomes time consuming and hard to navigate.\n", - "\n", - "To solve this problem, we implemented feature group specifications. They allow you to combinatorially create features. Let's look at an example:\n" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "from timeseriesflattener.feature_specs.group_specs import PredictorGroupSpec\n", - "from timeseriesflattener.aggregation_fns import maximum\n", - "from timeseriesflattener.testing.load_synth_data import load_synth_predictor_float\n", - "from timeseriesflattener.feature_specs.group_specs import NamedDataframe\n", - "import numpy as np\n", - "from timeseriesflattener.aggregation_fns import mean, maximum\n", - "from pprint import pprint as pprint" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "pred_spec_batch = PredictorGroupSpec(\n", - " named_dataframes=[\n", - " NamedDataframe(df=load_synth_predictor_float(), name=\"synth_predictor_float\")\n", - " ],\n", - " lookbehind_days=[(0, 365), (365, 730), 1095],\n", - " fallback=[np.nan],\n", - " aggregation_fns=[mean, maximum],\n", - ").create_combinations()" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You'll note that:\n", - "\n", - "1. All attributes are now required to be lists. This makes iteration easier when creating the combinations.\n", - "2. We require a named_dataframes sequence. A namedataframe is exactly that; a dataframe and a name. This is used when we create the features in the output, e.g. for a predictor, the output feature using load_synth_predictor_flaot will be called pred_synth_predictor_float_ because that's the name attributed in the NamedDataframe.\n", - "\n", - "Let's check that the results look good." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "––––––––– We created 6 combinations of predictors. ––––––––––\n", - "[{'aggregation_fn': 'mean',\n", - " 'feature_name': 'synth_predictor_float',\n", - " 'lookbehind_days': LookPeriod(min_days=0.0, max_days=365.0)},\n", - " {'aggregation_fn': 'maximum',\n", - " 'feature_name': 'synth_predictor_float',\n", - " 'lookbehind_days': LookPeriod(min_days=0.0, max_days=365.0)},\n", - " {'aggregation_fn': 'mean',\n", - " 'feature_name': 'synth_predictor_float',\n", - " 'lookbehind_days': LookPeriod(min_days=365.0, max_days=730.0)},\n", - " {'aggregation_fn': 'maximum',\n", - " 'feature_name': 'synth_predictor_float',\n", - " 'lookbehind_days': LookPeriod(min_days=365.0, max_days=730.0)},\n", - " {'aggregation_fn': 'mean',\n", - " 'feature_name': 'synth_predictor_float',\n", - " 'lookbehind_days': LookPeriod(min_days=0, max_days=1095.0)},\n", - " {'aggregation_fn': 'maximum',\n", - " 'feature_name': 'synth_predictor_float',\n", - " 'lookbehind_days': LookPeriod(min_days=0, max_days=1095.0)}]\n" - ] - } - ], - "source": [ - "# Create a small summary to highlight the generated predictors\n", - "pred_spec_batch_summary = [\n", - " {\n", - " \"feature_name\": pred_spec.feature_base_name,\n", - " \"lookbehind_days\": pred_spec.lookbehind_period,\n", - " \"aggregation_fn\": pred_spec.aggregation_fn.__name__,\n", - " }\n", - " for pred_spec in pred_spec_batch\n", - "]\n", - "print(\n", - " f\"––––––––– We created {len(pred_spec_batch)} combinations of predictors. ––––––––––\"\n", - ")\n", - "pprint(pred_spec_batch_summary)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we know how to create a bunch of feature specifications quickly! But with more features comes more computation. Let's look at caching next, so we can iterate on our datasets more quickly." - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Caching" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Timeseriesflattener ships with a class that allows for caching to disk. Let's look at an example of that:" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "from skimpy import skim\n", - "from timeseriesflattener.testing.load_synth_data import load_synth_prediction_times\n", - "from timeseriesflattener.feature_cache.cache_to_disk import DiskCache\n", - "from timeseriesflattener.flattened_dataset import TimeseriesFlattener\n", - "from pathlib import Path" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024-01-18 11:38:02 [INFO] Overriding pred_time_uuid_col_name in cache with pred_time_uuid_col_name passed to init of flattened dataset\n" - ] - } - ], - "source": [ - "ts_flattener = TimeseriesFlattener(\n", - " prediction_times_df=load_synth_prediction_times(),\n", - " entity_id_col_name=\"entity_id\",\n", - " timestamp_col_name=\"timestamp\",\n", - " n_workers=4,\n", - " cache=DiskCache(\n", - " feature_cache_dir=Path(\".tmp\") / \"feature_cache\",\n", - " ),\n", - " drop_pred_times_with_insufficient_look_distance=True,\n", - ")\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "All we need to specify is that we use the DiskCache class, and which directory to save the feature cache to.\n", - "\n", - "The first time we create features, this will just save them to disk and won't make any difference to performance. But say we want to add two more features - then it'll load the features that it has already computed from disk, and then only compute the two new features.\n", - "\n", - "Note that DiskCache is an instance of the abstract class FeatureCache. If you want to implement your own cache, for example using REDIS or SQL, all you'll need is to implement the 3 methods in that class. Now, let's compute a dataframe to check that everything works." - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "ts_flattener.add_spec(pred_spec_batch)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024-01-18 11:38:03 [INFO] There were unprocessed specs, computing...\n", - "2024-01-18 11:38:03 [INFO] _drop_pred_time_if_insufficient_look_distance: Dropped 6053 (60.53%) rows\n", - "2024-01-18 11:38:03 [INFO] Processing 6 temporal features in parallel with 4 workers. Chunksize is 2. If this is above 1, it may take some time for the progress bar to move, as processing is batched. However, this makes for much faster total performance.\n", - " 0%| | 0/6 [00:00╭──────────────────────────────────────────────── skimpy summary ─────────────────────────────────────────────────╮\n", - "│ Data Summary Data Types │\n", - "│ ┏━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┓ ┏━━━━━━━━━━━━━┳━━━━━━━┓ │\n", - "│ ┃ dataframe Values ┃ ┃ Column Type Count ┃ │\n", - "│ ┡━━━━━━━━━━━━━━━━━━━╇━━━━━━━━┩ ┡━━━━━━━━━━━━━╇━━━━━━━┩ │\n", - "│ │ Number of rows │ 3947 │ │ float64 │ 6 │ │\n", - "│ │ Number of columns │ 9 │ │ int64 │ 1 │ │\n", - "│ └───────────────────┴────────┘ │ datetime64 │ 1 │ │\n", - "│ │ string │ 1 │ │\n", - "│ └─────────────┴───────┘ │\n", - "│ number │\n", - "│ ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━┓ │\n", - "│ ┃ column_name NA NA % mean sd p0 p25 p75 p100 hist ┃ │\n", - "│ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━┩ │\n", - "│ │ entity_id 0 0 5000 2900 0 2600 7400 10000█████▇ │ │\n", - "│ │ pred_synth_predictor 7 0.18 5 1.3 0.29 4.1 5.8 9.9 ▂█▇▁ │ │\n", - "│ │ pred_synth_predictor 510 13 6.6 2.6 0.024 4.8 8.8 10▂▂▃▄▆█ │ │\n", - "│ │ pred_synth_predictor 530 14 6.6 2.6 0.0084 4.8 8.8 10▁▂▃▄▆█ │ │\n", - "│ │ pred_synth_predictor 7 0.18 8.4 1.5 0.29 7.8 9.5 10 ▁▃█ │ │\n", - "│ │ pred_synth_predictor 510 13 5.1 2.2 0.024 3.6 6.5 10▂▄██▅▂ │ │\n", - "│ │ pred_synth_predictor 530 14 5 2.1 0.0084 3.6 6.4 9.9▂▄██▄▂ │ │\n", - "│ └────────────────────────────┴───────┴────────┴────────┴────────┴──────────┴───────┴───────┴────────┴────────┘ │\n", - "│ datetime │\n", - "│ ┏━━━━━━━━━━━━━━━━━━┳━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┓ │\n", - "│ ┃ column_name NA NA % first last frequency ┃ │\n", - "│ ┡━━━━━━━━━━━━━━━━━━╇━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━┩ │\n", - "│ │ timestamp 0 0 1968-01-02 05:12:00 1969-12-31 21:42:00 None │ │\n", - "│ └──────────────────┴──────┴─────────┴────────────────────────────┴────────────────────────────┴──────────────┘ │\n", - "│ string │\n", - "│ ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓ │\n", - "│ ┃ column_name NA NA % words per row total words ┃ │\n", - "│ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩ │\n", - "│ │ prediction_time_uuid 0 0 1 3900 │ │\n", - "│ └───────────────────────────────────────┴───────┴───────────┴──────────────────────────┴─────────────────────┘ │\n", - "╰────────────────────────────────────────────────────── End ──────────────────────────────────────────────────────╯\n", - "\n" - ], - "text/plain": [ - "╭──────────────────────────────────────────────── skimpy summary ─────────────────────────────────────────────────╮\n", - "│ \u001b[3m Data Summary \u001b[0m \u001b[3m Data Types \u001b[0m │\n", - "│ ┏━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┓ ┏━━━━━━━━━━━━━┳━━━━━━━┓ │\n", - "│ ┃\u001b[1;36m \u001b[0m\u001b[1;36mdataframe \u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36mValues\u001b[0m\u001b[1;36m \u001b[0m┃ ┃\u001b[1;36m \u001b[0m\u001b[1;36mColumn Type\u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36mCount\u001b[0m\u001b[1;36m \u001b[0m┃ │\n", - "│ ┡━━━━━━━━━━━━━━━━━━━╇━━━━━━━━┩ ┡━━━━━━━━━━━━━╇━━━━━━━┩ │\n", - "│ │ Number of rows │ 3947 │ │ float64 │ 6 │ │\n", - "│ │ Number of columns │ 9 │ │ int64 │ 1 │ │\n", - "│ └───────────────────┴────────┘ │ datetime64 │ 1 │ │\n", - "│ │ string │ 1 │ │\n", - "│ └─────────────┴───────┘ │\n", - "│ \u001b[3m number \u001b[0m │\n", - "│ ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━┓ │\n", - "│ ┃\u001b[1m \u001b[0m\u001b[1mcolumn_name \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mNA \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mNA % \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmean \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1msd \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mp0 \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mp25 \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mp75 \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mp100 \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mhist \u001b[0m\u001b[1m \u001b[0m┃ │\n", - "│ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━┩ │\n", - "│ │ \u001b[38;5;141mentity_id \u001b[0m │ \u001b[36m 0\u001b[0m │ \u001b[36m 0\u001b[0m │ \u001b[36m 5000\u001b[0m │ \u001b[36m 2900\u001b[0m │ \u001b[36m 0\u001b[0m │ \u001b[36m 2600\u001b[0m │ \u001b[36m 7400\u001b[0m │ \u001b[36m 10000\u001b[0m │ \u001b[32m█████▇\u001b[0m │ │\n", - "│ │ \u001b[38;5;141mpred_synth_predictor \u001b[0m │ \u001b[36m 7\u001b[0m │ \u001b[36m 0.18\u001b[0m │ \u001b[36m 5\u001b[0m │ \u001b[36m 1.3\u001b[0m │ \u001b[36m 0.29\u001b[0m │ \u001b[36m 4.1\u001b[0m │ \u001b[36m 5.8\u001b[0m │ \u001b[36m 9.9\u001b[0m │ \u001b[32m ▂█▇▁ \u001b[0m │ │\n", - "│ │ \u001b[38;5;141mpred_synth_predictor \u001b[0m │ \u001b[36m 510\u001b[0m │ \u001b[36m 13\u001b[0m │ \u001b[36m 6.6\u001b[0m │ \u001b[36m 2.6\u001b[0m │ \u001b[36m 0.024\u001b[0m │ \u001b[36m 4.8\u001b[0m │ \u001b[36m 8.8\u001b[0m │ \u001b[36m 10\u001b[0m │ \u001b[32m▂▂▃▄▆█\u001b[0m │ │\n", - "│ │ \u001b[38;5;141mpred_synth_predictor \u001b[0m │ \u001b[36m 530\u001b[0m │ \u001b[36m 14\u001b[0m │ \u001b[36m 6.6\u001b[0m │ \u001b[36m 2.6\u001b[0m │ \u001b[36m 0.0084\u001b[0m │ \u001b[36m 4.8\u001b[0m │ \u001b[36m 8.8\u001b[0m │ \u001b[36m 10\u001b[0m │ \u001b[32m▁▂▃▄▆█\u001b[0m │ │\n", - "│ │ \u001b[38;5;141mpred_synth_predictor \u001b[0m │ \u001b[36m 7\u001b[0m │ \u001b[36m 0.18\u001b[0m │ \u001b[36m 8.4\u001b[0m │ \u001b[36m 1.5\u001b[0m │ \u001b[36m 0.29\u001b[0m │ \u001b[36m 7.8\u001b[0m │ \u001b[36m 9.5\u001b[0m │ \u001b[36m 10\u001b[0m │ \u001b[32m ▁▃█\u001b[0m │ │\n", - "│ │ \u001b[38;5;141mpred_synth_predictor \u001b[0m │ \u001b[36m 510\u001b[0m │ \u001b[36m 13\u001b[0m │ \u001b[36m 5.1\u001b[0m │ \u001b[36m 2.2\u001b[0m │ \u001b[36m 0.024\u001b[0m │ \u001b[36m 3.6\u001b[0m │ \u001b[36m 6.5\u001b[0m │ \u001b[36m 10\u001b[0m │ \u001b[32m▂▄██▅▂\u001b[0m │ │\n", - "│ │ \u001b[38;5;141mpred_synth_predictor \u001b[0m │ \u001b[36m 530\u001b[0m │ \u001b[36m 14\u001b[0m │ \u001b[36m 5\u001b[0m │ \u001b[36m 2.1\u001b[0m │ \u001b[36m 0.0084\u001b[0m │ \u001b[36m 3.6\u001b[0m │ \u001b[36m 6.4\u001b[0m │ \u001b[36m 9.9\u001b[0m │ \u001b[32m▂▄██▄▂\u001b[0m │ │\n", - "│ └────────────────────────────┴───────┴────────┴────────┴────────┴──────────┴───────┴───────┴────────┴────────┘ │\n", - "│ \u001b[3m datetime \u001b[0m │\n", - "│ ┏━━━━━━━━━━━━━━━━━━┳━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┓ │\n", - "│ ┃\u001b[1m \u001b[0m\u001b[1mcolumn_name \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mNA \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mNA % \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mfirst \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mlast \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mfrequency \u001b[0m\u001b[1m \u001b[0m┃ │\n", - "│ ┡━━━━━━━━━━━━━━━━━━╇━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━┩ │\n", - "│ │ \u001b[38;5;141mtimestamp \u001b[0m │ \u001b[36m 0\u001b[0m │ \u001b[36m 0\u001b[0m │ \u001b[31m 1968-01-02 05:12:00 \u001b[0m │ \u001b[31m 1969-12-31 21:42:00 \u001b[0m │ \u001b[38;5;141mNone \u001b[0m │ │\n", - "│ └──────────────────┴──────┴─────────┴────────────────────────────┴────────────────────────────┴──────────────┘ │\n", - "│ \u001b[3m string \u001b[0m │\n", - "│ ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓ │\n", - "│ ┃\u001b[1m \u001b[0m\u001b[1mcolumn_name \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mNA \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mNA % \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mwords per row \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mtotal words \u001b[0m\u001b[1m \u001b[0m┃ │\n", - "│ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩ │\n", - "│ │ \u001b[38;5;141mprediction_time_uuid \u001b[0m │ \u001b[36m 0\u001b[0m │ \u001b[36m 0\u001b[0m │ \u001b[36m 1\u001b[0m │ \u001b[36m 3900\u001b[0m │ │\n", - "│ └───────────────────────────────────────┴───────┴───────────┴──────────────────────────┴─────────────────────┘ │\n", - "╰────────────────────────────────────────────────────── End ──────────────────────────────────────────────────────╯\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "['entity_id',\n", - " 'timestamp',\n", - " 'prediction_time_uuid',\n", - " 'pred_synth_predictor_float_within_0_to_1095_days_mean_fallback_nan',\n", - " 'pred_synth_predictor_float_within_365_to_730_days_maximum_fallback_nan',\n", - " 'pred_synth_predictor_float_within_0_to_365_days_maximum_fallback_nan',\n", - " 'pred_synth_predictor_float_within_0_to_1095_days_maximum_fallback_nan',\n", - " 'pred_synth_predictor_float_within_365_to_730_days_mean_fallback_nan',\n", - " 'pred_synth_predictor_float_within_0_to_365_days_mean_fallback_nan']" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "skim(df)\n", - "\n", - "list(df.columns)" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
 entity_idtimestampprediction_time_uuidpred_1pred_2pred_3pred_4pred_5pred_6
099031968-05-09 21:24:009903-1968-05-09-21-24-002.8646262.1943190.1549815.9315531.4086550.154981
149271968-06-30 12:13:004927-1968-06-30-12-13-004.466599nan6.7306948.630901nan4.957251
231571969-10-07 05:01:003157-1969-10-07-05-01-004.168456nan5.2431765.243176nan5.068323
397931968-12-15 12:59:009793-1968-12-15-12-59-007.1449598.2932669.7089769.7271826.2304178.091755
498611969-01-22 17:34:009861-1969-01-22-17-34-003.6696355.4914153.1302836.2171613.3091973.130283
56571969-04-14 15:47:00657-1969-04-14-15-47-007.3915147.903614nan7.9036147.903614nan
679161968-12-20 03:38:007916-1968-12-20-03-38-004.2517046.0845234.3185866.9791566.0845233.901992
728831968-01-28 21:50:002883-1968-01-28-21-50-004.712403nan8.2577428.257742nan8.257742
815151968-07-18 08:28:001515-1968-07-18-08-28-003.1127003.6846148.6548398.6548393.1046742.907289
967541968-09-21 01:27:006754-1968-09-21-01-27-005.0829183.1021322.3466449.6577552.3249132.346644
\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# For displayability, shorten col names\n", - "pred_cols = [c for c in df.columns if c.startswith(\"pred_\")]\n", - "rename_dict = {c: f\"pred_{i+1}\" for i, c in enumerate(pred_cols)}\n", - "df_renamed = df.rename(rename_dict, axis=1)\n", - "\n", - "# Print a dataframe\n", - "base_cols = [\"entity_id\", \"timestamp\", \"prediction_time_uuid\"]\n", - "renamed_cols = list(rename_dict.values())\n", - "\n", - "df_renamed[0:10][base_cols + renamed_cols].style.set_table_attributes(\n", - " 'style=\"font-size: 14px\"'\n", - ")\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3.10.7 ('.venv': poetry)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - }, - "orig_nbformat": 4, - "vscode": { - "interpreter": { - "hash": "d2b49c0af2d95979144de75823f7cfbb268839811992fdd0cb17fc1bb54ce815" - } - } + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Advanced Tutorial" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the basic tutorial we covered how to add static features, predictors and outcomes.\n", + "In this tutorial, we'll expand on that, covering how to effectively add many features by:\n", + "1. Creating feature combinations from specifications,\n", + "2. Using caching, so you can iterate on your datasets without having to complete full computations every time\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Creating feature combinations\n", + "Manually specifying a handful of features one at a time is rather straightforward, but what if you want to generate hundreds of features? Or want to have multiple different lookbehind windows, e.g. a month, 6 months and a year? Then the amount of code you'll have to write will grow quite substantially and becomes time consuming and hard to navigate.\n", + "\n", + "To solve this problem, we implemented feature group specifications. They allow you to combinatorially create features. Let's look at an example:\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "from pprint import pprint as pprint\n", + "\n", + "import numpy as np\n", + "from timeseriesflattener.aggregation_fns import maximum, mean\n", + "from timeseriesflattener.feature_specs.group_specs import NamedDataframe, PredictorGroupSpec\n", + "from timeseriesflattener.testing.load_synth_data import load_synth_predictor_float" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "pred_spec_batch = PredictorGroupSpec(\n", + " named_dataframes=[\n", + " NamedDataframe(df=load_synth_predictor_float(), name=\"synth_predictor_float\")\n", + " ],\n", + " lookbehind_days=[(0, 365), (365, 730), 1095],\n", + " fallback=[np.nan],\n", + " aggregation_fns=[mean, maximum],\n", + ").create_combinations()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You'll note that:\n", + "\n", + "1. All attributes are now required to be lists. This makes iteration easier when creating the combinations.\n", + "2. We require a named_dataframes sequence. A namedataframe is exactly that; a dataframe and a name. This is used when we create the features in the output, e.g. for a predictor, the output feature using load_synth_predictor_flaot will be called pred_synth_predictor_float_ because that's the name attributed in the NamedDataframe.\n", + "\n", + "Let's check that the results look good." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "––––––––– We created 6 combinations of predictors. ––––––––––\n", + "[{'aggregation_fn': 'mean',\n", + " 'feature_name': 'synth_predictor_float',\n", + " 'lookbehind_days': LookPeriod(min_days=0.0, max_days=365.0)},\n", + " {'aggregation_fn': 'maximum',\n", + " 'feature_name': 'synth_predictor_float',\n", + " 'lookbehind_days': LookPeriod(min_days=0.0, max_days=365.0)},\n", + " {'aggregation_fn': 'mean',\n", + " 'feature_name': 'synth_predictor_float',\n", + " 'lookbehind_days': LookPeriod(min_days=365.0, max_days=730.0)},\n", + " {'aggregation_fn': 'maximum',\n", + " 'feature_name': 'synth_predictor_float',\n", + " 'lookbehind_days': LookPeriod(min_days=365.0, max_days=730.0)},\n", + " {'aggregation_fn': 'mean',\n", + " 'feature_name': 'synth_predictor_float',\n", + " 'lookbehind_days': LookPeriod(min_days=0, max_days=1095.0)},\n", + " {'aggregation_fn': 'maximum',\n", + " 'feature_name': 'synth_predictor_float',\n", + " 'lookbehind_days': LookPeriod(min_days=0, max_days=1095.0)}]\n" + ] + } + ], + "source": [ + "# Create a small summary to highlight the generated predictors\n", + "pred_spec_batch_summary = [\n", + " {\n", + " \"feature_name\": pred_spec.feature_base_name,\n", + " \"lookbehind_days\": pred_spec.lookbehind_period,\n", + " \"aggregation_fn\": pred_spec.aggregation_fn.__name__,\n", + " }\n", + " for pred_spec in pred_spec_batch\n", + "]\n", + "print(f\"––––––––– We created {len(pred_spec_batch)} combinations of predictors. ––––––––––\")\n", + "pprint(pred_spec_batch_summary)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we know how to create a bunch of feature specifications quickly! But with more features comes more computation. Let's look at caching next, so we can iterate on our datasets more quickly." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Caching" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Timeseriesflattener ships with a class that allows for caching to disk. Let's look at an example of that:" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "\n", + "from skimpy import skim\n", + "from timeseriesflattener.feature_cache.cache_to_disk import DiskCache\n", + "from timeseriesflattener.flattened_dataset import TimeseriesFlattener\n", + "from timeseriesflattener.testing.load_synth_data import load_synth_prediction_times" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-01-18 11:38:02 [INFO] Overriding pred_time_uuid_col_name in cache with pred_time_uuid_col_name passed to init of flattened dataset\n" + ] + } + ], + "source": [ + "ts_flattener = TimeseriesFlattener(\n", + " prediction_times_df=load_synth_prediction_times(),\n", + " entity_id_col_name=\"entity_id\",\n", + " timestamp_col_name=\"timestamp\",\n", + " n_workers=4,\n", + " cache=DiskCache(feature_cache_dir=Path(\".tmp\") / \"feature_cache\"),\n", + " drop_pred_times_with_insufficient_look_distance=True,\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "All we need to specify is that we use the DiskCache class, and which directory to save the feature cache to.\n", + "\n", + "The first time we create features, this will just save them to disk and won't make any difference to performance. But say we want to add two more features - then it'll load the features that it has already computed from disk, and then only compute the two new features.\n", + "\n", + "Note that DiskCache is an instance of the abstract class FeatureCache. If you want to implement your own cache, for example using REDIS or SQL, all you'll need is to implement the 3 methods in that class. Now, let's compute a dataframe to check that everything works." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "ts_flattener.add_spec(pred_spec_batch)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-01-18 11:38:03 [INFO] There were unprocessed specs, computing...\n", + "2024-01-18 11:38:03 [INFO] _drop_pred_time_if_insufficient_look_distance: Dropped 6053 (60.53%) rows\n", + "2024-01-18 11:38:03 [INFO] Processing 6 temporal features in parallel with 4 workers. Chunksize is 2. If this is above 1, it may take some time for the progress bar to move, as processing is batched. However, this makes for much faster total performance.\n", + " 0%| | 0/6 [00:00╭──────────────────────────────────────────────── skimpy summary ─────────────────────────────────────────────────╮\n", + "│ Data Summary Data Types │\n", + "│ ┏━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┓ ┏━━━━━━━━━━━━━┳━━━━━━━┓ │\n", + "│ ┃ dataframe Values ┃ ┃ Column Type Count ┃ │\n", + "│ ┡━━━━━━━━━━━━━━━━━━━╇━━━━━━━━┩ ┡━━━━━━━━━━━━━╇━━━━━━━┩ │\n", + "│ │ Number of rows │ 3947 │ │ float64 │ 6 │ │\n", + "│ │ Number of columns │ 9 │ │ int64 │ 1 │ │\n", + "│ └───────────────────┴────────┘ │ datetime64 │ 1 │ │\n", + "│ │ string │ 1 │ │\n", + "│ └─────────────┴───────┘ │\n", + "│ number │\n", + "│ ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━┓ │\n", + "│ ┃ column_name NA NA % mean sd p0 p25 p75 p100 hist ┃ │\n", + "│ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━┩ │\n", + "│ │ entity_id 0 0 5000 2900 0 2600 7400 10000█████▇ │ │\n", + "│ │ pred_synth_predictor 7 0.18 5 1.3 0.29 4.1 5.8 9.9 ▂█▇▁ │ │\n", + "│ │ pred_synth_predictor 510 13 6.6 2.6 0.024 4.8 8.8 10▂▂▃▄▆█ │ │\n", + "│ │ pred_synth_predictor 530 14 6.6 2.6 0.0084 4.8 8.8 10▁▂▃▄▆█ │ │\n", + "│ │ pred_synth_predictor 7 0.18 8.4 1.5 0.29 7.8 9.5 10 ▁▃█ │ │\n", + "│ │ pred_synth_predictor 510 13 5.1 2.2 0.024 3.6 6.5 10▂▄██▅▂ │ │\n", + "│ │ pred_synth_predictor 530 14 5 2.1 0.0084 3.6 6.4 9.9▂▄██▄▂ │ │\n", + "│ └────────────────────────────┴───────┴────────┴────────┴────────┴──────────┴───────┴───────┴────────┴────────┘ │\n", + "│ datetime │\n", + "│ ┏━━━━━━━━━━━━━━━━━━┳━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┓ │\n", + "│ ┃ column_name NA NA % first last frequency ┃ │\n", + "│ ┡━━━━━━━━━━━━━━━━━━╇━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━┩ │\n", + "│ │ timestamp 0 0 1968-01-02 05:12:00 1969-12-31 21:42:00 None │ │\n", + "│ └──────────────────┴──────┴─────────┴────────────────────────────┴────────────────────────────┴──────────────┘ │\n", + "│ string │\n", + "│ ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓ │\n", + "│ ┃ column_name NA NA % words per row total words ┃ │\n", + "│ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩ │\n", + "│ │ prediction_time_uuid 0 0 1 3900 │ │\n", + "│ └───────────────────────────────────────┴───────┴───────────┴──────────────────────────┴─────────────────────┘ │\n", + "╰────────────────────────────────────────────────────── End ──────────────────────────────────────────────────────╯\n", + "\n" + ], + "text/plain": [ + "╭──────────────────────────────────────────────── skimpy summary ─────────────────────────────────────────────────╮\n", + "│ \u001b[3m Data Summary \u001b[0m \u001b[3m Data Types \u001b[0m │\n", + "│ ┏━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┓ ┏━━━━━━━━━━━━━┳━━━━━━━┓ │\n", + "│ ┃\u001b[1;36m \u001b[0m\u001b[1;36mdataframe \u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36mValues\u001b[0m\u001b[1;36m \u001b[0m┃ ┃\u001b[1;36m \u001b[0m\u001b[1;36mColumn Type\u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36mCount\u001b[0m\u001b[1;36m \u001b[0m┃ │\n", + "│ ┡━━━━━━━━━━━━━━━━━━━╇━━━━━━━━┩ ┡━━━━━━━━━━━━━╇━━━━━━━┩ │\n", + "│ │ Number of rows │ 3947 │ │ float64 │ 6 │ │\n", + "│ │ Number of columns │ 9 │ │ int64 │ 1 │ │\n", + "│ └───────────────────┴────────┘ │ datetime64 │ 1 │ │\n", + "│ │ string │ 1 │ │\n", + "│ └─────────────┴───────┘ │\n", + "│ \u001b[3m number \u001b[0m │\n", + "│ ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━┓ │\n", + "│ ┃\u001b[1m \u001b[0m\u001b[1mcolumn_name \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mNA \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mNA % \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmean \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1msd \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mp0 \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mp25 \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mp75 \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mp100 \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mhist \u001b[0m\u001b[1m \u001b[0m┃ │\n", + "│ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━┩ │\n", + "│ │ \u001b[38;5;141mentity_id \u001b[0m │ \u001b[36m 0\u001b[0m │ \u001b[36m 0\u001b[0m │ \u001b[36m 5000\u001b[0m │ \u001b[36m 2900\u001b[0m │ \u001b[36m 0\u001b[0m │ \u001b[36m 2600\u001b[0m │ \u001b[36m 7400\u001b[0m │ \u001b[36m 10000\u001b[0m │ \u001b[32m█████▇\u001b[0m │ │\n", + "│ │ \u001b[38;5;141mpred_synth_predictor \u001b[0m │ \u001b[36m 7\u001b[0m │ \u001b[36m 0.18\u001b[0m │ \u001b[36m 5\u001b[0m │ \u001b[36m 1.3\u001b[0m │ \u001b[36m 0.29\u001b[0m │ \u001b[36m 4.1\u001b[0m │ \u001b[36m 5.8\u001b[0m │ \u001b[36m 9.9\u001b[0m │ \u001b[32m ▂█▇▁ \u001b[0m │ │\n", + "│ │ \u001b[38;5;141mpred_synth_predictor \u001b[0m │ \u001b[36m 510\u001b[0m │ \u001b[36m 13\u001b[0m │ \u001b[36m 6.6\u001b[0m │ \u001b[36m 2.6\u001b[0m │ \u001b[36m 0.024\u001b[0m │ \u001b[36m 4.8\u001b[0m │ \u001b[36m 8.8\u001b[0m │ \u001b[36m 10\u001b[0m │ \u001b[32m▂▂▃▄▆█\u001b[0m │ │\n", + "│ │ \u001b[38;5;141mpred_synth_predictor \u001b[0m │ \u001b[36m 530\u001b[0m │ \u001b[36m 14\u001b[0m │ \u001b[36m 6.6\u001b[0m │ \u001b[36m 2.6\u001b[0m │ \u001b[36m 0.0084\u001b[0m │ \u001b[36m 4.8\u001b[0m │ \u001b[36m 8.8\u001b[0m │ \u001b[36m 10\u001b[0m │ \u001b[32m▁▂▃▄▆█\u001b[0m │ │\n", + "│ │ \u001b[38;5;141mpred_synth_predictor \u001b[0m │ \u001b[36m 7\u001b[0m │ \u001b[36m 0.18\u001b[0m │ \u001b[36m 8.4\u001b[0m │ \u001b[36m 1.5\u001b[0m │ \u001b[36m 0.29\u001b[0m │ \u001b[36m 7.8\u001b[0m │ \u001b[36m 9.5\u001b[0m │ \u001b[36m 10\u001b[0m │ \u001b[32m ▁▃█\u001b[0m │ │\n", + "│ │ \u001b[38;5;141mpred_synth_predictor \u001b[0m │ \u001b[36m 510\u001b[0m │ \u001b[36m 13\u001b[0m │ \u001b[36m 5.1\u001b[0m │ \u001b[36m 2.2\u001b[0m │ \u001b[36m 0.024\u001b[0m │ \u001b[36m 3.6\u001b[0m │ \u001b[36m 6.5\u001b[0m │ \u001b[36m 10\u001b[0m │ \u001b[32m▂▄██▅▂\u001b[0m │ │\n", + "│ │ \u001b[38;5;141mpred_synth_predictor \u001b[0m │ \u001b[36m 530\u001b[0m │ \u001b[36m 14\u001b[0m │ \u001b[36m 5\u001b[0m │ \u001b[36m 2.1\u001b[0m │ \u001b[36m 0.0084\u001b[0m │ \u001b[36m 3.6\u001b[0m │ \u001b[36m 6.4\u001b[0m │ \u001b[36m 9.9\u001b[0m │ \u001b[32m▂▄██▄▂\u001b[0m │ │\n", + "│ └────────────────────────────┴───────┴────────┴────────┴────────┴──────────┴───────┴───────┴────────┴────────┘ │\n", + "│ \u001b[3m datetime \u001b[0m │\n", + "│ ┏━━━━━━━━━━━━━━━━━━┳━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┓ │\n", + "│ ┃\u001b[1m \u001b[0m\u001b[1mcolumn_name \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mNA \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mNA % \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mfirst \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mlast \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mfrequency \u001b[0m\u001b[1m \u001b[0m┃ │\n", + "│ ┡━━━━━━━━━━━━━━━━━━╇━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━┩ │\n", + "│ │ \u001b[38;5;141mtimestamp \u001b[0m │ \u001b[36m 0\u001b[0m │ \u001b[36m 0\u001b[0m │ \u001b[31m 1968-01-02 05:12:00 \u001b[0m │ \u001b[31m 1969-12-31 21:42:00 \u001b[0m │ \u001b[38;5;141mNone \u001b[0m │ │\n", + "│ └──────────────────┴──────┴─────────┴────────────────────────────┴────────────────────────────┴──────────────┘ │\n", + "│ \u001b[3m string \u001b[0m │\n", + "│ ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓ │\n", + "│ ┃\u001b[1m \u001b[0m\u001b[1mcolumn_name \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mNA \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mNA % \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mwords per row \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mtotal words \u001b[0m\u001b[1m \u001b[0m┃ │\n", + "│ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩ │\n", + "│ │ \u001b[38;5;141mprediction_time_uuid \u001b[0m │ \u001b[36m 0\u001b[0m │ \u001b[36m 0\u001b[0m │ \u001b[36m 1\u001b[0m │ \u001b[36m 3900\u001b[0m │ │\n", + "│ └───────────────────────────────────────┴───────┴───────────┴──────────────────────────┴─────────────────────┘ │\n", + "╰────────────────────────────────────────────────────── End ──────────────────────────────────────────────────────╯\n" + ] + }, + "metadata": {}, + "output_type": "display_data" }, - "nbformat": 4, - "nbformat_minor": 2 + { + "data": { + "text/plain": [ + "['entity_id',\n", + " 'timestamp',\n", + " 'prediction_time_uuid',\n", + " 'pred_synth_predictor_float_within_0_to_1095_days_mean_fallback_nan',\n", + " 'pred_synth_predictor_float_within_365_to_730_days_maximum_fallback_nan',\n", + " 'pred_synth_predictor_float_within_0_to_365_days_maximum_fallback_nan',\n", + " 'pred_synth_predictor_float_within_0_to_1095_days_maximum_fallback_nan',\n", + " 'pred_synth_predictor_float_within_365_to_730_days_mean_fallback_nan',\n", + " 'pred_synth_predictor_float_within_0_to_365_days_mean_fallback_nan']" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "skim(df)\n", + "\n", + "list(df.columns)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 entity_idtimestampprediction_time_uuidpred_1pred_2pred_3pred_4pred_5pred_6
099031968-05-09 21:24:009903-1968-05-09-21-24-002.8646262.1943190.1549815.9315531.4086550.154981
149271968-06-30 12:13:004927-1968-06-30-12-13-004.466599nan6.7306948.630901nan4.957251
231571969-10-07 05:01:003157-1969-10-07-05-01-004.168456nan5.2431765.243176nan5.068323
397931968-12-15 12:59:009793-1968-12-15-12-59-007.1449598.2932669.7089769.7271826.2304178.091755
498611969-01-22 17:34:009861-1969-01-22-17-34-003.6696355.4914153.1302836.2171613.3091973.130283
56571969-04-14 15:47:00657-1969-04-14-15-47-007.3915147.903614nan7.9036147.903614nan
679161968-12-20 03:38:007916-1968-12-20-03-38-004.2517046.0845234.3185866.9791566.0845233.901992
728831968-01-28 21:50:002883-1968-01-28-21-50-004.712403nan8.2577428.257742nan8.257742
815151968-07-18 08:28:001515-1968-07-18-08-28-003.1127003.6846148.6548398.6548393.1046742.907289
967541968-09-21 01:27:006754-1968-09-21-01-27-005.0829183.1021322.3466449.6577552.3249132.346644
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# For displayability, shorten col names\n", + "pred_cols = [c for c in df.columns if c.startswith(\"pred_\")]\n", + "rename_dict = {c: f\"pred_{i+1}\" for i, c in enumerate(pred_cols)}\n", + "df_renamed = df.rename(rename_dict, axis=1)\n", + "\n", + "# Print a dataframe\n", + "base_cols = [\"entity_id\", \"timestamp\", \"prediction_time_uuid\"]\n", + "renamed_cols = list(rename_dict.values())\n", + "\n", + "df_renamed[0:10][base_cols + renamed_cols].style.set_table_attributes('style=\"font-size: 14px\"')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.10.7 ('.venv': poetry)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "d2b49c0af2d95979144de75823f7cfbb268839811992fdd0cb17fc1bb54ce815" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 } diff --git a/docs/tutorials/03_text.ipynb b/docs/tutorials/03_text.ipynb index c96b22be..2da169cc 100644 --- a/docs/tutorials/03_text.ipynb +++ b/docs/tutorials/03_text.ipynb @@ -153,8 +153,8 @@ "outputs": [], "source": [ "%%capture\n", - "from sklearn.feature_extraction.text import TfidfVectorizer\n", "import pandas as pd\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", "\n", "\n", "# define function to embed text and return a dataframe\n", @@ -163,12 +163,13 @@ " embeddings = tfidf_model.fit_transform(text)\n", " return pd.DataFrame(embeddings.toarray(), columns=tfidf_model.get_feature_names_out())\n", "\n", + "\n", "# embed text\n", "embedded_text = embed_text_to_df(text=synth_text[\"value\"].tolist())\n", "# drop the text column from the original dataframe\n", "metadata_only = synth_text.drop(columns=[\"value\"])\n", "# concatenate the metadata and the embedded text\n", - "embedded_text_with_metadata = pd.concat([metadata_only, embedded_text], axis=1)\n" + "embedded_text_with_metadata = pd.concat([metadata_only, embedded_text], axis=1)" ] }, { @@ -313,7 +314,7 @@ } ], "source": [ - "embedded_text_with_metadata.head()\n" + "embedded_text_with_metadata.head() # type: ignore" ] }, { @@ -404,20 +405,18 @@ } ], "source": [ - "from timeseriesflattener.df_transforms import (\n", - " df_with_multiple_values_to_named_dataframes,\n", - ")\n", + "from timeseriesflattener.df_transforms import df_with_multiple_values_to_named_dataframes\n", "\n", "# split the dataframe into a list of named dataframes with one value each\n", "embedded_dfs = df_with_multiple_values_to_named_dataframes(\n", - " df=embedded_text_with_metadata,\n", + " df=embedded_text_with_metadata, # type: ignore\n", " entity_id_col_name=\"entity_id\",\n", " timestamp_col_name=\"timestamp\",\n", " name_prefix=\"tfidf_\",\n", ")\n", "\n", "# check the first dataframe\n", - "embedded_dfs[0].df.head()\n" + "embedded_dfs[0].df.head()" ] }, { @@ -438,7 +437,7 @@ ], "source": [ "# check the number of embeddings/dataframes\n", - "len(embedded_dfs)\n" + "len(embedded_dfs)" ] }, { @@ -465,7 +464,7 @@ } ], "source": [ - "embedded_dfs[0].name\n" + "embedded_dfs[0].name" ] }, { @@ -489,9 +488,9 @@ } ], "source": [ + "import numpy as np\n", "from timeseriesflattener.aggregation_fns import mean\n", "from timeseriesflattener.feature_specs.group_specs import PredictorGroupSpec\n", - "import numpy as np\n", "\n", "# create a group spec for the embedded text that will take the mean of each embedding on the column axis\n", "# for the last 365 and 730 days\n", @@ -503,7 +502,7 @@ ").create_combinations()\n", "\n", "# print the number of features we will create\n", - "print(len(emb_spec_batch))\n" + "print(len(emb_spec_batch))" ] }, { @@ -548,7 +547,7 @@ " drop_pred_times_with_insufficient_look_distance=False,\n", ")\n", "ts_flattener.add_spec(emb_spec_batch)\n", - "df = ts_flattener.get_df()\n" + "df = ts_flattener.get_df()" ] }, { @@ -870,7 +869,7 @@ ], "source": [ "# dropping na values (no embeddings within the lookbehind period) for the sake of this example\n", - "df.dropna().head()\n" + "df.dropna().head()" ] } ], diff --git a/pyproject.toml b/pyproject.toml index f009d19e..a372a351 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,7 +21,7 @@ classifiers = [ "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", ] -requires-python = ">=3.8.0,<3.12.0" +requires-python = ">=3.8.0,<3.13.0" dependencies = [ "scipy>=1.8.0", "scikit-learn>=1.1.2", @@ -34,17 +34,18 @@ dependencies = [ "frozendict>=2.3.4", "coloredlogs>14.0.0", "tqdm>4.1.0", + "polars==0.20.7", + "iterpy==0.22.1", ] [project.license] file = "LICENSE" -name = "MIT" + [project.optional-dependencies] dev = [ "pyright==1.1.330.post0", "pre-commit==3.4.0", - "ruff==0.0.292", # important that these match the pre-commit hooks - "black[jupyter]==23.9.1", # important that these match the pre-commit hooks + "ruff==0.2.1", # important that these match the pre-commit hooks "pandas-stubs", # type stubs for pandas "invoke==2.1.1", "tox", @@ -59,11 +60,11 @@ test = [ ] docs = [ "sphinx==5.3.0", - "furo==2023.3.27", - "sphinx-copybutton==0.5.2", - "sphinxext-opengraph==0.8.2", - "myst-nb==0.17.2", - "sphinx_design==0.3.0", + "furo==2023.3.27", + "sphinx-copybutton==0.5.2", + "sphinxext-opengraph==0.8.2", + "myst-nb==0.17.2", + "sphinx_design==0.3.0", ] tutorials = ["jupyter>=1.0.0,<1.1.0", "skimpy>=0.0.7,<0.1.0"] @@ -80,7 +81,14 @@ documentation = "https://aarhus-psychiatry-research.github.io/timeseriesflattene exclude = [".*venv*/", ".venv38/", ".tox"] [tool.ruff] +line-length = 100 +target-version = "py38" # Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default. + +[tool.ruff.format] +skip-magic-trailing-comma = true + +[tool.ruff.lint] select = [ "A", "ANN", @@ -109,7 +117,17 @@ select = [ "SIM", "W", ] -ignore = ["ANN101", "ANN401", "E402", "E501", "F401", "F841", "RET504"] +ignore = [ + "ANN101", + "ANN401", + "E402", + "E501", + "F401", + "F841", + "RET504", + "RUF001", + "COM812", +] ignore-init-module-imports = true # Allow autofix for all enabled rules (when `--fix`) is provided. unfixable = ["ERA"] @@ -140,16 +158,16 @@ exclude = [ ] # Allow unused variables when underscore-prefixed. dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" -target-version = "py38" -[tool.ruff.flake8-annotations] + +[tool.ruff.lint.flake8-annotations] mypy-init-return = true suppress-none-returning = true -[tool.ruff.isort] +[tool.ruff.lint.isort] known-third-party = ["wandb"] -[tool.ruff.mccabe] +[tool.ruff.lint.mccabe] # Unlike Flake8, default to a complexity level of 10. max-complexity = 10 @@ -166,7 +184,7 @@ include-package-data = true [tool.tox] legacy_tox_ini = """ [tox] -envlist = py39 +envlist = py310 [testenv] description: run unit tests @@ -178,7 +196,7 @@ commands = [testenv:type] description: run static type checking extras = test, dev -basepython = py39 +basepython = py310 use_develop = true allowlist_externals = ls commands = diff --git a/src/conftest.py b/src/conftest.py index bc233d82..e2f874e4 100644 --- a/src/conftest.py +++ b/src/conftest.py @@ -6,9 +6,7 @@ load_synth_prediction_times, load_synth_text, ) -from timeseriesflattener.testing.utils_for_testing import ( - load_long_df_with_multiple_values, -) +from timeseriesflattener.testing.utils_for_testing import load_long_df_with_multiple_values def pytest_addoption(parser): # noqa @@ -21,10 +19,7 @@ def pytest_addoption(parser): # noqa def pytest_configure(config): # noqa - config.addinivalue_line( - "markers", - "huggingface: mark test as using huggingface models", - ) + config.addinivalue_line("markers", "huggingface: mark test as using huggingface models") def pytest_collection_modifyitems(config, items): # noqa diff --git a/src/timeseriesflattener/__init__.py b/src/timeseriesflattener/__init__.py index 205cc147..2c4eb43e 100644 --- a/src/timeseriesflattener/__init__.py +++ b/src/timeseriesflattener/__init__.py @@ -1,10 +1,5 @@ """Init timeseriesflattener.""" from .feature_specs.group_specs import PredictorGroupSpec -from .feature_specs.single_specs import ( - OutcomeSpec, - PredictorSpec, -) -from .feature_specs.group_specs import ( - OutcomeGroupSpec, -) +from .feature_specs.single_specs import OutcomeSpec, PredictorSpec +from .feature_specs.group_specs import OutcomeGroupSpec from .flattened_dataset import TimeseriesFlattener diff --git a/src/timeseriesflattener/aggregation_fns.py b/src/timeseriesflattener/aggregation_fns.py index a7ee015c..3b460926 100644 --- a/src/timeseriesflattener/aggregation_fns.py +++ b/src/timeseriesflattener/aggregation_fns.py @@ -73,11 +73,7 @@ def boolean(grouped_df: DataFrameGroupBy) -> DataFrame: Returns: DataFrame: Dataframe with value column containing only 0 or 1s. """ - df = ( - grouped_df["timestamp_val"] - .apply(lambda x: (~x.isna()).sum()) - .reset_index(name="value") - ) + df = grouped_df["timestamp_val"].apply(lambda x: (~x.isna()).sum()).reset_index(name="value") df.loc[df["value"] > 0, "value"] = 1 @@ -95,17 +91,11 @@ def change_per_day(grouped_df: DataFrameGroupBy) -> DataFrame: """ # Check if some patients have multiple values but only one timestamp - if any( - grouped_df.timestamp_val.apply( - lambda x: len(set(x)) == 1 and len(x) > 1, - ).values, - ): + if any(grouped_df.timestamp_val.apply(lambda x: len(set(x)) == 1 and len(x) > 1).values): raise ValueError( - "One or more patients only have values with identical timestamps. There may be an error in the data.", + "One or more patients only have values with identical timestamps. There may be an error in the data." ) return grouped_df.apply( - lambda x: Series( - {"value": stats.linregress(x.timestamp_val, x.value)[0]}, - ), + lambda x: Series({"value": stats.linregress(x.timestamp_val, x.value)[0]}) ) diff --git a/src/timeseriesflattener/df_transforms.py b/src/timeseriesflattener/df_transforms.py index e9d6ffee..d9931809 100644 --- a/src/timeseriesflattener/df_transforms.py +++ b/src/timeseriesflattener/df_transforms.py @@ -15,15 +15,13 @@ def df_with_multiple_values_to_named_dataframes( mandatory_columns = [entity_id_col_name, timestamp_col_name] if any(col not in list(df.columns) for col in mandatory_columns): raise ValueError( - f"entity_id_col_name and timestamp_col_name must be columns in the dataframe. Available columns are {df.columns}.", + f"entity_id_col_name and timestamp_col_name must be columns in the dataframe. Available columns are {df.columns}." ) value_cols = [col for col in list(df.columns) if col not in mandatory_columns] return [ NamedDataframe( - df=df[[*mandatory_columns, value_col_name]].rename( - columns={value_col_name: "value"}, - ), + df=df[[*mandatory_columns, value_col_name]].rename(columns={value_col_name: "value"}), name=name_prefix + str(value_col_name), ) for value_col_name in value_cols diff --git a/src/timeseriesflattener/feature_cache/cache_to_disk.py b/src/timeseriesflattener/feature_cache/cache_to_disk.py index 6f9a66dd..1be18de7 100644 --- a/src/timeseriesflattener/feature_cache/cache_to_disk.py +++ b/src/timeseriesflattener/feature_cache/cache_to_disk.py @@ -49,10 +49,7 @@ def __init__( self.entity_entity_id_col_name = entity_id_col_name self.timestamp_col_name = timestamp_col_name - def _load_most_recent_df_matching_pattern( - self, - file_pattern: str, - ) -> pd.DataFrame: + def _load_most_recent_df_matching_pattern(self, file_pattern: str) -> pd.DataFrame: """Load most recent df matching pattern. Args: @@ -72,14 +69,9 @@ def _load_most_recent_df_matching_pattern( path_of_most_recent_file = max(files_with_suffix, key=os.path.getctime) - return load_dataset_from_file( - file_path=path_of_most_recent_file, - ) + return load_dataset_from_file(file_path=path_of_most_recent_file) - def _get_file_name( - self, - feature_spec: TemporalSpec, - ) -> str: + def _get_file_name(self, feature_spec: TemporalSpec) -> str: """Get file name for feature spec. Args: @@ -92,10 +84,7 @@ def _get_file_name( return f"{feature_spec.get_output_col_name()}_{n_rows}_rows_in_values_df" - def _get_file_pattern( - self, - feature_spec: TemporalSpec, - ) -> str: + def _get_file_pattern(self, feature_spec: TemporalSpec) -> str: """Get file pattern for feature spec. Args: @@ -118,7 +107,7 @@ def read_feature(self, feature_spec: TemporalSpec) -> pd.DataFrame: DataFrame: DataFrame with fallback column expanded """ df = self._load_most_recent_df_matching_pattern( - file_pattern=self._get_file_pattern(feature_spec=feature_spec), + file_pattern=self._get_file_pattern(feature_spec=feature_spec) ) # Expand fallback column @@ -133,19 +122,13 @@ def read_feature(self, feature_spec: TemporalSpec) -> pd.DataFrame: fallback = np.nan if feature_spec.fallback == "nan" else feature_spec.fallback # Replace NaNs with fallback - df[feature_spec.get_output_col_name()] = df[ - feature_spec.get_output_col_name() - ].fillna( - fallback, # type: ignore + df[feature_spec.get_output_col_name()] = df[feature_spec.get_output_col_name()].fillna( + fallback # type: ignore ) return df - def write_feature( - self, - feature_spec: TemporalSpec, - df: pd.DataFrame, - ): + def write_feature(self, feature_spec: TemporalSpec, df: pd.DataFrame): """Write feature to cache.""" file_name = self._get_file_name(feature_spec=feature_spec) @@ -161,14 +144,10 @@ def write_feature( timestamp = dt.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") write_df_to_file( df=df, - file_path=self.feature_cache_dir - / f"{file_name}_{timestamp}.{self.cache_file_suffix}", + file_path=self.feature_cache_dir / f"{file_name}_{timestamp}.{self.cache_file_suffix}", ) - def feature_exists( - self, - feature_spec: TemporalSpec, - ) -> bool: + def feature_exists(self, feature_spec: TemporalSpec) -> bool: """Check if cache is hit. Args: @@ -180,9 +159,7 @@ def feature_exists( file_pattern = self._get_file_pattern(feature_spec=feature_spec) # Check that file exists - file_pattern_hits = list( - self.feature_cache_dir.glob(file_pattern), - ) + file_pattern_hits = list(self.feature_cache_dir.glob(file_pattern)) if len(file_pattern_hits) == 0: return False diff --git a/src/timeseriesflattener/feature_specs/group_specs.py b/src/timeseriesflattener/feature_specs/group_specs.py index 39360ae7..9b72cbbc 100644 --- a/src/timeseriesflattener/feature_specs/group_specs.py +++ b/src/timeseriesflattener/feature_specs/group_specs.py @@ -4,11 +4,7 @@ import pandas as pd from timeseriesflattener.aggregation_fns import AggregationFunType -from timeseriesflattener.feature_specs.single_specs import ( - AnySpec, - OutcomeSpec, - PredictorSpec, -) +from timeseriesflattener.feature_specs.single_specs import AnySpec, OutcomeSpec, PredictorSpec from timeseriesflattener.utils.pydantic_basemodel import BaseModel @@ -40,9 +36,7 @@ class PredictorGroupSpec(BaseModel): def create_combinations(self) -> List[PredictorSpec]: """Create all combinations from the group spec.""" - combination_dict = create_feature_combinations_from_dict( - dictionary=self.__dict__, - ) + combination_dict = create_feature_combinations_from_dict(dictionary=self.__dict__) return [ PredictorSpec( @@ -84,9 +78,7 @@ class OutcomeGroupSpec(BaseModel): def create_combinations(self) -> List[OutcomeSpec]: """Create all combinations from the group spec.""" - combination_dict = create_feature_combinations_from_dict( - dictionary=self.__dict__, - ) + combination_dict = create_feature_combinations_from_dict(dictionary=self.__dict__) return [ OutcomeSpec( @@ -117,9 +109,7 @@ def create_feature_combinations_from_dict( List[Dict[str]]: list of all possible combinations of the arguments. """ # Make all elements iterable - dictionary = { - k: v if isinstance(v, (list, tuple)) else [v] for k, v in dictionary.items() - } + dictionary = {k: v if isinstance(v, (list, tuple)) else [v] for k, v in dictionary.items()} keys, values = zip(*dictionary.items()) # Create all combinations of top level elements @@ -128,18 +118,13 @@ def create_feature_combinations_from_dict( return permutations_dicts # type: ignore -def create_specs_from_group( - feature_group_spec: GroupSpec, - output_class: AnySpec, -) -> List[AnySpec]: +def create_specs_from_group(feature_group_spec: GroupSpec, output_class: AnySpec) -> List[AnySpec]: """Create a list of specs from a GroupSpec.""" # Create all combinations of top level elements # For each attribute in the FeatureGroupSpec feature_group_spec_dict = feature_group_spec.__dict__ - permuted_dicts = create_feature_combinations_from_dict( - dictionary=feature_group_spec_dict, - ) + permuted_dicts = create_feature_combinations_from_dict(dictionary=feature_group_spec_dict) return [output_class(**d) for d in permuted_dicts] # type: ignore diff --git a/src/timeseriesflattener/feature_specs/single_specs.py b/src/timeseriesflattener/feature_specs/single_specs.py index 5831f245..d5e12a1e 100644 --- a/src/timeseriesflattener/feature_specs/single_specs.py +++ b/src/timeseriesflattener/feature_specs/single_specs.py @@ -14,7 +14,7 @@ class LookPeriod: def __post_init__(self): if self.min_days > self.max_days: raise ValueError( - f"Invalid LookPeriod. The min_days ({self.min_days}) must be smaller than the max_days {self.max_days}.", + f"Invalid LookPeriod. The min_days ({self.min_days}) must be smaller than the max_days {self.max_days}." ) @@ -46,9 +46,7 @@ def coerce_floats(lookperiod: LookPeriod, fallback: float) -> CoercedFloats: coerced_lookperiod = LookPeriod(min_days=min_days, max_days=max_days) - fallback = ( - fallback if not can_be_coerced_losslessly_to_int(fallback) else int(fallback) - ) + fallback = fallback if not can_be_coerced_losslessly_to_int(fallback) else int(fallback) return CoercedFloats(lookperiod=coerced_lookperiod, fallback=fallback) @@ -128,10 +126,7 @@ class OutcomeSpec(BaseModel): def lookahead_period(self) -> LookPeriod: if isinstance(self.lookahead_days, (float, int)): return LookPeriod(min_days=0, max_days=self.lookahead_days) - return LookPeriod( - min_days=self.lookahead_days[0], - max_days=self.lookahead_days[1], - ) + return LookPeriod(min_days=self.lookahead_days[0], max_days=self.lookahead_days[1]) def get_output_col_name(self) -> str: """Get the column name for the output column.""" @@ -183,10 +178,7 @@ class PredictorSpec(BaseModel): def lookbehind_period(self) -> LookPeriod: if isinstance(self.lookbehind_days, (float, int)): return LookPeriod(min_days=0, max_days=self.lookbehind_days) - return LookPeriod( - min_days=self.lookbehind_days[0], - max_days=self.lookbehind_days[1], - ) + return LookPeriod(min_days=self.lookbehind_days[0], max_days=self.lookbehind_days[1]) def get_output_col_name(self) -> str: """Generate the column name for the output column.""" diff --git a/src/timeseriesflattener/flattened_dataset.py b/src/timeseriesflattener/flattened_dataset.py index 12185424..150ee470 100644 --- a/src/timeseriesflattener/flattened_dataset.py +++ b/src/timeseriesflattener/flattened_dataset.py @@ -43,18 +43,13 @@ class SpecCollection(PydanticBaseModel): def __len__(self) -> int: """Return number of specs in collection.""" - return ( - len(self.outcome_specs) + len(self.predictor_specs) + len(self.static_specs) - ) + return len(self.outcome_specs) + len(self.predictor_specs) + len(self.static_specs) class TimeseriesFlattener: """Turn a set of time-series into tabular prediction-time data.""" - def _override_cache_attributes_with_self_attributes( - self, - prediction_times_df: DataFrame, - ): + def _override_cache_attributes_with_self_attributes(self, prediction_times_df: DataFrame): """Make cache inherit attributes from flattened dataset. Avoids duplicate specification. @@ -62,29 +57,20 @@ def _override_cache_attributes_with_self_attributes( if self.cache is None: raise ValueError("Cache is None, cannot override attributes") - if ( - not hasattr(self.cache, "prediction_times_df") - or self.cache.prediction_times_df is None - ): + if not hasattr(self.cache, "prediction_times_df") or self.cache.prediction_times_df is None: self.cache.prediction_times_df = prediction_times_df elif not self.cache.prediction_times_df.equals(prediction_times_df): - log.info( - "Overriding prediction_times_df cache with the one passed to init", - ) + log.info("Overriding prediction_times_df cache with the one passed to init") self.cache.prediction_times_df = prediction_times_df - for attr in ( - "pred_time_uuid_col_name", - "timestamp_col_name", - "entity_id_col_name", - ): + for attr in ("pred_time_uuid_col_name", "timestamp_col_name", "entity_id_col_name"): if ( hasattr(self.cache, attr) and getattr(self.cache, attr) is not None and getattr(self.cache, attr) != getattr(self, attr) ): log.info( - f"Overriding {attr} in cache with {attr} passed to init of flattened dataset", + f"Overriding {attr} in cache with {attr} passed to init of flattened dataset" ) setattr(self.cache, attr, getattr(self, attr)) @@ -161,7 +147,7 @@ def __init__( if "value" in prediction_times_df.columns: raise ValueError( - "Column 'value' should not occur in prediction_times_df, only timestamps and ids.", + "Column 'value' should not occur in prediction_times_df, only timestamps and ids." ) self._df = prediction_times_df @@ -173,24 +159,17 @@ def __init__( ).validate_dataset() # Create pred_time_uuid_columne - self._df[self.pred_time_uuid_col_name] = self._df[ - self.entity_id_col_name - ].astype(str) + pd.to_datetime(self._df[self.timestamp_col_name]).dt.strftime( - "-%Y-%m-%d-%H-%M-%S", - ) + self._df[self.pred_time_uuid_col_name] = self._df[self.entity_id_col_name].astype( + str + ) + pd.to_datetime(self._df[self.timestamp_col_name]).dt.strftime("-%Y-%m-%d-%H-%M-%S") if log_to_stdout: # Setup logging to stdout by default - coloredlogs.install( - level=logging.INFO, - fmt="%(asctime)s [%(levelname)s] %(message)s", - ) + coloredlogs.install(level=logging.INFO, fmt="%(asctime)s [%(levelname)s] %(message)s") @staticmethod def _add_back_prediction_times_without_value( - df: DataFrame, - pred_times_with_uuid: DataFrame, - pred_time_uuid_colname: str, + df: DataFrame, pred_times_with_uuid: DataFrame, pred_time_uuid_colname: str ) -> DataFrame: """Ensure all prediction times are represented in the returned @@ -205,11 +184,7 @@ def _add_back_prediction_times_without_value( DataFrame: A merged dataframe with all prediction times. """ return pd.merge( - pred_times_with_uuid, - df, - how="left", - on=pred_time_uuid_colname, - suffixes=("", "_temp"), + pred_times_with_uuid, df, how="left", on=pred_time_uuid_colname, suffixes=("", "_temp") ) @staticmethod @@ -242,9 +217,7 @@ def _aggregate_values_within_interval_days( log.info("All values are NaT, returning empty dataframe") # Sort by timestamp_pred in case aggregation needs dates - grouped_df = df.sort_values(by=val_timestamp_col_name).groupby( - pred_time_uuid_colname, - ) + grouped_df = df.sort_values(by=val_timestamp_col_name).groupby(pred_time_uuid_colname) if callable(aggregation): df = aggregation(grouped_df).reset_index() @@ -286,19 +259,18 @@ def _drop_records_outside_interval_days( # Divide by 86.400 seconds/day if direction == "ahead": - df["is_in_interval"] = ( - df["time_from_pred_to_val_in_days"] <= lookperiod.max_days - ) & (df["time_from_pred_to_val_in_days"] > lookperiod.min_days) + df["is_in_interval"] = (df["time_from_pred_to_val_in_days"] <= lookperiod.max_days) & ( + df["time_from_pred_to_val_in_days"] > lookperiod.min_days + ) elif direction == "behind": - df["is_in_interval"] = ( - df["time_from_pred_to_val_in_days"] >= -lookperiod.max_days - ) & (df["time_from_pred_to_val_in_days"] < -lookperiod.min_days) + df["is_in_interval"] = (df["time_from_pred_to_val_in_days"] >= -lookperiod.max_days) & ( + df["time_from_pred_to_val_in_days"] < -lookperiod.min_days + ) else: raise ValueError("direction can only be 'ahead' or 'behind'") return df[df["is_in_interval"]].drop( - ["is_in_interval", "time_from_pred_to_val_in_days"], - axis=1, + ["is_in_interval", "time_from_pred_to_val_in_days"], axis=1 ) @staticmethod @@ -387,15 +359,13 @@ def _flatten_temporal_values_to_df( # Find value_cols and add fallback to them value_col_str_name = output_spec.get_output_col_name() - df[output_spec.get_output_col_name()] = df[ - output_spec.get_output_col_name() - ].fillna( - output_spec.fallback, + df[output_spec.get_output_col_name()] = df[output_spec.get_output_col_name()].fillna( + output_spec.fallback ) if verbose: log.info( - f"Returning {df.shape[0]} rows of flattened dataframe for {output_spec.get_output_col_name()}", + f"Returning {df.shape[0]} rows of flattened dataframe for {output_spec.get_output_col_name()}" ) # Add back prediction times that don't have a value, and fill them with fallback @@ -404,15 +374,12 @@ def _flatten_temporal_values_to_df( pred_times_with_uuid=prediction_times_with_uuid_df, pred_time_uuid_colname=pred_time_uuid_col_name, ).fillna( - output_spec.fallback, # type: ignore + output_spec.fallback # type: ignore ) return df[[value_col_str_name, pred_time_uuid_col_name]] - def _get_temporal_feature( - self, - feature_spec: TemporalSpec, - ) -> pd.DataFrame: + def _get_temporal_feature(self, feature_spec: TemporalSpec) -> pd.DataFrame: """Get feature. Either load from cache, or generate if necessary. Args: @@ -423,24 +390,16 @@ def _get_temporal_feature( """ if self.cache: if self.cache.feature_exists(feature_spec=feature_spec): - log.debug( - f"Cache hit for {feature_spec.get_output_col_name()}, loading from cache", - ) + log.debug(f"Cache hit for {feature_spec.get_output_col_name()}, loading from cache") df = self.cache.read_feature(feature_spec=feature_spec) return df.set_index(keys=self.pred_time_uuid_col_name).sort_index() - log.debug( - f"Cache miss for {feature_spec.get_output_col_name()}, generating", - ) + log.debug(f"Cache miss for {feature_spec.get_output_col_name()}, generating") elif not self.cache: log.debug("No cache specified, not attempting load") df = self._flatten_temporal_values_to_df( prediction_times_with_uuid_df=self._df[ - [ - self.pred_time_uuid_col_name, - self.entity_id_col_name, - self.timestamp_col_name, - ] + [self.pred_time_uuid_col_name, self.entity_id_col_name, self.timestamp_col_name] ], entity_id_col_name=self.entity_id_col_name, pred_time_uuid_col_name=self.pred_time_uuid_col_name, @@ -450,10 +409,7 @@ def _get_temporal_feature( # Write df to cache if exists if self.cache: - self.cache.write_feature( - feature_spec=feature_spec, - df=df, - ) + self.cache.write_feature(feature_spec=feature_spec, df=df) return df.set_index(keys=self.pred_time_uuid_col_name).sort_index() @@ -469,11 +425,11 @@ def _check_dfs_are_ready_for_concat(dfs: List[pd.DataFrame]): n_dfs = len(dfs) log.info( - "Checking alignment of dataframes - this might take a little while (~2 minutes for 1.000 dataframes with 2.000.000 rows).", + "Checking alignment of dataframes - this might take a little while (~2 minutes for 1.000 dataframes with 2.000.000 rows)." ) log.debug( - "Checking that dataframes are ready for concatenation - namely that their indices are aligned. This is a sanity check, and should not be necessary if the dataframes were correctly aligned before concatenation. However, any errors here will completely break predictions, so rather safe than sorry. Can take a while for a large number of dataframes, e.g. 2 minutes for 1_000 dataframes with 2_000_000 rows.", + "Checking that dataframes are ready for concatenation - namely that their indices are aligned. This is a sanity check, and should not be necessary if the dataframes were correctly aligned before concatenation. However, any errors here will completely break predictions, so rather safe than sorry. Can take a while for a large number of dataframes, e.g. 2 minutes for 1_000 dataframes with 2_000_000 rows." ) for i, feature_df in enumerate(dfs[1:]): @@ -484,27 +440,20 @@ def _check_dfs_are_ready_for_concat(dfs: List[pd.DataFrame]): # Check that dataframes are of equal length log.debug("Checking that dataframes are of equal length") if len(feature_df) != base_length: - errors.append( - "Dataframes are not of equal length. ", - ) + errors.append("Dataframes are not of equal length. ") log.debug("Checking that indices are aligned") - if not all( - feature_df.index == base_df.index, - ): - errors.append( - "Dataframes are not aligned. ", - ) + if not all(feature_df.index == base_df.index): + errors.append("Dataframes are not aligned. ") if errors: debug_info = f"Columns in dataframes: 0_df: {dfs[0].columns}, feature_df: {feature_df.columns}. Were they correctly aligned before concatenation?" raise ValueError( - f"Dataframes are not ready for concatenation. {errors}, {debug_info}", + f"Dataframes are not ready for concatenation. {errors}, {debug_info}" ) def _concatenate_flattened_timeseries( - self, - flattened_predictor_dfs: List[pd.DataFrame], + self, flattened_predictor_dfs: List[pd.DataFrame] ) -> None: """Concatenate flattened predictor dfs.""" @@ -517,12 +466,9 @@ def _concatenate_flattened_timeseries( # If so, ready for concatenation. Reset index to be ready for the merge at the end. log.info( - "Starting concatenation. Will take some time on performant systems, e.g. 30s for 100 features and 2_000_000 prediction times. This is normal.", + "Starting concatenation. Will take some time on performant systems, e.g. 30s for 100 features and 2_000_000 prediction times. This is normal." ) - new_features = pd.concat( - objs=flattened_predictor_dfs, - axis=1, - ).reset_index() + new_features = pd.concat(objs=flattened_predictor_dfs, axis=1).reset_index() end_time = time.time() @@ -531,10 +477,7 @@ def _concatenate_flattened_timeseries( log.info("Merging with original df") self._df = self._df.merge(right=new_features, on=self.pred_time_uuid_col_name) - def _add_temporal_batch( - self, - temporal_batch: List[TemporalSpec], - ): + def _add_temporal_batch(self, temporal_batch: List[TemporalSpec]): """Add predictors to the flattened dataframe from a list.""" # Shuffle predictor specs to avoid IO contention random.shuffle(temporal_batch) @@ -546,7 +489,7 @@ def _add_temporal_batch( chunksize = max(1, round(len(temporal_batch) / (n_workers))) log.info( - f"Processing {len(temporal_batch)} temporal features in parallel with {n_workers} workers. Chunksize is {chunksize}. If this is above 1, it may take some time for the progress bar to move, as processing is batched. However, this makes for much faster total performance.", + f"Processing {len(temporal_batch)} temporal features in parallel with {n_workers} workers. Chunksize is {chunksize}. If this is above 1, it may take some time for the progress bar to move, as processing is batched. However, this makes for much faster total performance." ) with Pool(n_workers) as p: @@ -558,17 +501,12 @@ def _add_temporal_batch( chunksize=chunksize, ), total=len(temporal_batch), - ), + ) ) - self._concatenate_flattened_timeseries( - flattened_predictor_dfs=flattened_predictor_dfs, - ) + self._concatenate_flattened_timeseries(flattened_predictor_dfs=flattened_predictor_dfs) - def _add_static_info( - self, - static_spec: AnySpec, - ): + def _add_static_info(self, static_spec: AnySpec): """Add static info to each prediction time, e.g. age, sex etc. Args: @@ -588,46 +526,32 @@ def _add_static_info( value_col_name = possible_value_cols[0] elif len(possible_value_cols) > 1: raise ValueError( - f"Only one value column can be added to static info, found multiple: {possible_value_cols}", + f"Only one value column can be added to static info, found multiple: {possible_value_cols}" ) elif len(possible_value_cols) == 0: - raise ValueError( - "No value column found in spec.df, please check.", - ) + raise ValueError("No value column found in spec.df, please check.") output_col_name = static_spec.get_output_col_name() df = pd.DataFrame( { - self.entity_id_col_name: static_spec.timeseries_df[ - self.entity_id_col_name - ], # type: ignore + self.entity_id_col_name: static_spec.timeseries_df[self.entity_id_col_name], # type: ignore output_col_name: static_spec.timeseries_df[value_col_name], # type: ignore - }, + } ) self._df = pd.merge( - self._df, - df, - how="left", - on=self.entity_id_col_name, - suffixes=("", ""), - validate="m:1", + self._df, df, how="left", on=self.entity_id_col_name, suffixes=("", ""), validate="m:1" ) def _process_static_specs(self): """Process static specs.""" for spec in self.unprocessed_specs.static_specs: - self._add_static_info( - static_spec=spec, - ) + self._add_static_info(static_spec=spec) self.unprocessed_specs.static_specs = [] - def _add_incident_outcome( - self, - outcome_spec: OutcomeSpec, - ): + def _add_incident_outcome(self, outcome_spec: OutcomeSpec): """Add incident outcomes. Can be done vectorized, hence the separate function. @@ -637,7 +561,7 @@ def _add_incident_outcome( if not outcome_spec.timeseries_df[self.entity_id_col_name].is_unique: raise ValueError( - f"""Since incident = True, we expect only one outcome timestamp per entity id. This is not the case in {outcome_spec.feature_base_name}""", + f"""Since incident = True, we expect only one outcome timestamp per entity id. This is not the case in {outcome_spec.feature_base_name}""" ) df = pd.merge( @@ -649,11 +573,7 @@ def _add_incident_outcome( validate="m:1", ) - df = df.drop( - df[ - df[outcome_timestamp_col_name] < df[prediction_timestamp_col_name] - ].index, - ) + df = df.drop(df[df[outcome_timestamp_col_name] < df[prediction_timestamp_col_name]].index) if outcome_spec.is_dichotomous(): outcome_is_within_lookahead = ( @@ -666,14 +586,9 @@ def _add_incident_outcome( <= df[outcome_timestamp_col_name] ) - df[outcome_spec.get_output_col_name()] = outcome_is_within_lookahead.astype( - int, - ) + df[outcome_spec.get_output_col_name()] = outcome_is_within_lookahead.astype(int) - df = df.rename( - {prediction_timestamp_col_name: "timestamp"}, - axis=1, - ) + df = df.rename({prediction_timestamp_col_name: "timestamp"}, axis=1) df = df.drop([outcome_timestamp_col_name], axis=1) df = df.drop(["value"], axis=1) @@ -703,10 +618,7 @@ def _get_cutoff_date_from_spec(self, spec: TemporalSpec) -> pd.Timestamp: raise ValueError(f"Spec type {type(spec)} not recognised.") @print_df_dimensions_diff - def _drop_pred_time_if_insufficient_look_distance( - self, - df: pd.DataFrame, - ) -> pd.DataFrame: + def _drop_pred_time_if_insufficient_look_distance(self, df: pd.DataFrame) -> pd.DataFrame: """Drop prediction times if there is insufficient look distance. A prediction time has insufficient look distance if the feature spec @@ -717,10 +629,7 @@ def _drop_pred_time_if_insufficient_look_distance( Takes a dataframe as input to conform to a standard filtering interface, which we can easily decorate. """ - spec_batch = ( - self.unprocessed_specs.outcome_specs - + self.unprocessed_specs.predictor_specs - ) + spec_batch = self.unprocessed_specs.outcome_specs + self.unprocessed_specs.predictor_specs # Find the latest cutoff date for predictors cutoff_date_behind = pd.Timestamp("1700-01-01") @@ -743,9 +652,7 @@ def _drop_pred_time_if_insufficient_look_distance( ] if output_df.shape[0] == 0: - raise ValueError( - "No records left after dropping records outside look distance", - ) + raise ValueError("No records left after dropping records outside look distance") return output_df @@ -759,9 +666,7 @@ def _process_temporal_specs(self): # Handle incident specs separately, since their operations can be vectorised, # making them much faster if hasattr(spec, "incident") and spec.incident: - self._add_incident_outcome( - outcome_spec=spec, - ) + self._add_incident_outcome(outcome_spec=spec) # Remove processed specs. Beware of using .remove on a list of specs, as it causes errors. self.unprocessed_specs.outcome_specs = [ @@ -792,34 +697,26 @@ def _check_that_spec_df_has_required_columns(self, spec: AnySpec): if col not in spec.timeseries_df.columns: # type: ignore raise KeyError(f"Missing required column: {col}") - def _check_that_spec_df_timestamp_col_is_correctly_formatted( - self, - spec: TemporalSpec, - ): + def _check_that_spec_df_timestamp_col_is_correctly_formatted(self, spec: TemporalSpec): """Check that timestamp column is correctly formatted. Attempt to coerce if possible.""" timestamp_col_type = spec.timeseries_df[self.timestamp_col_name].dtype # type: ignore if timestamp_col_type not in ("Timestamp", "datetime64[ns]"): # Convert column dtype to datetime64[ns] if it isn't already - log.info( - f"{spec.feature_base_name}: Converting timestamp column to datetime64[ns]", - ) + log.info(f"{spec.feature_base_name}: Converting timestamp column to datetime64[ns]") spec.timeseries_df[self.timestamp_col_name] = pd.to_datetime( # type: ignore - spec.timeseries_df[self.timestamp_col_name], # type: ignore + spec.timeseries_df[self.timestamp_col_name] # type: ignore ) min_timestamp = min(spec.timeseries_df[self.timestamp_col_name]) # type: ignore if min_timestamp < pd.Timestamp("1971-01-01"): # type: ignore log.warning( - f"{spec.feature_base_name}: Minimum timestamp is {min_timestamp} - perhaps ints were coerced to timestamps?", + f"{spec.feature_base_name}: Minimum timestamp is {min_timestamp} - perhaps ints were coerced to timestamps?" ) - def add_spec( - self, - spec: Union[Sequence[AnySpec], AnySpec], - ): + def add_spec(self, spec: Union[Sequence[AnySpec], AnySpec]): """Add a specification to the flattened dataset. This adds it to a queue of unprocessed specs, which are not processed @@ -832,22 +729,16 @@ def add_spec( specs_to_process = [spec] if not isinstance(spec, Sequence) else spec for spec_i in specs_to_process: - allowed_spec_types = ( - OutcomeSpec, - PredictorSpec, - StaticSpec, - ) + allowed_spec_types = (OutcomeSpec, PredictorSpec, StaticSpec) if not isinstance(spec_i, allowed_spec_types): - raise ValueError( - f"Input is not allowed. Must be one of: {allowed_spec_types}", - ) + raise ValueError(f"Input is not allowed. Must be one of: {allowed_spec_types}") self._check_that_spec_df_has_required_columns(spec=spec_i) if isinstance(spec_i, (PredictorSpec, OutcomeSpec)): self._check_that_spec_df_timestamp_col_is_correctly_formatted( - spec=spec_i, # type: ignore + spec=spec_i # type: ignore ) if isinstance(spec_i, OutcomeSpec): @@ -876,12 +767,12 @@ def add_age( if date_of_birth_df[date_of_birth_col_name].dtype != " 0: raise ValueError( - f"The following required column(s) is/are missing from the input dataframe: {missing_columns}. Available columns are {df.columns}.", + f"The following required column(s) is/are missing from the input dataframe: {missing_columns}. Available columns are {df.columns}." ) # Get the unique value names from the dataframe @@ -89,10 +84,7 @@ def format_dict_for_printing(d: dict) -> str: ) -def load_dataset_from_file( - file_path: Path, - nrows: Union[int, None] = None, -) -> pd.DataFrame: +def load_dataset_from_file(file_path: Path, nrows: Union[int, None] = None) -> pd.DataFrame: """Load dataset from file. Handles csv and parquet files based on suffix. Args: @@ -117,9 +109,7 @@ def load_dataset_from_file( def load_most_recent_file_matching_pattern_as_df( - dir_path: Path, - file_pattern: str, - file_suffix: str, + dir_path: Path, file_pattern: str, file_suffix: str ) -> pd.DataFrame: """Load most recent df matching pattern. @@ -157,10 +147,7 @@ def df_contains_duplicates(df: pd.DataFrame, col_subset: List[str]) -> bool: return df.duplicated(subset=col_subset).any() -def write_df_to_file( - df: pd.DataFrame, - file_path: Path, -): +def write_df_to_file(df: pd.DataFrame, file_path: Path): """Write dataset to file. Handles csv and parquet files based on suffix. Args: @@ -204,9 +191,7 @@ def assert_no_duplicate_dicts_in_list(predictor_spec_list: List[Dict[str, Any]]) def print_df_dimensions_diff( # noqa - func: Callable, - print_when_starting: bool = False, - print_when_no_diff: bool = False, + func: Callable, print_when_starting: bool = False, print_when_no_diff: bool = False ): """Print the difference in rows between the input and output dataframes.""" @@ -232,9 +217,7 @@ def wrapper(*args, **kwargs): # noqa n_in_dim_before_func = df.shape[dim_int] if print_when_no_diff: - log.info( - f"{func.__name__}: {n_in_dim_before_func} {dim} before function", - ) + log.info(f"{func.__name__}: {n_in_dim_before_func} {dim} before function") result = func(*args, **kwargs) @@ -242,10 +225,7 @@ def wrapper(*args, **kwargs): # noqa if diff != 0: percent_diff = round( - (n_in_dim_before_func - result.shape[dim_int]) - / n_in_dim_before_func - * 100, - 2, + (n_in_dim_before_func - result.shape[dim_int]) / n_in_dim_before_func * 100, 2 ) log.info(f"{func.__name__}: Dropped {diff} ({percent_diff}%) {dim}") diff --git a/src/timeseriesflattener/testing/load_synth_data.py b/src/timeseriesflattener/testing/load_synth_data.py index 744ac0ab..4deb492d 100644 --- a/src/timeseriesflattener/testing/load_synth_data.py +++ b/src/timeseriesflattener/testing/load_synth_data.py @@ -18,10 +18,7 @@ def load_raw_test_csv(filename: str, n_rows: Optional[int] = None) -> pd.DataFra filename (str): Name of the file to load. n_rows (int, optional): Number of rows to load. Defaults to None. """ - df = pd.read_csv( - TEST_DATA_PATH / "raw" / filename, - nrows=n_rows, - ) + df = pd.read_csv(TEST_DATA_PATH / "raw" / filename, nrows=n_rows) # Convert timestamp col to datetime if "timestamp" in df.columns: @@ -30,9 +27,7 @@ def load_raw_test_csv(filename: str, n_rows: Optional[int] = None) -> pd.DataFra return df -def load_synth_predictor_float( - n_rows: Optional[int] = None, -) -> pd.DataFrame: +def load_synth_predictor_float(n_rows: Optional[int] = None) -> pd.DataFrame: """Load synth predictor data.". Args: @@ -44,9 +39,7 @@ def load_synth_predictor_float( return load_raw_test_csv("synth_raw_float_1.csv", n_rows=n_rows) -def load_synth_sex( - n_rows: Optional[int] = None, -) -> pd.DataFrame: +def load_synth_sex(n_rows: Optional[int] = None) -> pd.DataFrame: """Load synth sex data.". Args: @@ -58,9 +51,7 @@ def load_synth_sex( return load_raw_test_csv("synth_sex.csv", n_rows=n_rows) -def synth_predictor_binary( - n_rows: Optional[int] = None, -) -> pd.DataFrame: +def synth_predictor_binary(n_rows: Optional[int] = None) -> pd.DataFrame: """Load synth predictor data.". Args: @@ -72,9 +63,7 @@ def synth_predictor_binary( return load_raw_test_csv("synth_raw_binary_1.csv", n_rows=n_rows) -def load_synth_outcome( - n_rows: Optional[int] = None, -) -> pd.DataFrame: +def load_synth_outcome(n_rows: Optional[int] = None) -> pd.DataFrame: """Load synth predictor data.". Args: @@ -92,9 +81,7 @@ def load_synth_outcome( return df -def load_synth_prediction_times( - n_rows: Optional[int] = None, -) -> pd.DataFrame: +def load_synth_prediction_times(n_rows: Optional[int] = None) -> pd.DataFrame: """Load synth predictor data.". Args: @@ -106,9 +93,7 @@ def load_synth_prediction_times( return load_raw_test_csv("synth_prediction_times.csv", n_rows=n_rows) -def load_synth_text( - n_rows: Optional[int] = None, -) -> pd.DataFrame: +def load_synth_text(n_rows: Optional[int] = None) -> pd.DataFrame: """Load synth text data.". Args: diff --git a/src/timeseriesflattener/testing/synth_data_generator/synth_col_generators.py b/src/timeseriesflattener/testing/synth_data_generator/synth_col_generators.py index 886ba0fe..ecbce7ab 100644 --- a/src/timeseriesflattener/testing/synth_data_generator/synth_col_generators.py +++ b/src/timeseriesflattener/testing/synth_data_generator/synth_col_generators.py @@ -33,11 +33,7 @@ def create_outcome_values( effect, col = var.split("*") _y = float(effect) * df[col] + _y - noise = np.random.normal( - loc=noise_mean_sd[0], - scale=noise_mean_sd[1], - size=n_samples, - ) + noise = np.random.normal(loc=noise_mean_sd[0], scale=noise_mean_sd[1], size=n_samples) # Z-score normalise and add noise _y = stats.zscore(_y) + noise @@ -46,11 +42,7 @@ def create_outcome_values( return out # type: ignore -def generate_col_from_specs( - column_type: str, - n_samples: int, - col_specs: dict, -) -> Iterable: +def generate_col_from_specs(column_type: str, n_samples: int, col_specs: dict) -> Iterable: """Generate a column of data. Args: @@ -68,36 +60,22 @@ def generate_col_from_specs( return -np.arange(n_samples) if column_type == "uniform_int": - return np.random.randint( - low=col_specs["min"], - high=col_specs["max"], - size=n_samples, - ) + return np.random.randint(low=col_specs["min"], high=col_specs["max"], size=n_samples) if column_type == "uniform_float": - return np.random.uniform( - low=col_specs["min"], - high=col_specs["max"], - size=n_samples, - ) + return np.random.uniform(low=col_specs["min"], high=col_specs["max"], size=n_samples) if column_type == "normal": - return np.random.normal( - loc=col_specs["mean"], - scale=col_specs["sd"], - size=n_samples, - ) + return np.random.normal(loc=col_specs["mean"], scale=col_specs["sd"], size=n_samples) if column_type == "datetime_uniform": return pd.to_datetime( np.random.uniform( # type: ignore - low=col_specs["min"], - high=col_specs["max"], - size=n_samples, + low=col_specs["min"], high=col_specs["max"], size=n_samples ), unit="D", ).round( # type: ignore - "min", + "min" ) raise ValueError(f"Unknown distribution: {column_type}") @@ -148,9 +126,7 @@ def generate_data_columns( column_type = col_props["column_type"] df[col_name] = generate_col_from_specs( - column_type=column_type, - n_samples=n_samples, - col_specs=col_props, + column_type=column_type, n_samples=n_samples, col_specs=col_props ) # If column has min and/or max, floor and ceil appropriately @@ -167,14 +143,9 @@ def generate_data_columns( # Get project root directory column_specs = [ { - "dw_ek_borger": { - "column_type": "id", - }, + "dw_ek_borger": {"column_type": "id"}, "raw_predictor": {"column_type": "uniform_float", "min": 0, "max": 10}, - }, + } ] - df = generate_data_columns( - predictors=column_specs, - n_samples=10_000, - ) + df = generate_data_columns(predictors=column_specs, n_samples=10_000) diff --git a/src/timeseriesflattener/testing/synth_data_generator/synth_prediction_times_generator.py b/src/timeseriesflattener/testing/synth_data_generator/synth_prediction_times_generator.py index 58c015f0..6c48a4ae 100644 --- a/src/timeseriesflattener/testing/synth_data_generator/synth_prediction_times_generator.py +++ b/src/timeseriesflattener/testing/synth_data_generator/synth_prediction_times_generator.py @@ -8,9 +8,7 @@ create_outcome_values, generate_data_columns, ) -from timeseriesflattener.testing.synth_data_generator.utils import ( - replace_vals_with_na, -) +from timeseriesflattener.testing.synth_data_generator.utils import replace_vals_with_na def generate_synth_data( diff --git a/src/timeseriesflattener/testing/synth_data_generator/synth_txt_data_generator.py b/src/timeseriesflattener/testing/synth_data_generator/synth_txt_data_generator.py index ead4d31a..c30df96e 100644 --- a/src/timeseriesflattener/testing/synth_data_generator/synth_txt_data_generator.py +++ b/src/timeseriesflattener/testing/synth_data_generator/synth_txt_data_generator.py @@ -35,11 +35,7 @@ def generate_synth_txt_data( df = pd.DataFrame(columns=list(predictors.keys())) # Generate data - df = generate_data_columns( - predictors=predictors, - n_samples=n_samples, - df=df, - ) + df = generate_data_columns(predictors=predictors, n_samples=n_samples, df=df) # randomly replace predictors with NAs if na_prob: @@ -55,10 +51,7 @@ def generate_synth_txt_data( "text": {"column_type": "text"}, } - out_df = generate_synth_txt_data( - predictors=column_specifications, - n_samples=100, - ) + out_df = generate_synth_txt_data(predictors=column_specifications, n_samples=100) save_path = Path(__file__).parent.parent.parent.parent out_df.to_csv(save_path / "tests" / "test_data" / "synth_txt_data.csv") diff --git a/src/timeseriesflattener/testing/synth_data_generator/utils.py b/src/timeseriesflattener/testing/synth_data_generator/utils.py index 773edd71..e63ff08a 100644 --- a/src/timeseriesflattener/testing/synth_data_generator/utils.py +++ b/src/timeseriesflattener/testing/synth_data_generator/utils.py @@ -7,9 +7,7 @@ def replace_vals_with_na( - df: pd.DataFrame, - na_prob: float, - na_ignore_cols: Optional[list[str]] = None, + df: pd.DataFrame, na_prob: float, na_ignore_cols: Optional[list[str]] = None ) -> pd.DataFrame: """Replace values with NAs. diff --git a/src/timeseriesflattener/testing/test_data/flattened/generated_with_outcome/create_synth_flattened_with_outcome.py b/src/timeseriesflattener/testing/test_data/flattened/generated_with_outcome/create_synth_flattened_with_outcome.py index a8016a4f..a092b962 100644 --- a/src/timeseriesflattener/testing/test_data/flattened/generated_with_outcome/create_synth_flattened_with_outcome.py +++ b/src/timeseriesflattener/testing/test_data/flattened/generated_with_outcome/create_synth_flattened_with_outcome.py @@ -10,11 +10,7 @@ column_specifications = { "citizen_ids": {"column_type": "uniform_int", "min": 0, "max": 1_200_001}, "timestamp": {"column_type": "datetime_uniform", "min": 0, "max": 5 * 365}, - "timestamp_outcome": { - "column_type": "datetime_uniform", - "min": 1 * 365, - "max": 6 * 365, - }, + "timestamp_outcome": {"column_type": "datetime_uniform", "min": 1 * 365, "max": 6 * 365}, "pred_hba1c_within_100_days_max_fallback_np.nan": { "column_type": "normal", "mean": 48, diff --git a/src/timeseriesflattener/testing/test_data/models/create_bow_and_pca_model.py b/src/timeseriesflattener/testing/test_data/models/create_bow_and_pca_model.py index a6a6ae56..b3b5b61e 100644 --- a/src/timeseriesflattener/testing/test_data/models/create_bow_and_pca_model.py +++ b/src/timeseriesflattener/testing/test_data/models/create_bow_and_pca_model.py @@ -42,10 +42,7 @@ def train_pca_model(embedding: np.ndarray) -> PCA: return model -def save_model_to_test_dir( - model: Any, - filename: str, -): +def save_model_to_test_dir(model: Any, filename: str): """ Saves the model to a pickle file diff --git a/src/timeseriesflattener/testing/test_data/raw/create_synth_prediction_times.py b/src/timeseriesflattener/testing/test_data/raw/create_synth_prediction_times.py index 981c64d9..a778914d 100644 --- a/src/timeseriesflattener/testing/test_data/raw/create_synth_prediction_times.py +++ b/src/timeseriesflattener/testing/test_data/raw/create_synth_prediction_times.py @@ -9,16 +9,8 @@ # Get project root directory column_specs = { - "entity_id": { - "column_type": "uniform_int", - "min": 0, - "max": 10_000, - }, - "timestamp": { - "column_type": "datetime_uniform", - "min": -5 * 365, - "max": 0 * 365, - }, + "entity_id": {"column_type": "uniform_int", "min": 0, "max": 10_000}, + "timestamp": {"column_type": "datetime_uniform", "min": -5 * 365, "max": 0 * 365}, } df = generate_data_columns( @@ -27,6 +19,5 @@ ) df.to_csv( - PROJECT_ROOT / "tests" / "test_data" / "raw" / "synth_prediction_times.csv", - index=False, + PROJECT_ROOT / "tests" / "test_data" / "raw" / "synth_prediction_times.csv", index=False ) diff --git a/src/timeseriesflattener/testing/test_data/raw/create_synth_raw_binary.py b/src/timeseriesflattener/testing/test_data/raw/create_synth_raw_binary.py index 40dafadc..bcb82669 100644 --- a/src/timeseriesflattener/testing/test_data/raw/create_synth_raw_binary.py +++ b/src/timeseriesflattener/testing/test_data/raw/create_synth_raw_binary.py @@ -11,30 +11,14 @@ project_root = Path(__file__).resolve().parents[3] column_specs = [ - { - "entity_id": { - "column_type": "uniform_int", - "min": 0, - "max": 10_000, - }, - }, - { - "timestamp": { - "column_type": "datetime_uniform", - "min": -5 * 365, - "max": 0 * 365, - }, - }, + {"entity_id": {"column_type": "uniform_int", "min": 0, "max": 10_000}}, + {"timestamp": {"column_type": "datetime_uniform", "min": -5 * 365, "max": 0 * 365}}, {"value": {"column_type": "uniform_int", "min": 0, "max": 2}}, ] for i in (1, 2): - df = generate_data_columns( - predictors=column_specs, - n_samples=10_000, - ) + df = generate_data_columns(predictors=column_specs, n_samples=10_000) df.to_csv( - project_root / "tests" / "test_data" / "raw" / f"synth_raw_binary_{i}.csv", - index=False, + project_root / "tests" / "test_data" / "raw" / f"synth_raw_binary_{i}.csv", index=False ) diff --git a/src/timeseriesflattener/testing/test_data/raw/create_synth_raw_float.py b/src/timeseriesflattener/testing/test_data/raw/create_synth_raw_float.py index 9e1d0399..d7a533b1 100644 --- a/src/timeseriesflattener/testing/test_data/raw/create_synth_raw_float.py +++ b/src/timeseriesflattener/testing/test_data/raw/create_synth_raw_float.py @@ -9,30 +9,14 @@ # Get project root directory column_specs = [ - { - "entity_id": { - "column_type": "uniform_int", - "min": 0, - "max": 10_000, - }, - }, - { - "timestamp": { - "column_type": "datetime_uniform", - "min": -5 * 365, - "max": 0 * 365, - }, - }, + {"entity_id": {"column_type": "uniform_int", "min": 0, "max": 10_000}}, + {"timestamp": {"column_type": "datetime_uniform", "min": -5 * 365, "max": 0 * 365}}, {"value": {"column_type": "uniform_float", "min": 0, "max": 10}}, ] for i in (1, 2): - df = generate_data_columns( - predictors=column_specs, - n_samples=100_000, - ) + df = generate_data_columns(predictors=column_specs, n_samples=100_000) df.to_csv( - PROJECT_ROOT / "tests" / "test_data" / "raw" / f"synth_raw_float_{i}.csv", - index=False, + PROJECT_ROOT / "tests" / "test_data" / "raw" / f"synth_raw_float_{i}.csv", index=False ) diff --git a/src/timeseriesflattener/testing/test_data/raw/create_synth_sex.py b/src/timeseriesflattener/testing/test_data/raw/create_synth_sex.py index d45f3fbd..1d818c54 100644 --- a/src/timeseriesflattener/testing/test_data/raw/create_synth_sex.py +++ b/src/timeseriesflattener/testing/test_data/raw/create_synth_sex.py @@ -11,24 +11,12 @@ project_root = Path(__file__).resolve().parents[3] column_specs = [ - { - "entity_id": { - "column_type": "uniform_int", - "min": 0, - "max": 10_000, - }, - }, + {"entity_id": {"column_type": "uniform_int", "min": 0, "max": 10_000}}, {"female": {"column_type": "uniform_int", "min": 0, "max": 2}}, ] - df = generate_data_columns( - predictors=column_specs, - n_samples=100_000, - ) + df = generate_data_columns(predictors=column_specs, n_samples=100_000) df = df.groupby("entity_id").last().reset_index() - df.to_csv( - project_root / "tests" / "test_data" / "raw" / "synth_sex.csv", - index=False, - ) + df.to_csv(project_root / "tests" / "test_data" / "raw" / "synth_sex.csv", index=False) diff --git a/src/timeseriesflattener/testing/utils_for_testing.py b/src/timeseriesflattener/testing/utils_for_testing.py index 8a214939..77888b1f 100644 --- a/src/timeseriesflattener/testing/utils_for_testing.py +++ b/src/timeseriesflattener/testing/utils_for_testing.py @@ -9,14 +9,11 @@ from pandas.testing import assert_series_equal from timeseriesflattener import TimeseriesFlattener from timeseriesflattener.feature_specs.single_specs import AnySpec -from timeseriesflattener.testing.load_synth_data import ( - synth_predictor_binary, -) +from timeseriesflattener.testing.load_synth_data import synth_predictor_binary def convert_cols_with_matching_colnames_to_datetime( - df: DataFrame, - colname_substr: str, + df: DataFrame, colname_substr: str ) -> DataFrame: """Convert columns that contain colname_substr in their name to datetimes. @@ -28,8 +25,7 @@ def convert_cols_with_matching_colnames_to_datetime( DataFrame: The converted df """ df.loc[:, df.columns.str.contains(colname_substr)] = df.loc[ - :, - df.columns.str.contains(colname_substr), + :, df.columns.str.contains(colname_substr) ].apply(pd.to_datetime) return df @@ -70,10 +66,7 @@ def str_to_df( return df.loc[:, ~df.columns.str.contains("^Unnamed")] -def _get_value_cols_based_on_spec( - df: pd.DataFrame, - spec: AnySpec, -) -> Union[str, List[str]]: +def _get_value_cols_based_on_spec(df: pd.DataFrame, spec: AnySpec) -> Union[str, List[str]]: """Get value columns based on spec. Checks if multiple value columns are present.""" feature_name = spec.feature_base_name value_cols = df.columns[df.columns.str.contains(feature_name)].tolist() @@ -101,16 +94,12 @@ def assert_flattened_data_as_expected( drop_pred_times_with_insufficient_look_distance=drop_pred_times_with_insufficient_look_distance, ) - flattened_ds.add_spec( - spec=output_spec, - ) + flattened_ds.add_spec(spec=output_spec) if expected_df: for col in expected_df.columns: assert_series_equal( - left=flattened_ds.get_df()[col], - right=expected_df[col], - check_dtype=False, + left=flattened_ds.get_df()[col], right=expected_df[col], check_dtype=False ) elif expected_values: output_df = flattened_ds.get_df() diff --git a/src/timeseriesflattener/tests/test_feature_cache/test_cache_to_disk.py b/src/timeseriesflattener/tests/test_feature_cache/test_cache_to_disk.py index bae11bd3..051816e2 100644 --- a/src/timeseriesflattener/tests/test_feature_cache/test_cache_to_disk.py +++ b/src/timeseriesflattener/tests/test_feature_cache/test_cache_to_disk.py @@ -10,9 +10,7 @@ from timeseriesflattener.feature_specs.single_specs import PredictorSpec -def test_write_and_check_feature( - tmp_path: Path, -): +def test_write_and_check_feature(tmp_path: Path): """Test that write_feature writes a feature to disk.""" cache = DiskCache( @@ -20,9 +18,7 @@ def test_write_and_check_feature( pred_time_uuid_col_name="pred_time_uuid", entity_id_col_name="entity_id", cache_file_suffix="csv", - prediction_times_df=pd.DataFrame( - {"uuid": [1, 2, 3], "pred_time_uuid": [1, 2, 3]}, - ), + prediction_times_df=pd.DataFrame({"uuid": [1, 2, 3], "pred_time_uuid": [1, 2, 3]}), ) values_df = pd.DataFrame( @@ -31,7 +27,7 @@ def test_write_and_check_feature( "pred_time_uuid": [1, 2, 3], "timestamp": [1, 2, 3], "value": [1, 2, 3], - }, + } ) test_spec = PredictorSpec( @@ -48,7 +44,7 @@ def test_write_and_check_feature( "pred_time_uuid": [1, 2, 3], "timestamp": [1, 2, 3], f"{test_spec.get_output_col_name()}": [1, 2, 3], - }, + } ) assert cache.feature_exists(feature_spec=test_spec) is False @@ -75,17 +71,11 @@ def test_read_feature(tmp_path: Path): entity_id_col_name="entity_id", timestamp_col_name="timestamp", cache_file_suffix="csv", - prediction_times_df=pd.DataFrame( - {"pred_time_uuid": [1, 2, 3], "entity_id": [1, 2, 3]}, - ), + prediction_times_df=pd.DataFrame({"pred_time_uuid": [1, 2, 3], "entity_id": [1, 2, 3]}), ) values_df = pd.DataFrame( - { - "entity_id": [1, 2, 3, 4, 5], - "timestamp": [1, 2, 3, 4, 5], - "value": [1, 2, 3, 4, 5], - }, + {"entity_id": [1, 2, 3, 4, 5], "timestamp": [1, 2, 3, 4, 5], "value": [1, 2, 3, 4, 5]} ) test_spec = PredictorSpec( @@ -98,15 +88,11 @@ def test_read_feature(tmp_path: Path): generated_df = pd.DataFrame( { - "entity_id": [ - 1, - 2, - 3, - ], + "entity_id": [1, 2, 3], "pred_time_uuid": [1, 2, 3], "timestamp": [1, 2, 3], f"{test_spec.get_output_col_name()}": [1, 2, np.nan], - }, + } ) cache.write_feature(feature_spec=test_spec, df=generated_df) @@ -115,7 +101,4 @@ def test_read_feature(tmp_path: Path): # For each column in df, check that the values are equal to generated_df for col in df.columns: - assert_frame_equal( - df[col].to_frame(), - generated_df[col].to_frame(), - ) + assert_frame_equal(df[col].to_frame(), generated_df[col].to_frame()) diff --git a/src/timeseriesflattener/tests/test_timeseriesflattener/test_aggregation_fns.py b/src/timeseriesflattener/tests/test_timeseriesflattener/test_aggregation_fns.py index ed2c5b64..59b3e0fe 100644 --- a/src/timeseriesflattener/tests/test_timeseriesflattener/test_aggregation_fns.py +++ b/src/timeseriesflattener/tests/test_timeseriesflattener/test_aggregation_fns.py @@ -15,10 +15,7 @@ summed, variance, ) -from timeseriesflattener.feature_specs.single_specs import ( - OutcomeSpec, - PredictorSpec, -) +from timeseriesflattener.feature_specs.single_specs import OutcomeSpec, PredictorSpec from timeseriesflattener.testing.utils_for_testing import ( assert_flattened_data_as_expected, str_to_df, diff --git a/src/timeseriesflattener/tests/test_timeseriesflattener/test_df_transforms.py b/src/timeseriesflattener/tests/test_timeseriesflattener/test_df_transforms.py index 3bfc9764..a94681fb 100644 --- a/src/timeseriesflattener/tests/test_timeseriesflattener/test_df_transforms.py +++ b/src/timeseriesflattener/tests/test_timeseriesflattener/test_df_transforms.py @@ -1,9 +1,7 @@ import pytest from pandas import DataFrame -from timeseriesflattener.df_transforms import ( - df_with_multiple_values_to_named_dataframes, -) +from timeseriesflattener.df_transforms import df_with_multiple_values_to_named_dataframes from timeseriesflattener.testing.utils_for_testing import str_to_df @@ -15,9 +13,7 @@ def df_with_multiple_values() -> DataFrame: return str_to_df(df_str) -def test_df_with_multiple_values_to_named_dataframes( - df_with_multiple_values: DataFrame, -) -> None: +def test_df_with_multiple_values_to_named_dataframes(df_with_multiple_values: DataFrame) -> None: dfs = df_with_multiple_values_to_named_dataframes( df=df_with_multiple_values, entity_id_col_name="entity_id", @@ -34,13 +30,13 @@ def test_df_with_multiple_values_to_named_dataframes( str_to_df( """entity_id,timestamp,value, 1,2021-12-30 00:00:01, 1 - 1,2021-12-29 00:00:02, 2""", - ), + 1,2021-12-29 00:00:02, 2""" + ) ) assert dfs[1].df.equals( str_to_df( """entity_id,timestamp,value, 1,2021-12-30 00:00:01, 2 - 1,2021-12-29 00:00:02, 3""", - ), + 1,2021-12-29 00:00:02, 3""" + ) ) diff --git a/src/timeseriesflattener/tests/test_timeseriesflattener/test_feature_spec_objects.py b/src/timeseriesflattener/tests/test_timeseriesflattener/test_feature_spec_objects.py index 4d0f4b3f..a5050219 100644 --- a/src/timeseriesflattener/tests/test_timeseriesflattener/test_feature_spec_objects.py +++ b/src/timeseriesflattener/tests/test_timeseriesflattener/test_feature_spec_objects.py @@ -25,7 +25,7 @@ def test_skip_all_if_no_need_to_process(empty_named_df: NamedDataframe): lookbehind_days=[1], aggregation_fns=[maximum], fallback=[0], - ).create_combinations(), + ).create_combinations() ) == 1 ) diff --git a/src/timeseriesflattener/tests/test_timeseriesflattener/test_flattened_dataset/test_add_values.py b/src/timeseriesflattener/tests/test_timeseriesflattener/test_flattened_dataset/test_add_values.py index 167fd284..136d77dd 100644 --- a/src/timeseriesflattener/tests/test_timeseriesflattener/test_flattened_dataset/test_add_values.py +++ b/src/timeseriesflattener/tests/test_timeseriesflattener/test_flattened_dataset/test_add_values.py @@ -7,11 +7,7 @@ from timeseriesflattener import TimeseriesFlattener from timeseriesflattener.aggregation_fns import latest, maximum, minimum -from timeseriesflattener.feature_specs.single_specs import ( - OutcomeSpec, - PredictorSpec, - StaticSpec, -) +from timeseriesflattener.feature_specs.single_specs import OutcomeSpec, PredictorSpec, StaticSpec from timeseriesflattener.testing.utils_for_testing import ( assert_flattened_data_as_expected, str_to_df, @@ -23,12 +19,12 @@ def test_predictor_after_prediction_time(): prediction_times_df = str_to_df( """entity_id,timestamp, 1,2021-12-31 00:00:00 - """, + """ ) predictor_df = str_to_df( """entity_id,timestamp,value, 1,2022-01-01 00:00:01, 1.0 - """, + """ ) assert_flattened_data_as_expected( @@ -273,20 +269,12 @@ def test_static_predictor(): dataset.add_spec( StaticSpec( # type: ignore - timeseries_df=str_to_df(static_predictor), - feature_base_name=feature_name, - prefix=prefix, - ), + timeseries_df=str_to_df(static_predictor), feature_base_name=feature_name, prefix=prefix + ) ) expected_values = pd.DataFrame( - { - output_col_name: [ - "1994-12-31 00:00:01", - "1994-12-31 00:00:01", - "1994-12-31 00:00:01", - ], - }, + {output_col_name: ["1994-12-31 00:00:01", "1994-12-31 00:00:01", "1994-12-31 00:00:01"]} ) pd.testing.assert_series_equal( @@ -319,15 +307,7 @@ def test_add_age(): output_prefix=output_prefix, ) - expected_values = pd.DataFrame( - { - f"{output_prefix}_age_in_years": [ - 0.0, - 27.0, - 27.0, - ], - }, - ) + expected_values = pd.DataFrame({f"{output_prefix}_age_in_years": [0.0, 27.0, 27.0]}) pd.testing.assert_series_equal( left=dataset.get_df()["eval_age_in_years"].reset_index(drop=True), @@ -353,8 +333,7 @@ def test_add_age_error(): with pytest.raises(ValueError, match=".*Recommend converting.*"): dataset.add_age( - date_of_birth_df=str_to_df(static_predictor), - date_of_birth_col_name="date_of_birth", + date_of_birth_df=str_to_df(static_predictor), date_of_birth_col_name="date_of_birth" ) @@ -387,7 +366,7 @@ def test_incident_addition_with_multiple_timestamps_raises_meaningful_error(): fallback=np.NaN, feature_base_name="value", aggregation_fn=maximum, - ), + ) ) with pytest.raises(ValueError, match="Since incident = True"): @@ -434,17 +413,13 @@ def test_incident_outcome_removing_prediction_times(): fallback=np.NaN, feature_base_name="value", aggregation_fn=maximum, - ), + ) ) outcome_df = flattened_dataset.get_df().reset_index(drop=True) for col in expected_df.columns: - pd.testing.assert_series_equal( - outcome_df[col], - expected_df[col], - check_dtype=False, - ) + pd.testing.assert_series_equal(outcome_df[col], expected_df[col], check_dtype=False) def test_add_multiple_static_predictors(): @@ -504,16 +479,13 @@ def test_add_multiple_static_predictors(): spec=[ output_spec, StaticSpec( # type: ignore - timeseries_df=male_df, - feature_base_name="male", - prefix="pred", + timeseries_df=male_df, feature_base_name="male", prefix="pred" ), - ], + ] ) flattened_dataset.add_age( - date_of_birth_col_name="date_of_birth", - date_of_birth_df=birthdates_df, + date_of_birth_col_name="date_of_birth", date_of_birth_df=birthdates_df ) outcome_df = flattened_dataset.get_df() @@ -584,7 +556,7 @@ def test_add_temporal_predictors_then_temporal_outcome(): incident=True, feature_base_name="value", ), - ], + ] ) outcome_df = flattened_dataset.get_df().set_index("entity_id").sort_index() @@ -592,10 +564,7 @@ def test_add_temporal_predictors_then_temporal_outcome(): for col in expected_df.columns: pd.testing.assert_series_equal( - outcome_df[col], - expected_df[col], - check_index=False, - check_dtype=False, + outcome_df[col], expected_df[col], check_index=False, check_dtype=False ) @@ -634,7 +603,7 @@ def test_add_temporal_incident_binary_outcome(): fallback=np.NaN, feature_base_name="value", aggregation_fn=maximum, - ), + ) ) outcome_df = flattened_dataset.get_df() @@ -692,7 +661,7 @@ def test_add_outcome_timestamps(): fallback=np.NaN, feature_base_name="timestamp", aggregation_fn=latest, - ), + ) ) outcome_df = flattened_dataset.get_df() diff --git a/src/timeseriesflattener/tests/test_timeseriesflattener/test_flattened_dataset/test_cache.py b/src/timeseriesflattener/tests/test_timeseriesflattener/test_flattened_dataset/test_cache.py index cb15a8b6..62a9fbb5 100644 --- a/src/timeseriesflattener/tests/test_timeseriesflattener/test_flattened_dataset/test_cache.py +++ b/src/timeseriesflattener/tests/test_timeseriesflattener/test_flattened_dataset/test_cache.py @@ -10,10 +10,7 @@ from timeseriesflattener.aggregation_fns import maximum, mean from timeseriesflattener.feature_cache.cache_to_disk import DiskCache -from timeseriesflattener.feature_specs.group_specs import ( - NamedDataframe, - PredictorGroupSpec, -) +from timeseriesflattener.feature_specs.group_specs import NamedDataframe, PredictorGroupSpec from timeseriesflattener.feature_specs.single_specs import PredictorSpec from timeseriesflattener.testing.load_synth_data import ( load_synth_prediction_times, @@ -27,7 +24,7 @@ base_float_predictor_combinations = PredictorGroupSpec( named_dataframes=[ - NamedDataframe(df=load_synth_predictor_float(), name="synth_predictor_float"), + NamedDataframe(df=load_synth_predictor_float(), name="synth_predictor_float") ], lookbehind_days=[365, 730], aggregation_fns=[mean], @@ -35,9 +32,7 @@ ).create_combinations() base_binary_predictor_combinations = PredictorGroupSpec( - named_dataframes=[ - NamedDataframe(df=synth_predictor_binary(), name="synth_predictor_binary"), - ], + named_dataframes=[NamedDataframe(df=synth_predictor_binary(), name="synth_predictor_binary")], lookbehind_days=[365, 730], aggregation_fns=[maximum], fallback=[np.NaN], @@ -45,13 +40,10 @@ @pytest.mark.parametrize( - "predictor_specs", - [base_float_predictor_combinations, base_binary_predictor_combinations], + "predictor_specs", [base_float_predictor_combinations, base_binary_predictor_combinations] ) def test_cache_hitting( - tmp_path: Path, - predictor_specs: List[PredictorSpec], - synth_prediction_times: pd.DataFrame, + tmp_path: Path, predictor_specs: List[PredictorSpec], synth_prediction_times: pd.DataFrame ): """Test that cache hits.""" diff --git a/src/timeseriesflattener/tests/test_timeseriesflattener/test_flattened_dataset/test_concatenation.py b/src/timeseriesflattener/tests/test_timeseriesflattener/test_flattened_dataset/test_concatenation.py index efa8fe86..52e2689d 100644 --- a/src/timeseriesflattener/tests/test_timeseriesflattener/test_flattened_dataset/test_concatenation.py +++ b/src/timeseriesflattener/tests/test_timeseriesflattener/test_flattened_dataset/test_concatenation.py @@ -51,10 +51,7 @@ def test_benchmark_full_index_comparison_before_concatenate(): # 0.004 seconds for 9 dfs when sampling 5_000 rows # 0.033 seconds for 9 dfs when sampling 100_000 rows # 7.622 seconds for 100 dfs when sampling 2_000_000 rows - compute_seconds = benchmark( - TimeseriesFlattener._check_dfs_are_ready_for_concat, - dfs, - ) + compute_seconds = benchmark(TimeseriesFlattener._check_dfs_are_ready_for_concat, dfs) assert compute_seconds < 4 diff --git a/src/timeseriesflattener/tests/test_timeseriesflattener/test_flattened_dataset/test_errors.py b/src/timeseriesflattener/tests/test_timeseriesflattener/test_flattened_dataset/test_errors.py index 7d81cc2a..90a956d0 100644 --- a/src/timeseriesflattener/tests/test_timeseriesflattener/test_flattened_dataset/test_errors.py +++ b/src/timeseriesflattener/tests/test_timeseriesflattener/test_flattened_dataset/test_errors.py @@ -52,7 +52,7 @@ def test_col_does_not_exist(): aggregation_fn=maximum, fallback=2, feature_base_name="value", - ), + ) ) diff --git a/src/timeseriesflattener/tests/test_timeseriesflattener/test_flattened_dataset/test_flattened_dataset.py b/src/timeseriesflattener/tests/test_timeseriesflattener/test_flattened_dataset/test_flattened_dataset.py index 7c009889..eb662956 100644 --- a/src/timeseriesflattener/tests/test_timeseriesflattener/test_flattened_dataset/test_flattened_dataset.py +++ b/src/timeseriesflattener/tests/test_timeseriesflattener/test_flattened_dataset/test_flattened_dataset.py @@ -4,11 +4,7 @@ import pytest from timeseriesflattener.aggregation_fns import latest, mean -from timeseriesflattener.feature_specs.single_specs import ( - OutcomeSpec, - PredictorSpec, - StaticSpec, -) +from timeseriesflattener.feature_specs.single_specs import OutcomeSpec, PredictorSpec, StaticSpec from timeseriesflattener.flattened_dataset import TimeseriesFlattener @@ -35,11 +31,7 @@ def test_add_spec(synth_prediction_times: pd.DataFrame, synth_outcome: pd.DataFr aggregation_fn=mean, fallback=np.nan, ) - static_spec = StaticSpec( - timeseries_df=synth_outcome, - feature_base_name="static", - prefix="pred", - ) + static_spec = StaticSpec(timeseries_df=synth_outcome, feature_base_name="static", prefix="pred") # Test adding a single spec dataset.add_spec(outcome_spec) @@ -55,10 +47,7 @@ def test_add_spec(synth_prediction_times: pd.DataFrame, synth_outcome: pd.DataFr dataset.add_spec("invalid spec") # type: ignore -def test_compute_specs( - synth_prediction_times: pd.DataFrame, - synth_outcome: pd.DataFrame, -): +def test_compute_specs(synth_prediction_times: pd.DataFrame, synth_outcome: pd.DataFrame): # Create an instance of the class that contains the `add_spec` method dataset = TimeseriesFlattener( prediction_times_df=synth_prediction_times, @@ -102,7 +91,7 @@ def test_drop_pred_time_if_insufficient_look_distance(): { "entity_id": [1, 1, 1, 1], "datetime": ["2022-01-01", "2022-01-02", "2022-01-03", "2022-01-04"], - }, + } ) ts_flattener = TimeseriesFlattener( @@ -111,13 +100,7 @@ def test_drop_pred_time_if_insufficient_look_distance(): timestamp_col_name="datetime", ) - pred_val_df = pd.DataFrame( - { - "entity_id": [1], - "datetime": ["2022-01-01"], - "value": [1], - }, - ) + pred_val_df = pd.DataFrame({"entity_id": [1], "datetime": ["2022-01-01"], "value": [1]}) # Create a sample set of specs predictor_spec = PredictorSpec( @@ -128,13 +111,7 @@ def test_drop_pred_time_if_insufficient_look_distance(): feature_base_name="test_feature", ) - out_val_df = pd.DataFrame( - { - "entity_id": [1], - "datetime": ["2022-01-05"], - "value": [4], - }, - ) + out_val_df = pd.DataFrame({"entity_id": [1], "datetime": ["2022-01-05"], "value": [4]}) outcome_spec = OutcomeSpec( timeseries_df=out_val_df, @@ -162,7 +139,7 @@ def test_double_compute_doesn_not_duplicate_columns(): { "entity_id": [1, 1, 2, 2], "date": ["2020-01-01", "2020-02-01", "2020-02-01", "2020-03-01"], - }, + } ) # Load a dataframe with raw values you wish to aggregate as predictors predictor_df = pd.DataFrame( @@ -178,7 +155,7 @@ def test_double_compute_doesn_not_duplicate_columns(): "2020-03-16", ], "value": [1, 2, 3, 4, 4, 5, 6], - }, + } ) predictor_spec = PredictorSpec( diff --git a/src/timeseriesflattener/tests/test_timeseriesflattener/test_flattened_dataset/utils.py b/src/timeseriesflattener/tests/test_timeseriesflattener/test_flattened_dataset/utils.py index 2c6b0fc7..dfac41b9 100644 --- a/src/timeseriesflattener/tests/test_timeseriesflattener/test_flattened_dataset/utils.py +++ b/src/timeseriesflattener/tests/test_timeseriesflattener/test_flattened_dataset/utils.py @@ -70,7 +70,7 @@ def create_flattened_df( ) flat_ds.add_spec( - spec=predictor_specs, # type: ignore + spec=predictor_specs # type: ignore ) return flat_ds.get_df() diff --git a/src/timeseriesflattenerv2/__init__.py b/src/timeseriesflattenerv2/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/timeseriesflattenerv2/feature_specs.py b/src/timeseriesflattenerv2/feature_specs.py new file mode 100644 index 00000000..df93da01 --- /dev/null +++ b/src/timeseriesflattenerv2/feature_specs.py @@ -0,0 +1,98 @@ +import datetime as dt +from dataclasses import dataclass +from typing import Protocol, Sequence, Union + +import polars as pl + +Fallback = Union[int, float, str] +LookDistance = dt.timedelta + +# TODO: Add validation that all entity_id and timestamp columns are the same + +default_entity_id_col_name = "entity_id" +default_pred_time_uuid_col_name = "pred_time_uuid" +default_pred_time_col_name = "pred_timestamp" + + +@dataclass +class PredictionTimeFrame: + df: pl.LazyFrame + entity_id_col_name: str = default_entity_id_col_name + timestamp_col_name: str = default_pred_time_col_name + pred_time_uuid_col_name: str = default_pred_time_uuid_col_name + + def __post_init__(self): + self.df = self.df.with_columns( + pl.concat_str( + pl.col(self.entity_id_col_name), pl.lit("-"), pl.col(self.timestamp_col_name) + ).alias(self.pred_time_uuid_col_name) + ) + + def to_lazyframe_with_uuid(self) -> pl.LazyFrame: + return self.df + + +@dataclass(frozen=True) +class ValueFrame: + """A frame that contains the values of a time series.""" + + df: pl.LazyFrame + value_type: str + entity_id_col_name: str = default_entity_id_col_name + value_timestamp_col_name: str = "value_timestamp" + + +@dataclass(frozen=True) +class SlicedFrame: + """A frame that has been sliced by a lookdirection.""" + + df: pl.LazyFrame + pred_time_uuid_col_name: str = default_pred_time_uuid_col_name + value_col_name: str = "value" + + +@dataclass(frozen=True) +class AggregatedValueFrame: + df: pl.LazyFrame + pred_time_uuid_col_name: str = default_pred_time_uuid_col_name + value_col_name: str = "value" + + +class Aggregator(Protocol): + name: str + + def apply(self, value_frame: SlicedFrame, column_name: str) -> AggregatedValueFrame: + ... + + +@dataclass(frozen=True) +class PredictorSpec: + value_frame: ValueFrame + lookbehind_distances: Sequence[LookDistance] + aggregators: Sequence[Aggregator] + fallbacks: Sequence[Fallback] + + +@dataclass(frozen=True) +class OutcomeSpec: + value_frame: ValueFrame + lookahead_distances: Sequence[LookDistance] + aggregators: Sequence[Aggregator] + fallbacks: Sequence[Fallback] + + +@dataclass(frozen=True) +class TimedeltaFrame: + df: pl.LazyFrame + pred_time_uuid_col_name: str = default_pred_time_uuid_col_name + timedelta_col_name: str = "time_from_prediction_to_value" + value_col_name: str = "value" + + +ValueSpecification = Union[PredictorSpec, OutcomeSpec] + + +@dataclass(frozen=True) +class AggregatedFrame: + pred_time_uuid_col_name: str + timestamp_col_name: str diff --git a/src/timeseriesflattenerv2/flattener.py b/src/timeseriesflattenerv2/flattener.py new file mode 100644 index 00000000..4c503d25 --- /dev/null +++ b/src/timeseriesflattenerv2/flattener.py @@ -0,0 +1,114 @@ +from dataclasses import dataclass +from typing import Sequence + +import polars as pl +from iterpy._iter import Iter + +from .feature_specs import ( + AggregatedValueFrame, + Aggregator, + LookDistance, + OutcomeSpec, + PredictionTimeFrame, + PredictorSpec, + SlicedFrame, + TimedeltaFrame, + ValueFrame, + ValueSpecification, +) + + +def _aggregate_within_slice( + sliced_frame: SlicedFrame, aggregators: Sequence[Aggregator] +) -> Iter[AggregatedValueFrame]: + aggregated_value_frames = [ + aggregator.apply(SlicedFrame(sliced_frame.df), column_name=sliced_frame.value_col_name) + for aggregator in aggregators + ] + + return Iter( + AggregatedValueFrame( + df=frame.df, + pred_time_uuid_col_name=sliced_frame.pred_time_uuid_col_name, + value_col_name=sliced_frame.value_col_name, + ) + for frame in aggregated_value_frames + ) + + +def _slice_frame(timedelta_frame: TimedeltaFrame, distance: LookDistance) -> SlicedFrame: + sliced_frame = timedelta_frame.df.filter(pl.col(timedelta_frame.timedelta_col_name) <= distance) + + return SlicedFrame( + df=sliced_frame, + pred_time_uuid_col_name=timedelta_frame.pred_time_uuid_col_name, + value_col_name=timedelta_frame.value_col_name, + ) + + +def _slice_and_aggregate_spec( + timedelta_frame: TimedeltaFrame, distance: LookDistance, aggregators: Sequence[Aggregator] +) -> Iter[AggregatedValueFrame]: + sliced_frame = _slice_frame(timedelta_frame, distance) + return _aggregate_within_slice(sliced_frame, aggregators) + + +def _normalise_lookdistances(spec: ValueSpecification) -> Sequence[LookDistance]: + if isinstance(spec, PredictorSpec): + lookdistances = [-distance for distance in spec.lookbehind_distances] + elif isinstance(spec, OutcomeSpec): + lookdistances = spec.lookahead_distances + else: + raise ValueError("Unknown spec type") + return lookdistances + + +def _horizontally_concatenate_dfs(dfs: Sequence[pl.LazyFrame]) -> pl.LazyFrame: + # Run some checks on the dfs + return pl.concat(dfs, how="horizontal") + + +@dataclass +class Flattener: + predictiontime_frame: PredictionTimeFrame + + def _get_timedelta_frame(self, spec: ValueSpecification) -> TimedeltaFrame: + # Join the prediction time dataframe + joined_frame = self.predictiontime_frame.to_lazyframe_with_uuid().join( + spec.value_frame.df, on=self.predictiontime_frame.entity_id_col_name + ) + + # Get timedelta + timedelta_frame = joined_frame.with_columns( + ( + pl.col(spec.value_frame.value_timestamp_col_name) + - pl.col(self.predictiontime_frame.timestamp_col_name) + ).alias("time_from_prediction_to_value") + ) + + return TimedeltaFrame(timedelta_frame) + + def _process_spec(self, spec: ValueSpecification) -> ValueFrame: + lookdistances = _normalise_lookdistances(spec) + timedelta_frame = self._get_timedelta_frame(spec) + + aggregated_value_frames = ( + Iter(lookdistances) + .map( + lambda distance: _slice_and_aggregate_spec( + timedelta_frame=timedelta_frame, distance=distance, aggregators=spec.aggregators + ) + ) + .flatten() + ) + + return ValueFrame( + df=_horizontally_concatenate_dfs([f.df for f in aggregated_value_frames.to_list()]), + value_type=spec.value_frame.value_type, + entity_id_col_name=spec.value_frame.entity_id_col_name, + value_timestamp_col_name=spec.value_frame.value_timestamp_col_name, + ) + + def aggregate_timeseries(self, specs: Sequence[ValueSpecification]) -> AggregatedValueFrame: + dfs = Iter(specs).map(self._process_spec).map(lambda x: x.df).to_list() + return AggregatedValueFrame(df=_horizontally_concatenate_dfs(dfs)) diff --git a/src/timeseriesflattenerv2/test_flattener.py b/src/timeseriesflattenerv2/test_flattener.py new file mode 100644 index 00000000..ae8b2b3e --- /dev/null +++ b/src/timeseriesflattenerv2/test_flattener.py @@ -0,0 +1,60 @@ +import datetime as dt +from dataclasses import dataclass + +import polars as pl +from timeseriesflattener.testing.utils_for_testing import str_to_df + +from timeseriesflattenerv2.flattener import Flattener + +from .feature_specs import ( + AggregatedValueFrame, + Aggregator, + PredictionTimeFrame, + PredictorSpec, + SlicedFrame, + ValueFrame, +) + + +@dataclass +class MeanAggregator(Aggregator): + name: str = "mean" + + def apply(self, sliced_frame: SlicedFrame, column_name: str) -> AggregatedValueFrame: + df = sliced_frame.df.group_by(pl.col(sliced_frame.pred_time_uuid_col_name)).agg( + pl.col(column_name).mean().alias(column_name) + ) + # TODO: Figure out how to standardise the output column names + + return AggregatedValueFrame(df=df) + + +def test_flattener(): + pred_frame = str_to_df( + """entity_id,pred_timestamp + 1,2021-01-03""" + ) + + value_frame = str_to_df( + """entity_id,value,value_timestamp + 1,1,2021-01-01 + 1,2,2021-01-02 + 1,3,2021-01-03""" + ) + + result = Flattener( + predictiontime_frame=PredictionTimeFrame(df=pl.from_pandas(pred_frame).lazy()) + ).aggregate_timeseries( + specs=[ + PredictorSpec( + value_frame=ValueFrame( + df=pl.from_pandas(value_frame).lazy(), value_type="test_value" + ), + lookbehind_distances=[dt.timedelta(days=1)], + aggregators=[MeanAggregator()], + fallbacks=["NaN"], + ) + ] + ) + + assert isinstance(result, AggregatedValueFrame) diff --git a/tasks.py b/tasks.py index e484224d..f1303aae 100644 --- a/tasks.py +++ b/tasks.py @@ -56,16 +56,11 @@ def git_init(c: Context, branch: str = "main"): print(f"{Emo.GOOD} Git repository already initialized") -def setup_venv( - c: Context, - python_version: str, -) -> str: +def setup_venv(c: Context, python_version: str) -> str: venv_name = f'.venv{python_version.replace(".", "")}' if not Path(venv_name).exists(): - echo_header( - f"{Emo.DO} Creating virtual environment for {python_version}{Emo.PY}", - ) + echo_header(f"{Emo.DO} Creating virtual environment for {python_version}{Emo.PY}") c.run(f"python{python_version} -m venv {venv_name}") print(f"{Emo.GOOD} Virtual environment created") else: @@ -88,11 +83,7 @@ def _add_commit(c: Context, msg: Optional[str] = None): def is_uncommitted_changes(c: Context) -> bool: - git_status_result: Result = c.run( - "git status --porcelain", - pty=True, - hide=True, - ) + git_status_result: Result = c.run("git status --porcelain", pty=True, hide=True) uncommitted_changes = git_status_result.stdout != "" return uncommitted_changes @@ -101,15 +92,9 @@ def is_uncommitted_changes(c: Context) -> bool: def add_and_commit(c: Context, msg: Optional[str] = None): """Add and commit all changes.""" if is_uncommitted_changes(c): - uncommitted_changes_descr = c.run( - "git status --porcelain", - pty=True, - hide=True, - ).stdout + uncommitted_changes_descr = c.run("git status --porcelain", pty=True, hide=True).stdout - echo_header( - f"{Emo.WARN} Uncommitted changes detected", - ) + echo_header(f"{Emo.WARN} Uncommitted changes detected") for line in uncommitted_changes_descr.splitlines(): print(f" {line.strip()}") @@ -120,10 +105,7 @@ def add_and_commit(c: Context, msg: Optional[str] = None): def branch_exists_on_remote(c: Context) -> bool: branch_name = Path(".git/HEAD").read_text().split("/")[-1].strip() - branch_exists_result: Result = c.run( - f"git ls-remote --heads origin {branch_name}", - hide=True, - ) + branch_exists_result: Result = c.run(f"git ls-remote --heads origin {branch_name}", hide=True) return branch_name in branch_exists_result.stdout @@ -141,21 +123,14 @@ def update_branch(c: Context): def create_pr(c: Context): - c.run( - "gh pr create --web", - pty=True, - ) + c.run("gh pr create --web", pty=True) def update_pr(c: Context): echo_header(f"{Emo.COMMUNICATE} Syncing PR") # Get current branch name branch_name = Path(".git/HEAD").read_text().split("/")[-1].strip() - pr_result: Result = c.run( - "gh pr list --state OPEN", - pty=False, - hide=True, - ) + pr_result: Result = c.run("gh pr list --state OPEN", pty=False, hide=True) if branch_name not in pr_result.stdout: create_pr(c) @@ -169,11 +144,12 @@ def exit_if_error_in_stdout(result: Result): # Find N remaining using regex if "error" in result.stdout: - errors_remaining = re.findall(r"\d+(?=( remaining))", result.stdout)[ - 0 - ] # testing - if errors_remaining != "0": - exit(0) + try: + errors_remaining = re.findall(r"\d+(?=( remaining))", result.stdout)[0] # testing + if errors_remaining != "0": + exit(0) + except IndexError: + pass def pre_commit(c: Context): @@ -207,7 +183,7 @@ def setup(c: Context, python_version: str = "3.9"): git_init(c) venv_name = setup_venv(c, python_version=python_version) print( - f"{Emo.DO} Activate your virtual environment by running: \n\n\t\t source {venv_name}/bin/activate \n", + f"{Emo.DO} Activate your virtual environment by running: \n\n\t\t source {venv_name}/bin/activate \n" ) print(f"{Emo.DO} Then install the project by running: \n\n\t\t inv install\n") @@ -222,9 +198,9 @@ def update(c: Context): @task(iterable="pytest_args") def test( c: Context, - python_versions: str = "3.9", + python_versions: str = "3.10", pytest_args: List[str] = [], # noqa - testmon: bool = False, + testmon: bool = True, ): """Run tests""" echo_header(f"{Emo.TEST} Running tests") @@ -249,9 +225,7 @@ def test( python_version_arg_string = ",".join(python_version_strings) test_result: Result = c.run( - f"tox -e {python_version_arg_string} -- {pytest_arg_str}", - warn=True, - pty=True, + f"tox -e {python_version_arg_string} -- {pytest_arg_str}", warn=True, pty=True ) # If "failed" in the pytest results @@ -260,9 +234,7 @@ def test( echo_header("Failed tests") # Get lines with "FAILED" in them from the .pytest_results file - failed_tests = [ - line for line in test_result.stdout if line.startswith("FAILED") - ] + failed_tests = [line for line in test_result.stdout if line.startswith("FAILED")] for line in failed_tests: # Remove from start of line until /test_ @@ -298,7 +270,7 @@ def lint(c: Context): @task def test_tutorials(c: Context): c.run( - "find docs/tutorials -name '*.ipynb' | grep -v 'nbconvert' | xargs jupyter nbconvert --to notebook --execute", + "find docs/tutorials -name '*.ipynb' | grep -v 'nbconvert' | xargs jupyter nbconvert --to notebook --execute" )