From 9f32e9524df147cd5fb39dfcff0ef31c5375570d Mon Sep 17 00:00:00 2001
From: Sean Shahkarami <sean.shahkarami@gmail.com>
Date: Thu, 21 Sep 2023 16:46:16 -0500
Subject: [PATCH 01/12] use scheduled tasks to decide whether to check image
 and audio data

---
 rollup_health_and_sanity_metrics.py | 35 +++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/rollup_health_and_sanity_metrics.py b/rollup_health_and_sanity_metrics.py
index 81ecb9d..048dae3 100644
--- a/rollup_health_and_sanity_metrics.py
+++ b/rollup_health_and_sanity_metrics.py
@@ -3,6 +3,7 @@
 import pandas as pd
 import logging
 import sage_data_client
+import requests
 from typing import NamedTuple
 from utils import (
     load_node_table,
@@ -196,6 +197,32 @@
 }
 
 
+def get_scheduled_tasks_by_node():
+    """
+    Queries the cloud scheduler and returns a map of VSN -> [Plugin names across all running jobs for VSN]
+    """
+    r = requests.get("https://es.sagecontinuum.org/api/v1/jobs/list")
+    r.raise_for_status()
+    jobs = list(r.json().values())
+
+    # filter only running jobs
+    jobs = [job for job in jobs if job["state"]["last_state"] == "Running"]
+
+    tasks_by_node = {}
+
+    for job in jobs:
+        plugins = job.get("plugins") or []
+        nodes = job.get("nodes") or {}
+
+        for plugin in plugins:
+            for vsn in nodes.keys():
+                if vsn not in tasks_by_node:
+                    tasks_by_node[vsn] = []
+                tasks_by_node[vsn].append(plugin["name"])
+
+    return tasks_by_node
+
+
 def get_health_records_for_window(nodes, start, end, window):
     records = []
 
@@ -242,6 +269,8 @@ def add_device_health_check_record(vsn, device, value):
 
     vsn_groups = df.groupby(["meta.vsn"])
 
+    scheduled_tasks_by_node = get_scheduled_tasks_by_node()
+
     for node in nodes:
         try:
             df_vsn = vsn_groups.get_group(node.vsn)
@@ -261,9 +290,15 @@ def check_publishing_frequency_for_device(device, window):
                 except KeyError:
                     yield task, name, 0.0
 
+        scheduled_tasks = scheduled_tasks_by_node.get(node.vsn, [])
+
         def check_publishing_sla_for_device(device, window, sla):
             healthy = True
+
             for task, name, f in check_publishing_frequency_for_device(device, window):
+                # skip image and audio sampler tasks which are not scheduled
+                if "sampler" in task and task not in scheduled_tasks:
+                    continue
                 if f < sla:
                     healthy = False
                     logging.info(

From 018878590312ed7b063c2fb31ce73bc07601872c Mon Sep 17 00:00:00 2001
From: Sean Shahkarami <sean.shahkarami@gmail.com>
Date: Thu, 21 Sep 2023 16:49:36 -0500
Subject: [PATCH 02/12] fixed formatting

---
 rollup_health_and_sanity_metrics.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/rollup_health_and_sanity_metrics.py b/rollup_health_and_sanity_metrics.py
index 048dae3..992aa29 100644
--- a/rollup_health_and_sanity_metrics.py
+++ b/rollup_health_and_sanity_metrics.py
@@ -311,6 +311,7 @@ def check_publishing_sla_for_device(device, window, sla):
                         name,
                         f,
                     )
+
             return healthy
 
         node_healthy = True

From e324d5e282999b7b4e0a6e0a30cfc8ba9622e620 Mon Sep 17 00:00:00 2001
From: Sean Shahkarami <sean.shahkarami@gmail.com>
Date: Thu, 21 Sep 2023 22:17:01 -0500
Subject: [PATCH 03/12] ignoring sys.cooling sys.freq sys.gps metrics until we
 can finish debugging collection

---
 rollup_health_and_sanity_metrics.py | 68 ++++++++++++++++-------------
 1 file changed, 37 insertions(+), 31 deletions(-)

diff --git a/rollup_health_and_sanity_metrics.py b/rollup_health_and_sanity_metrics.py
index 992aa29..a2d0391 100644
--- a/rollup_health_and_sanity_metrics.py
+++ b/rollup_health_and_sanity_metrics.py
@@ -15,24 +15,30 @@
 )
 
 
+# these metrics are coming in inconsistently. we should debug later
+# but to make the health report less red, we'll comment them out.
+# sys.cooling*
+# sys.freq*
+# sys.gps*
+
 sys_from_nxcore = {
     "sys.boot_time",
-    "sys.cooling",
-    "sys.cooling_max",
+    # "sys.cooling",
+    # "sys.cooling_max",
     "sys.cpu_seconds",
-    "sys.freq.ape",
-    "sys.freq.cpu",
-    "sys.freq.cpu_max",
-    "sys.freq.cpu_min",
-    "sys.freq.cpu_perc",
-    "sys.freq.emc",
-    "sys.freq.emc_max",
-    "sys.freq.emc_min",
-    "sys.freq.emc_perc",
-    "sys.freq.gpu",
-    "sys.freq.gpu_max",
-    "sys.freq.gpu_min",
-    "sys.freq.gpu_perc",
+    # "sys.freq.ape",
+    # "sys.freq.cpu",
+    # "sys.freq.cpu_max",
+    # "sys.freq.cpu_min",
+    # "sys.freq.cpu_perc",
+    # "sys.freq.emc",
+    # "sys.freq.emc_max",
+    # "sys.freq.emc_min",
+    # "sys.freq.emc_perc",
+    # "sys.freq.gpu",
+    # "sys.freq.gpu_max",
+    # "sys.freq.gpu_min",
+    # "sys.freq.gpu_perc",
     "sys.fs.avail",
     "sys.fs.size",
     "sys.hwmon",
@@ -59,7 +65,7 @@
     # "sys.gps.epy", # not sent with no GPS fix
     # "sys.gps.epv", # not sent with no GPS fix
     # "sys.gps.satellites", # not sent with no GPS fix
-    "sys.gps.mode",
+    # "sys.gps.mode",
 }
 
 sys_from_dellblade = {
@@ -103,22 +109,22 @@
 
 sys_from_nxagent = {
     "sys.boot_time",
-    "sys.cooling",
-    "sys.cooling_max",
+    # "sys.cooling",
+    # "sys.cooling_max",
     "sys.cpu_seconds",
-    "sys.freq.ape",
-    "sys.freq.cpu",
-    "sys.freq.cpu_max",
-    "sys.freq.cpu_min",
-    "sys.freq.cpu_perc",
-    "sys.freq.emc",
-    "sys.freq.emc_max",
-    "sys.freq.emc_min",
-    "sys.freq.emc_perc",
-    "sys.freq.gpu",
-    "sys.freq.gpu_max",
-    "sys.freq.gpu_min",
-    "sys.freq.gpu_perc",
+    # "sys.freq.ape",
+    # "sys.freq.cpu",
+    # "sys.freq.cpu_max",
+    # "sys.freq.cpu_min",
+    # "sys.freq.cpu_perc",
+    # "sys.freq.emc",
+    # "sys.freq.emc_max",
+    # "sys.freq.emc_min",
+    # "sys.freq.emc_perc",
+    # "sys.freq.gpu",
+    # "sys.freq.gpu_max",
+    # "sys.freq.gpu_min",
+    # "sys.freq.gpu_perc",
     "sys.fs.avail",
     "sys.fs.size",
     "sys.hwmon",

From cb28142e0f6506d793f8e89ae39e26ad946335a1 Mon Sep 17 00:00:00 2001
From: Sean Shahkarami <sean.shahkarami@gmail.com>
Date: Tue, 26 Sep 2023 11:48:07 -0500
Subject: [PATCH 04/12] bug fix: split sys tasks per device

---
 rollup_health_and_sanity_metrics.py | 20 ++++++++++++--------
 utils.py                            | 27 +++++++++++++++++----------
 2 files changed, 29 insertions(+), 18 deletions(-)

diff --git a/rollup_health_and_sanity_metrics.py b/rollup_health_and_sanity_metrics.py
index a2d0391..6935936 100644
--- a/rollup_health_and_sanity_metrics.py
+++ b/rollup_health_and_sanity_metrics.py
@@ -4,7 +4,6 @@
 import logging
 import sage_data_client
 import requests
-from typing import NamedTuple
 from utils import (
     load_node_table,
     parse_time,
@@ -188,10 +187,10 @@
 # the possible devices on a node. the frequency is the minimum expected
 # publishing frequency
 device_output_table = {
-    "nxcore": [("sys", name, "120s") for name in sys_from_nxcore],
-    "nxagent": [("sys", name, "120s") for name in sys_from_nxagent],
-    "rpi": [("sys", name, "120s") for name in sys_from_rpi],
-    "dell": [("sys", name, "60s") for name in sys_from_dellblade],
+    "nxcore": [("nxcore", name, "120s") for name in sys_from_nxcore],
+    "nxagent": [("nxagent", name, "120s") for name in sys_from_nxagent],
+    "rpi": [("rpi", name, "120s") for name in sys_from_rpi],
+    "dell": [("dell", name, "60s") for name in sys_from_dellblade],
     "bme280": [("wes-iio-bme280", name, "30s") for name in outputs_from_bme],
     "bme680": [("wes-iio-bme680", name, "30s") for name in outputs_from_bme],
     "raingauge": [("wes-raingauge", name, "30s") for name in outputs_from_raingauge],
@@ -269,9 +268,14 @@ def add_device_health_check_record(vsn, device, value):
             }
         )
 
-    # NOTE metrics agent doesn't add a task name, so we set task name
-    # to system for system metrics.
-    df.loc[df["name"].str.startswith("sys."), "meta.task"] = "sys"
+    # NOTE derive task name from sys metrics using host
+    is_sys = df["name"].str.startswith("sys.")
+    df.loc[is_sys & df["meta.host"].str.endswith("nxcore"), "meta.task"] = "nxcore"
+    df.loc[is_sys & df["meta.host"].str.endswith("nxagent"), "meta.task"] = "nxagent"
+    # NOTE this will not really work for nodes with multiple rpis. we need to rethink this a bit
+    # in the future. for now, we want to fix the urgent problem of differentiating most sys metrics.
+    df.loc[is_sys & df["meta.host"].str.endswith("rpi"), "meta.task"] = "rpi"
+    df.loc[is_sys & df["meta.host"].str.endswith("sbcore"), "meta.task"] = "dell"
 
     vsn_groups = df.groupby(["meta.vsn"])
 
diff --git a/utils.py b/utils.py
index f60d3ab..c006de9 100644
--- a/utils.py
+++ b/utils.py
@@ -1,9 +1,13 @@
-
 import influxdb_client
-from influxdb_client.client.write_api import WriteOptions, WritePrecision, Point, WriteType
+from influxdb_client.client.write_api import (
+    WriteOptions,
+    WritePrecision,
+    Point,
+    WriteType,
+)
 import pandas as pd
 import requests
-from typing import NamedTuple
+from dataclasses import dataclass
 
 
 def write_results_to_influxdb(url, token, org, bucket, records):
@@ -18,9 +22,14 @@ def write_results_to_influxdb(url, token, org, bucket, records):
         p = p.time(int(r["timestamp"].timestamp()), write_precision=WritePrecision.S)
         data.append(p)
 
-    with influxdb_client.InfluxDBClient(url=url, token=token, org=org) as client, \
-         client.write_api(write_options=WriteOptions(batch_size=10000)) as write_api:
-        write_api.write(bucket=bucket, org=org, record=data, write_precision=WritePrecision.S)
+    with influxdb_client.InfluxDBClient(
+        url=url, token=token, org=org
+    ) as client, client.write_api(
+        write_options=WriteOptions(batch_size=10000)
+    ) as write_api:
+        write_api.write(
+            bucket=bucket, org=org, record=data, write_precision=WritePrecision.S
+        )
 
 
 def check_publishing_frequency(df, freq, window):
@@ -29,7 +38,8 @@ def check_publishing_frequency(df, freq, window):
     return total_samples / expected_samples
 
 
-class Node(NamedTuple):
+@dataclass
+class Node:
     id: str
     vsn: str
     type: str
@@ -54,11 +64,8 @@ def load_node_table_item(item):
         devices.add("nxagent")
     if item["shield"] is True:
         devices.add("rpi")
-    if item["shield"] is True:
         devices.add("raingauge")
-    if item["shield"] is True:
         devices.add("bme680")
-    if item["shield"] is True:
         devices.add("microphone")
 
     # add cameras

From 5e1e4906eca02f645c9f9a35e90a080543fadf5e Mon Sep 17 00:00:00 2001
From: Sean Shahkarami <sean.shahkarami@gmail.com>
Date: Thu, 28 Sep 2023 17:07:26 -0500
Subject: [PATCH 05/12] bumped sage-data-client to 0.7.1 to address loading
 large / small numbers

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index e4b399b..b088a0f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-sage-data-client==0.6.0
+sage-data-client==0.7.1
 slackclient
 influxdb-client[ciso]
 requests

From 00d949d9085b2ba13dd2435f48100dda313fd34f Mon Sep 17 00:00:00 2001
From: Sean Shahkarami <sean.shahkarami@gmail.com>
Date: Thu, 28 Sep 2023 17:13:53 -0500
Subject: [PATCH 06/12] upped version python 3.11

---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index 2305cac..76602e3 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.8
+FROM python:3.11
 WORKDIR /app
 COPY requirements.txt .
 RUN pip3 install --no-cache-dir -r requirements.txt

From 2664a415e553a0f3c04139c5fa4d3819d287a060 Mon Sep 17 00:00:00 2001
From: Sean Shahkarami <sean.shahkarami@gmail.com>
Date: Wed, 17 Jul 2024 13:30:20 -0500
Subject: [PATCH 07/12] bumped sage-data-client version

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index b088a0f..783fbab 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-sage-data-client==0.7.1
+sage-data-client==0.8.0
 slackclient
 influxdb-client[ciso]
 requests

From 3cfccda3ac9580ffd4994e658f128df9dbbefd05 Mon Sep 17 00:00:00 2001
From: Sean Shahkarami <sean.shahkarami@gmail.com>
Date: Wed, 17 Jul 2024 13:32:17 -0500
Subject: [PATCH 08/12] INFLUXDB_ vars can use env vars. rename time_windows to
 get_time_windows and return list. added reverse flag. using experimental
 count func for plugin stats.

---
 rollup_health_and_sanity_metrics.py |  13 +++-
 rollup_plugin_counts.py             | 110 ++++++++++++++++++----------
 test_utils.py                       |  16 ++--
 utils.py                            |   4 +-
 4 files changed, 95 insertions(+), 48 deletions(-)

diff --git a/rollup_health_and_sanity_metrics.py b/rollup_health_and_sanity_metrics.py
index 6935936..70dc312 100644
--- a/rollup_health_and_sanity_metrics.py
+++ b/rollup_health_and_sanity_metrics.py
@@ -1,5 +1,6 @@
 import argparse
 import os
+from os import getenv
 import pandas as pd
 import logging
 import sage_data_client
@@ -8,7 +9,7 @@
     load_node_table,
     parse_time,
     get_rollup_range,
-    time_windows,
+    get_time_windows,
     write_results_to_influxdb,
     check_publishing_frequency,
 )
@@ -183,6 +184,10 @@
     "env.raingauge.total_acc",
 }
 
+# add a stuct here which either has count or interval so we can check this
+
+"nxcore": Check("nxcore", name, mean_publish_interval="3min")
+
 # device_output_table describes the output publishing policy for each of
 # the possible devices on a node. the frequency is the minimum expected
 # publishing frequency
@@ -426,8 +431,8 @@ def time_arg(s):
     )
 
     if not args.dry_run:
-        INFLUXDB_URL = "https://influxdb.sagecontinuum.org"
-        INFLUXDB_ORG = "waggle"
+        INFLUXDB_URL = getenv("INFLUXDB_URL", "https://influxdb.sagecontinuum.org")
+        INFLUXDB_ORG = getenv("INFLUXDB_ORG", "waggle")
         INFLUXDB_TOKEN = os.environ["INFLUXDB_TOKEN"]
 
     nodes = load_node_table()
@@ -436,7 +441,7 @@ def time_arg(s):
 
     logging.info("current time is %s", now)
 
-    for start, end in time_windows(start, end, window):
+    for start, end in get_time_windows(start, end, window):
         logging.info("getting health records in %s %s", start, end)
         health_records = get_health_records_for_window(nodes, start, end, window)
 
diff --git a/rollup_plugin_counts.py b/rollup_plugin_counts.py
index ccaa7f5..f0d6cef 100644
--- a/rollup_plugin_counts.py
+++ b/rollup_plugin_counts.py
@@ -1,70 +1,100 @@
 import argparse
 import os
+from os import getenv
 import pandas as pd
 import logging
 import sage_data_client
-import json
 
-from utils import load_node_table, parse_time, get_rollup_range, time_windows, write_results_to_influxdb
+from utils import (
+    load_node_table,
+    parse_time,
+    get_rollup_range,
+    get_time_windows,
+    write_results_to_influxdb,
+)
 
 
 def get_plugin_counts_for_window(nodes, start, end, convert_timestamps=False):
-    df = sage_data_client.query(start=start, end=end, filter={
-        "plugin": ".*"
-    })
+    df = sage_data_client.query(
+        start=start,
+        end=end,
+        filter={"plugin": ".*"},
+        experimental_func="count",
+    )
 
     df["timestamp"] = df["timestamp"].dt.round("1h")
-    df["total"] = 1
+    # experimental_func count returns the total counts as the value field
+    df["total"] = df["value"]
 
     # ignore records with "plugin.duration" for now
     df = df[df["name"].str.contains("plugin.duration") == False]
 
-    table = df.groupby(["meta.node", "meta.vsn", 'meta.plugin'])[["total"]].sum()
+    table = df.groupby(["meta.node", "meta.vsn", "meta.plugin"])[["total"]].sum()
 
     records = []
     for node in nodes:
         try:
             r = table.loc[(node.id, node.vsn)]
-            plugin_totals =  r["total"]
+            plugin_totals = r["total"]
         except KeyError:
-            continue # ignore if id & vsn combination not found
+            continue  # ignore if id & vsn combination not found
 
         for plugin_name, total in plugin_totals.items():
-            records.append({
-                "measurement": "total",
-                "tags": {
-                    "vsn": node.vsn,
-                    "node": node.id,
-                    "plugin": plugin_name
-                },
-                "fields": {
-                    "value": int(total), # is this necessary?
-                },
-                "timestamp": start.isoformat()+'Z' if convert_timestamps else start,
-            })
+            records.append(
+                {
+                    "measurement": "total",
+                    "tags": {"vsn": node.vsn, "node": node.id, "plugin": plugin_name},
+                    "fields": {
+                        "value": int(total),  # is this necessary?
+                    },
+                    "timestamp": (
+                        start.isoformat() + "Z" if convert_timestamps else start
+                    ),
+                }
+            )
 
     return records
 
 
 def main():
-    INFLUXDB_URL = "https://influxdb.sagecontinuum.org"
-    INFLUXDB_ORG = "waggle"
-    INFLUXDB_TOKEN = os.environ["INFLUXDB_TOKEN"]
-
     now = pd.to_datetime("now", utc=True)
+
     def time_arg(s):
         return parse_time(s, now=now)
 
     parser = argparse.ArgumentParser()
-    parser.add_argument("--start", default="-1h", type=time_arg, help="relative start time")
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="perform dry run to view logs. will skip writing results to influxdb.",
+    )
+    parser.add_argument(
+        "--start", default="-1h", type=time_arg, help="relative start time"
+    )
     parser.add_argument("--end", default="now", type=time_arg, help="relative end time")
-    parser.add_argument("--window", default="1h", type=pd.Timedelta, help="window duration to aggreagate over")
+    parser.add_argument(
+        "--window",
+        default="1h",
+        type=pd.Timedelta,
+        help="window duration to aggreagate over",
+    )
+    parser.add_argument(
+        "--reverse",
+        action="store_true",
+        help="reverse the rollup starting so it works from most recent to least recent",
+    )
     args = parser.parse_args()
 
     logging.basicConfig(
         level=logging.INFO,
         format="%(asctime)s %(message)s",
-        datefmt="%Y/%m/%d %H:%M:%S")
+        datefmt="%Y/%m/%d %H:%M:%S",
+    )
+
+    if not args.dry_run:
+        INFLUXDB_URL = getenv("INFLUXDB_URL", "https://influxdb.sagecontinuum.org")
+        INFLUXDB_ORG = getenv("INFLUXDB_ORG", "waggle")
+        INFLUXDB_TOKEN = os.environ["INFLUXDB_TOKEN"]
 
     nodes = load_node_table()
     start, end = get_rollup_range(args.start, args.end)
@@ -72,19 +102,25 @@ def time_arg(s):
 
     logging.info("current time is %s", now)
 
-    for start, end in time_windows(start, end, window):
+    time_windows = get_time_windows(start, end, window)
+
+    if args.reverse:
+        time_windows = reversed(time_windows)
+
+    for start, end in time_windows:
         logging.info("getting plugin counts for %s %s", start, end)
         records = get_plugin_counts_for_window(nodes, start, end)
-
         # print(json.dumps(records))
 
-        logging.info("writing %d sanity health records...", len(records))
-        write_results_to_influxdb(
-            url=INFLUXDB_URL,
-            org=INFLUXDB_ORG,
-            token=INFLUXDB_TOKEN,
-            bucket="plugin-stats",
-            records=records)
+        if not args.dry_run:
+            logging.info("writing %d plugin stats records...", len(records))
+            write_results_to_influxdb(
+                url=INFLUXDB_URL,
+                org=INFLUXDB_ORG,
+                token=INFLUXDB_TOKEN,
+                bucket="plugin-stats",
+                records=records,
+            )
 
     logging.info("done!")
 
diff --git a/test_utils.py b/test_utils.py
index 5c61765..15c5b53 100644
--- a/test_utils.py
+++ b/test_utils.py
@@ -1,4 +1,4 @@
-from utils import load_node_table, parse_time, get_rollup_range, time_windows
+from utils import load_node_table, parse_time, get_rollup_range, get_time_windows
 import pandas as pd
 import unittest
 
@@ -18,18 +18,24 @@ def test_parse_time(self):
         now = datetime("2021-10-11 10:34:23")
         self.assertEqual(parse_time("-3h", now=now), datetime("2021-10-11 07:34:23"))
         # check absolute time
-        self.assertEqual(parse_time("2021-10-11 11:22:33", now=now), datetime("2021-10-11 11:22:33"))
+        self.assertEqual(
+            parse_time("2021-10-11 11:22:33", now=now), datetime("2021-10-11 11:22:33")
+        )
 
     def test_get_rollup_range(self):
         now = datetime("2021-10-11 10:34:23")
-        start, end = get_rollup_range(parse_time("-3h", now=now), parse_time("-1h", now=now))
+        start, end = get_rollup_range(
+            parse_time("-3h", now=now), parse_time("-1h", now=now)
+        )
         self.assertEqual(start, datetime("2021-10-11 07:00:00"))
         self.assertEqual(end, datetime("2021-10-11 09:00:00"))
 
     def test_time_windows(self):
         now = datetime("2021-10-11 10:01:23")
-        start, end = get_rollup_range(parse_time("-3h", now=now), parse_time("-1h", now=now))
-        windows = list(time_windows(start, end, "1h"))
+        start, end = get_rollup_range(
+            parse_time("-3h", now=now), parse_time("-1h", now=now)
+        )
+        windows = list(get_time_windows(start, end, "1h"))
         expect = [
             (datetime("2021-10-11 07:00:00"), datetime("2021-10-11 08:00:00")),
             (datetime("2021-10-11 08:00:00"), datetime("2021-10-11 09:00:00")),
diff --git a/utils.py b/utils.py
index c006de9..402d000 100644
--- a/utils.py
+++ b/utils.py
@@ -82,9 +82,9 @@ def load_node_table_item(item):
     )
 
 
-def time_windows(start, end, freq):
+def get_time_windows(start, end, freq):
     windows = pd.date_range(start, end, freq=freq)
-    return zip(windows[:-1], windows[1:])
+    return list(zip(windows[:-1], windows[1:]))
 
 
 def parse_time(s, now=None):

From a6776a827cdc7e0a50f8ef183a4b4418639087a2 Mon Sep 17 00:00:00 2001
From: Sean Shahkarami <sean.shahkarami@gmail.com>
Date: Wed, 17 Jul 2024 13:37:32 -0500
Subject: [PATCH 09/12] added bucket env vars (with same defaults as before).
 added --reverse flag to health / sanity rollup

---
 rollup_health_and_sanity_metrics.py | 20 ++++++++++++++++----
 rollup_plugin_counts.py             |  3 ++-
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/rollup_health_and_sanity_metrics.py b/rollup_health_and_sanity_metrics.py
index 70dc312..19468fd 100644
--- a/rollup_health_and_sanity_metrics.py
+++ b/rollup_health_and_sanity_metrics.py
@@ -422,6 +422,11 @@ def time_arg(s):
         type=pd.Timedelta,
         help="window duration to aggreagate over",
     )
+    parser.add_argument(
+        "--reverse",
+        action="store_true",
+        help="reverse the rollup starting so it works from most recent to least recent",
+    )
     args = parser.parse_args()
 
     logging.basicConfig(
@@ -434,6 +439,8 @@ def time_arg(s):
         INFLUXDB_URL = getenv("INFLUXDB_URL", "https://influxdb.sagecontinuum.org")
         INFLUXDB_ORG = getenv("INFLUXDB_ORG", "waggle")
         INFLUXDB_TOKEN = os.environ["INFLUXDB_TOKEN"]
+        INFLUXDB_BUCKET_HEALTH = getenv("INFLUXDB_BUCKET_HEALTH", "health-check-test")
+        INFLUXDB_BUCKET_SANITY = getenv("INFLUXDB_BUCKET_SANITY", "downsampled-test")
 
     nodes = load_node_table()
     start, end = get_rollup_range(args.start, args.end)
@@ -441,7 +448,12 @@ def time_arg(s):
 
     logging.info("current time is %s", now)
 
-    for start, end in get_time_windows(start, end, window):
+    time_windows = get_time_windows(start, end, window)
+
+    if args.reverse:
+        time_windows = reversed(time_windows)
+
+    for start, end in time_windows:
         logging.info("getting health records in %s %s", start, end)
         health_records = get_health_records_for_window(nodes, start, end, window)
 
@@ -451,7 +463,7 @@ def time_arg(s):
                 url=INFLUXDB_URL,
                 org=INFLUXDB_ORG,
                 token=INFLUXDB_TOKEN,
-                bucket="health-check-test",
+                bucket=INFLUXDB_BUCKET_HEALTH,
                 records=health_records,
             )
 
@@ -459,12 +471,12 @@ def time_arg(s):
         sanity_records = get_sanity_records_for_window(nodes, start, end)
 
         if not args.dry_run:
-            logging.info("writing %d sanity health records...", len(sanity_records))
+            logging.info("writing %d sanity records...", len(sanity_records))
             write_results_to_influxdb(
                 url=INFLUXDB_URL,
                 org=INFLUXDB_ORG,
                 token=INFLUXDB_TOKEN,
-                bucket="downsampled-test",
+                bucket=INFLUXDB_BUCKET_SANITY,
                 records=sanity_records,
             )
 
diff --git a/rollup_plugin_counts.py b/rollup_plugin_counts.py
index f0d6cef..c5c38a6 100644
--- a/rollup_plugin_counts.py
+++ b/rollup_plugin_counts.py
@@ -95,6 +95,7 @@ def time_arg(s):
         INFLUXDB_URL = getenv("INFLUXDB_URL", "https://influxdb.sagecontinuum.org")
         INFLUXDB_ORG = getenv("INFLUXDB_ORG", "waggle")
         INFLUXDB_TOKEN = os.environ["INFLUXDB_TOKEN"]
+        INFLUXDB_BUCKET = getenv("INFLUXDB_BUCKET", "plugin-stats")
 
     nodes = load_node_table()
     start, end = get_rollup_range(args.start, args.end)
@@ -118,7 +119,7 @@ def time_arg(s):
                 url=INFLUXDB_URL,
                 org=INFLUXDB_ORG,
                 token=INFLUXDB_TOKEN,
-                bucket="plugin-stats",
+                bucket=INFLUXDB_BUCKET,
                 records=records,
             )
 

From a85c40c90461349951b076eccbf13e7afdd33995 Mon Sep 17 00:00:00 2001
From: Sean Shahkarami <sean.shahkarami@gmail.com>
Date: Wed, 17 Jul 2024 13:47:43 -0500
Subject: [PATCH 10/12] fixed line which was supposed to be a comment

---
 rollup_health_and_sanity_metrics.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/rollup_health_and_sanity_metrics.py b/rollup_health_and_sanity_metrics.py
index 19468fd..932dda0 100644
--- a/rollup_health_and_sanity_metrics.py
+++ b/rollup_health_and_sanity_metrics.py
@@ -185,8 +185,7 @@
 }
 
 # add a stuct here which either has count or interval so we can check this
-
-"nxcore": Check("nxcore", name, mean_publish_interval="3min")
+# "nxcore": Check("nxcore", name, mean_publish_interval="3min")
 
 # device_output_table describes the output publishing policy for each of
 # the possible devices on a node. the frequency is the minimum expected

From c0f0aa3b2605625e8ad72c7cfae22b0cf2692ab4 Mon Sep 17 00:00:00 2001
From: Sean Shahkarami <sean.shahkarami@gmail.com>
Date: Wed, 17 Jul 2024 13:48:54 -0500
Subject: [PATCH 11/12] added gitignore

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 .gitignore

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..f7275bb
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+venv/

From fd2148e08584e9815edb3072f99b5f4ca3d3fe25 Mon Sep 17 00:00:00 2001
From: Sean Shahkarami <sean.shahkarami@gmail.com>
Date: Wed, 17 Jul 2024 13:49:26 -0500
Subject: [PATCH 12/12] update .gitignore

---
 .gitignore | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.gitignore b/.gitignore
index f7275bb..d0ee3b1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,3 @@
 venv/
+__pycache__/
+*.pyc