feat: better overhead power integration (#95)

Helmholtz-AI-Energy · Oct 25, 2023 · 4a0c984 · 4a0c984
1 parent e14b8a8
commit 4a0c984
Show file tree

Hide file tree

Showing 12 changed files with 132 additions and 44 deletions.
diff --git a/docs/configuration.rst b/docs/configuration.rst
@@ -13,16 +13,17 @@ Options
 .. csv-table:: Configuration Options
     :header: "Name", "Default", "Description"
 
-    "pue", 1.58, "Power Usage Effectiveness: A measure of a data centers efficiency, calculated as
-    PUE = Total facilitty energy / IT equipment energy"
-    "emissions_factor", 417.80, "Average carbon intensity of electricity (gCO2e/kWh). Source: https://ourworldindata.org/grapher/carbon-intensity-electricity"
-    "price_factor", 0.3251, "Power to Currency conversion factor (Currency/kWh). Source : https://www.stromauskunft.de/strompreise/"
+    "power_overhead", 0.0, "Estimated power consumption of non-measured hardware components in Watts. Will be added to the power draw and energy consumed of individual nodes. Defaults to 0 Watts"
+    "pue", 1.0, "Power Usage Effectiveness: A measure of a data centers efficiency, calculated as
+    PUE = Total facilitty energy / IT equipment energy. Calculated for each run."
+    "emissions_factor", 417.80, "Average carbon intensity of electricity (gCO2e/kWh). Calculated for each run. Source: https://ourworldindata.org/grapher/carbon-intensity-electricity"
+    "price_factor", 0.3251, "Power to Currency conversion factor (Currency/kWh). Calculated for each run. Source : https://www.stromauskunft.de/strompreise/"
     "price_unit", €, "Currency Icon"
     "sampling_rate", 1, "Seconds between measurements"
     "app_name", None, "Name to identify the app. If **None**, name will be based on the file or function name."
     "run_id", None, "ID of the current run. If **None**, the current date and time will be used. If **SLURM**, perun will look for the environmental variable **SLURM_JOB_ID** and use that."
     "format", "text", "Output report format [text, pickle, csv, hdf5, json, bench]"
     "data_out", "./perun_results", "perun output location"
-    "rounds", 5, "Number of times a the application is run"
-    "warmup_rounds", 1, "Number of warmup rounds to run before starting the benchmarks."
+    "rounds", 1, "Number of times the application is run"
+    "warmup_rounds", 0, "Number of warmup rounds to run before starting the benchmarks."
     "log_lvl", "WARNING", "Change logging output [DEBUG, INFO, WARNING, ERROR, CRITICAL]"
diff --git a/example.perun.ini b/example.perun.ini
@@ -1,7 +1,9 @@
 [post-processing]
-pue = 1.58
+pue = 1
+power_overhead = 0
 emissions_factor = 417.8
 price_factor = 0.3251
+price_unit=€
 
 [monitor]
 sampling_rate = 1

diff --git a/perun/api/cli.py b/perun/api/cli.py
@@ -102,23 +102,40 @@ def _get_arg_parser() -> argparse.ArgumentParser:
         help="Directory where output files are saved. Defaults to ./perun_results",
     )
     monitor_parser.add_argument(
-        "--sampling_rate", type=float, help="Sampling rate in seconds"
+        "--sampling_rate",
+        type=float,
+        help="Sampling rate in seconds. Defaults to 1 second.",
+    )
+    monitor_parser.add_argument(
+        "--power_overhead",
+        type=float,
+        help="Estimated power consumption of non-measured hardware components in Watts. Will be added to measured power consumption on the text report summary. Defaults to 0 Watts",
     )
     monitor_parser.add_argument(
-        "--pue", type=float, help="Data center Power Usage Effectiveness"
+        "--pue", type=float, help="Data center Power Usage Effectiveness. Defaults to 1"
     )
     monitor_parser.add_argument(
         "--price_factor",
         type=float,
-        help="Electricity to Currency convertion factor in the form of Currency/kWh",
+        help="Electricity to Currency convertion factor in the form of Currency/kWh. Defaults to 0.3251 €/kWh",
+    )
+    monitor_parser.add_argument(
+        "--price_unit",
+        type=str,
+        help="Currency character to use on the text report summary. Defaults to €",
+    )
+    monitor_parser.add_argument(
+        "--emission_factor",
+        type=float,
+        help="Average carbon intensity of electricity (gCO2e/kWh). Defaults to 417.80 gC02e/kWh",
     )
     monitor_parser.add_argument(
-        "--rounds", type=int, help="Number of warmup rounds to run app."
+        "--rounds", type=int, help="Number of warmup rounds to run app. Defaults to 1"
     )
     monitor_parser.add_argument(
         "--warmup_rounds",
         type=int,
-        help="Number of warmup rounds to run the app. A warmup round is a full run of the application without gathering performance data.",
+        help="Number of warmup rounds to run the app. A warmup round is a full run of the application without gathering performance data. Defaults to 0",
     )
     monitor_parser.add_argument("script", type=str)
     monitor_parser.add_argument("script_args", nargs=argparse.REMAINDER)
@@ -157,20 +174,24 @@ def showconf(args: argparse.Namespace):
     """Print current perun configuration in INI format."""
     from perun.configuration import _default_config
 
-    if args.showconf_default:
-        config.read_dict(_default_config)
-        config.write(sys.stdout)
-    else:
-        config.write(sys.stdout)
+    perun = Perun(config)
+    if perun.comm.Get_rank() == 0:
+        if args.showconf_default:
+            config.read_dict(_default_config)
+            config.write(sys.stdout)
+        else:
+            config.write(sys.stdout)
 
 
 def sensors(args: argparse.Namespace):
     """Print sensors assigned to each rank by perun."""
     perun = Perun(config)
+    log.debug(f"Rank {perun.comm.Get_rank()}: Sensors initialized perun object")
+    sensor_config = perun.sensors_config
+    host_rank = perun.host_rank
+    log.debug(f"Rank {perun.comm.Get_rank()}: Sensors gather global configuration")
     if perun.comm.Get_rank() == 0:
-        printableConfig = printableSensorConfiguration(
-            perun.sensors_config, perun.host_rank
-        )
+        printableConfig = printableSensorConfiguration(sensor_config, host_rank)
         print(printableConfig)
 
 

diff --git a/perun/configuration.py b/perun/configuration.py
@@ -8,7 +8,8 @@
 
 _default_config: Mapping[str, Mapping[str, Any]] = {
     "post-processing": {
-        "pue": 1.58,
+        "power_overhead": 0,  # Watt
+        "pue": 1.0,  # Global Average Power Usage Effectiveness (2022 or something)
         "emissions_factor": 417.80,  # gCO2eq/kWh
         "price_factor": 0.3251,  # Currency/kWh
         "price_unit": "€",

diff --git a/perun/data_model/data.py b/perun/data_model/data.py
@@ -49,6 +49,8 @@ class MetricType(str, enum.Enum):
     OTHER_ENERGY = "other_energy"
     OTHER_MEM = "other_memory"
     N_RUNS = "n_runs"
+    MONEY = "money"
+    CO2 = "co2"
 
 
 class AggregateType(str, enum.Enum):

diff --git a/perun/data_model/measurement_type.py b/perun/data_model/measurement_type.py
@@ -18,6 +18,7 @@ class Unit(str, enum.Enum):
     SECOND = "s"
     PERCENT = "%"
     SCALAR = ""
+    GRAM = "g"
 
     @property
     def symbol(self) -> str:

diff --git a/perun/io/text_report.py b/perun/io/text_report.py
@@ -8,6 +8,7 @@
 tableMetrics = [
     MetricType.RUNTIME,
     MetricType.ENERGY,
+    MetricType.POWER,
     MetricType.CPU_POWER,
     MetricType.CPU_UTIL,
     MetricType.GPU_POWER,
@@ -101,20 +102,17 @@ def textReport(dataNode: DataNode, mr_id: str) -> str:
     else:
         region_report_str = ""
 
-    # Summary
     n_runs = len(dataNode.nodes)
     if MetricType.ENERGY in dataNode.metrics:
+        # Application Summary
         total_energy = dataNode.metrics[MetricType.ENERGY].sum  # type: ignore
-        e_pue = total_energy * config.getfloat("post-processing", "pue")
-        e_kWh = e_pue / (3600 * 1e3)
-        kgCO2 = e_kWh * config.getfloat("post-processing", "emissions_factor") / 1e3
-        money = e_kWh * config.getfloat(
-            "post-processing", "price_factor"
-        )  # Currency / kWh
+        e_kWh = total_energy / (3600 * 1e3)
+        kgCO2 = dataNode.metrics[MetricType.CO2].sum  # type: ignore
+        money = dataNode.metrics[MetricType.MONEY].sum  # type: ignore
         money_icon = config.get("post-processing", "price_unit")
 
-        summary_str = f"The application has been run {n_runs} times. Throughout its runtime, it has used {e_kWh:.3f} kWh, released a total of {kgCO2:.3f} kgCO2e into the atmosphere, and you paid {money:.2f} {money_icon} in electricity for it.\n"
+        app_summary_str = f"Application Summary\n\nThe application has been run {n_runs} times. Throughout its runtime, it has used {e_kWh:.3f} kWh, released a total of {kgCO2:.3f} kgCO2e into the atmosphere, and you paid {money:.2f} {money_icon} in electricity for it."
     else:
-        summary_str = f"The application has been run {n_runs} times."
+        app_summary_str = f"The application has been run {n_runs} times."
 
-    return report_header + mr_report_str + region_report_str + summary_str
+    return report_header + mr_report_str + region_report_str + app_summary_str
diff --git a/perun/perun.py b/perun/perun.py
@@ -249,7 +249,7 @@ def monitor_application(
                 nodes=multirun_nodes,
                 processed=False,
             )
-            multirun_node = processDataNode(multirun_node)
+            multirun_node = processDataNode(multirun_node, self.config)
 
             app_data_file = data_out / f"{app_name}.{IOFormat.HDF5.suffix}"
             app_data = None
@@ -273,7 +273,7 @@ def monitor_application(
                     nodes={multirun_id: multirun_node},
                     processed=False,
                 )
-            app_data = processDataNode(app_data)
+            app_data = processDataNode(app_data, self.config)
 
             self.export_to(data_out, app_data, IOFormat.HDF5)
             if out_format != IOFormat.HDF5:
@@ -308,6 +308,7 @@ def _run_application(
                         self.comm.Get_rank(),
                         self.backends,
                         self.l_sensors_config,
+                        self.config,
                         sp_ready_event,
                         start_event,
                         stop_event,
@@ -389,7 +390,7 @@ def _run_application(
                     nodes={node.id: node for node in dataNodes if node},
                 )
                 runNode.addRegionData(globalRegions, starttime_ns)
-                runNode = processDataNode(runNode)
+                runNode = processDataNode(runNode, self.config)
 
                 return runNode
             return None

diff --git a/perun/processing.py b/perun/processing.py
@@ -1,6 +1,7 @@
 """Processing Module."""
 import copy
 import logging
+from configparser import ConfigParser
 from datetime import datetime
 from itertools import chain
 from typing import Any, Dict, List, Optional, Tuple
@@ -306,13 +307,17 @@ def processSensorData(sensorData: DataNode) -> DataNode:
     return sensorData
 
 
-def processDataNode(dataNode: DataNode, force_process=False) -> DataNode:
+def processDataNode(
+    dataNode: DataNode, perunConfig: ConfigParser, force_process=False
+) -> DataNode:
     """Recursively calculate metrics on the dataNode tree.
 
     Parameters
     ----------
     dataNode : DataNode
         Root data node tree.
+    perunConfig: ConfigParser
+        Perun configuration
     force_process : bool, optional
         Force recomputation of child node metrics, by default False
 
@@ -342,7 +347,9 @@ def processDataNode(dataNode: DataNode, force_process=False) -> DataNode:
             if subNode.type == NodeType.SENSOR:
                 subNode = processSensorData(subNode)
             else:
-                subNode = processDataNode(subNode, force_process=force_process)
+                subNode = processDataNode(
+                    subNode, perunConfig=perunConfig, force_process=force_process
+                )
 
         if dataNode.type == NodeType.APP:
             for subSubNode in subNode.nodes.values():
@@ -380,6 +387,52 @@ def processDataNode(dataNode: DataNode, force_process=False) -> DataNode:
                 metricType, aggregatedValue, metric_md, aggType
             )
 
+    # Apply power overhead to each computational node if there is power data available.
+    if dataNode.type == NodeType.NODE and MetricType.POWER in dataNode.metrics:
+        power_overhead = perunConfig.getfloat("post-processing", "power_overhead")
+        dataNode.metrics[MetricType.POWER].value += power_overhead  # type: ignore
+        runtime = dataNode.metrics[MetricType.RUNTIME].value
+        dataNode.metrics[MetricType.ENERGY].value += runtime * power_overhead  # type: ignore
+
+    # If there is energy data, apply PUE, and convert to currency and CO2 emmisions.
+    if dataNode.type == NodeType.RUN and MetricType.ENERGY in dataNode.metrics:
+        pue = perunConfig.getfloat("post-processing", "pue")
+        emissions_factor = perunConfig.getfloat("post-processing", "emissions_factor")
+        price_factor = perunConfig.getfloat("post-processing", "price_factor")
+        total_energy = dataNode.metrics[MetricType.ENERGY].value * pue
+        dataNode.metrics[MetricType.ENERGY].value = total_energy  # type: ignore
+        e_kWh = total_energy / (3600 * 1e3)
+
+        costMetric = Metric(
+            MetricType.MONEY,
+            e_kWh * price_factor,
+            MetricMetaData(
+                Unit.SCALAR,
+                Magnitude.ONE,
+                np.dtype("float32"),
+                np.float32(0),
+                np.finfo("float32").max,
+                np.float32(0),
+            ),
+            AggregateType.SUM,
+        )
+
+        co2Emissions = Metric(
+            MetricType.CO2,
+            e_kWh * emissions_factor,
+            MetricMetaData(
+                Unit.GRAM,
+                Magnitude.ONE,
+                np.dtype("float32"),
+                np.float32(0),
+                np.finfo("float32").max,
+                np.float32(0),
+            ),
+            AggregateType.SUM,
+        )
+        dataNode.metrics[MetricType.MONEY] = costMetric
+        dataNode.metrics[MetricType.CO2] = co2Emissions
+
     dataNode.processed = True
     return dataNode
 

diff --git a/perun/subprocess.py b/perun/subprocess.py
@@ -2,6 +2,7 @@
 import logging
 import platform
 import time
+from configparser import ConfigParser
 from multiprocessing import Queue
 from typing import Dict, List, Set
 
@@ -21,6 +22,7 @@ def perunSubprocess(
     rank: int,
     backends: Dict[str, Backend],
     l_sensors_config: Dict[str, Set[str]],
+    perunConfig: ConfigParser,
     sp_ready_event,
     start_event,
     stop_event,
@@ -123,7 +125,7 @@ def perunSubprocess(
                 deviceType=deviceType,
             )
 
-            dn = processDataNode(dn)
+            dn = processDataNode(dn, perunConfig)
             deviceGroupNodes.append(dn)
         else:
             deviceGroupNodes.extend(sensorNodes)
@@ -136,7 +138,7 @@ def perunSubprocess(
         metadata={},
         nodes={node.id: node for node in deviceGroupNodes},
     )
-    processDataNode(hostNode)
+    processDataNode(hostNode, perunConfig)
 
     # This should send a single processed node for the current computational node
     queue.put(hostNode, block=True)

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -7,13 +7,13 @@
 from perun.perun import Perun
 
 
-@pytest.fixture(scope="package")
+@pytest.fixture()
 def defaultConfig():
     defaultConfig = configparser.ConfigParser(allow_no_value=True)
     defaultConfig.read_dict(_default_config)
     return defaultConfig
 
 
-@pytest.fixture(scope="package")
+@pytest.fixture()
 def perun(defaultConfig):
     return Perun(defaultConfig)
diff --git a/tests/perun/api/test_cli.py b/tests/perun/api/test_cli.py
@@ -77,22 +77,25 @@ def test_showconf_command_with_default(
         defaultConfig.write(configFile)
 
     processorOut = subprocess.run(
-        ["perun", "--log_lvl", "INFO", "--configuration", str(confPath), "showconf"],
+        ["perun", "--log_lvl", "ERROR", "--configuration", str(confPath), "showconf"],
         capture_output=True,
         text=True,
     ).stdout
+    print(processorOut)
     parser = configparser.ConfigParser(allow_no_value=True)
     parser.read_string(processorOut)
-    assert parser.get("debug", "log_lvl") == "INFO"
+    assert defaultConfig.get("monitor", "sampling_rate") == "2"
+    assert defaultConfig.get("debug", "log_lvl") == "WARNING"
     assert parser.get("monitor", "sampling_rate") == "2"
+    assert parser.get("debug", "log_lvl") == "ERROR"
     assert parser != defaultConfig
 
     defaultConfig.set("monitor", "sampling_rate", "1")
     processorOut = subprocess.run(
         [
             "perun",
             "--log_lvl",
-            "INFO",
+            "ERROR",
             "--configuration",
             str(confPath),
             "showconf",
@@ -101,12 +104,15 @@ def test_showconf_command_with_default(
         capture_output=True,
         text=True,
     ).stdout
+    print(processorOut)
     parser = configparser.ConfigParser(allow_no_value=True)
     parser.read_string(processorOut)
 
+    assert defaultConfig.get("monitor", "sampling_rate") == "1"
+    assert defaultConfig.get("debug", "log_lvl") == "WARNING"
     assert parser.get("debug", "log_lvl") == "WARNING"
     assert parser.get("monitor", "sampling_rate") == "1"
-    assert parser != defaultConfig
+    assert parser == defaultConfig
 
 
 def test_metadata_command(perun: Perun):