Small benchmark changes

It4innovations · Feb 9, 2024 · 615833d · 615833d
1 parent 9e0fb7f
commit 615833d
Show file tree

Hide file tree

Showing 2 changed files with 21 additions and 13 deletions.
diff --git a/benchmarks/dask_benchmarks.py b/benchmarks/dask_benchmarks.py
@@ -31,7 +31,8 @@ def generate_descriptors(self) -> Iterable[BenchmarkDescriptor]:
         hq_env = single_node_hq_cluster(hq_path, worker_threads=worker_threads)
         dask_env = single_node_dask_cluster(worker_threads=worker_threads)
 
-        task_counts = [100, 1000, 5000, 10000, 25000, 50000]
+        task_counts = [50000]  # [100, 1000, 5000, 10000, 25000, 50000]
+        repeat_count = 1
 
         types = [
             (hq_env, SleepHQ),
@@ -42,7 +43,9 @@ def generate_descriptors(self) -> Iterable[BenchmarkDescriptor]:
             for task_count in task_counts:
                 sleep_duration = total_duration_single_thread / task_count
                 workload = workload_cls(task_count=task_count, sleep_duration=sleep_duration)
-                yield BenchmarkDescriptor(env_descriptor=env, workload=workload, timeout=timeout, repeat_count=2)
+                yield BenchmarkDescriptor(
+                    env_descriptor=env, workload=workload, timeout=timeout, repeat_count=repeat_count
+                )
 
     def postprocess(self, workdir: Path, database: Database):
         import seaborn as sns
@@ -61,7 +64,7 @@ def parse_env(record: DatabaseRecord) -> str:
         )
 
         ax = sns.scatterplot(df, x="task-count", y="duration", hue="environment", marker="o")
-        ax.set(ylabel="Duration [s]", xlabel="Task count")
+        ax.set(ylabel="Duration [s]", xlabel="Task count", ylim=(0, None))
         # ax.set(yscale="log")
         render_chart_to_png(workdir / "dask-vs-hq-sleep.png")
 
@@ -106,7 +109,7 @@ def postprocess(self, workdir: Path, database: Database):
         )
 
         ax = sns.lineplot(df, x="task-count", y="duration", hue="environment", marker="o")
-        ax.set(ylabel="Duration [s]", xlabel="Task count")
+        ax.set(ylabel="Duration [s]", xlabel="Task count", ylim=(0, None))
         # ax.set(yscale="log")
         render_chart_to_png(workdir / "dask-vs-hq-empty.png")
 

diff --git a/benchmarks/ligen_benchmarks.py b/benchmarks/ligen_benchmarks.py
@@ -6,6 +6,7 @@
 from typing import Dict, Any, List, Iterable
 
 import dask
+import dataclasses
 import distributed
 import numpy as np
 import pandas as pd
@@ -190,15 +191,17 @@ class DaskVsHqLigen(TestCase):
     """
 
     def generate_descriptors(self) -> Iterable[BenchmarkDescriptor]:
-        hq_path = get_hq_binary()
+        hq_path = get_hq_binary(debug_symbols=True)
 
         worker_threads = min(multiprocessing.cpu_count(), 64)
-        hq_env = single_node_hq_cluster(hq_path, worker_threads=worker_threads)
+        hq_env = dataclasses.replace(
+            single_node_hq_cluster(hq_path, worker_threads=worker_threads), generate_event_log=False
+        )
         dask_env = single_node_dask_cluster(worker_threads=worker_threads)
         timeout = datetime.timedelta(minutes=10)
 
-        input_smi = CURRENT_DIR / "datasets/ligen/artif-200.smi"
-        variants = [(1, 1), (4, 4), (8, 8), (32, 4)]  # One molecule per task, one thread per task
+        input_smi = CURRENT_DIR / "datasets/ligen/artif-32.smi"
+        variants = [(1, 1), (4, 4)]  # , (4, 4), (8, 8), (32, 4)]  # One molecule per task, one thread per task
 
         def gen_descriptions(env: EnvironmentDescriptor, workload_cls) -> List[BenchmarkDescriptor]:
             for max_molecules, threads in variants:
@@ -214,9 +217,11 @@ def postprocess(self, workdir: Path, database: Database):
         import seaborn as sns
 
         df = analyze_results_utilization(database)
-        print(f"""UTILIZATION
+        print(
+            f"""UTILIZATION
 {df}
-""")
+"""
+        )
 
         df = (
             DataFrameExtractor(database)
@@ -279,10 +284,10 @@ def benchmark_aggregated_vs_separate_tasks():
     per input ligand, vs. when we use 4/8/16 ligands for each task.
     """
     hq_path = get_hq_binary()
-    env = single_node_hq_cluster(hq_path, worker_threads=min(multiprocessing.cpu_count(), 64))
-    input_smi = get_dataset_path(Path("ligen/artif-200.smi"))
+    env = single_node_hq_cluster(hq_path, worker_threads=min(multiprocessing.cpu_count(), 64), version="base")
+    input_smi = get_dataset_path(Path("ligen/artif-2.smi"))
 
-    variants = [(1, 1), (4, 4), (8, 8)]
+    variants = [(1, 1)]  # , (4, 4), (8, 8)]
     descriptions = []
     for max_molecules, num_threads in variants:
         workload = LigenHQWorkload(smi_path=input_smi, max_molecules=max_molecules, screening_threads=num_threads)