emdgroup · ritalyu17 · Dec 3, 2024 · Dec 16, 2024 · Dec 16, 2024 · Dec 17, 2024
@@ -0,0 +1,308 @@
+"""
+@Time    :   2024/11/06 16:21:18
+@Author  :   Rita Lyu
+@Version :   1.0
+@Contact :   [email protected]
+@Desc    :   Crabnet hyperparameter benchmarking, a minimization task on Crabnet hyperparameters. 
+
+Crabnet hyperparameter function with 20 continuous (treat as discrete for simplicity) and 3 categorical input. 
+This code interacts with an external API hosted on Hugging Face Spaces:
+https://huggingface.co/spaces/AccelerationConsortium/crabnet-hyperparameter
+
+The external space might be asleep, and this code includes logic to wake it up
+and retry until it becomes available or a retry limit is reached.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+import time
+import os
+import numpy as np
+import pandas as pd
+from pandas import DataFrame
+
+from baybe.campaign import Campaign
+from baybe.parameters import NumericalDiscreteParameter, CategoricalParameter, TaskParameter
+from baybe.searchspace import SearchSpace
+from baybe.targets import NumericalTarget, TargetMode
+from baybe.recommenders.pure.nonpredictive.sampling import RandomRecommender
+from baybe.simulation import simulate_scenarios
+from baybe.utils.random import set_random_seed
+from benchmarks.definition import (
+    Benchmark, 
+    ConvergenceExperimentSettings,
+)
+
+# Wake up the Hugging Face space
+from gradio_client import Client
+client = Client("AccelerationConsortium/crabnet-hyperparameter")
+
+def wake_up_hfspace(client, max_retries=2, wait_time=150):
+    """
+    Ensure the external Hugging Face space is awake before making predictions.
+    Args:
+        client: The Gradio Client instance.
+        max_retries: Maximum number of retries to wake up the space.
+        wait_time: Seconds to wait between retries.
+    Raises:
+        RuntimeError: If the space does not wake up after max_retries.
+    """
+    for attempt in range(max_retries):
+        try:
+            # Attempt a simple request to check if the space is awake
+            client.predict(0, 0, 0, 0, 0, 0.3, 0, 0, 0, 0, 
+                           0, 0, 0, 0, 0, 0, 0, 0, 0.1, 0.5, 
+                           "c1_0", "c2_0", "c3_0", 
+                           0.5, api_name="/predict")
+            print("Hugging Face space is awake.")
+            return
+        except Exception as e:
+            print(f"Attempt {attempt + 1}: Space is asleep. Retrying in {wait_time} seconds...")
+            time.sleep(wait_time)
+    raise RuntimeError("Hugging Face space is still asleep after maximum retries.")
+
+wake_up_hfspace(client)
+
+def adv_opt(c1, c2, c3, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19, x20):
+    """Optimization function with 20 continuous variables and 3 categorical parameters."""
+    result = client.predict(
+        x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, 
+        x11, x12, x13, x14, x15, x16, x17, x18, x19, x20,  # Continuous variables
+        c1, c2, c3,  # Categorical variables
+        0.5,  # Fidelity
+        api_name="/predict",
+    )
+    return result['data'][0][0]  # return y1 value only
+
+#%% 
+# Run this cell and above if "CrabNet_lookup_testing_y1.csv" and "CrabNet_lookup_training_y2.csv" are not provided 
+# or if you want to generate data for the benchmark 
+
+# def generate_parameters():
+#     while True:
+#         # Random float values for x1 to x20 between 0.0 and 1.0
+#         params = {f"x{i}": np.random.uniform(0.0, 1.0) for i in range(1, 21)}
+
+#         # Random categorical values for c1, c2, c3
+#         params["c1"] = np.random.choice(["c1_0", "c1_1"])
+#         params["c2"] = np.random.choice(["c2_0", "c2_1"])
+#         params["c3"] = np.random.choice(["c3_0", "c3_1", "c3_2"])
+
+#         # Check constraints
+#         if params["x19"] < params["x20"] and params["x6"] + params["x15"] <= 1.0:
+#             return params
+
+# # Create DataFrame for 20 input data size in number_init_points
+# data = [generate_parameters() for _ in range(20)]
+# initial_points = pd.DataFrame(data)
+# # make sure c1, c2, c3 are str type
+# initial_points['c1'] = initial_points['c1'].apply(str)
+# initial_points['c2'] = initial_points['c2'].apply(str)
+# initial_points['c3'] = initial_points['c3'].apply(str)
+
+# # create a dataframe, that has initial points for y1 and y2
+# # add a Target column for y1/y2 value, and a Function column for the fucntion used 
+# lookup_training_y2 = initial_points.copy()
+# lookup_training_y2['Target'] = lookup_training_y2.apply(lambda x: adv_opt_y2(**x), axis=1)
+# lookup_training_y2['Function'] = "TrainingY2"
+
+# lookup_testing_y1 = initial_points.copy()
+# lookup_testing_y1['Target'] = lookup_testing_y1.apply(lambda x: adv_opt_y1(**x), axis=1)
+# lookup_testing_y1['Function'] = "TestingY1"
+
+# # save lookup_training_y2 and lookup_testing_y1 to csv
+# lookup_testing_y1.to_csv("CrabNet_lookup_testing_y1.csv", index=False)
+# lookup_training_y2.to_csv("CrabNet_lookup_training_y2.csv", index=False)
+#%%
+# import data
+strHomeDir = os.getcwd()
+df_trainingY2 = pd.read_csv(os.path.join(strHomeDir, "benchmarks", "domains", "CrabNet_lookup_training_y2.csv"))
+df_testingY1 = pd.read_csv(os.path.join(strHomeDir, "benchmarks", "domains", "CrabNet_lookup_testing_y1.csv"))
+
+# concatenate the two dataframes
+dfSearchSpace = pd.concat([df_testingY1, df_trainingY2], ignore_index=True)
+# drop the "Target" column
+dfSearchSpace = dfSearchSpace.drop(columns=["Target"])
+
+def advopt(settings: ConvergenceExperimentSettings) -> DataFrame: 
+    """Crabnet hyperparameter optimization with 20 discrete and 3 categorical input.
+    Compare across random, default, and no task parameter settings.
+
+    Inputs:
+        x1-x20  discrete        0 ≤ xi ≤ 1 for i ∈ {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20}
+        note: x1-x20 are continuous hyperparameters, treat as discrete for simplicity 
+
+        c1      categorical     c1 ∈ {c1_0, c1_1}
+        c2      categorical     c2 ∈ {c2_0, c2_1}
+        c3      categorical     c3 ∈ {c3_0, c3_1, c3_2}
+    Output: continuous
+    Objective: Minimization
+    """
+    # Define and create the search space
+    lstParameters = []
+    lstParameters_noTask = []
+    for col in dfSearchSpace.columns[:-4]:
+        # create a NumericalDiscreteParameter
+        temp_Parameter = NumericalDiscreteParameter(
+            name = col, 
+            values=np.unique(dfSearchSpace[col]), 
+            tolerance=0.0,
+        )
+        # append the parameter to the list
+        lstParameters.append(temp_Parameter)
+        lstParameters_noTask.append(temp_Parameter)
+
+    categorical_parameters = [
+        CategoricalParameter(name='c1', values=['c1_0', 'c1_1'], encoding="OHE"),
+        CategoricalParameter(name='c2', values=['c2_0', 'c2_1'], encoding="OHE"),
+        CategoricalParameter(name='c3', values=['c3_0', 'c3_1', 'c3_2'], encoding="OHE"),
+    ]
+    lstParameters.extend(categorical_parameters)
+    lstParameters_noTask.extend(categorical_parameters)
+
+    task_parameter = TaskParameter(
+        name="Function",
+        values=["TrainingY2", "TestingY1"],
+        active_values=["TestingY1"],
+    )
+
+    # append the task parameter to the list
+    lstParameters.append(task_parameter)
+
+    search_space = SearchSpace.from_dataframe(dfSearchSpace, parameters=lstParameters)
+    searchspace_noTask = SearchSpace.from_dataframe(df_testingY1.drop(columns=['Target', 'Function']), parameters=lstParameters_noTask)
+
+    # define objective
+    objective = NumericalTarget(name="Target", mode=TargetMode.MIN).to_objective()
+
+    # Define the scenarios
+    scenarios: dict[str, Campaign] = {
+        "Random Recommender": Campaign(
+            searchspace=SearchSpace.from_dataframe(
+                df_testingY1.drop(columns=['Target', 'Function']),
+                parameters=lstParameters_noTask
+            ),
+            recommender=RandomRecommender(),
+            objective=objective,
+        ),
+        "Default Recommender": Campaign(
+            searchspace=SearchSpace.from_dataframe(
+                dfSearchSpace, 
+                parameters=lstParameters,
+            ),
+            objective=objective,
+        ),
+        "noTask": Campaign(
+            searchspace=searchspace_noTask, 
+            objective=objective,
+        ),
+        }
+
+    # Simulate the scenarios
+    return simulate_scenarios(
+        scenarios,
+        df_testingY1,
+        batch_size=settings.batch_size,
+        n_doe_iterations=settings.n_doe_iterations,
+        n_mc_iterations=settings.n_mc_iterations,
+        impute_mode="error",
+    )
+
+
+def advopt_transfer_learning(settings: ConvergenceExperimentSettings) -> DataFrame: 
+    """Crabnet hyperparameter optimization with 20 discrete and 3 categorical input.
+    Transfer learning with different initial data sizes.
+
+    Inputs:
+        x1-x20  discrete        0 ≤ xi ≤ 1 for i ∈ {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20}
+        note: x1-x20 are continuous hyperparameters, treat as discrete for simplicity 
+
+        c1      categorical     c1 ∈ {c1_0, c1_1}
+        c2      categorical     c2 ∈ {c2_0, c2_1}
+        c3      categorical     c3 ∈ {c3_0, c3_1, c3_2}
+    Output: continuous
+    Objective: Minimization
+    """
+    # Define and create the search space
+    lstParameters = []
+    for col in dfSearchSpace.columns[:-4]:
+        # create a NumericalDiscreteParameter
+        temp_Parameter = NumericalDiscreteParameter(
+            name = col, 
+            values=np.unique(dfSearchSpace[col]), 
+            tolerance=0.0,
+        )
+        # append the parameter to the list
+        lstParameters.append(temp_Parameter)
+
+    categorical_parameters = [
+        CategoricalParameter(name='c1', values=['c1_0', 'c1_1'], encoding="OHE"),
+        CategoricalParameter(name='c2', values=['c2_0', 'c2_1'], encoding="OHE"),
+        CategoricalParameter(name='c3', values=['c3_0', 'c3_1', 'c3_2'], encoding="OHE"),
+    ]
+    lstParameters.extend(categorical_parameters)
+
+    task_parameter = TaskParameter(
+        name="Function",
+        values=["TrainingY2", "TestingY1"],
+        active_values=["TestingY1"],
+    )
+
+    # append the task parameter to the list
+    lstParameters.append(task_parameter)
+
+    # define objective
+    objective = NumericalTarget(name="Target", mode=TargetMode.MIN).to_objective()
+
+    for n in (50, 100, 500, 700, 1000): 
+        search_space = SearchSpace.from_dataframe(dfSearchSpace, parameters=lstParameters)
+
+        campaign_temp = Campaign(searchspace=search_space, objective=objective)
+        initial_data_temp = [df_trainingY2.sample(n) for _ in range(settings.n_mc_iterations)]
+
+    return simulate_scenarios(
+        {f"{n} Initial Data": campaign_temp},
+        df_testingY1, 
+        initial_data=initial_data_temp,
+        batch_size=settings.batch_size,
+        n_doe_iterations=settings.n_doe_iterations,
+        impute_mode="error",
+    )
+
+#%%
+benchmark_config = ConvergenceExperimentSettings(
+    batch_size=1,
+    n_doe_iterations=30,
+    n_mc_iterations=5,
+)
+
+# Define the benchmark
+crabnet_benchmark = Benchmark(
+    function=advopt,
+    best_possible_result=None,
+    settings=benchmark_config,
+    optimal_function_inputs=None,
+)
+
+crabnet_transfer_learning_benchmark = Benchmark(
+    function=advopt_transfer_learning,
+    best_possible_result=None,
+    settings=benchmark_config,
+    optimal_function_inputs=None,
+)
+
+
+if __name__ == "__main__":
+    #  Describe the benchmark task
+    print("CrabNet optimization is a minimization task by tuning 20 numerical and 3 categorical hyperparameters.")
+    print("The numerical hyperparameters include number of attention layers, learning rate, step size of epochs, etc.")
+    print("Details can be found in Table 1 of Baird, S. G.; Liu, M.; Sparks, T. D. High-Dimensional Bayesian Optimization of 23 Hyperparameters over 100 Iterations for an Attention-Based Network to Predict Materials Property: A Case Study on CrabNet Using Ax Platform and SAASBO. Computational Materials Science 2022, 211, 111505.")
+    print("The categorical hyperparameters include boolean values for bias residual network, loss function, and elemental feature vector.")
+    print("The numerical hyperparameters are normalized to range [0.0, 1.0], while the categorical hyperparameters are one-hot encoded.")
+    print("")
+    print("The objective is to minimize y1, RMSE, of the CrabNet hyperparameter function. If y1 is greater than 0.2, the result is coonsider bad.")
+    print("")
+    print("CrabNet benchmark compares across random, default, and no task parameter set up. ")
+    print("")
+    print("CrabNet transfer learning benchmark compares across different initialized data sizes. ")