erdos-project · ruizehung · Dec 6, 2023 · Nov 14, 2023 · Nov 19, 2023 · Nov 20, 2023
diff --git a/.gitignore b/.gitignore
@@ -25,3 +25,5 @@ env
 
 # Ignore build of tetrisched
 schedulers/tetrisched/build/*
+
+experiments/*
diff --git a/configs/alibaba_edf_fixed.conf b/configs/alibaba_edf_fixed.conf
@@ -0,0 +1,20 @@
+--log_file_name=./alibaba_scheduler_EDF_num_invocation_50.log
+--csv_file_name=./alibaba_scheduler_EDF_num_invocation_50.csv
+--log_level=debug
+--execution_mode=replay
+--replay_trace=alibaba
+--max_deadline_variance=100
+--min_deadline_variance=50
+--workload_profile_path=./traces/alibaba-cluster-trace-v2018/alibaba_random_50_dags.pkl
+--override_num_invocations=50
+--override_arrival_period=10
+--randomize_start_time_max=100
+--worker_profile_path=profiles/workers/alibaba_cluster.yaml
+--scheduler_runtime=0
+--scheduler=EDF
+# --enforce_deadlines
+# --retract_schedules
+# --drop_skipped_tasks
+# # --release_taskgraphs
+# --scheduler_log_times=10
+# --scheduler_time_discretization=1
diff --git a/configs/alibaba_tetrisched_adaptive_discretization.conf b/configs/alibaba_tetrisched_adaptive_discretization.conf
@@ -6,7 +6,7 @@
 --max_deadline_variance=100
 --min_deadline_variance=50
 --workload_profile_path=./traces/alibaba-cluster-trace-v2018/alibaba_random_50_dags.pkl
---override_num_invocations=1
+--override_num_invocations=50
 --override_arrival_period=10
 --randomize_start_time_max=100
 --worker_profile_path=profiles/workers/alibaba_cluster.yaml

diff --git a/configs/alibaba_tetrisched_discrete_1.conf b/configs/alibaba_tetrisched_discrete_1.conf
@@ -6,15 +6,20 @@
 --max_deadline_variance=100
 --min_deadline_variance=50
 --workload_profile_path=./traces/alibaba-cluster-trace-v2018/alibaba_random_50_dags.pkl
---override_num_invocations=1
+--override_num_invocations=50
 --override_arrival_period=10
 --randomize_start_time_max=100
 --worker_profile_path=profiles/workers/alibaba_cluster.yaml
 --scheduler_runtime=0
 --scheduler=TetriSched
+# --scheduler=EDF
 --enforce_deadlines
 --retract_schedules
 --drop_skipped_tasks
 --release_taskgraphs
 --scheduler_log_times=10
---scheduler_time_discretization=1
+--scheduler_time_discretization=5
+
+# --override_release_policy=gamma
+# --override_poisson_arrival_rate=10
+# --override_gamma_coefficient=3
diff --git a/configs/alibaba_tetrisched_discrete_1_no_dag_awareness.conf b/configs/alibaba_tetrisched_discrete_1_no_dag_awareness.conf
@@ -0,0 +1,20 @@
+--log_file_name=./alibaba_scheduler_TetriSched_release_policy_fixed_deadline_var_100_scheduler_discretization_1_no_dag_awareness.log
+--csv_file_name=./alibaba_scheduler_TetriSched_release_policy_fixed_deadline_var_100_scheduler_discretization_1_no_dag_awareness.csv
+--log_level=debug
+--execution_mode=replay
+--replay_trace=alibaba
+--max_deadline_variance=100
+--min_deadline_variance=50
+--workload_profile_path=./traces/alibaba-cluster-trace-v2018/alibaba_random_50_dags.pkl
+--override_num_invocations=50
+--override_arrival_period=10
+--randomize_start_time_max=100
+--worker_profile_path=profiles/workers/alibaba_cluster.yaml
+--scheduler_runtime=0
+--scheduler=TetriSched
+--enforce_deadlines
+--retract_schedules
+--drop_skipped_tasks
+# --release_taskgraphs
+--scheduler_log_times=10
+--scheduler_time_discretization=1
diff --git a/configs/alibaba_tetrisched_dynamic_discretization_1_5.conf b/configs/alibaba_tetrisched_dynamic_discretization_1_5.conf
@@ -0,0 +1,23 @@
+--log_file_name=./alibaba_scheduler_TetriSched_release_policy_fixed_deadline_var_100_scheduler_dynamic_discretization_1_5_auto_occupancy_0.7.log
+--csv_file_name=./alibaba_scheduler_TetriSched_release_policy_fixed_deadline_var_100_scheduler_dynamic_discretization_1_5_auto_occupancy_0.7.csv
+--log_level=debug
+--execution_mode=replay
+--replay_trace=alibaba
+--max_deadline_variance=100
+--min_deadline_variance=50
+--workload_profile_path=./traces/alibaba-cluster-trace-v2018/alibaba_random_50_dags.pkl
+--override_num_invocations=50
+--override_arrival_period=10
+--randomize_start_time_max=100
+--worker_profile_path=profiles/workers/alibaba_cluster.yaml
+--scheduler_runtime=0
+--scheduler=TetriSched
+--enforce_deadlines
+--retract_schedules
+--drop_skipped_tasks
+--release_taskgraphs
+--scheduler_log_times=10
+--scheduler_time_discretization=1
+--scheduler_dynamic_discretization
+--scheduler_max_time_discretization=5
+--scheduler_max_occupancy_threshold=0.7
diff --git a/configs/alibaba_tetrisched_dynamic_discretization_1_5_max_occupancy_1000.conf b/configs/alibaba_tetrisched_dynamic_discretization_1_5_max_occupancy_1000.conf
@@ -0,0 +1,23 @@
+--log_file_name=./alibaba_scheduler_TetriSched_release_policy_fixed_deadline_var_100_scheduler_dynamic_discretization_1_5_max_occ_1100.log
+--csv_file_name=./alibaba_scheduler_TetriSched_release_policy_fixed_deadline_var_100_scheduler_dynamic_discretization_1_5_max_occ_1100.csv
+--log_level=debug
+--execution_mode=replay
+--replay_trace=alibaba
+--max_deadline_variance=100
+--min_deadline_variance=50
+--workload_profile_path=./traces/alibaba-cluster-trace-v2018/alibaba_random_50_dags.pkl
+--override_num_invocations=50
+--override_arrival_period=10
+--randomize_start_time_max=100
+--worker_profile_path=profiles/workers/alibaba_cluster.yaml
+--scheduler_runtime=0
+--scheduler=TetriSched
+--enforce_deadlines
+--retract_schedules
+--drop_skipped_tasks
+--release_taskgraphs
+--scheduler_log_times=10
+--scheduler_time_discretization=1
+--scheduler_dynamic_discretization
+--scheduler_max_time_discretization=5
+--scheduler_max_occupancy_threshold=1100
diff --git a/configs/alibaba_trace.conf b/configs/alibaba_trace.conf
@@ -7,7 +7,7 @@
 --execution_mode=replay
 --replay_trace=alibaba
 --workload_profile_path=./traces/alibaba-cluster-trace-v2018/alibaba_random_50_dags.pkl
---batch_size_job_loading=25
+# --batch_size_job_loading=25
 --override_num_invocations=1
 --override_arrival_period=10
 --randomize_start_time_max=50

diff --git a/data/alibaba_loader.py b/data/alibaba_loader.py
@@ -1,3 +1,4 @@
+import json
 import math
 import os
 import pathlib

diff --git a/data/csv_reader.py b/data/csv_reader.py
@@ -53,11 +53,11 @@ def parse_events(self, readings: Mapping[str, Sequence[str]]):
             schedulers = []
             for reading in csv_readings:
                 try:
+                    # TODO: This
                     if reading[1] == "SIMULATOR_START":
                         simulator = Simulator(
                             csv_path=csv_path,
                             start_time=int(reading[0]),
-                            total_tasks=reading[2],
                         )
                     elif reading[1] == "UPDATE_WORKLOAD":
                         simulator.total_tasks += int(reading[2])

diff --git a/experiments/analysis.ipynb b/experiments/analysis.ipynb
diff --git a/experiments/analysis_utils.py b/experiments/analysis_utils.py
@@ -0,0 +1,173 @@
+import os
+from matplotlib import pyplot as plt
+import numpy as np
+import pandas as pd
+
+def calculate_arrival_rate_and_cv2(release_time: list[int]):
+    release_time.sort()
+    inter_arrival_times = np.diff(release_time)
+    avg_inter_arrival_time = np.mean(inter_arrival_times)
+    std_inter_arrival_time = np.std(inter_arrival_times)
+    cv2 = (std_inter_arrival_time/avg_inter_arrival_time) ** 2
+    return 1/avg_inter_arrival_time, cv2
+
+def find_all_file_paths(path, ends_with=".csv"):
+    csv_file_paths = []
+    if os.path.isdir(path):
+        for filename in os.listdir(path):
+            if filename.endswith(ends_with):
+                csv_file_paths.append(os.path.join(path, filename))
+            else:
+                csv_file_paths += find_all_file_paths(os.path.join(path, filename), ends_with)
+    return csv_file_paths
+
+def extract_variables_from_filename(filename):
+    # Split the filename by underscores
+    parts = filename.split('_')
+
+    # Extract the variables based on your format
+    replay_trace = parts[0]
+    scheduler = parts[2]
+    release_policy = parts[5]
+    deadline_var = int(parts[9])
+    dag_aware = parts[12] == "1"
+
+    try:
+        arrival_rate = float(parts[16])
+        cv2 = int(parts[19].split('.')[0])  # Assuming the file extension is .csv
+    except:
+        # Before 11/28 afternoon, I used a different format for the filename and didn't include the arrival rate and CV2
+        arrival_rate = 10
+        cv2 = 2
+
+    if scheduler == "TetriSched":
+        scheduler_time_discretization = int(parts[-1].split('.')[0])
+        scheduler = f"TetriSched_time_dis_{scheduler_time_discretization}" + ("_DAG_aware" if dag_aware else "")
+    else:
+        scheduler_time_discretization = None
+
+    # Create a dictionary to store the extracted variables
+    variables = {
+        'trace': replay_trace,
+        'release_policy': release_policy,
+        'max_deadline_variance': deadline_var,
+        'scheduler': scheduler,
+        'DAG_aware': dag_aware,
+        'scheduler_time_discretization': scheduler_time_discretization,
+        "arrival_rate": arrival_rate,
+        "cv2": cv2,
+    }
+
+    return variables
+
+
+def extract_experiments_result(base_dir: str) -> pd.DataFrame:
+    rows = []
+    # Loop through each folder and process the CSV file
+    for csv_file_path in find_all_file_paths(base_dir):
+        file_name = csv_file_path.split(os.sep)[-1]
+        try:
+            # Open the CSV file and read the last line
+            with open(csv_file_path, 'r') as file:
+                lines = file.readlines()
+                last_line = lines[-1]
+
+            end_time, _, finished_tasks, cancelled_tasks, missed_task_deadlines, finished_task_graphs, cancelled_task_graphs, missed_task_graph_deadlines = last_line.split(",")
+            row = extract_variables_from_filename(file_name)
+            # Analyze SLO attainment and goodput
+            slo_attainment = (int(finished_task_graphs) - int(missed_task_graph_deadlines)) / (int(cancelled_task_graphs) + int(finished_task_graphs))
+            row["slo_attainment"] = slo_attainment
+            row["goodput"] = int(finished_tasks)
+            row["csv_file_path"] = csv_file_path
+
+            # Calculate the arrival rate and cv2
+            release_times = []
+            for line in lines:
+                if "TASK_RELEASE" not in line:
+                    continue
+                # event_time should be the actual release time
+                event_time, _, task_name, _, task_intended_release_time, task_release_time, task_deadline, task_id, task_graph = line.strip().split(",")
+                release_times.append(int(task_release_time))
+
+            actual_arrival_rate, actual_cv2 = calculate_arrival_rate_and_cv2(release_times)
+            row["actual_arrival_rate"] = actual_arrival_rate
+            row["actual_cv2"] = actual_cv2
+
+            rows.append(row)
+        except FileNotFoundError:
+            print(f"File not found: {csv_file_path}")
+        except Exception as e:
+            print(f"An error occurred while processing {csv_file_path}: {str(e)}")
+            # I want to remove the parent folder of the CSV file
+            # print(f"Removing {os.path.dirname(csv_file_path)}")
+            # shutil.rmtree(os.path.dirname(csv_file_path)) 
+
+    return pd.DataFrame(rows)
+
+
+def plot_slo_attainments(data: pd.DataFrame):
+    # Define your unique values for the grid
+    cv2_values = sorted(data["cv2"].unique())
+    arrival_rate_values = sorted(data["arrival_rate"].unique())
+    scheduler_values = ["TetriSched_time_dis_20", "TetriSched_time_dis_20_DAG_aware", "TetriSched_time_dis_10", 
+                        "TetriSched_time_dis_10_DAG_aware", "TetriSched_time_dis_1",  "TetriSched_time_dis_1_DAG_aware", "EDF"]
+
+    # Number of schedulers
+    n_schedulers = len(scheduler_values)
+
+    # Create a subplot grid
+    fig, axes = plt.subplots(len(arrival_rate_values), len(cv2_values), figsize=(20, 15), sharey=True)
+
+    # Define the width of each bar and the spacing between them
+    bar_width = 0.20
+    spacing = 0.05
+    group_width_factor = 2  # Increase this factor to widen the distance between groups
+
+    # Collect handles and labels for the legend
+    handles, labels = [], []
+
+    # Iterate over each subplot and plot the data
+    for i, arrival_rate in enumerate(arrival_rate_values):
+        for j, cv2 in enumerate(cv2_values):
+            ax = axes[i][j]
+            subset = data[(data['arrival_rate'] == arrival_rate) & (data['cv2'] == cv2)]
+
+            # Get unique deadline variances
+            deadline_vars = sorted(subset['max_deadline_variance'].unique())
+            x = np.arange(len(deadline_vars)) * group_width_factor  # Adjust x positions
+
+            for k, scheduler in enumerate(scheduler_values):
+                scheduler_data = subset[subset['scheduler'] == scheduler]
+                # Calculate the position of each bar
+                bar_positions = x - (n_schedulers * bar_width / 2) + (k * bar_width) + (spacing * k)
+                # Some bars may not exist for some schedulers
+                slo_attainments = []
+                for deadline_var in deadline_vars:
+                    if len(scheduler_data[scheduler_data['max_deadline_variance'] == deadline_var]['slo_attainment']) == 0:
+                        slo_attainments.append(0)
+                    else:
+                        slo_attainments.append(scheduler_data[scheduler_data['max_deadline_variance'] == deadline_var]['slo_attainment'].item())
+
+                ax.bar(bar_positions, slo_attainments, width=bar_width, label=scheduler)
+
+            for c in ax.containers:
+                labels = [f'{(v.get_height() * 100):.1f}' for v in c]
+                ax.bar_label(c, labels=labels, label_type='edge', rotation=45, size=8)
+
+            ax.set_xticks(x)
+            ax.set_xticklabels(deadline_vars)
+            ax.set_title(f"Arrival Rate: {subset['actual_arrival_rate'].mean():.2f}, CV2: {subset['actual_cv2'].mean():.2f}")
+            ax.set_xlabel('Max Deadline Variance')
+            ax.set_ylabel('SLO Attainment')
+
+    # Adjust layout and add a super title
+    plt.tight_layout()
+    plt.subplots_adjust(top=0.9)  # Adjust the bottom parameter to make space for the legend
+
+    handles, labels = ax.get_legend_handles_labels()
+    fig.legend(handles, labels, loc='upper center', bbox_to_anchor=(0.5, 0.95), ncol=len(labels))
+
+    plt.suptitle('SLO Attainment Comparison (min_deadline_var=10, num_invocation=400) 11_29_2023', size=16)
+
+    # Show the plot
+    plt.show()
diff --git a/main.py b/main.py
@@ -242,12 +242,24 @@
     "If `True`, the scheduler creates space-time matrix non-uniformly. "
     "The discretization is finer initially, and coarser at the end. (default: False)",
 )
+flags.DEFINE_bool(
+    "scheduler_dynamic_discretization",
+    False,
+    "If `True`, the scheduler creates space-time matrix non-uniformly. "
+    "The discretization is dynamically decided based on the occupancy request for each time slice. (default: False)",
+)
 flags.DEFINE_integer(
     "scheduler_max_time_discretization",
     5,
     "The maximum discretization that the scheduler can have (in µs). "
     "Only used when scheduler_adaptive_discretization flag is enabled. (default: 5)",
 )
+flags.DEFINE_float(
+    "scheduler_max_occupancy_threshold",
+    0.8,
+    "The percentage b/w 0 and 1 of maximum occupancy beyond which the discretization would always be 1 incase of dynamic discretization. "
+    "This flag is only used when dynamic discretization is enabled (default: 0.8)",
+)
 flags.DEFINE_integer(
     "scheduler_delay",
     0,
@@ -623,6 +635,8 @@ def main(args):
             max_time_discretization=EventTime(
                 FLAGS.scheduler_max_time_discretization, EventTime.Unit.US
             ),
+            dynamic_discretization=FLAGS.scheduler_dynamic_discretization,
+            max_occupancy_threshold=FLAGS.scheduler_max_occupancy_threshold,
         )
     else:
         raise ValueError(
Original file line number	Diff line number	Diff line change
Expand Up		@@ -25,3 +25,5 @@ env

		# Ignore build of tetrisched
		schedulers/tetrisched/build/*

		experiments/*