From 4686e7d6510f0074971a6ab5027e480f980161c3 Mon Sep 17 00:00:00 2001 From: Lisa Dunlap Date: Tue, 10 Dec 2024 02:35:26 +0000 Subject: [PATCH] reduced repeat Style Control leaderboard code --- fastchat/serve/monitor/monitor.py | 64 ++++++++++++++-------------- fastchat/serve/monitor/monitor_md.py | 13 +++--- 2 files changed, 39 insertions(+), 38 deletions(-) diff --git a/fastchat/serve/monitor/monitor.py b/fastchat/serve/monitor/monitor.py index c07ee4669..b21d668a1 100644 --- a/fastchat/serve/monitor/monitor.py +++ b/fastchat/serve/monitor/monitor.py @@ -442,8 +442,11 @@ def build_arena_tab( for k in key_to_category_name.keys(): if k not in elo_results: continue - arena_dfs[key_to_category_name[k]] = elo_results[k]["leaderboard_table_df"] - category_elo_results[key_to_category_name[k]] = elo_results[k] + category_name = key_to_category_name[k.replace("_style_control", "")] + if "_style_control" in k: + category_name = f"{category_name} w/ Style Control" + arena_dfs[category_name] = elo_results[k]["leaderboard_table_df"] + category_elo_results[category_name] = elo_results[k] arena_df = arena_dfs["Overall"] @@ -791,7 +794,7 @@ def highlight_top_3(s): style = style.background_gradient( cmap="Blues", subset=category_names, - vmin=1150, + vmin=category_df[category_names].max().max() - 250, vmax=category_df[category_names].max().max(), ) @@ -814,10 +817,6 @@ def build_category_leaderboard_tab( combined_elo_df, categories, "rating" ) sort_ranking = lambda _: get_arena_category_table(combined_elo_df, categories) - with gr.Row(): - gr.Markdown( - f"""  Chatbot Arena Overview""" - ) overall_ranking_leaderboard = gr.Dataframe( headers=["Model"] + [key_to_category_name[k] for k in categories], @@ -852,6 +851,20 @@ def build_category_leaderboard_tab( ] selected_categories_width = [110, 110, 110, 110, 80, 80, 80, 110, 80, 80] +vision_categories = [ + "full", + "full_style_control", + "captioning", + "captioning_style_control", + "entity_recognition", + "ocr", + "creative_writing_vision", + "homework", + "diagram", + "no_refusal", +] +vision_categories_width = [110, 110, 100, 110, 110, 60, 80, 80, 80, 80] + language_categories = [ "english", "chinese", @@ -963,16 +976,26 @@ def build_leaderboard_tab( combined_table = get_combined_table(elo_results_text, model_table_df) build_category_leaderboard_tab( combined_table, - "Task", + "LLM Task", selected_categories, selected_categories_width, ) build_category_leaderboard_tab( combined_table, - "Language", + "LLM Language", language_categories, language_categories_width, ) + if elo_results_vision is not None: + vision_combined_table = get_combined_table( + elo_results_vision, model_table_df + ) + build_category_leaderboard_tab( + vision_combined_table, + "VLM Task", + vision_categories, + vision_categories_width, + ) gr.Markdown( f""" ***Rank (UB)**: model's ranking (upper-bound), defined by one + the number of models that are statistically better than the target model. @@ -1074,31 +1097,10 @@ def build_demo(elo_results_file, leaderboard_table_file, arena_hard_leaderboard) from fastchat.serve.gradio_web_server import block_css text_size = gr.themes.sizes.text_lg - # load theme from theme.json - theme = gr.themes.Default.load("theme.json") - # set text size to large - theme.text_size = text_size - theme.set( - button_large_text_size="20px", - button_small_text_size="20px", - button_large_text_weight="100", - button_small_text_weight="100", - button_shadow="*shadow_drop_lg", - button_shadow_hover="*shadow_drop_lg", - checkbox_label_shadow="*shadow_drop_lg", - button_shadow_active="*shadow_inset", - button_secondary_background_fill="*primary_300", - button_secondary_background_fill_dark="*primary_700", - button_secondary_background_fill_hover="*primary_200", - button_secondary_background_fill_hover_dark="*primary_500", - button_secondary_text_color="*primary_800", - button_secondary_text_color_dark="white", - ) with gr.Blocks( title="Chatbot Arena Leaderboard", - # theme=gr.themes.Default(text_size=text_size), - theme=theme, + theme=gr.themes.Default(text_size=text_size), css=block_css, ) as demo: with gr.Tabs() as tabs: diff --git a/fastchat/serve/monitor/monitor_md.py b/fastchat/serve/monitor/monitor_md.py index 76db0f14b..c7f1f0fd3 100644 --- a/fastchat/serve/monitor/monitor_md.py +++ b/fastchat/serve/monitor/monitor_md.py @@ -14,17 +14,15 @@ key_to_category_name = { "full": "Overall", - "full_style_control": "Overall w/ Style Control", "dedup": "De-duplicate Top Redundant Queries (soon to be default)", "math": "Math", "if": "Instruction Following", "multiturn": "Multi-Turn", "creative_writing": "Creative Writing", + "creative_writing_vision": "Creative Writing", "coding": "Coding", - "coding_style_control": "Coding w/ Style Control", "hard_6": "Hard Prompts", "hard_english_6": "Hard Prompts (English)", - "hard_6_style_control": "Hard Prompts w/ Style Control", "long_user": "Longer Query", "english": "English", "chinese": "Chinese", @@ -49,15 +47,12 @@ } cat_name_to_explanation = { "Overall": "Overall Questions", - "Overall w/ Style Control": "Overall Leaderboard with Style Control. See details in [blog post](https://lmsys.org/blog/2024-08-28-style-control/).", "De-duplicate Top Redundant Queries (soon to be default)": "De-duplicate top redundant queries (top 0.1%). See details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/#note-enhancing-quality-through-de-duplication).", "Math": "Math", "Instruction Following": "Instruction Following", "Multi-Turn": "Multi-Turn Conversation (>= 2 turns)", "Coding": "Coding: whether conversation contains code snippets", - "Coding w/ Style Control": "Coding with Style Control", "Hard Prompts": "Hard Prompts: details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/)", - "Hard Prompts w/ Style Control": "Hard Prompts with Style Control. See details in [blog post](https://lmsys.org/blog/2024-08-28-style-control/).", "Hard Prompts (English)": "Hard Prompts (English), note: the delta is to English Category. details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/)", "Longer Query": "Longer Query (>= 500 tokens)", "English": "English Prompts", @@ -140,7 +135,11 @@ def make_category_arena_leaderboard_md(arena_df, arena_subset_df, name="Overall" space = "   " total_subset_votes = sum(arena_subset_df["num_battles"]) // 2 total_subset_models = len(arena_subset_df) - leaderboard_md = f"""### {cat_name_to_explanation[name]} + if "w/ Style Control" in name: + explanation = cat_name_to_explanation[name.replace(" w/ Style Control", "")] + " with Style Control. See details in [blog post](https://lmsys.org/blog/2024-08-28-style-control/)." + else: + explanation = cat_name_to_explanation[name] + leaderboard_md = f"""### {explanation} #### {space} #models: **{total_subset_models} ({round(total_subset_models/total_models *100)}%)** {space} #votes: **{"{:,}".format(total_subset_votes)} ({round(total_subset_votes/total_votes * 100)}%)**{space} """ return leaderboard_md