From 2669ec386abd0e6565d811e60a51f8bee7d3d841 Mon Sep 17 00:00:00 2001 From: Wei-Lin Chiang Date: Sat, 18 Jan 2025 00:29:50 +0000 Subject: [PATCH] format --- fastchat/serve/monitor/basic_stats.py | 3 +- fastchat/serve/monitor/clean_battle_data.py | 92 +++++++++++++------ fastchat/serve/monitor/clean_chat_data.py | 24 +++-- .../lmsys_chat_1m/compute_stats.py | 2 + fastchat/serve/monitor/elo_analysis.py | 60 ++++++++---- fastchat/serve/monitor/filter_pkl.py | 53 ++++++----- fastchat/serve/monitor/merge.py | 10 +- fastchat/serve/monitor/monitor.py | 27 ++++-- fastchat/serve/monitor/process.py | 51 ++++++---- fastchat/serve/monitor/rating_systems.py | 23 ++++- fastchat/serve/monitor/read_data.py | 20 ++-- fastchat/serve/monitor/utils.py | 30 ++++-- 12 files changed, 267 insertions(+), 128 deletions(-) diff --git a/fastchat/serve/monitor/basic_stats.py b/fastchat/serve/monitor/basic_stats.py index f21f3afdd..09633fe17 100644 --- a/fastchat/serve/monitor/basic_stats.py +++ b/fastchat/serve/monitor/basic_stats.py @@ -142,7 +142,7 @@ def report_basic_stats(log_files): height=300, width=1200, ) - + # calculate conversion rate for each day (vote / chat) conversion_rate = {} for date in chat_dates_counts.index: @@ -163,7 +163,6 @@ def report_basic_stats(log_files): width=1200, ) - # Model call counts model_hist_all = df_all[df_all["type"] == "chat"]["model"].value_counts() model_hist_1_day = df_1_day[df_1_day["type"] == "chat"]["model"].value_counts() diff --git a/fastchat/serve/monitor/clean_battle_data.py b/fastchat/serve/monitor/clean_battle_data.py index 664c84f12..4a3dd6dc3 100644 --- a/fastchat/serve/monitor/clean_battle_data.py +++ b/fastchat/serve/monitor/clean_battle_data.py @@ -146,7 +146,7 @@ def replace_model_name(old_name, tstamp): "dumbledore-v3": "llama-3.2-vision-90b-instruct", "potter-v1": "llama-3.2-vision-11b-instruct", "sharp-game-player-v1": "llama-3.2-3b-instruct", - "zeus-flare-thunder-v1": "llama-3.2-1b-instruct", + "zeus-flare-thunder-v1": "llama-3.2-1b-instruct", "qwen-vl-max-0809": "qwen2-vl-72b", "gemini-1.5-pro-002-test-sp": "gemini-1.5-pro-002", "gemini-1.5-flash-test-5": "gemini-1.5-flash-002", @@ -185,7 +185,7 @@ def replace_model_name(old_name, tstamp): return old_name else: return old_name + "-old" - if old_name == "eureka-chatbot": + if old_name == "eureka-chatbot": if tstamp > 1721651521: return "eureka-chatbot-v2" else: @@ -293,12 +293,17 @@ def process_data_txt2img( if row["models"][0] is None or row["models"][1] is None: count_dict["invalid"] += 1 continue - if not isinstance(row["models"][0], str) or not isinstance(row["models"][1], str): + if not isinstance(row["models"][0], str) or not isinstance( + row["models"][1], str + ): count_dict["invalid"] += 1 continue # Resolve model names - models_public = [remove_html(row["models"][0]), remove_html(row["models"][1])] + models_public = [ + remove_html(row["models"][0]), + remove_html(row["models"][1]), + ] if "model_name" in row["states"][0]: models_hidden = [ row["states"][0]["model_name"], @@ -356,12 +361,12 @@ def process_data_txt2img( "anony_bothbad_vote": 0, "anony_leftvote": 0, "anony_rightvote": 0, - "sanitized_id": shortuuid.uuid() + "sanitized_id": shortuuid.uuid(), } all_ips[ip]["count"] += 1 if flag_anony: all_ips[ip]["anony_count"] += 1 - all_ips[ip]["anony_"+row["type"]] += 1 + all_ips[ip]["anony_" + row["type"]] += 1 if sanitize_ip: user_id = f"{all_ips[ip]['sanitized_id']}" @@ -389,6 +394,7 @@ def process_data_txt2img( ) return battles, count_dict, count_leak, all_ips + def process_data( data, exclude_model_names, @@ -433,12 +439,17 @@ def process_data( if row["models"][0] is None or row["models"][1] is None: count_dict["invalid"] += 1 continue - if not isinstance(row["models"][0], str) or not isinstance(row["models"][1], str): + if not isinstance(row["models"][0], str) or not isinstance( + row["models"][1], str + ): count_dict["invalid"] += 1 continue # Resolve model names - models_public = [remove_html(row["models"][0]), remove_html(row["models"][1])] + models_public = [ + remove_html(row["models"][0]), + remove_html(row["models"][1]), + ] if "model_name" in row["states"][0]: models_hidden = [ row["states"][0]["model_name"], @@ -484,7 +495,6 @@ def process_data( print(state["messages"][state["offset"]][1]) raise ValueError - # Drop conversations if the model names are leaked messages = "" for i in range(2): @@ -576,12 +586,12 @@ def process_data( "anony_bothbad_vote": 0, "anony_leftvote": 0, "anony_rightvote": 0, - "sanitized_id": shortuuid.uuid() + "sanitized_id": shortuuid.uuid(), } all_ips[ip]["count"] += 1 if flag_anony: all_ips[ip]["anony_count"] += 1 - all_ips[ip]["anony_"+row["type"]] += 1 + all_ips[ip]["anony_" + row["type"]] += 1 if sanitize_ip: user_id = f"{all_ips[ip]['sanitized_id']}" @@ -607,13 +617,25 @@ def process_data( ) user_tokens = sum( - [conv["num_tokens"] for conv in conversation_a if conv["role"] == "user"] + [ + conv["num_tokens"] + for conv in conversation_a + if conv["role"] == "user" + ] ) assistant_a_tokens = sum( - [conv["num_tokens"] for conv in conversation_a if conv["role"] == "assistant"] + [ + conv["num_tokens"] + for conv in conversation_a + if conv["role"] == "assistant" + ] ) assistant_b_tokens = sum( - [conv["num_tokens"] for conv in conversation_b if conv["role"] == "assistant"] + [ + conv["num_tokens"] + for conv in conversation_b + if conv["role"] == "assistant" + ] ) context_tokens_a = sum([conv["num_tokens"] for conv in conversation_a[:-1]]) context_tokens_b = sum([conv["num_tokens"] for conv in conversation_b[:-1]]) @@ -702,22 +724,26 @@ def clean_battle_data( all_ips[ip]["count"] += sub_all_ips[ip]["count"] all_ips[ip]["anony_count"] += sub_all_ips[ip]["anony_count"] all_ips[ip]["anony_tievote"] += sub_all_ips[ip]["anony_tievote"] - all_ips[ip]["anony_bothbad_vote"] += sub_all_ips[ip]["anony_bothbad_vote"] + all_ips[ip]["anony_bothbad_vote"] += sub_all_ips[ip][ + "anony_bothbad_vote" + ] all_ips[ip]["anony_leftvote"] += sub_all_ips[ip]["anony_leftvote"] all_ips[ip]["anony_rightvote"] += sub_all_ips[ip]["anony_rightvote"] battles.sort(key=lambda x: x["tstamp"]) last_updated_tstamp = battles[-1]["tstamp"] - + battles = pd.DataFrame(battles) - + # drop rows with same question_id print(f"before drop dups #battles: {len(battles)}") battles = battles.drop_duplicates(subset=["question_id"], keep="first") battles = battles.reset_index(drop=True) print(f"#battles: {len(battles)}") - battles = battles[battles["anony"]].reset_index(drop=True) if anony_only else battles + battles = ( + battles[battles["anony"]].reset_index(drop=True) if anony_only else battles + ) if run_dedup and not (vision or txt2img): print("Running deduplication...") battles = utils.dedup_process(battles) @@ -725,7 +751,9 @@ def clean_battle_data( print(f"#dedup_battles: {num_dedup_battles}") else: print("Skip deduplication...") - dedup_tags = np.array([{"high_freq": False, "sampled": True} for _ in range(len(battles))]) + dedup_tags = np.array( + [{"high_freq": False, "sampled": True} for _ in range(len(battles))] + ) battles["dedup_tag"] = dedup_tags last_updated_datetime = datetime.datetime.fromtimestamp( @@ -746,7 +774,9 @@ def clean_battle_data( for votetype in ["tievote", "bothbad_vote", "leftvote", "rightvote"]: vote_key = "anony_" + votetype userid_key = "sanitized_id" if sanitize_ip else "ip" - top_30_users = sorted(all_ips.values(), key=lambda x: x[vote_key], reverse=True)[:30] + top_30_users = sorted( + all_ips.values(), key=lambda x: x[vote_key], reverse=True + )[:30] top_30_ip_id = ["arena_user_" + ip[userid_key] for ip in top_30_users] battles_top_30_ips = battles[battles["judge"].isin(top_30_ip_id)] print(f"Top 30 IPs #battles: {len(battles_top_30_ips)}") @@ -755,13 +785,15 @@ def clean_battle_data( for user in top_30_users: user_ip = user["ip"] user_id = "arena_user_" + user[userid_key] - + ip_battles = battles_top_30_ips[battles_top_30_ips["judge"] == user_id] win_count = len(ip_battles[ip_battles["winner"] == "model_a"]) tie_count = len(ip_battles[ip_battles["winner"] == "tie"]) loss_count = len(ip_battles[ip_battles["winner"] == "model_b"]) - print(f"{user_id}: model_a {win_count}, tie {tie_count}, mobel_b {loss_count}, {user_ip}") - + print( + f"{user_id}: model_a {win_count}, tie {tie_count}, mobel_b {loss_count}, {user_ip}" + ) + return battles @@ -784,8 +816,14 @@ def clean_battle_data( ban_ip_list = json.load(open(args.ban_ip_file)) if args.ban_ip_file else None battles = clean_battle_data( - log_files, args.exclude_model_names or [], ban_ip_list, args.sanitize_ip, anony_only=args.anony_only, - run_dedup=args.run_dedup, vision=args.vision, txt2img=args.txt2img + log_files, + args.exclude_model_names or [], + ban_ip_list, + args.sanitize_ip, + anony_only=args.anony_only, + run_dedup=args.run_dedup, + vision=args.vision, + txt2img=args.txt2img, ) last_updated_tstamp = battles.iloc[-1]["tstamp"] cutoff_date = datetime.datetime.fromtimestamp( @@ -801,7 +839,9 @@ def clean_battle_data( print(f"Write cleaned data to {output}") if not args.txt2img: - battles = battles.drop(columns=["conversation_a", "conversation_b", "question_id"]) + battles = battles.drop( + columns=["conversation_a", "conversation_b", "question_id"] + ) print("Samples:") print(battles[:5]) diff --git a/fastchat/serve/monitor/clean_chat_data.py b/fastchat/serve/monitor/clean_chat_data.py index 7a999babd..7ec82ad50 100644 --- a/fastchat/serve/monitor/clean_chat_data.py +++ b/fastchat/serve/monitor/clean_chat_data.py @@ -41,7 +41,9 @@ def get_log_files(max_num_files=None, is_vision=False): prefix = "" if is_vision: prefix = "vision-tmp-" - name = os.path.expanduser(f"~/fastchat_logs/server{i}/{prefix}{d}-conv.json") + name = os.path.expanduser( + f"~/fastchat_logs/server{i}/{prefix}{d}-conv.json" + ) if os.path.exists(name): filenames.append(name) max_num_files = max_num_files or len(filenames) @@ -120,9 +122,7 @@ def clean_chat_data(log_files, action_type, remove_prompt=False, exclude_models= msg = x["content"] if isinstance(x["content"], list): msg = x["content"][0] - x["num_tokens"] = len( - encoding.encode(msg, allowed_special="all") - ) + x["num_tokens"] = len(encoding.encode(msg, allowed_special="all")) messages_concat += msg.lower() if remove_prompt: @@ -185,21 +185,27 @@ def clean_chat_data(log_files, action_type, remove_prompt=False, exclude_models= parser.add_argument("--action-type", type=str, default="chat") parser.add_argument("--max-num-files", type=int) parser.add_argument("--vision", action="store_true") - parser.add_argument("--start-time", type=str) # example: 2024-08-01 - parser.add_argument("--end-time", type=str) # example: 2024-08-01 + parser.add_argument("--start-time", type=str) # example: 2024-08-01 + parser.add_argument("--end-time", type=str) # example: 2024-08-01 parser.add_argument("--remove-prompt", action="store_true") parser.add_argument("--exclude-models", type=str, nargs="+", default=[]) args = parser.parse_args() log_files = get_log_files(args.max_num_files, args.vision) # print(log_files) - chats = clean_chat_data(log_files, args.action_type, args.remove_prompt, args.exclude_models) + chats = clean_chat_data( + log_files, args.action_type, args.remove_prompt, args.exclude_models + ) print(len(chats)) # convert to dataframe chats = pd.DataFrame(chats) if args.start_time is not None: - chats = chats[pd.to_datetime(chats["tstamp"], unit="s") >= pd.to_datetime(args.start_time)] - chats = chats[pd.to_datetime(chats["tstamp"], unit='s') < pd.to_datetime(args.end_time)] + chats = chats[ + pd.to_datetime(chats["tstamp"], unit="s") >= pd.to_datetime(args.start_time) + ] + chats = chats[ + pd.to_datetime(chats["tstamp"], unit="s") < pd.to_datetime(args.end_time) + ] print(len(chats)) last_updated_tstamp = chats.iloc[-1]["tstamp"] diff --git a/fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/compute_stats.py b/fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/compute_stats.py index be7b1f0a0..27601fcd9 100644 --- a/fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/compute_stats.py +++ b/fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/compute_stats.py @@ -98,11 +98,13 @@ def to_remove(x): # "lmsys/vicuna-7b-v1.5", use_fast=False # ) + def num_tokens_from_string(string: str) -> int: encoding = tiktoken.encoding_for_model("gpt-4") num_tokens = len(encoding.encode(string)) return num_tokens + prompts = [] responses = [] for conv in df["conversation"]: diff --git a/fastchat/serve/monitor/elo_analysis.py b/fastchat/serve/monitor/elo_analysis.py index 93b884659..b7d309cb9 100644 --- a/fastchat/serve/monitor/elo_analysis.py +++ b/fastchat/serve/monitor/elo_analysis.py @@ -101,7 +101,7 @@ }, "no_refusal": { "filter_func": NoRefusalFilter(), - }, + }, "hard_english_5": { "filter_func": HardFilter(threshold=5), }, @@ -145,9 +145,10 @@ "creative_writing_style_control": { "filter_func": CreativeWritingFilter(), "style_control": True, - } + }, } + def get_median_elo_from_bootstrap(bootstrap_df): median = dict(bootstrap_df.quantile(0.5)) median = {k: int(v + 0.5) for k, v in median.items()} @@ -428,6 +429,7 @@ def outlier_detect( battles = battles[~battles["judge"].isin(bad_user_id_list)] return battles + def filter_long_conv(row): threshold = 768 for conversation_type in ["conversation_a", "conversation_b"]: @@ -486,15 +488,22 @@ def report_elo_analysis_results( # Only use anonymous votes battles = battles[battles["anony"]].reset_index(drop=True) battles_no_ties = battles[~battles["winner"].str.contains("tie")] - + # remove model with less than 20 votes - model_vote_cnt = battles["model_a"].value_counts() + battles["model_b"].value_counts() + model_vote_cnt = ( + battles["model_a"].value_counts() + battles["model_b"].value_counts() + ) print(f"Number of models: {len(model_vote_cnt)}") model_vote_cnt = model_vote_cnt[model_vote_cnt >= 20] print(f"Number of models with at least 20 votes: {len(model_vote_cnt)}") - battles = battles[battles["model_a"].isin(model_vote_cnt.index) & battles["model_b"].isin(model_vote_cnt.index)] - print(f"Number of battles after removing models with less than 20 votes: {len(battles)}") - + battles = battles[ + battles["model_a"].isin(model_vote_cnt.index) + & battles["model_b"].isin(model_vote_cnt.index) + ] + print( + f"Number of battles after removing models with less than 20 votes: {len(battles)}" + ) + if exclude_tie: battles = battles_no_ties if exclude_bothbad: @@ -515,19 +524,30 @@ def report_elo_analysis_results( if rating_system == "bt": if style_control: bootstrap_df, boostrap_coef = compute_bootstrap_style_control( - battles, num_round=num_bootstrap, anchor_model_and_rating=anchor_model_and_rating, + battles, + num_round=num_bootstrap, + anchor_model_and_rating=anchor_model_and_rating, + ) + elo_rating_final, coef_final = compute_style_control( + battles, anchor_model_and_rating=anchor_model_and_rating ) - elo_rating_final, coef_final = compute_style_control(battles, anchor_model_and_rating=anchor_model_and_rating) else: bootstrap_df = compute_bootstrap_bt( - battles, num_round=num_bootstrap, num_cpu=num_cpu, anchor_model_and_rating=anchor_model_and_rating + battles, + num_round=num_bootstrap, + num_cpu=num_cpu, + anchor_model_and_rating=anchor_model_and_rating, + ) + elo_rating_final = compute_bt( + battles, anchor_model_and_rating=anchor_model_and_rating ) - elo_rating_final = compute_bt(battles, anchor_model_and_rating=anchor_model_and_rating) elif rating_system == "new_bt": anchor_model, anchor_rating = anchor_model_and_rating if style_control: bt_ratings, coef_final = compute_style_control(battles) - offset_score = anchor_rating - bt_ratings[bt_ratings.index == anchor_model].values[0] + offset_score = ( + anchor_rating - bt_ratings[bt_ratings.index == anchor_model].values[0] + ) bootstrap_df, boostrap_coef = compute_bootstrap_style_control( battles, num_round=num_bootstrap, offset=offset_score @@ -535,8 +555,10 @@ def report_elo_analysis_results( elo_rating_final = bt_ratings + offset_score else: bt_ratings = compute_bt(battles) - offset_score = anchor_rating - bt_ratings[bt_ratings.index == anchor_model].values[0] - + offset_score = ( + anchor_rating - bt_ratings[bt_ratings.index == anchor_model].values[0] + ) + bootstrap_df = compute_bootstrap_bt( battles, num_round=num_bootstrap, num_cpu=num_cpu, offset=offset_score ) @@ -672,7 +694,10 @@ def pretty_print_elo_rating(rating): anchor_model_and_rating = None if args.anchor: anchor_model_and_rating = args.anchor.split(":") - anchor_model_and_rating = (anchor_model_and_rating[0], int(anchor_model_and_rating[1])) + anchor_model_and_rating = ( + anchor_model_and_rating[0], + int(anchor_model_and_rating[1]), + ) results = {} for cat in args.category: print(f"# Running {cat} conversations") @@ -701,9 +726,6 @@ def pretty_print_elo_rating(rating): battles, **kwargs_default, ) - - - for cat in args.category: print(f"# Results for {cat} conversations") @@ -712,7 +734,7 @@ def pretty_print_elo_rating(rating): if args.style_control: print("# Median (style control)") pretty_print_elo_rating(results[f"{cat}_style_control"]["elo_rating_final"]) - + print(f"last update : {results[cat]['last_updated_datetime']}") last_updated_tstamp = results[cat]["last_updated_tstamp"] diff --git a/fastchat/serve/monitor/filter_pkl.py b/fastchat/serve/monitor/filter_pkl.py index d3c084cda..a240cb891 100644 --- a/fastchat/serve/monitor/filter_pkl.py +++ b/fastchat/serve/monitor/filter_pkl.py @@ -22,10 +22,12 @@ parser.add_argument("--battle-file", type=str, required=True) parser.add_argument("--output-pkl", type=str, required=True) parser.add_argument("--exclude-models", type=str, nargs="+", required=True) - parser.add_argument("--rename-models", type=str, nargs="+", required=False) # format: model-a:rename-a model-b:rename-b - + parser.add_argument( + "--rename-models", type=str, nargs="+", required=False + ) # format: model-a:rename-a model-b:rename-b + args = parser.parse_args() - + # Load the data data = pkl.load(open(args.input_pkl, "rb")) battles = pd.read_json(args.battle_file) @@ -33,10 +35,10 @@ print("Before dedup: ", len(battles)) battles = battles[battles["dedup_tag"].apply(lambda x: x.get("sampled", False))] print("After dedup: ", len(battles)) - + battles = battles[battles["anony"]].reset_index(drop=True) battles_no_ties = battles[~battles["winner"].str.contains("tie")] - + # remove exclude models battles_no_ties = battles_no_ties[ ~( @@ -44,15 +46,22 @@ | battles_no_ties["model_b"].isin(args.exclude_models) ) ] - + # rename models if args.rename_models: for rename_str in args.rename_models: model_name, new_name = rename_str.split(":") - battles_no_ties.loc[battles_no_ties["model_a"] == model_name, "model_a"] = new_name - battles_no_ties.loc[battles_no_ties["model_b"] == model_name, "model_b"] = new_name - - model_list = list(set(battles_no_ties["model_a"].unique()) | set(battles_no_ties["model_b"].unique())) + battles_no_ties.loc[ + battles_no_ties["model_a"] == model_name, "model_a" + ] = new_name + battles_no_ties.loc[ + battles_no_ties["model_b"] == model_name, "model_b" + ] = new_name + + model_list = list( + set(battles_no_ties["model_a"].unique()) + | set(battles_no_ties["model_b"].unique()) + ) limit_show_number = 25 new_dict = {} @@ -66,8 +75,9 @@ filtered_indices = battles_no_ties.progress_apply(filter_func, axis=1) print(f"Number of battles after filtering: {sum(filtered_indices)}") battles_no_ties_cat = battles_no_ties[filtered_indices] - battles_no_ties_cat = battles_no_ties_cat.sort_values(ascending=True, by=["tstamp"]) - + battles_no_ties_cat = battles_no_ties_cat.sort_values( + ascending=True, by=["tstamp"] + ) print(data[k].keys()) elo_rating_final = data[k]["elo_rating_final"] @@ -77,22 +87,26 @@ # remove exclude models # elo_rating_online = elo_rating_online[~elo_rating_online.index.isin(args.exclude_models)] - elo_rating_final = elo_rating_final[~elo_rating_final.index.isin(args.exclude_models)] + elo_rating_final = elo_rating_final[ + ~elo_rating_final.index.isin(args.exclude_models) + ] bootstrap_df = bootstrap_df.drop(columns=args.exclude_models, errors="ignore") leaderboard_table_df = leaderboard_table_df[ ~leaderboard_table_df.index.isin(args.exclude_models) ] - + # rename models if args.rename_models: for rename_str in args.rename_models: model_name, new_name = rename_str.split(":") elo_rating_final = elo_rating_final.rename(index={model_name: new_name}) bootstrap_df = bootstrap_df.rename(columns={model_name: new_name}) - leaderboard_table_df = leaderboard_table_df.rename(index={model_name: new_name}) + leaderboard_table_df = leaderboard_table_df.rename( + index={model_name: new_name} + ) model_order = list(elo_rating_final.keys()) - + # intersaction of model list and model order model_order = [model for model in model_order if model in model_list] model_order = model_order[:limit_show_number] @@ -101,9 +115,7 @@ win_fraction_heatmap = visualize_pairwise_win_fraction( battles_no_ties_cat, model_order ) - battle_count_heatmap = visualize_battle_count( - battles_no_ties_cat, model_order - ) + battle_count_heatmap = visualize_battle_count(battles_no_ties_cat, model_order) average_win_rate_bar = visualize_average_win_rate( battles_no_ties_cat, limit_show_number ) @@ -125,7 +137,7 @@ "bootstrap_df": bootstrap_df, "leaderboard_table_df": leaderboard_table_df, } - + # data[k] = filtered_data new_dict[k] = filtered_data @@ -133,4 +145,3 @@ # Save the data with open(args.output_pkl, "wb") as f: pkl.dump(new_dict, f) - \ No newline at end of file diff --git a/fastchat/serve/monitor/merge.py b/fastchat/serve/monitor/merge.py index 858091e75..2fb788392 100644 --- a/fastchat/serve/monitor/merge.py +++ b/fastchat/serve/monitor/merge.py @@ -5,20 +5,22 @@ parser = argparse.ArgumentParser() parser.add_argument("--elo-battle-pkl", type=str) parser.add_argument("--vision-elo-battle-pkl", type=str) - parser.add_argument("--output-merged-battle-pkl", type=str, default="merged_battle.pkl") + parser.add_argument( + "--output-merged-battle-pkl", type=str, default="merged_battle.pkl" + ) args = parser.parse_args() - + # read two pkls with open(args.elo_battle_pkl, "rb") as f: elo_battle = pk.load(f) with open(args.vision_elo_battle_pkl, "rb") as f: vision_elo_battle = pk.load(f) - + # merge two pkls (dicts). one in key "text", another in key "vision" merged_battle = {} merged_battle["text"] = elo_battle merged_battle["vision"] = vision_elo_battle - + # save the merged pkl with open(args.output_merged_battle_pkl, "wb") as f: pk.dump(merged_battle, f) diff --git a/fastchat/serve/monitor/monitor.py b/fastchat/serve/monitor/monitor.py index 6d20b2833..ab44eb131 100644 --- a/fastchat/serve/monitor/monitor.py +++ b/fastchat/serve/monitor/monitor.py @@ -107,7 +107,12 @@ def update_elo_components( if elo_results_file is None: # Do live update ban_ip_list = json.load(open(ban_ip_file)) if ban_ip_file else None battles = clean_battle_data( - log_files, exclude_model_names, ban_ip_list=ban_ip_list, run_dedup=True, vision=vision, num_threads=16, + log_files, + exclude_model_names, + ban_ip_list=ban_ip_list, + run_dedup=True, + vision=vision, + num_threads=16, ) # rating_system = "bt" if not vision else "new_bt" rating_system = "new_bt" @@ -115,7 +120,7 @@ def update_elo_components( battles, rating_system=rating_system, scale=2, - anchor_model_and_rating=("gpt-4o-mini-2024-07-18", 1273) + anchor_model_and_rating=("gpt-4o-mini-2024-07-18", 1273), ) # dump to file periodically with timestamp in filename @@ -158,12 +163,21 @@ def update_elo_components( def update_worker( - max_num_files, interval, elo_results_file, ban_ip_file, exclude_model_names, vision=False + max_num_files, + interval, + elo_results_file, + ban_ip_file, + exclude_model_names, + vision=False, ): while True: tic = time.time() update_elo_components( - max_num_files, elo_results_file, ban_ip_file, exclude_model_names, vision=vision + max_num_files, + elo_results_file, + ban_ip_file, + exclude_model_names, + vision=vision, ) durtaion = time.time() - tic print(f"update duration: {durtaion:.2f} s") @@ -607,9 +621,7 @@ def build_arena_tab( "#### Figure 3: Fraction of Model A Wins for All Non-tied A vs. B Battles", elem_id="plot-title", ) - plot_1 = gr.Plot( - p1, show_label=False, elem_id="plot-container" - ) + plot_1 = gr.Plot(p1, show_label=False, elem_id="plot-container") with gr.Column(): gr.Markdown( "#### Figure 4: Battle Count for Each Combination of Models (without Ties)", @@ -712,6 +724,7 @@ def highlight_rank_max(s): return elo_datarame.style.apply(highlight_max, subset=["Rank* (UB)"]).apply( highlight_rank_max, subset=["Delta"] ) + # last_updated_time = elo_results["full"]["last_updated_datetime"].split(" ")[0] # for k in key_to_category_name.keys(): # if k not in elo_results: diff --git a/fastchat/serve/monitor/process.py b/fastchat/serve/monitor/process.py index 6d554dce4..240c493ed 100644 --- a/fastchat/serve/monitor/process.py +++ b/fastchat/serve/monitor/process.py @@ -3,7 +3,7 @@ import argparse import hashlib -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--input", type=str, required=True) parser.add_argument("--output", type=str, required=True) @@ -11,42 +11,61 @@ parser.add_argument("--exclude-models", type=str, required=False, nargs="+") parser.add_argument("--sample-ratio", type=float, required=True) parser.add_argument("--external", action="store_true", default=False) - parser.add_argument("--rename-models", type=str, nargs="+", required=False) # format: model-a:rename-a model-b:rename-b - parser.add_argument("--start-time", type=str) # example: 2024-08-01 + parser.add_argument( + "--rename-models", type=str, nargs="+", required=False + ) # format: model-a:rename-a model-b:rename-b + parser.add_argument("--start-time", type=str) # example: 2024-08-01 parser.add_argument("--only-listed-model", action="store_true", default=False) parser.add_argument("--prompt-only", action="store_true", default=False) args = parser.parse_args() - + # Load the data data = pd.read_json(args.input) data = data[data["anony"]] - + print(len(data)) if args.start_time is not None: - data = data[pd.to_datetime(data["tstamp"], unit="s") >= pd.to_datetime(args.start_time)] + data = data[ + pd.to_datetime(data["tstamp"], unit="s") >= pd.to_datetime(args.start_time) + ] print(len(data)) - data_to_exclude = json.load(open("/mnt/disks/data-20240517/analysis/release/labelbench/FastChat/fastchat/serve/monitor/classify/kaggle_question_id_info_20241122.json")) - data_to_exclude_id = data_to_exclude["train_question_ids"] + data_to_exclude["test_question_ids"] + data_to_exclude = json.load( + open( + "/mnt/disks/data-20240517/analysis/release/labelbench/FastChat/fastchat/serve/monitor/classify/kaggle_question_id_info_20241122.json" + ) + ) + data_to_exclude_id = ( + data_to_exclude["train_question_ids"] + data_to_exclude["test_question_ids"] + ) data = data[~data["question_id"].isin(data_to_exclude_id)] print("Number of votes after excluding kaggle questions: ", len(data)) if args.target_models is not None: if args.only_listed_model: - data = data[data["model_a"].isin(args.target_models) & data["model_b"].isin(args.target_models)] + data = data[ + data["model_a"].isin(args.target_models) + & data["model_b"].isin(args.target_models) + ] else: - data = data[data["model_a"].isin(args.target_models) | data["model_b"].isin(args.target_models)] + data = data[ + data["model_a"].isin(args.target_models) + | data["model_b"].isin(args.target_models) + ] if args.exclude_models: - data = data[~data["model_a"].isin(args.exclude_models) & ~data["model_b"].isin(args.exclude_models)] - + data = data[ + ~data["model_a"].isin(args.exclude_models) + & ~data["model_b"].isin(args.exclude_models) + ] + # rename models if args.rename_models: for rename_str in args.rename_models: model_name, new_name = rename_str.split(":") data.loc[data["model_a"] == model_name, "model_a"] = new_name data.loc[data["model_b"] == model_name, "model_b"] = new_name - + print("Number of votes: ", len(data)) # "dedup_tag":{ # "high_freq":true, @@ -58,7 +77,7 @@ # sample data if args.sample_ratio < 1: data = data.sample(frac=args.sample_ratio, random_state=42) - + if args.external: # drop timestamp data = data.drop(columns=["tstamp"]) @@ -81,6 +100,6 @@ print("Data statistics:") print(data["model_a"].value_counts()) print("Total number of data points: ", len(data)) - + # output the data - data.to_json(args.output, orient="records", indent=4, force_ascii=False) \ No newline at end of file + data.to_json(args.output, orient="records", indent=4, force_ascii=False) diff --git a/fastchat/serve/monitor/rating_systems.py b/fastchat/serve/monitor/rating_systems.py index a6400a90f..017319c04 100644 --- a/fastchat/serve/monitor/rating_systems.py +++ b/fastchat/serve/monitor/rating_systems.py @@ -221,7 +221,8 @@ def compute_bt( matchups, outcomes, models, weights = preprocess_for_bt(df) ratings = fit_bt(matchups, outcomes, weights, len(models), math.log(base), tol) scaled_ratings = scale_and_offset( - ratings, models, scale, init_rating, anchor_model_and_rating) + ratings, models, scale, init_rating, anchor_model_and_rating + ) return pd.Series(scaled_ratings, index=models).sort_values(ascending=False) @@ -253,7 +254,9 @@ def compute_bootstrap_bt( results = list(tqdm(pool.imap_unordered(bt_fn, boot_weights), total=num_round)) ratings = np.array(results) - scaled_ratings = scale_and_offset(ratings, models, scale, init_rating+offset, anchor_model_and_rating) + scaled_ratings = scale_and_offset( + ratings, models, scale, init_rating + offset, anchor_model_and_rating + ) df = pd.DataFrame(scaled_ratings, columns=models) return df[df.median().sort_values(ascending=False).index] @@ -334,7 +337,13 @@ def fit_contextual_bt( def compute_style_control( - df, alpha=math.log(10.0), reg=0.5, init_rating=1000.0, scale=400.0, tol=1e-6, anchor_model_and_rating=None, + df, + alpha=math.log(10.0), + reg=0.5, + init_rating=1000.0, + scale=400.0, + tol=1e-6, + anchor_model_and_rating=None, ): matchups, features, outcomes, models = preprocess_for_style(df) ratings_params = fit_contextual_bt( @@ -348,7 +357,9 @@ def compute_style_control( ) ratings = ratings_params[: len(models)] params = ratings_params[len(models) :] - scaled_ratings = scale_and_offset(ratings, models, scale, init_rating, anchor_model_and_rating) + scaled_ratings = scale_and_offset( + ratings, models, scale, init_rating, anchor_model_and_rating + ) scaled_ratings = pd.Series(scaled_ratings, index=models).sort_values( ascending=False ) @@ -392,6 +403,8 @@ def compute_bootstrap_style_control( ratings_params = np.array(results) ratings = ratings_params[:, : len(models)] params = ratings_params[:, len(models) :] - scaled_ratings = scale_and_offset(ratings, models, scale, init_rating+offset, anchor_model_and_rating) + scaled_ratings = scale_and_offset( + ratings, models, scale, init_rating + offset, anchor_model_and_rating + ) df = pd.DataFrame(scaled_ratings, columns=models) return df[df.median().sort_values(ascending=False).index], params diff --git a/fastchat/serve/monitor/read_data.py b/fastchat/serve/monitor/read_data.py index 08632af63..48c67435a 100644 --- a/fastchat/serve/monitor/read_data.py +++ b/fastchat/serve/monitor/read_data.py @@ -2,13 +2,13 @@ import argparse import pandas as pd -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument('--data-path', type=str) - parser.add_argument('--model-list', type=str, nargs='+') + parser.add_argument("--data-path", type=str) + parser.add_argument("--model-list", type=str, nargs="+") args = parser.parse_args() - with open(args.data_path, 'rb') as f: + with open(args.data_path, "rb") as f: data = pkl.load(f) print(data.keys()) # only the models in the model list. dataframe index is model name @@ -17,14 +17,16 @@ # pd.set_option('display.max_rows', None) print(leaderboard_df.index) sub_df = leaderboard_df.loc[args.model_list, :] - sub_df["95%CI"] = sub_df.apply(lambda x: f"{x['rating']:.2f} + {x['rating_q975']-x['rating']:.2f} - {x['rating']-x['rating_q025']:.2f}", axis=1) + sub_df["95%CI"] = sub_df.apply( + lambda x: f"{x['rating']:.2f} + {x['rating_q975']-x['rating']:.2f} - {x['rating']-x['rating_q025']:.2f}", + axis=1, + ) # precision 2 decimal - # drop variance column sub_df = sub_df.drop(columns=["variance", "rating_q025", "rating_q975"]) - + # sort columns sub_df = sub_df[["rating", "95%CI", "num_battles"]] - - print(sub_df) \ No newline at end of file + + print(sub_df) diff --git a/fastchat/serve/monitor/utils.py b/fastchat/serve/monitor/utils.py index 994a8af42..1ae2ae929 100644 --- a/fastchat/serve/monitor/utils.py +++ b/fastchat/serve/monitor/utils.py @@ -165,11 +165,15 @@ def process_battle_file(battle_file_path: str, n_cpus: int): def dedup_process(df: pd.DataFrame, percentile: float = 0.999) -> pd.DataFrame: """Deduplicate conversations based on the prompt.""" # Preprocessing - df["post_process_conv"] = df["conversation_a"].apply(lambda x: " ".join([turn["content"] for turn in x if turn["role"] == "user"])[:10000]) - + df["post_process_conv"] = df["conversation_a"].apply( + lambda x: " ".join([turn["content"] for turn in x if turn["role"] == "user"])[ + :10000 + ] + ) + print("Number of conversations: ", len(df)) - prompt_counts = df['post_process_conv'].value_counts() + prompt_counts = df["post_process_conv"].value_counts() # Show the top 20 most frequent prompts top_prompts = prompt_counts.head(20) print(top_prompts) @@ -180,15 +184,21 @@ def dedup_process(df: pd.DataFrame, percentile: float = 0.999) -> pd.DataFrame: # prompts that are more common than the percentile cutoff high_frequency_prompts = prompt_counts[prompt_counts > percentile_cutoff].index - print(f"Number of high frequency prompts: {len(high_frequency_prompts)}/{len(prompt_counts)}") - + print( + f"Number of high frequency prompts: {len(high_frequency_prompts)}/{len(prompt_counts)}" + ) + # initialize a new column dedup_tag - dedup_tags = np.array([{"high_freq": False, "sampled": True} for _ in range(len(df))]) - high_freq_groups = df.groupby('post_process_conv') + dedup_tags = np.array( + [{"high_freq": False, "sampled": True} for _ in range(len(df))] + ) + high_freq_groups = df.groupby("post_process_conv") # import pdb; pdb.set_trace() for prompt in tqdm(high_frequency_prompts): df_high_freq = high_freq_groups.get_group(prompt) - sampled_indices = df_high_freq.sample(n=int(percentile_cutoff), random_state=42).index + sampled_indices = df_high_freq.sample( + n=int(percentile_cutoff), random_state=42 + ).index dedup_tags[df_high_freq.index] = {"high_freq": True, "sampled": False} dedup_tags[sampled_indices] = {"high_freq": True, "sampled": True} @@ -196,9 +206,9 @@ def dedup_process(df: pd.DataFrame, percentile: float = 0.999) -> pd.DataFrame: # drop intermediate columns (post_process_conv) df = df.drop(columns=["post_process_conv"]) - + return df - + if __name__ == "__main__": parser = argparse.ArgumentParser()