HAFix initial commit

SAILResearch · Jan 14, 2025 · 16eac87 · 16eac87
commit 16eac87
Show file tree

Hide file tree

Showing 118 changed files with 264,795 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,30 @@
+readbymyself.txt
+
+.idea/
+analysis/inspect_model_output/
+analysis/result_csv/
+analysis/result_pictures/
+analysis/RQ3
+analysis/**/*.png
+
+backup/
+
+dataset/src/test.py
+
+evaluation/**/*.png
+
+model_inference/codellama_13b_instruct_hf_instruct/
+model_inference/codellama_34b_instruct_hf
+model_inference/codellama_34b_python_hf
+model_inference/codellama_70b_instruct_hf
+model_inference/codellama_70b_python_hf
+model_inference/examples
+model_inference/log
+model_inference/test
+model_inference/codellama_13b*
+model_inference/codellama_34b*
+model_inference/codellama_70b*
+
+subject_projects/
+
+**/__pycache__/
diff --git a/README.md b/README.md
@@ -0,0 +1 @@
+HAFix: History-Augmented Large Language Models for Bug Fixing
diff --git a/analysis/draw_figure.py b/analysis/draw_figure.py
@@ -0,0 +1,164 @@
+def main():
+    # Define your sets with case IDs
+    instruct_baseline = {'3', '5', '6', '7', '8', '9', '23', '24', '26', '27', '28', '36', '37', '38', '42', '48', '49', '54', '66', '67'}
+    instruct_combine = {'2', '3', '5', '6', '7', '8', '9', '10', '17', '23', '24', '26', '27', '28', '29', '32', '36', '37', '38', '42', '46', '47', '48', '49', '53', '54', '60', '66', '67'}
+
+    create_2_set_venn_percentage(instruct_baseline, instruct_combine, 'Baseline', 'HAFix-Agg', 'instruct_1_HAFix_percentage')
+
+    instruct_6 = {'2', '3', '5', '6', '7', '8', '9', '10', '17', '23', '24', '26', '27', '29', '32', '36', '38', '42', '48', '49', '54', '66'}
+    create_2_set_venn_percentage(instruct_baseline, instruct_6, 'baseline', 'FLN-all', 'instruct_6_FLN-all_percentage')
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+    create_2_set_venn(instruct_baseline, instruct_combine, 'baseline', 'aggregation', 'instruct')
+
+    instruct_label_baseline = {'3', '5', '6', '7', '8', '9', '23', '24', '26', '27', '28', '32', '36', '37', '41', '42', '44', '49', '54', '67'}
+    instruct_label_combine = {'3', '6', '7', '8', '9', '10', '23', '24', '26', '27', '28', '32', '36', '37', '41', '42', '44', '46', '49', '53', '54', '60', '67'}
+    create_2_set_venn(instruct_label_baseline, instruct_label_combine, 'baseline', 'aggregation', 'instruct_label')
+    create_2_set_venn_percentage(instruct_label_baseline, instruct_label_combine, 'baseline', 'aggregation', 'instruct_label_percentage')
+
+
+    infill_baseline = {'2', '23', '24', '26', '47', '48', '49'}
+    infill_combine = {'2', '5', '23', '26', '41', '47', '48', '49'}
+    create_2_set_venn(infill_baseline, infill_combine, 'baseline', 'aggregation', 'infill')
+    create_2_set_venn_percentage(infill_baseline, infill_combine, 'baseline', 'aggregation', 'infill_percentage')
+
+
+    union_baseline = {'2', '3', '5', '6', '7', '8', '9', '23', '24', '26', '27', '28', '32', '36', '37', '38', '41', '42', '44', '47', '48', '49', '54', '66', '67'}
+    union_aggregation = {'2', '3', '5', '6', '7', '8', '9', '10', '17', '23', '24', '26', '27', '28', '29', '32', '36', '37', '38', '41', '42', '44', '46', '47', '48', '49', '53', '54', '60', '66', '67'}
+    create_2_set_venn(union_baseline, union_aggregation, 'baseline', 'aggregation', 'union')
+    create_2_set_venn_percentage(union_baseline, union_aggregation, 'baseline', 'aggregation', 'union_percentage')
+
+
+
+def create_2_set_venn(set1, set2, set1_label, set2_label, picture_name):
+    import matplotlib.pyplot as plt
+    from matplotlib_venn import venn2, venn2_circles
+
+    def format_label_text(numbers):
+        """Format the list of numbers into a multi-line string with a dynamic number of items per line."""
+        sorted_numbers = sorted(numbers, key=int)
+        lines = []
+        line_length = 1
+        i = 0
+
+        while i < len(sorted_numbers):
+            line = ", ".join(sorted_numbers[i:i + line_length])
+            lines.append(line)
+            i += line_length
+            line_length += 1  # Gradually increase the number of items per line
+
+        return "\n".join(lines)
+
+    # Create the Venn diagram
+    venn = venn2([set1, set2], (set1_label, set2_label))
+
+    # Customize the labels inside the circles
+    venn.get_label_by_id('10').set_text(format_label_text(set1 - set2))
+    venn.get_label_by_id('01').set_text(format_label_text(set2 - set1))
+    venn.get_label_by_id('11').set_text(format_label_text(set1 & set2))
+
+    # Optionally set the font size for better readability
+    for subset in ['10', '01', '11']:
+        venn.get_label_by_id(subset).set_fontsize(8)
+
+    plt.savefig(f'{picture_name}.png', bbox_inches='tight')
+    plt.close()
+
+
+def create_2_set_venn_percentage(set1, set2, set1_label, set2_label, picture_name):
+    import matplotlib.pyplot as plt
+    from matplotlib_venn import venn2, venn2_circles
+
+    def calculate_percentage(subset, total):
+        """Calculate the percentage of the subset relative to the total."""
+        return len(subset) / total * 100
+
+    total_elements = len(set1 | set2)
+
+    # Calculate percentages for each subset
+    only_set1 = calculate_percentage(set1 - set2, total_elements)
+    only_set2 = calculate_percentage(set2 - set1, total_elements)
+    intersection = calculate_percentage(set1 & set2, total_elements)
+    # Create the Venn diagram
+    venn = venn2([set1, set2], (set1_label, set2_label))
+
+    # Customize the labels inside the circles with percentages
+    if len(set1 - set2) != 0:
+        venn.get_label_by_id('10').set_text(f'{len(set1 - set2)}\n\n{only_set1:.2f}%')
+    else:
+        venn.get_label_by_id('10').set_text(f'')
+    venn.get_label_by_id('01').set_text(f'{len(set2 - set1)}\n\n{only_set2:.2f}%')
+    venn.get_label_by_id('11').set_text(f'{len(set1 & set2)}\n\n{intersection:.2f}%')
+
+
+    # Optionally set the font size for better readability
+    for subset in ['10', '01', '11']:
+        venn.get_label_by_id(subset).set_fontsize(8)
+
+    # Change the colors of the circle areas if needed
+    # venn_patches = venn.patches
+    # venn_patches[0].set_facecolor('red')
+    # venn_patches[1].set_facecolor('blue')
+    # venn_patches[2].set_facecolor('purple')  # Overlapping area color if needed
+
+    plt.savefig(f'{picture_name}.png', bbox_inches='tight')
+    plt.close()
+
+
+def create_3_set_venn():
+    # import matplotlib.pyplot as plt
+    # from matplotlib_venn import venn3
+    #
+    # # Define your sets with case IDs
+    # set1 = {'ID1', 'ID2', 'ID3', 'ID4'}
+    # set2 = {'ID3', 'ID4', 'ID5', 'ID6'}
+    # set3 = {'ID4', 'ID6', 'ID7', 'ID8'}
+    #
+    # # Create the Venn diagram
+    # venn = venn3([set1, set2, set3], ('Set1', 'Set2', 'Set3'))
+    #
+    # # Customize the labels inside the circles
+    # venn.get_label_by_id('100').set_text('\n'.join(set1 - set2 - set3))
+    # venn.get_label_by_id('010').set_text('\n'.join(set2 - set1 - set3))
+    # venn.get_label_by_id('001').set_text('\n'.join(set3 - set1 - set2))
+    # venn.get_label_by_id('110').set_text('\n'.join(set1 & set2 - set3))
+    # venn.get_label_by_id('101').set_text('\n'.join(set1 & set3 - set2))
+    # venn.get_label_by_id('011').set_text('\n'.join(set2 & set3 - set1))
+    # venn.get_label_by_id('111').set_text('\n'.join(set1 & set2 & set3))
+    #
+    # # Optionally set the font size for better readability
+    # for subset in ['100', '010', '001', '110', '101', '011', '111']:
+    #     venn.get_label_by_id(subset).set_fontsize(8)
+    #
+    #
+    # # Display the plot
+    # plt.savefig('venn3.png', bbox_inches='tight')
+    # # Display the plot
+    # plt.show()
+    # plt.close()
+    pass
+
+
+if __name__ == '__main__':
+    main()
diff --git a/analysis/inference_time_analysis.py b/analysis/inference_time_analysis.py
@@ -0,0 +1,51 @@
+import re
+
+
+def get_nucleus_sampling_start_time(file_path):
+    with open(file_path, 'r') as file:
+        text = file.read()
+
+    # Regular expression to find finish_at timestamps and bug_ids
+    pattern = re.compile \
+        (r'Current time (\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}): Finish generation.*?finish the bug: (\d+)', re.DOTALL)
+
+    # Find all matches in the text
+    matches = pattern.findall(text)
+
+    # Print out the results
+    for match in matches:
+        start_at, bug_id = match
+        print(f"bug_id={bug_id} start_at={start_at}")
+
+
+def get_nucleus_sampling_end_time(file_path):
+    with open(file_path, 'r') as file:
+        text = file.read()
+
+    # Regular expression to find finish_at timestamps and bug_ids
+    pattern = re.compile \
+        (r'Current time (\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}): Finish generation.*?finish the bug: (\d+)', re.DOTALL)
+
+    pattern = re.compile \
+        (r'finish the bug: (\d+).*?Current time (\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}): Start generation', re.DOTALL)
+
+    # Find all matches in the text
+    matches = pattern.findall(text)
+
+    # Print out the results
+    for match in matches:
+        bug_id, finish_at = match
+        print(f"bug_id={bug_id} finish_at={finish_at}")
+
+
+def main():
+    for i in range(8):
+        file_path = f"/home/22ys22/project/fm-apr-replay/model_inference/log/old_prompt/codellama_7b_instruct_{i+1}.log"
+        print(f"==========================log {i + 1}==========================")
+        get_nucleus_sampling_start_time(file_path)
+        print(f"=============================================================")
+        get_nucleus_sampling_end_time(file_path)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/analysis/inspect_model_output.py b/analysis/inspect_model_output.py
@@ -0,0 +1,64 @@
+import os
+import json
+
+from util import HistoryCategory
+
+CURRENT_DIR_PATH = os.path.abspath(os.path.dirname(__file__))
+PROJECT_DIR_BASE = os.path.abspath(os.path.join(CURRENT_DIR_PATH, '../'))
+MODEL_INFERENCE_BASE_PATH = os.path.abspath(os.path.join(PROJECT_DIR_BASE, 'backup', 'model_inference'))
+MODEL_EVALUATION_BASE_PATH = os.path.abspath(os.path.join(PROJECT_DIR_BASE, 'backup', 'evaluation'))
+
+
+def main():
+    model_name_path = [
+        'codellama_7b_instruct_hf',
+        'codellama_7b_instruct_hf_infill',
+        'codellama_13b_instruct_hf',
+        'codellama_13b_instruct_hf_infill',
+        'codellama_34b_instruct_hf'
+    ]
+    history_settings_ids = [1, 2, 3, 4, 5, 6, 7, 8]
+
+    for bug_id in range(1, 69):
+        bug_id_str = str(bug_id)
+        _bug_id_inspect_path = f"{CURRENT_DIR_PATH}/inspect_model_output"
+        os.makedirs(_bug_id_inspect_path, exist_ok=True)
+        bug_id_inspect_path = f"{_bug_id_inspect_path}/model_output_bug_{bug_id}.txt"
+        bug_id_inspect_result = open(bug_id_inspect_path, 'a')
+        bug_id_inspect_result.write(f"####################inspect model-generated code for bug {bug_id}####################\n")
+        for model_name in model_name_path:
+            bug_id_inspect_result.write(
+                f"####################inspect {model_name}-generated code####################\n")
+            for history_flag in history_settings_ids:
+                history_flag = str(history_flag)
+                if model_name == 'codellama_7b_instruct_hf_infill' or model_name == 'codellama_13b_instruct_hf_infill':
+                    inference_path = f"{MODEL_INFERENCE_BASE_PATH}/{model_name}/{model_name.replace('_infill', '')}_{HistoryCategory(history_flag).name}.json"
+                    evaluate_path = f"{MODEL_EVALUATION_BASE_PATH}/{model_name}/unittest_result_{model_name.replace('_infill', '')}_{HistoryCategory(history_flag).name}.json"
+                else:
+                    inference_path = f"{MODEL_INFERENCE_BASE_PATH}/{model_name}/{model_name}_{HistoryCategory(history_flag).name}.json"
+                    evaluate_path = f"{MODEL_EVALUATION_BASE_PATH}/{model_name}/unittest_result_{model_name}_{HistoryCategory(history_flag).name}.json"
+
+                inference_json = json.load(open(inference_path, 'r'))
+                evaluate_result: dict = json.load(open(evaluate_path, 'r'))
+
+                if bug_id_str not in inference_json or bug_id_str not in evaluate_result:
+                    continue
+                if history_flag == '1':
+                    buggy_code = inference_json[bug_id_str]['input']['buggy_code']
+                    ground_fixed_code = inference_json[bug_id_str]['ground_fixed_code']
+                    bug_id_inspect_result.write(f"#1. buggy code\n")
+                    bug_id_inspect_result.write(f"{buggy_code}\n\n")
+                    bug_id_inspect_result.write(f"#2. ground fixed code\n")
+                    bug_id_inspect_result.write(f"{ground_fixed_code}\n\n")
+
+                model_generated_code = inference_json[bug_id_str]['output']['greedy_search']
+                result = evaluate_result[bug_id_str]['greedy_search_flag']
+                bug_id_inspect_result.write(f"#setting_{history_flag}: {HistoryCategory(history_flag).name}\n")
+                bug_id_inspect_result.write(f"{result}\n")
+                bug_id_inspect_result.write(f"{model_generated_code}\n\n")
+
+        bug_id_inspect_result.close()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/analysis/result_analysis.py b/analysis/result_analysis.py
@@ -0,0 +1,55 @@
+import json
+import os
+import csv
+from util import HistoryCategory
+
+
+def main():
+    model_size = '7b'  # 34b 13b 7b
+    # prompt_style = 'instruct'
+
+    k = 5  # 1, 5, 10
+
+    save_csv_file = f"/home/22ys22/project/fm-apr-replay/analysis/[email protected]"
+    source_path = f"/home/22ys22/project/fm-apr-replay/backup/evaluation/codellama_7b_instruct_hf_infill/"
+    if os.path.exists(save_csv_file):
+        os.remove(save_csv_file)
+
+    # settings = []
+    # for setting in HistoryCategory:
+    #     settings.append(setting.value)
+    for file_name in os.listdir(source_path):
+        if file_name.endswith('.json'):
+            for setting in HistoryCategory:
+                if f'unittest_result_{setting.name}.json' == file_name:
+                    name_suf_dict = json.load(open(os.path.join(source_path, file_name), 'r'))
+                    if k == 1:
+                        for bug_id, result in name_suf_dict.items():
+                            flag = 1 if result['greedy_search_flag'] == 'Pass' else 0
+                            save_result(save_csv_file, f"codellama_{model_size}_instruct", setting.value,
+                                        setting.name, bug_id, flag)
+
+                    else:
+                        for bug_id, result in name_suf_dict.items():
+                            # calculate all cases
+                            flag = 1 if any(
+                                result == 'Pass' for result in list(result['nucleus_sampling_flags'])[:k]) else 0
+                            # flag = 1 if result['greedy_search_flag'] == 'Pass' else 0
+                            save_result(save_csv_file, f"codellama_{model_size}_instruct", setting.value, setting.name,
+                                        bug_id, flag)
+
+
+def save_result(save_csv_file, model_id, setting_id, setting_name, bug_id, test_result):
+    if not os.path.exists(save_csv_file):
+        with open(save_csv_file, 'w') as f:
+            csv_write = csv.writer(f)
+            csv_head = ["model_id", "setting_id", "setting_name", "bug_id", "test_result"]
+            csv_write.writerow(csv_head)
+    with open(save_csv_file, 'a+') as f:
+        csv_write = csv.writer(f)
+        csv_row = [model_id, setting_id, setting_name, bug_id, test_result]
+        csv_write.writerow(csv_row)
+
+
+if __name__ == "__main__":
+    main()
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		HAFix: History-Augmented Large Language Models for Bug Fixing