Minor tweaks

secure-software-engineering · Jul 22, 2024 · 3db15b5 · 3db15b5
1 parent 833427d
commit 3db15b5
Show file tree

Hide file tree

Showing 6 changed files with 187 additions and 18 deletions.
diff --git a/README.md b/README.md
@@ -14,6 +14,12 @@
 - 🔄 Efficiently transforms inferred types into a **standardized format**.
 - 📊 Automatically produces **meaningful metrics** for in-depth assessment and comparison.
 
+### [New] TypeEvalPy Autogen
+- 🤖 **Autogenerates code snippets** and ground truth to scale the benchmark based on the original `TypeEvalPy` benchmark.
+- 📈 The autogen benchmark now contains:
+  - **Python files**: 7121
+  - **Type annotations**: 78373
+
 ## 🛠️ Supported Tools
 
 | Supported :white_check_mark:                                          | In-progress :wrench:                                                 | Planned :bulb:                                        |
@@ -231,6 +237,29 @@ docker run \
 
 ---
 
+## Running TypeEvalPy Autogen
+
+To generate an extended version of the original TypeEvalPy benchmark to include many more Python types, run the following commands:
+
+1.  **Navigate to the `autogen` Directory**
+
+    ```bash
+    cd autogen
+    ```
+
+
+2.  **Execute the Generation Script**
+
+    Run the following command to start the generation process:
+
+    ```bash
+    python generate_typeevalpy_dataset.py
+    ```
+
+This will generate a folder in the repo root with the autogen benchmark with the current date.
+
+---
+
 ### 🤝 Contributing
 
 Thank you for your interest in contributing! To add support for a new tool, please utilize the Docker templates provided in our repository. After implementing and testing your tool, please submit a pull request (PR) with a descriptive message. Our maintainers will review your submission, and merge them.

diff --git a/autogen/generate_typeevalpy_dataset.py b/autogen/generate_typeevalpy_dataset.py
@@ -3,12 +3,18 @@
 import shutil
 from pathlib import Path
 import tqdm
+import time
+import json
+import datetime
 
-SCRIPT_DIR = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
+ROOT_DIR = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
 
-output_folder = f"{SCRIPT_DIR}/.scrapy/generated_typeevalpy_dataset"
-error_folder = f"{SCRIPT_DIR}/.scrapy/error"
-benchmark_dir = f"{SCRIPT_DIR}/micro-benchmark-autogen-templates/python_features"
+current_datetime = datetime.datetime.now()
+formatted_datetime = current_datetime.strftime("%Y-%m-%d_%H-%M-%S")
+
+output_folder = f"{ROOT_DIR}/generated_typeevalpy_dataset_{formatted_datetime}"
+error_folder = f"{ROOT_DIR}/.scrapy/error"
+benchmark_dir = f"{ROOT_DIR}/micro-benchmark-autogen-templates"
 shutil.rmtree(output_folder, ignore_errors=True)
 shutil.rmtree(error_folder, ignore_errors=True)
 
@@ -17,14 +23,21 @@
 files_analyzed = 0
 error_count = 0
 last_folder = ""
+start_time = time.time()
+total_start_time = time.time()
 for file in tqdm.tqdm(python_files, desc="Processing files"):
     try:
         # print the folder path if its not the same as the last one
         if str(file.parent.parent.name) != last_folder:
+            if last_folder:
+                print(
+                    f"Time taken for {last_folder}: {time.time() - start_time} seconds"
+                )
             print(
                 f"##################\nProcessing: {file.parent.parent.name}\n##################"
             )
             last_folder = str(file.parent.parent.name)
+            start_time = time.time()
 
         # ignore if not main.py
         if file.name != "main.py":
@@ -55,3 +68,109 @@
     except Exception as e:
         print(e)
         pass
+
+
+def get_fact_stats(json_files):
+    total_annotations = 0
+    total_types = 0
+    total_col = 0
+    rows = []
+    sum_functions = 0
+    sum_params = 0
+    sum_variables = 0
+    sum_empty_out_types = 0
+    sum_non_empty_out_types = 0
+    for json_file in json_files:
+        with open(json_file, "r") as f:
+            data = json.load(f)
+            if "ground_truth" not in data:
+                continue
+            data = data["ground_truth"]
+            total_annotations += len(data)
+            merged_cell = json_file
+            for _t in data:
+                total_types += len(_t["type"])
+                if _t.get("col_offset"):
+                    total_col += 1
+                line_number = _t.get("line_number", "")
+                function = _t.get("function", "")
+                param = _t.get("parameter", "")
+                variable = _t.get("variable", "")
+                types = ", ".join(_t.get("type", []))
+                rows.append(
+                    [
+                        merged_cell,
+                        line_number,
+                        function,
+                        param,
+                        variable,
+                        types,
+                    ]
+                )
+                if function:
+                    if not param and not variable:
+                        sum_functions += 1
+
+                if param:
+                    sum_params += 1
+
+                if variable:
+                    sum_variables += 1
+
+    return (
+        total_annotations,
+        total_types,
+        total_col,
+        sum_functions,
+        sum_params,
+        sum_variables,
+    )
+
+
+# python_features
+print("python_features")
+print(
+    "category | Overall annotations | Overall types | Overall functions | overall param"
+    " | Overall variables"
+)
+python_features_dir = output_folder + "/python_features"
+pf_overall_annotations = 0
+pf_overall_types = 0
+for cat in sorted(os.listdir(python_features_dir)):
+    cat_dir = os.path.join(python_features_dir, cat)
+    json_files = [_file for _file in sorted(Path(cat_dir).rglob("*_gt.json"))]
+
+    _a, _t, _, sum_functions, sum_params, sum_variables = get_fact_stats(json_files)
+
+    pf_overall_annotations += _a
+    pf_overall_types += _t
+
+    print(cat, _a, _t, sum_functions, sum_params, sum_variables)
+
+print(pf_overall_annotations, pf_overall_types)
+
+json_files = [_file for _file in sorted(Path(output_folder).rglob("*_gt.json"))]
+python_files = [_file for _file in sorted(Path(output_folder).rglob("*.py"))]
+
+print("\nOverall")
+total_annotations = 0
+total_types = 0
+total_col = 0
+for json_file in json_files:
+    with open(json_file, "r") as f:
+        data = json.load(f)
+        if "ground_truth" not in data:
+            continue
+        data = data["ground_truth"]
+        total_annotations += len(data)
+        for _t in data:
+            total_types += len(_t["type"])
+            if _t.get("col_offset"):
+                total_col += 1
+
+
+print(f"Total Python files: {len(python_files)}")
+print(f"Total annotations: {total_annotations}")
+print(f"Total types in annotations: {total_types}")
+
+print(f"Total time taken: {time.time() - total_start_time} seconds")
diff --git a/autogen/test.py b/autogen/test.py
@@ -1,14 +1,15 @@
-from helpers import read_template, process_file
+from helpers import read_template, process_file, process_import_case
 import os
 import shutil
 from pathlib import Path
 
 SCRIPT_DIR = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
 
-output_folder = f"{SCRIPT_DIR}/.scrapy/test/test"
+output_folder = f"{SCRIPT_DIR}/.scrapy/test/"
 benchmark_dir = (
-    f"{SCRIPT_DIR}/micro-benchmark-autogen-templates/python_features/builtins/functools"
+    f"{SCRIPT_DIR}/.scrapy/micro-benchmark-autogen-templates/python_features"
 )
+
 shutil.rmtree(f"{SCRIPT_DIR}/.scrapy/test", ignore_errors=True)
 file = Path(
     "/home/ashwin/TypeEvalPy_AutoBench/micro-benchmark-autogen-templates/python_features/builtins/map/main.py"
@@ -22,12 +23,31 @@ def call_files():
     for file in python_files:
         try:
             print(file)
+            # ignore if not main.py
+            if file.name != "main.py":
+                print(f">> Ignoring: {file}")
+                continue
+
             template_data = read_template(file)
-            process_file(
-                *template_data,
-                str(file.parent).replace(benchmark_dir, ""),
-                output_folder,
-            )
+            if template_data["replacement_mode"] == "Imports":
+                process_import_case(
+                    name=template_data["name"],
+                    data_types=template_data["data_types"],
+                    code_template=template_data["code_template"],
+                    json_template=template_data["json_template"],
+                    file_path=str(file.parent).replace(benchmark_dir, ""),
+                    file_parent=str(file.parent),
+                    output_folder=output_folder,
+                )
+            else:
+                process_file(
+                    name=template_data["name"],
+                    data_types=template_data["data_types"],
+                    code_template=template_data["code_template"],
+                    json_template=template_data["json_template"],
+                    file_path=str(file.parent).replace(benchmark_dir, ""),
+                    output_folder=output_folder,
+                )
 
         except Exception as e:
             print(e)

diff --git a/micro-benchmark-autogen-templates/python_features/imports/import_from/main_gt.json b/micro-benchmark-autogen-templates/python_features/imports/import_from/main_gt.json
@@ -1,8 +1,7 @@
 {
     "replacement_mode": "Imports",
     "imports": [
-        "nested/mod.py",
-        "nested/__init__.py"
+        "from_mod.py"
     ],
     "type_replacements": [
         "int",

diff --git a/micro-benchmark-autogen-templates/python_features/imports/init_func_import/main_gt.json b/micro-benchmark-autogen-templates/python_features/imports/init_func_import/main_gt.json
@@ -1,7 +1,8 @@
 {
     "replacement_mode": "Imports",
     "imports": [
-        "to_import.py"
+        "nested/mod.py",
+        "nested/__init__.py"
     ],
     "type_replacements": [
         "int",

diff --git a/scripts/rewrite_jsons.py b/scripts/rewrite_jsons.py
@@ -6,8 +6,9 @@
 def modify_data(data):
     # Example modification: add a new key-value pair
     new_data = {
-        "replacement_mode": "",
-        "type_replacements": ["int"],
+        "replacement_mode": "Imports",
+        "imports": ["to_import.py"],
+        "type_replacements": ["int", "float", "str", "bool", "list", "dict", "tuple"],
         "ground_truth": data,
     }
     return new_data
@@ -40,5 +41,5 @@ def main(directory_path):
 # Example usage
 if __name__ == "__main__":
     main(
-        "/mnt/Projects/PhD/Research/TypeEvalPy/git_sources/TypeEvalPy_AutoBench/.scrapy/micro-benchmark-autogen-templates copy"
+        "/mnt/Projects/PhD/Research/TypeEvalPy/git_sources/TypeEvalPy_AutoBench/.scrapy/micro-benchmark-autogen-templates"
     )  # Change this