Skip to content

Commit

Permalink
Minor tweaks
Browse files Browse the repository at this point in the history
  • Loading branch information
ashwinprasadme committed Jul 22, 2024
1 parent 833427d commit 3db15b5
Show file tree
Hide file tree
Showing 6 changed files with 187 additions and 18 deletions.
29 changes: 29 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,12 @@
- 🔄 Efficiently transforms inferred types into a **standardized format**.
- 📊 Automatically produces **meaningful metrics** for in-depth assessment and comparison.

### [New] TypeEvalPy Autogen
- 🤖 **Autogenerates code snippets** and ground truth to scale the benchmark based on the original `TypeEvalPy` benchmark.
- 📈 The autogen benchmark now contains:
- **Python files**: 7121
- **Type annotations**: 78373

## 🛠️ Supported Tools

| Supported :white_check_mark: | In-progress :wrench: | Planned :bulb: |
Expand Down Expand Up @@ -231,6 +237,29 @@ docker run \

---

## Running TypeEvalPy Autogen

To generate an extended version of the original TypeEvalPy benchmark to include many more Python types, run the following commands:

1. **Navigate to the `autogen` Directory**

```bash
cd autogen
```


2. **Execute the Generation Script**

Run the following command to start the generation process:

```bash
python generate_typeevalpy_dataset.py
```

This will generate a folder in the repo root with the autogen benchmark with the current date.

---

### 🤝 Contributing

Thank you for your interest in contributing! To add support for a new tool, please utilize the Docker templates provided in our repository. After implementing and testing your tool, please submit a pull request (PR) with a descriptive message. Our maintainers will review your submission, and merge them.
Expand Down
127 changes: 123 additions & 4 deletions autogen/generate_typeevalpy_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,18 @@
import shutil
from pathlib import Path
import tqdm
import time
import json
import datetime

SCRIPT_DIR = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
ROOT_DIR = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))

output_folder = f"{SCRIPT_DIR}/.scrapy/generated_typeevalpy_dataset"
error_folder = f"{SCRIPT_DIR}/.scrapy/error"
benchmark_dir = f"{SCRIPT_DIR}/micro-benchmark-autogen-templates/python_features"
current_datetime = datetime.datetime.now()
formatted_datetime = current_datetime.strftime("%Y-%m-%d_%H-%M-%S")

output_folder = f"{ROOT_DIR}/generated_typeevalpy_dataset_{formatted_datetime}"
error_folder = f"{ROOT_DIR}/.scrapy/error"
benchmark_dir = f"{ROOT_DIR}/micro-benchmark-autogen-templates"
shutil.rmtree(output_folder, ignore_errors=True)
shutil.rmtree(error_folder, ignore_errors=True)

Expand All @@ -17,14 +23,21 @@
files_analyzed = 0
error_count = 0
last_folder = ""
start_time = time.time()
total_start_time = time.time()
for file in tqdm.tqdm(python_files, desc="Processing files"):
try:
# print the folder path if its not the same as the last one
if str(file.parent.parent.name) != last_folder:
if last_folder:
print(
f"Time taken for {last_folder}: {time.time() - start_time} seconds"
)
print(
f"##################\nProcessing: {file.parent.parent.name}\n##################"
)
last_folder = str(file.parent.parent.name)
start_time = time.time()

# ignore if not main.py
if file.name != "main.py":
Expand Down Expand Up @@ -55,3 +68,109 @@
except Exception as e:
print(e)
pass


def get_fact_stats(json_files):
total_annotations = 0
total_types = 0
total_col = 0
rows = []
sum_functions = 0
sum_params = 0
sum_variables = 0
sum_empty_out_types = 0
sum_non_empty_out_types = 0
for json_file in json_files:
with open(json_file, "r") as f:
data = json.load(f)
if "ground_truth" not in data:
continue
data = data["ground_truth"]
total_annotations += len(data)
merged_cell = json_file
for _t in data:
total_types += len(_t["type"])
if _t.get("col_offset"):
total_col += 1
line_number = _t.get("line_number", "")
function = _t.get("function", "")
param = _t.get("parameter", "")
variable = _t.get("variable", "")
types = ", ".join(_t.get("type", []))
rows.append(
[
merged_cell,
line_number,
function,
param,
variable,
types,
]
)
if function:
if not param and not variable:
sum_functions += 1

if param:
sum_params += 1

if variable:
sum_variables += 1

return (
total_annotations,
total_types,
total_col,
sum_functions,
sum_params,
sum_variables,
)


# python_features
print("python_features")
print(
"category | Overall annotations | Overall types | Overall functions | overall param"
" | Overall variables"
)
python_features_dir = output_folder + "/python_features"
pf_overall_annotations = 0
pf_overall_types = 0
for cat in sorted(os.listdir(python_features_dir)):
cat_dir = os.path.join(python_features_dir, cat)
json_files = [_file for _file in sorted(Path(cat_dir).rglob("*_gt.json"))]

_a, _t, _, sum_functions, sum_params, sum_variables = get_fact_stats(json_files)

pf_overall_annotations += _a
pf_overall_types += _t

print(cat, _a, _t, sum_functions, sum_params, sum_variables)

print(pf_overall_annotations, pf_overall_types)

json_files = [_file for _file in sorted(Path(output_folder).rglob("*_gt.json"))]
python_files = [_file for _file in sorted(Path(output_folder).rglob("*.py"))]

print("\nOverall")
total_annotations = 0
total_types = 0
total_col = 0
for json_file in json_files:
with open(json_file, "r") as f:
data = json.load(f)
if "ground_truth" not in data:
continue
data = data["ground_truth"]
total_annotations += len(data)
for _t in data:
total_types += len(_t["type"])
if _t.get("col_offset"):
total_col += 1


print(f"Total Python files: {len(python_files)}")
print(f"Total annotations: {total_annotations}")
print(f"Total types in annotations: {total_types}")

print(f"Total time taken: {time.time() - total_start_time} seconds")
36 changes: 28 additions & 8 deletions autogen/test.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
from helpers import read_template, process_file
from helpers import read_template, process_file, process_import_case
import os
import shutil
from pathlib import Path

SCRIPT_DIR = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))

output_folder = f"{SCRIPT_DIR}/.scrapy/test/test"
output_folder = f"{SCRIPT_DIR}/.scrapy/test/"
benchmark_dir = (
f"{SCRIPT_DIR}/micro-benchmark-autogen-templates/python_features/builtins/functools"
f"{SCRIPT_DIR}/.scrapy/micro-benchmark-autogen-templates/python_features"
)

shutil.rmtree(f"{SCRIPT_DIR}/.scrapy/test", ignore_errors=True)
file = Path(
"/home/ashwin/TypeEvalPy_AutoBench/micro-benchmark-autogen-templates/python_features/builtins/map/main.py"
Expand All @@ -22,12 +23,31 @@ def call_files():
for file in python_files:
try:
print(file)
# ignore if not main.py
if file.name != "main.py":
print(f">> Ignoring: {file}")
continue

template_data = read_template(file)
process_file(
*template_data,
str(file.parent).replace(benchmark_dir, ""),
output_folder,
)
if template_data["replacement_mode"] == "Imports":
process_import_case(
name=template_data["name"],
data_types=template_data["data_types"],
code_template=template_data["code_template"],
json_template=template_data["json_template"],
file_path=str(file.parent).replace(benchmark_dir, ""),
file_parent=str(file.parent),
output_folder=output_folder,
)
else:
process_file(
name=template_data["name"],
data_types=template_data["data_types"],
code_template=template_data["code_template"],
json_template=template_data["json_template"],
file_path=str(file.parent).replace(benchmark_dir, ""),
output_folder=output_folder,
)

except Exception as e:
print(e)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
{
"replacement_mode": "Imports",
"imports": [
"nested/mod.py",
"nested/__init__.py"
"from_mod.py"
],
"type_replacements": [
"int",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
{
"replacement_mode": "Imports",
"imports": [
"to_import.py"
"nested/mod.py",
"nested/__init__.py"
],
"type_replacements": [
"int",
Expand Down
7 changes: 4 additions & 3 deletions scripts/rewrite_jsons.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@
def modify_data(data):
# Example modification: add a new key-value pair
new_data = {
"replacement_mode": "",
"type_replacements": ["int"],
"replacement_mode": "Imports",
"imports": ["to_import.py"],
"type_replacements": ["int", "float", "str", "bool", "list", "dict", "tuple"],
"ground_truth": data,
}
return new_data
Expand Down Expand Up @@ -40,5 +41,5 @@ def main(directory_path):
# Example usage
if __name__ == "__main__":
main(
"/mnt/Projects/PhD/Research/TypeEvalPy/git_sources/TypeEvalPy_AutoBench/.scrapy/micro-benchmark-autogen-templates copy"
"/mnt/Projects/PhD/Research/TypeEvalPy/git_sources/TypeEvalPy_AutoBench/.scrapy/micro-benchmark-autogen-templates"
) # Change this

0 comments on commit 3db15b5

Please sign in to comment.