ECAI 23 - code and data.

pmorvalho · Jul 21, 2023 · 84ee930 · 84ee930
1 parent 9251949
commit 84ee930
Show file tree

Hide file tree

Showing 174 changed files with 32,336 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,5 @@
+*~
+*#
+#*
+.#*
+.DS_Store*
diff --git a/README.md b/README.md
@@ -0,0 +1,41 @@
+# ECAI 2023 - Graph Neural Networks For Mapping Variables Between Programs
+
+This repository contains the code and data used for the paper "*Graph Neural Networks For Mapping Variables Between Programs*", accepted at ECAI 2023.
+
+We present a novel graph program representation that is agnostic to the names of the variables and for each variable in the program contains a representative variable node that is connected to all the variable's occurrences. Furthermore, we use GNNs for mapping variables between programs based on our program representation, ignoring the variables' identifiers;
+We represent each program as a graph using the script gen_progs_repr.py, as explained in [1, 2].
+
+To understand the entire pipeline, from the dataset generation to the repair process, the interested reader is refered to our main script 'run_all.sh'.
+
+The generation of the datasets (training, validation and evaluation) and the training of our GNN has been commented since it takes a few hours to compute. Only the mapping computations and the program repair tasks were left uncommented. In this repo only the evaluation dataset is available.
+
+- How to execute:
+
+```
+chmod +x run_all.sh
+bash run_all.sh
+```
+
+## Installation Requirements
+
+The following script creates a new conda environment named 'gnn_env' and installs all the required dependencies in it.
+
+```
+chmod +x config_gnn.sh
+bash confin_gnn.sh
+```
+
+## Introductory Programming Assignments (IPAs) Dataset
+
+
+To generate our evaluation set of C programs we used [MultIPAs](https://github.com/pmorvalho/MultIPAs) [3], which is a program transformation framework, to augment our dataset of IPAs, [C-Pack-IPAs](https://github.com/pmorvalho/C-Pack-IPAs) [4]. 
+
+## References
+
+[1] Pedro Orvalho, Jelle Piepenbrock, Mikoláš Janota, and Vasco Manquinho. Graph Neural Networks For Mapping Variables Between Programs. ECAI 2023. [PDF](). *[Accepted for Publication]*
+
+[2] Pedro Orvalho, Jelle Piepenbrock, Mikoláš Janota, and Vasco Manquinho. Project Proposal: Learning Variable Mappings to Repair Programs. AITP 2022. [PDF](http://aitp-conference.org/2022/abstract/AITP_2022_paper_15.pdf).
+
+[3] Pedro Orvalho, Mikoláš Janota, and Vasco Manquinho. MultIPAs: Applying Program Transformations to Introductory Programming Assignments for Data Augmentation. In 30th ACM Joint European Software Engineering Conference and Symposium on the Foundations of Software Engineering, ESEC/FSE 2022. [PDF](https://dl.acm.org/doi/10.1145/3540250.3558931).
+
+[4] Pedro Orvalho, Mikoláš Janota, and Vasco Manquinho. C-Pack of IPAs: A C90 Program Benchmark of Introductory Programming Assignments. 2022. [PDF](https://arxiv.org/pdf/2206.08768.pdf).
diff --git a/config_gnn.sh b/config_gnn.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+#Title			: config_gnn.sh
+#Usage			: bash config_gnn.sh
+#Author			: pmorvalho
+#Date			: July 29, 2022
+#Description		: Jelle's commands to config the GNN's environment, plus the python packages needed to run MultIPAs
+#Notes			: 
+# (C) Copyright 2022 Pedro Orvalho.
+#==============================================================================
+
+#!/usr/bin/env bash
+source ~/anaconda3/etc/profile.d/conda.sh
+conda create -n gnn_env python=3.9
+conda activate gnn_env
+#conda install pytorch torchvision torchaudio cpuonly -c pytorch
+#pip3 install torch torchvision torchaudio
+pip install torch==1.11.0+cpu torchvision==0.12.0+cpu torchaudio==0.11.0 --extra-index-url https://download.pytorch.org/whl/cpu
+# check with nvidia-smi if the 'cu102' should be replaced with another cuda version
+pip install torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-1.11.0+cpu.html
+pip install pycparser==2.21
+conda activate gnn_env
+
+
diff --git a/eval-dataset.zip b/eval-dataset.zip
diff --git a/eval.py b/eval.py
@@ -0,0 +1,110 @@
+import argparse
+from sys import argv
+import gzip
+import pickle
+from gnn import VariableMappingGNN
+import torch
+import time
+import gzip
+
+def preprocess_data_test_time(left_ast, right_ast):
+    left_edge_index_pairs = []
+    left_edge_types = []
+    for triple in left_ast['edges']:
+        left_edge_index_pairs.append([triple[0], triple[1]])
+        left_edge_types.append(triple[2])
+
+    left_node_types = [left_ast['nodes2types'][k] for k in left_ast['nodes2types']]
+
+    right_edge_index_pairs = []
+    right_edge_types = []
+
+    for triple in right_ast['edges']:
+        right_edge_index_pairs.append([triple[0], triple[1]])
+        right_edge_types.append(triple[2])
+
+    right_node_types = [right_ast['nodes2types'][k] for k in right_ast['nodes2types']]
+
+    # var_norm_index = {k: e for (e, k) in enumerate(left_ast['vars2id'])}
+    # var_norm_index2 = {k: e for (e, k) in enumerate(right_ast['vars2id'])}
+
+    left_node_types = torch.as_tensor(left_node_types)
+    right_node_types = torch.as_tensor(right_node_types)
+
+    left_edge_index_pairs = torch.as_tensor(left_edge_index_pairs)
+    right_edge_index_pairs = torch.as_tensor(right_edge_index_pairs)
+
+    left_edge_types = torch.as_tensor(left_edge_types)
+    right_edge_types = torch.as_tensor(right_edge_types)
+
+    return ((left_node_types, left_edge_index_pairs, left_edge_types, left_ast),
+            (right_node_types, right_edge_index_pairs, right_edge_types, right_ast))
+
+def load_model(model_location):
+    device = "cpu"
+    with gzip.open("types2int.pkl.gz", 'rb') as f:
+        node_type_mapping = pickle.load(f)
+        print(node_type_mapping)
+        # find maximum index
+        num_types = node_type_mapping['diff_types']
+        print(num_types)
+
+    gnn = VariableMappingGNN(num_types, device).to(device)
+
+    gnn.load_state_dict(
+        torch.load(model_location, map_location=torch.device('cpu')))
+
+    return gnn
+
+def predict(gnn_model, left_ast_file, right_ast_file):
+
+    with gzip.open(left_ast_file, 'rb') as f:
+        left_ast = pickle.load(f)
+
+    with gzip.open(right_ast_file, 'rb') as f:
+        right_ast = pickle.load(f)
+
+    left_ast, right_ast = preprocess_data_test_time(left_ast, right_ast)
+
+    op_var_dict, op_dist_dict = gnn_model.test_time_output((left_ast, right_ast))
+
+    return op_var_dict, op_dist_dict
+
+def save_var_maps(var_dict, p_name):
+    fp=gzip.open(p_name,'wb')
+    pickle.dump(var_dict,fp)
+    fp.close()
+
+
+def parser():
+    parser = argparse.ArgumentParser(prog='prog_fixer.py', formatter_class=argparse.RawTextHelpFormatter)
+    parser.add_argument('-ia', '--inc_ast', help='Incorrect Program\'s AST.')
+    parser.add_argument('-ca', '--cor_ast', help='Correct program\'s AST.')
+    parser.add_argument('-m', '--var_map', help='Variable mapping Path.')
+    parser.add_argument('-md', '--var_map_dist', help='Path for the each variable mapping distribution.')    
+    parser.add_argument('-gm', '--gnn_model', help='GNN model to use.')
+    parser.add_argument('-t', '--time', help='File where the time spent predicting the model will be written to.')
+    parser.add_argument('-v', '--verbose', action='store_true', default=False, help='Prints debugging information.')
+    args = parser.parse_args(argv[1:])
+    return args
+
+
+if __name__ == "__main__":
+    args = parser()
+
+    model_location = args.gnn_model
+    model = load_model(model_location)
+
+    buggy_ast_file = args.inc_ast
+    correct_ast_file = args.cor_ast
+
+    time_0 = time.time()
+    model_output, model_output_distributions = predict(model, buggy_ast_file, correct_ast_file)
+    time_f = time.time()-time_0
+    with open(args.time, 'w+') as writer:
+        writer.writelines("Time: {t}".format(t=round(time_f,3)))
+
+    print(model_output)
+    save_var_maps(model_output, args.var_map)
+    save_var_maps(model_output_distributions, args.var_map_dist)
+    # TODO Use the {model_output_distributions} to sample
diff --git a/gen_eval_dataset.py b/gen_eval_dataset.py
@@ -0,0 +1,65 @@
+#!/usr/bin/python
+#Title			: gen_eval_dataset.py
+#Usage			: python gen_eval_dataset.py -h
+#Author			: pmorvalho
+#Date			: July 25, 2022
+#Description	        : For each program mutation and mutilation configuration (input dir), this script chooses randomly a mutilated program for each student and guarantes that this program is semantically incorrect.
+#Notes			: 
+#Python Version: 3.8.5
+# (C) Copyright 2022 Pedro Orvalho.
+#==============================================================================
+
+import argparse
+from sys import argv
+import os
+import random
+
+from helper import *
+
+
+def choose_incorrect_programs(input_dir, output_dir):
+    subfolders = [f.path for f in os.scandir(input_dir) if f.is_dir()]
+    for sf in subfolders:
+        stu_id = str(sf).split("/")[-1]
+        # print(sf+"/")
+        muts = list(pathlib.Path(sf+"/").glob('*/')) # mutations directories
+        prog_found = False
+        while len(muts) > 0 and not prog_found:
+            mut = random.sample(muts, 1)[0]
+            muts.remove(mut)
+            m = int(str(mut).split("/")[-1].replace("-",""))
+            if m == 0: # ignoring programs that were not mutated (index 0, 00, 000). E.g. programs that have a for-loop and are inside the for2while directory.
+                print("Ignoring ", mut)
+                continue
+            progs = list(pathlib.Path("{d}".format(d=mut)).glob('*.c')) # mutilated programs
+            while len(progs) > 0:
+                p = random.sample(progs, 1)[0]
+                progs.remove(p)
+                p = str(p)
+                if not check_program(p, args.ipa):
+                    prog_found=True
+                    if args.verbose:
+                        print("Program ",p, " has been chosen.")
+                    p=p.split("/")[-1][:-2]
+                    os.system("cp {d}/{p}.c {o}/{s}.c".format(d=mut, p=p, o=args.output_dir, s=stu_id))
+                    os.system("cp {d}/ast-{p}.pkl.gz {o}/ast-{s}.pkl.gz".format(d=mut, p=p, o=args.output_dir, s=stu_id))
+                    os.system("cp {d}/bug_map-{z}-{p}.pkl.gz {o}/bug_map-{s}.pkl.gz".format(d=mut, z="0"*len(p),p=p, o=args.output_dir, s=stu_id))
+                    os.system("cp {d}/var_map-{z}_{p}.pkl.gz {o}/var_map-{s}.pkl.gz".format(d=mut, z="0"*len(p),p=p, o=args.output_dir, s=stu_id))
+                    os.system("rm {d}/*.o".format(d=mut))
+                    break
+            # os.system("rm {d}/*.o".format(d=mut))
+
+
+def parser():
+    parser = argparse.ArgumentParser(prog='gen_eval_dataset.py', formatter_class=argparse.RawTextHelpFormatter)
+    parser.add_argument('-d', '--input_dir', nargs='?',help='input directory.')
+    parser.add_argument('-e', '--ipa', help='Name of the lab and exercise (IPA) so we can check the IO tests.')    
+    parser.add_argument('-i', '--ignore', action='store_true', default=False, help='ignores...')
+    parser.add_argument('-o', '--output_dir', nargs='?', help='the name of the output dir.')
+    parser.add_argument('-v', '--verbose', action='store_true', default=False, help='Prints debugging information.')
+    args = parser.parse_args(argv[1:])
+    return args
+
+if __name__ == '__main__':
+    args = parser()
+    choose_incorrect_programs(args.input_dir, args.output_dir)
diff --git a/gen_gnn_variable_mappings.sh b/gen_gnn_variable_mappings.sh
@@ -0,0 +1,74 @@
+#!/usr/bin/env bash
+#Title			: gen_gnn_variable_mappings.sh
+#Usage			: bash gen_gnn_variable_mappings.sh
+#Author			: pmorvalho
+#Date			: July 29, 2022
+#Description		: Generates a var mapping for each pair of incorrect/correct programs in the eval dataset using the GNN  
+#Notes			: 
+# (C) Copyright 2022 Pedro Orvalho.
+#==============================================================================
+
+# to activate clara python-environment
+if [ -f ~/opt/anaconda3/etc/profile.d/conda.sh ]; then
+  # if running on MacOS
+    source ~/opt/anaconda3/etc/profile.d/conda.sh
+else
+  # if running on ARSR Machines (Linux)
+    source ~/anaconda3/etc/profile.d/conda.sh
+fi
+
+conda activate gnn_env
+
+initial_dir=$(pwd)
+data_dir="eval-dataset"
+gnn_models_dir="gnn_models"
+var_maps_dir=$initial_dir"/variable_mappings"
+TIMEOUT_REPAIR=600 # in seconds 
+
+#labs=("lab02" "lab03" "lab04") 	# we are not considering lab05 for this dataset, and only year 2020/2021 has lab05.
+labs=("lab02")
+# we will only use lab02 of the second year as the evaluation dataset
+# models=("wco" "vm" "ed" "all")
+#models=("wco")
+#models=("vm")
+#models=("ed")
+models=("all")
+
+for((m=0;m<${#models[@]};m++));
+do
+    model=${models[$m]}
+    echo $model
+    results_dir="results/var_maps-"$model
+    mkdir -p $results_dir
+    for((l=0;l<${#labs[@]};l++));
+    do
+	lab=${labs[$l]}
+	for ex in $(find $data_dir/incorrect_submissions/$lab/ex* -maxdepth 0 -type d);
+	do
+	    ex=$(echo $ex | rev | cut -d '/' -f 1 | rev)
+	    for mut_dir in $(find $data_dir/incorrect_submissions/$lab/$ex/* -maxdepth 0 -mindepth 0 -type d);
+	    do
+		mut=$(echo $mut_dir | rev | cut -d '/' -f 1 | rev)
+		for mutl_dir in $(find $data_dir/incorrect_submissions/$lab/$ex/$mut/* -maxdepth 0 -mindepth 0 -type d);
+		do
+		    mutl=$(echo $mutl_dir | rev | cut -d '/' -f 1 | rev)
+		    echo "Dealing with "$mutl_dir
+		    for p in $(find $data_dir/incorrect_submissions/$lab/$ex/$mut/$mutl/*.c -maxdepth 0 -mindepth 0 -type f);
+		    do
+			stu_id=$(echo $p | rev | cut -d '/' -f 1 | rev)
+			stu_id=$(echo $stu_id | sed "s/.c//g")
+			d=$results_dir/$lab/$ex/$mut/$mutl/$stu_id-$model
+			mkdir -p $d $initial_dir/variable_mappings/$model/$lab/$ex/$mut/$mutl
+			c_prog_ast=$(find $data_dir/correct_submissions/$lab/$ex/"ast-"$stu_id* -type f | tail -n 1)
+			i_prog_ast=$mutl_dir/"ast-"$stu_id".pkl.gz"
+			gnn_model=$(find $gnn_models_dir/$model*.pt -type f | tail -1 )
+			# /home/pmorvalho/runsolver/src/runsolver -o $d/out.o -w $d/watcher.w -v $d/var.v -W $TIMEOUT_REPAIR --rss-swap-limit 32000 \
+			python3 eval.py -ia $i_prog_ast -ca $c_prog_ast -m $var_maps_dir/$model/$lab/$ex/$mut/$mutl"/var_map-"$stu_id".pkl.gz" -md $var_maps_dir/$model/$lab/$ex/$mut/$mutl"/var_map_distributions-"$stu_id".pkl.gz" -gm $gnn_model -t $d/var_map_time.txt > $d/out.o
+		    done
+		    # wait
+		done
+	    done
+	done
+    done
+done
+