-
Notifications
You must be signed in to change notification settings - Fork 146
/
Copy pathcreate_data_for_online_st.py
198 lines (164 loc) · 5.98 KB
/
create_data_for_online_st.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
# Copyright (c) 2019-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
#
import argparse
import fastBPE
import torch
from pathlib import Path
import pandas as pd
INITIAL_CACHE_FOLDER = "initial_cache"
LANGUAGES = ["cpp", "java", "python"]
from logging import getLogger
import numpy as np
from utils import ROOT_PATH, add_root_to_path # type: ignore
add_root_to_path()
from codegen_sources.model.src.cache import ListCache
from codegen_sources.model.preprocess import XLM_preprocess
from codegen_sources.model.src.data.dataset import MUTATION_SCORE, ASSERTS_COUNT
logger = getLogger()
class Params:
def __init__(self, pad_index=0, eos_index=1) -> None:
self.pad_index = pad_index
self.eos_index = eos_index
self.tokens_per_batch = 1000
self.st_remove_proba = 0.3
def get_tensors(reloaded_data):
pos = reloaded_data["positions"]
sent = reloaded_data["sentences"]
sentences = [sent[p[0] : p[1]] for p in pos]
lengths = [torch.tensor(len(s) + 2) for s in sentences]
out_sentences = []
for s in sentences:
l = len(s) + 2
out_s = torch.LongTensor(l).fill_(1)
out_s[1 : l - 1].copy_(torch.from_numpy(s.astype(np.int64)))
out_sentences.append(out_s)
return out_sentences, lengths
def initialize_cache(dataset_path, output_path: Path):
languages = [l + "_sa" for l in LANGUAGES]
for l1 in languages:
for l2 in [l for l in languages if l > l1]:
print(f"computing initial cache for {l1}-{l2}")
reloaded_data = torch.load(
dataset_path.joinpath(f"train.{l1}-{l2}.{l1}.pth")
)
sents1, len1 = get_tensors(reloaded_data)
sents2, len2 = get_tensors(
torch.load(dataset_path.joinpath(f"train.{l1}-{l2}.{l2}.pth"))
)
assert len(sents1) == len(sents2) == len(len1) == len(len2)
elements = list(zip(sents1, len1, sents2, len2))
ListCache(elements, Params()).save(
output_path.joinpath(f"cache_{l1}-{l2}.pkl")
)
def add_self_trained_dataset(data_df, dataset_path, vocab_path):
logger.info(f"Self labelled dataset to {dataset_path}")
bpe_model = fastBPE.fastBPE(
str(ROOT_PATH.joinpath("data/bpe/cpp-java-python/codes"))
)
print("unfiltered df:", len(data_df))
data_df = data_df[
data_df.python_translated_tests.apply(lambda x: x.count("assert")) > 1
]
print("filtered df:", len(data_df))
java_functions_with_indices = bpe_model.apply(
pd.Series(data_df["TARGET_CLASS"] + " | " + data_df["java_function"])
)
output_folder = dataset_path
output_files = [
open(
output_folder.joinpath(f"self_training.java_sa.{i}.bpe"),
"w",
encoding="utf-8",
errors="ignore",
)
for i in range(args.n_gpus)
]
output_files_all = open(
output_folder.joinpath(f"self_training.java_sa.bpe"),
"w",
encoding="utf-8",
errors="ignore",
)
for i, l in enumerate(sorted(java_functions_with_indices)):
output_files_all.write(l.strip())
output_files_all.write("\n")
output_files[i % 8].write(l.strip())
output_files[i % 8].write("\n")
for f in output_files:
f.close()
output_files_all.close()
for file_path in Path(output_folder).glob("*.bpe"):
print(f"Processing {file_path} with vocab {Path(vocab_path).absolute()}")
XLM_preprocess(
str(Path(vocab_path).absolute()),
str(file_path),
str(file_path).replace(".bpe", ".pth"),
)
def output_multilingual_tests_dataset(df_python, df_cpp, output_path):
data_df = df_python[
["TARGET_CLASS", "java_function", "path_to_test", "python_translated_tests"]
]
data_df["cpp_translated_tests"] = df_cpp["cpp_translated_tests"]
data_df[MUTATION_SCORE] = df_python["MutationScore"]
data_df[ASSERTS_COUNT] = data_df.python_translated_tests.apply(
lambda x: x.count("assert")
)
data_df[
[
"TARGET_CLASS",
"path_to_test",
"python_translated_tests",
"cpp_translated_tests",
MUTATION_SCORE,
ASSERTS_COUNT,
]
].to_json(
output_path.joinpath("translated_tests.json"), orient="records", lines=True
)
return data_df
if __name__ == "__main__":
logger.info("#" * 10 + "Creating data for Online Self-Training" + "#" * 10)
parser = argparse.ArgumentParser(description="")
parser.add_argument(
"--dataset_path", help="path to the offline dataset",
)
parser.add_argument(
"--input_dfs_path", help="Path to input dataframes",
)
parser.add_argument(
"--output_path",
type=str,
help="where the files should be outputed",
default=Path(ROOT_PATH).joinpath("data"),
)
parser.add_argument(
"--n_gpus", type=int, help="number of train set splits", default=8
)
parser.add_argument(
"--vocab_path",
type=str,
help="Path to vocab",
default=Path(ROOT_PATH).joinpath("data", "bpe", "cpp-java-python", "vocab"),
)
args = parser.parse_args()
output_path = Path(args.output_path)
dataset_path = Path(args.dataset_path)
initialize_cache(dataset_path, output_path.joinpath(INITIAL_CACHE_FOLDER))
input_dfs_path = Path(args.input_dfs_path)
assert input_dfs_path.is_dir()
input_dfs_paths = {
lang: input_dfs_path.joinpath(f"test_results_{lang}_df.csv")
for lang in ["python", "cpp"]
}
test_results_dfs = {
lang: pd.read_csv(path) for lang, path in input_dfs_paths.items()
}
data_df = output_multilingual_tests_dataset(
test_results_dfs["python"], test_results_dfs["cpp"], output_path
)
add_self_trained_dataset(data_df, output_path, args.vocab_path)
logger.info("\n" * 2)