forked from Hoho-Lin/OpenKG-for-Game-of-Thrones
-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathweak_supervise_data.py
223 lines (197 loc) · 8.62 KB
/
weak_supervise_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
import collections
import json
import sqlite3
import pickle
# Mate Variables
from SPARQLWrapper import SPARQLWrapper, JSON
SentenceFile = "processed_data/preprocessed_data_with_entity_v1.jsonl"
Entity2SentencesFile = "processed_data/entity2sentence_v1.pkl" # 标注词语可能对应的实体的jsonl文件
EntityReplacementFile = "processed_data/candidate_entity_replacement_list_v5.jsonl" # 记录literal实体候选替换实体的jsonl文件
SQLiteFile = "processed_data/weak_supervise.db"
SuperviseDataFile = "supervise_data/tagged_corpus_with_candidate_v3.jsonl" # 用于监督训练的数据的jsonl文件(带有句子分词等其他信息)
SuperviseDataCSV = "supervise_data/tagged_corpus_with_candidate_v3.csv" # 用于监督训练的数据的csv文件(可用于deepKE训练)
HasRelationSuperviseDataCSV = "supervise_data/tagged_corpus_with_candidate_v3_has_relations.csv" # 用于监督训练的数据的csv文件(可用于deepKE训练, 带有关系的数据样本)
NoneRelationSuperviseDataCSV = "supervise_data/tagged_corpus_with_candidate_v3_None_relations.csv" # 用于监督训练的数据的csv文件(可用于deepKE训练, 关系None的数据样本)
def squeeze_result(res):
if res["type"] == "uri":
tmp_split = res["value"].split("/")
if tmp_split[-2].strip() == "action":
return f"r:{tmp_split[-1]}"
elif tmp_split[-2].strip() == "entity":
return f"e:{tmp_split[-1]}"
else:
return res["value"]
def create_table():
conn = sqlite3.connect(SQLiteFile)
c = conn.cursor()
c.execute(
"""
CREATE TABLE data_index
(sentenceID INT NOT NULL,
entity1 TEXT NOT NULL,
entity2 TEXT NOT NULL,
relation TEXT NOT NULL,
offset1 INT NOT NULL,
offset2 INT NOT NULL,
);
"""
)
conn.commit()
conn.close()
def insert_data(in_tuple):
conn = sqlite3.connect(SQLiteFile)
c = conn.cursor()
c.execute(
"INSERT INTO data_index (sentenceID, entity1, entity2, relation, offset1, offset2) "
"VALUES (?, ?, ?, ?, ?, ?)", in_tuple
)
conn.commit()
conn.close()
def sparql_get_all_two_entities_triple():
"""
返回所有头尾实体都不是literal的triple
:return:
"""
sparql = SPARQLWrapper("http://localhost:3030/testds/sparql")
sparql.setQuery(
"""
PREFIX r: <http://kg.course/action/>
PREFIX e: <http://kg.course/entity/>
SELECT DISTINCT *
WHERE {
?s ?r ?o.
MINUS{
?s ?r ?o
FILTER isLiteral(?o)
}
}
"""
)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
return [(squeeze_result(r["s"]), squeeze_result(r["r"]), squeeze_result(r["o"]))
for r in results["results"]["bindings"]]
def list2jsonl_file(dict_list, filename):
with open(filename, mode="w", encoding="utf-8", errors="ignore") as f:
for i in dict_list:
f.write(json.dumps(i) + "\n")
def jsonl_generator(filename, **kwargs):
with open(filename, **kwargs) as f:
while True:
line = f.readline()
if line != "":
yield json.loads(line)
else:
return ""
def combine_offset(list1, list2):
"""
将list1和list2的元素两两组合
:param list1:
:param list2:
:return:
"""
ret = []
for i in list1:
for j in list2:
if i != j:
ret.append((i, j))
return ret
def find_sentence(e1_occur, e2_occur):
"""
:param e1_occur: list of tuple, 每个tuple中为(出现的句子编号, 出现的位置)
:param e2_occur:
:return:
"""
ret = []
occur_sentence1 = set([s[0] for s in e1_occur])
occur_sentence2 = set([s[0] for s in e2_occur])
co_occur_sentences = list(occur_sentence1.intersection(occur_sentence2))
for s_i in co_occur_sentences:
offsets1 = [s[1] for s in e1_occur if s[0] == s_i and s[1] != "pov"]
offsets2 = [s[1] for s in e2_occur if s[0] == s_i and s[1] != "pov"]
ret.append({"sentenceID": s_i, "offsets": combine_offset(offsets1, offsets2)})
return ret
def construct_weak_supervise_data():
ret = [] # tuple列表, 每个tuple=(sentenceID, entity1, offset1, entity2, offset2, relation)
with open(Entity2SentencesFile, mode="rb") as f:
entity2sentence = pickle.load(f)
# replacement list里的三元组
for triple in jsonl_generator(EntityReplacementFile):
s_entity = triple["s"]
r = triple["r"]
o_literal = triple["o"]
print(f"{s_entity}, {r}, {o_literal}")
o_entities = triple["candidate_entity"]
for o in o_entities:
if s_entity not in entity2sentence.keys() or o not in entity2sentence.keys():
continue
for cooccur_info in find_sentence(entity2sentence[s_entity], entity2sentence[o]):
for offset1, offset2 in cooccur_info["offsets"]:
ret.append((cooccur_info["sentenceID"],
s_entity, offset1,
o_literal, offset2,
r))
for triple in sparql_get_all_two_entities_triple():
s_entity = triple[0]
r = triple[1]
o_entity = triple[2]
print(f"{s_entity}, {r}, {o_entity}")
if s_entity not in entity2sentence.keys() or o_entity not in entity2sentence.keys():
continue
for cooccur_info in find_sentence(entity2sentence[s_entity], entity2sentence[o_entity]):
for offset1, offset2 in cooccur_info["offsets"]:
ret.append((cooccur_info["sentenceID"],
s_entity, offset1,
o_entity, offset2,
r))
ret = list(set(ret))
return ret
def instantiate_sentence_in_superivse_data(supervise_data):
data_ptr = 0
ret = []
for i, s in enumerate(jsonl_generator(SentenceFile)):
data = supervise_data[data_ptr]
while data[0] == i:
print(i, data_ptr)
ret.append({
"sentence": s["hanlp_tokens"],
"s": data[1], "s_offset": data[2],
"o": data[3], "o_offset": data[4],
"r": data[5]
})
data_ptr += 1
if data_ptr >= len(supervise_data):
break
data = supervise_data[data_ptr]
if data_ptr >= len(supervise_data):
break
return ret
if __name__ == "__main__":
# SuperviseData = construct_weak_supervise_data()
# SuperviseData = sorted(SuperviseData, key=lambda x: x[0])
# SuperviseData = instantiate_sentence_in_superivse_data(SuperviseData)
#
# list2jsonl_file(SuperviseData, SuperviseDataFile)
# == 把jsonl转换为deepKE的格式sentence, relation, head, head_offset, tail, tail_offset ==
HowToReadJSONL = {"mode": "r", "encoding": "utf-8", "errors": "ignore"}
RelationCnter = collections.Counter()
with open(SuperviseDataCSV, mode="w", encoding="utf-8", errors="ignore") as OutCSVFile:
for Item in jsonl_generator(SuperviseDataFile, **HowToReadJSONL):
OutCSVFile.write(",".join([Item["text"], str(Item["relation"]),
str(Item["ents"][0][0]), str(Item["ents"][0][2]),
str(Item["ents"][1][0]), str(Item["ents"][1][2])]) + "\n")
RelationCnter[str(Item["relation"])] += 1
# == 存在关系的样本写入一个文件 ==
with open(HasRelationSuperviseDataCSV, mode="w", encoding="utf-8", errors="ignore") as OutCSVFile:
for Item in jsonl_generator(SuperviseDataFile, **HowToReadJSONL):
if str(Item["relation"]) != "None":
OutCSVFile.write(",".join([Item["text"], str(Item["relation"]),
str(Item["ents"][0][0]), str(Item["ents"][0][2]),
str(Item["ents"][1][0]), str(Item["ents"][1][2])]) + "\n")
# == 不存在关系的样本写入一个文件 ==
with open(NoneRelationSuperviseDataCSV, mode="w", encoding="utf-8", errors="ignore") as OutCSVFile:
for Item in jsonl_generator(SuperviseDataFile, **HowToReadJSONL):
if str(Item["relation"]) == "None":
OutCSVFile.write(",".join([Item["text"], str(Item["relation"]),
str(Item["ents"][0][0]), str(Item["ents"][0][2]),
str(Item["ents"][1][0]), str(Item["ents"][1][2])]) + "\n")