-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconvert_gold.py
executable file
·110 lines (95 loc) · 4.13 KB
/
convert_gold.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/usr/bin/env python
import sys, os, io, csv, json
from wtu.table import Table
if len(sys.argv) != 2:
print('usage: {:s} <gold dir>'.format(sys.argv[0]))
sys.exit()
# data directories
gold_dir = sys.argv[1]
classes_file = os.path.join(gold_dir, 'classes_GS.csv')
instance_dir = os.path.join(gold_dir, 'instance')
property_dir = os.path.join(gold_dir, 'property')
tables_dir = os.path.join(gold_dir, 'tables')
# load classes_file
classes = dict()
with io.open(classes_file, 'r') as classes_fh:
classes_reader = csv.reader(classes_fh, delimiter=',', quotechar='"')
for class_row in classes_reader:
table_name_ext, class_name, class_uri = class_row
table_name = table_name_ext.split('.')[0]
classes[table_name] = (class_name, class_uri)
# iterate over all tables
for table_name_ext in os.listdir(tables_dir):
table_file = os.path.join(tables_dir, table_name_ext)
table_name = os.path.splitext(table_name_ext)[0]
# read table data & create Table object
with io.open(table_file, 'r', encoding='utf-8', errors='ignore') as table_fh:
table_data = json.load(table_fh)
table = Table(table_data)
# add class annotation if available
if table_name in classes:
class_name, class_uri = classes[table_name]
table.annotations.append({
'source': 'gold-v2',
'task': 'ClassLinking',
'type': 'class',
'class_name': class_name,
'class_uri': class_uri
})
# add property annotations (columns)
key_col_idx = None
property_file = os.path.join(property_dir, table_name + '.csv')
if os.path.isfile(property_file):
col_set = table.columns()
with io.open(property_file) as property_fh:
property_reader = csv.reader(property_fh, delimiter=',', quotechar='"')
for property_row in property_reader:
# there is at least one malformed file in the gold standard
# (two instead of three columns). The only thing we can do,
# is skip these files.
try:
property_uri, column_header, is_key_column, col_idx = property_row
col_idx = int(col_idx)
column = col_set[col_idx]
# property annotation
column.annotations.append({
'source': 'gold-v2',
'task': 'PropertyLinking',
'type': 'property',
'property_uri': property_uri
})
# key column annotation
if is_key_column == 'True':
key_col_idx = col_idx
column.annotations.append({
'source': 'gold-v2',
'task': 'KeyColumnIdentification',
'type': 'key_column',
})
except ValueError:
pass
# add instance annotations
if key_col_idx is not None:
instance_file = os.path.join(instance_dir, table_name + '.csv')
if os.path.isfile(instance_file):
with io.open(instance_file) as instance_fh:
instance_reader = csv.reader(instance_fh, delimiter=',', quotechar='"')
for instance_row in instance_reader:
# again, skip malformed files
try:
resource_uri, value, row_idx = instance_row
row_idx = int(row_idx)
cell = table.cells()[(key_col_idx, row_idx)]
# instance annotation
cell.annotations.append({
'source': 'gold-v2',
'task': 'EntityLinking',
'type': 'resource',
'resource_uri': resource_uri
})
except ValueError:
pass
# write annotated table to stdout, skip tables that did not
# receive any annotation
if len(table._annotations):
print(json.dumps(table.dump()))