forked from DSE-MSU/DeepRobust
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathogbn_converter.py
105 lines (92 loc) · 2.71 KB
/
ogbn_converter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import json
import sys
import os
import numpy as np
import scipy.sparse as sp
from tqdm import tqdm
from ogb.nodeproppred import PygNodePropPredDataset
"""
Run this script to convert the graph from the open graph benchmark format
to the GraphSAINT format.
Right now, ogbn-products and ogbn-arxiv can be converted by this script.
"""
dataset = PygNodePropPredDataset(name='ogbn-arxiv')
split_idx = dataset.get_idx_split()
train_idx, valid_idx, test_idx = split_idx['train'], split_idx['valid'], split_idx['test']
graph = dataset[0]
num_node = graph.y.shape[0]
# import pdb; pdb.set_trace()
save_dir = './data/'+'ogbn-arxiv'+'/'
try:
os.mkdir(save_dir)
except OSError as error:
print(error)
# feats.npy
feats = graph.x.numpy()
np.save(save_dir+'feats.npy',feats)
# role.json
role = dict()
role['tr'] = train_idx.numpy().tolist()
role['va'] = valid_idx.numpy().tolist()
role['te'] = test_idx.numpy().tolist()
with open(save_dir+'role.json','w') as f:
json.dump(role, f)
# class_map.json
class_map = dict()
for i in range(num_node):
class_map[str(i)] = int(graph.y[i])
with open(save_dir + 'class_map.json', 'w') as f:
json.dump(class_map, f)
# adj_*.npz
train_idx_set = set(train_idx.numpy().tolist())
test_idx_set = set(test_idx.numpy().tolist())
edge_index = graph.edge_index.numpy()
row_full = edge_index[0]
col_full = edge_index[1]
row_train = []
col_train = []
row_val = []
col_val = []
for i in tqdm(range(row_full.shape[0])):
if row_full[i] in train_idx_set and col_full[i] in train_idx_set:
row_train.append(row_full[i])
col_train.append(col_full[i])
row_val.append(row_full[i])
col_val.append(col_full[i])
elif not (row_full[i] in test_idx_set or col_full[i] in test_idx_set):
row_val.append(row_full[i])
col_val.append(col_full[i])
row_train = np.array(row_train)
col_train = np.array(col_train)
row_val = np.array(row_val)
col_val = np.array(col_val)
dtype = np.bool
adj_full = sp.coo_matrix(
(
np.ones(row_full.shape[0], dtype=dtype),
(row_full, col_full),
),
shape=(num_node, num_node)
).tocsr()
adj_train = sp.coo_matrix(
(
np.ones(row_train.shape[0], dtype=dtype),
(row_train, col_train),
),
shape=(num_node, num_node)
).tocsr()
adj_val = sp.coo_matrix(
(
np.ones(row_val.shape[0], dtype=dtype),
(row_val, col_val),
),
shape=(num_node, num_node)
).tocsr()
# import pdb; pdb.set_trace()
print('adj_full num edges:', adj_full.nnz)
print('adj_val num edges:', adj_val.nnz)
print('adj_train num edges:', adj_train.nnz)
sp.save_npz(save_dir+'adj_full.npz', adj_full)
sp.save_npz(save_dir+'adj_train.npz', adj_train)
# adj_val not used in GraphSAINT
sp.save_npz(save_dir+'adj_val.npz', adj_val)