add support for general graph datasets

OshriHalimi · Apr 8, 2019 · 97a3aca · 97a3aca
1 parent 540c97f
commit 97a3aca
Show file tree

Hide file tree

Showing 10 changed files with 1,023 additions and 4 deletions.
diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 This is the PyTorch implementation of [Lanczos Network](https://arxiv.org/abs/1901.01484) as described in the following ICLR 2019 paper:
 
 ```
-@inproceedings{liao2018lanczos,
+@inproceedings{liao2019lanczos,
   title={LanczosNet: Multi-Scale Deep Graph Convolutional Networks},
   author={Liao, Renjie and Zhao, Zhizhen and Urtasun, Raquel and Zemel, Richard},
   booktitle={ICLR},
@@ -84,6 +84,27 @@ Python 3, PyTorch(1.0), numpy, scipy, sklearn
   ```python run_exp.py -c config/X.yaml -t```
 
 
+## Run on General Graph Datasets
+
+I provide an example code for a synthetic graph regression problem, i.e., given multiple graphs, each of which is accompanied with node embeddings, it is required to predict a real-valued graph embedding vector per graph. 
+
+* To generate the synthetic dataset:
+
+  ```cd dataset```
+
+  ```PYTHONPATH=../ python get_graph_data.py```
+
+* To run the training:
+
+  ```python run_exp.py -c config/graph_lanczos_net.yaml```
+
+
+**Note**:
+* Please read ```dataset/get_graph_data.py``` for more information on how to adapt it to your own graph datasets. 
+* I only add support to LanczosNet by slightly modifying the learnable node embedding to the input node embedding in ```model/lanczos_net_general.py```. It should be straightforward for you to add support to other models if you are interested.
+
+
+
 ## Cite
 Please cite our paper if you use this code in your research work.
 

diff --git a/config/graph_lanczos_net.yaml b/config/graph_lanczos_net.yaml
@@ -0,0 +1,46 @@
+---
+exp_name: graph_lanczos_net
+exp_dir: exp/graph_lanczos_net
+runner: GraphRunner
+use_gpu: true
+gpus: [0]
+seed: 1234
+dataset:
+  loader_name: GraphData
+  name: synthetic
+  data_path: data/synthetic  
+  node_emb_dim: 10
+  graph_emb_dim: 2
+  num_edge_type: 1
+model:
+  name: LanczosNetGeneral
+  short_diffusion_dist: []
+  long_diffusion_dist: [1, 2, 3, 5, 7, 10, 20, 30]
+  num_eig_vec: 20
+  spectral_filter_kind: MLP  
+  input_dim: 10
+  hidden_dim: [128, 128, 128, 128, 128, 128, 128]
+  output_dim: 2
+  num_layer: 7
+  loss: MSE
+  output_func: MLP
+train:
+  optimizer: Adam
+  lr_decay: 0.1
+  lr_decay_steps: [10000]
+  num_workers: 0
+  max_epoch: 200
+  batch_size: 10
+  display_iter: 10
+  snapshot_epoch: 10000
+  valid_epoch: 100
+  lr: 1.0e-4
+  wd: 0.0e-4
+  momentum: 0.9
+  shuffle: true
+  is_resume: false
+  resume_model: None
+test:  
+  batch_size: 64
+  num_workers: 0
+  test_model: 
diff --git a/dataset/__init__.py b/dataset/__init__.py
@@ -1 +1,2 @@
 from dataset.qm8 import *
+from dataset.graph_data import *
diff --git a/dataset/get_graph_data.py b/dataset/get_graph_data.py
@@ -0,0 +1,102 @@
+import os
+import glob
+import pickle
+import numpy as np
+import networkx as nx
+from utils.data_helper import get_multi_graph_laplacian_eigs, get_graph_laplacian_eigs, get_laplacian
+
+save_dir = '../data/synthetic/'
+
+if not os.path.exists(save_dir):
+  os.mkdir(save_dir)
+  print('made directory {}'.format(save_dir))
+
+
+def gen_data(min_num_nodes=20,
+             max_num_nodes=100,
+             num_graphs=10,
+             node_emb_dim=10,
+             graph_emb_dim=2,
+             edge_prob=0.5,
+             seed=123):
+  """
+    Generate synthetic graph data for graph regression, i.e., given node 
+    embedding and graph structure as input, predict a graph embedding 
+    as output.
+
+    N.B.: modification to other tasks like node classification is straightforward
+
+    A single graph in your dataset should contin:
+      X: Node embedding, numpy array, shape N X D where N is # nodes
+      A: Graph structure, numpy array, shape N X N X E where E is # edge types
+      Y: Graph embedding, numpy array, shape N X O
+  """
+  npr = np.random.RandomState(seed)
+  N = npr.randint(min_num_nodes, high=max_num_nodes+1, size=num_graphs)
+
+  data = []
+  for ii in range(num_graphs):    
+    data_dict = {}
+    data_dict['X'] = npr.randn(N[ii], node_emb_dim)
+    # we assume # edge type = 1, but you can easily extend it to be more than 1
+    data_dict['A'] = np.expand_dims(
+        nx.to_numpy_matrix(
+            nx.fast_gnp_random_graph(N[ii], edge_prob, seed=npr.randint(1000))),
+        axis=2)
+    data_dict['Y'] = npr.randn(1, graph_emb_dim)
+    data += [data_dict]
+
+  return data
+
+
+def dump_data(data_list, tag='train'):
+  count = 0
+  print('Dump {} data!'.format(tag))
+  for data in data_list:
+
+    data_dict = {}
+    data_dict['node_feat'] = data['X']
+    adjs = data['A']
+    adj_simple = np.sum(adjs, axis=2)
+    D_list, V_list, L_list = get_multi_graph_laplacian_eigs(
+        adjs, graph_laplacian_type='L4', use_eigen_decomp=True, is_sym=True)
+    D, V, L4 = get_graph_laplacian_eigs(
+        adj_simple,
+        graph_laplacian_type='L4',
+        use_eigen_decomp=True,
+        is_sym=True)
+
+    L6 = get_laplacian(adj_simple, graph_laplacian_type='L6')
+    L7 = get_laplacian(adj_simple, graph_laplacian_type='L7')
+
+    data_dict['L_multi'] = np.stack(L_list, axis=2)
+    data_dict['L_simple_4'] = L4
+    data_dict['L_simple_6'] = L6
+    data_dict['L_simple_7'] = L7
+
+    # N.B.: for some edge type, adjacency matrix may be diagonal
+    data_dict['D_simple'] = D if D is not None else np.ones(adjs.shape[0])
+    data_dict['V_simple'] = V if V is not None else np.eye(adjs.shape[0])
+    data_dict['D_multi'] = D_list
+    data_dict['V_multi'] = V_list
+    data_dict['label'] = data['Y']
+
+    pickle.dump(
+        data_dict,
+        open(
+            os.path.join(save_dir, 'synthetic_{}_{:07d}.p'.format(tag, count)),
+            'wb'))
+
+    count += 1
+
+  print('100.0 %%')
+
+
+if __name__ == '__main__':
+  train_dataset = gen_data(seed=123)
+  dev_dataset = gen_data(seed=456)
+  test_dataset = gen_data(seed=789)
+
+  dump_data(train_dataset, 'train')
+  dump_data(dev_dataset, 'dev')
+  dump_data(test_dataset, 'test')
Original file line number	Diff line number	Diff line change
		@@ -1 +1,2 @@
		from dataset.qm8 import *
		from dataset.graph_data import *