add code

biandh · Dec 13, 2019 · 78b224a · 78b224a
1 parent f8df86d
commit 78b224a
Show file tree

Hide file tree

Showing 34 changed files with 2,340 additions and 0 deletions.
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2019 OGB Team
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/examples/graphproppred/main_dgl.py b/examples/graphproppred/main_dgl.py
@@ -0,0 +1,162 @@
+### example code of GIN using DGL
+import torch
+from torch.utils.data import DataLoader
+import dgl.function as fn
+import dgl
+import torch.optim as optim
+import torch.nn.functional as F
+
+from tqdm import tqdm
+import argparse
+import time
+import numpy as np
+
+### importing OGB
+
+### for loading dataset
+from ogb.graphproppred.dataset_dgl import DglGraphPropPredDataset, collate_dgl
+### for encoding raw molecule features
+from ogb.graphproppred.mol_encoder import AtomEncoder, BondEncoder
+### for evaluation
+from ogb.graphproppred import Evaluator
+
+criterion = torch.nn.BCEWithLogitsLoss()
+
+class GINConv(torch.nn.Module):
+    """
+    - GIN architecture.
+    - Assume both node_feat and edge_feat have the dimensionality of emb_dim.
+    """
+    def __init__(self, emb_dim):
+        super(GINConv, self).__init__()
+
+        self.mlp = torch.nn.Sequential(torch.nn.Linear(emb_dim, 2*emb_dim), torch.nn.BatchNorm1d(2*emb_dim), torch.nn.ReLU(), torch.nn.Linear(2*emb_dim, emb_dim))
+        self.eps = torch.nn.Parameter(torch.Tensor([0]))
+
+    def forward(self, graph, node_feat, edge_feat):
+        graph = graph.local_var()
+        graph.ndata['h_n'] = node_feat
+        graph.edata['h_e'] = edge_feat
+
+        ### u, v, e represent source nodes, destination nodes and edges among them
+        graph.update_all(fn.u_add_e('h_n', 'h_e', 'm'), fn.sum('m', 'neigh'))
+        rst = (1 + self.eps) * node_feat + graph.ndata['neigh']
+        rst = self.mlp(rst)
+
+        return rst
+
+
+class GIN(torch.nn.Module):
+    def __init__(self, num_layer = 5, emb_dim = 100, num_task = 2, device = "cpu"):
+        super(GIN, self).__init__()
+
+        self.num_layer = num_layer
+
+        self.gins = torch.nn.ModuleList()
+        self.batch_norms = torch.nn.ModuleList()
+        for layer in range(self.num_layer):
+            self.gins.append(GINConv(emb_dim))
+            self.batch_norms.append(torch.nn.BatchNorm1d(emb_dim))
+
+        ### convenient module to encode/embed raw molecule node/edge features. (TODO) make it more efficient.
+        self.atom_encoder = AtomEncoder(emb_dim)
+        self.bond_encoder = BondEncoder(emb_dim)
+
+        self.graph_pred_linear = torch.nn.Linear(emb_dim, num_task)
+        self.device = device
+
+
+    def forward(self, g):
+        h_node = self.atom_encoder(g.ndata["feat"].to(self.device))
+        h_edge = self.bond_encoder(g.edata["feat"].to(self.device))
+
+         ### iterative message passing to obtain node embeddings
+        for layer in range(self.num_layer):
+            h_node = self.gins[layer](g, h_node, h_edge)
+            h_node = self.batch_norms[layer](h_node)
+            h_node = F.relu(h_node)
+
+        ### pooling
+        g.ndata['h_node'] = h_node
+        h_graph = dgl.mean_nodes(g, 'h_node')
+
+        return self.graph_pred_linear(h_graph)
+
+
+
+def train(model, device, loader, optimizer):
+    model.train()
+
+    for step, (graphs, labels) in enumerate(tqdm(loader, desc="Iteration")):
+        labels = labels.to(device)
+        pred = model(graphs)
+        optimizer.zero_grad()
+        is_valid = labels == labels
+        loss = criterion(pred.to(torch.float32)[is_valid], labels.to(torch.float32)[is_valid])
+        loss.backward()
+        optimizer.step()
+
+def eval(model, device, loader, evaluator):
+    model.eval()
+    y_true = []
+    y_pred = []
+
+    for step, (graphs, labels) in enumerate(tqdm(loader, desc="Iteration")):
+
+        with torch.no_grad():
+            pred = model(graphs)
+
+        y_true.append(labels.view(pred.shape).detach().cpu())
+        y_pred.append(pred.detach().cpu())
+
+    y_true = torch.cat(y_true, dim = 0).numpy()
+    y_pred = torch.cat(y_pred, dim = 0).numpy()
+
+    input_dict = {"y_true": y_true, "y_pred": y_pred}
+
+    return evaluator.eval(input_dict)
+
+
+def main():
+    # Training settings
+    parser = argparse.ArgumentParser(description='GIN with Pytorch Geometrics')
+    parser.add_argument('--device', type=int, default=0,
+                        help='which gpu to use if any (default: 0)')
+    parser.add_argument('--batch_size', type=int, default=32,
+                        help='input batch size for training (default: 32)')
+    parser.add_argument('--epochs', type=int, default=100,
+                        help='number of epochs to train (default: 100)')
+    parser.add_argument('--num_workers', type=int, default=0,
+                        help='number of workers (default: 0)')
+    parser.add_argument('--dataset', type=str, default="ogbg-mol-tox21",
+                        help='dataset name (default: ogbg-mol-tox21)')
+    args = parser.parse_args()
+
+    device = torch.device("cuda:" + str(args.device)) if torch.cuda.is_available() else torch.device("cpu")
+
+
+    ### automatic dataloading and splitting
+    dataset = DglGraphPropPredDataset(name = args.dataset)
+    splitted_idx = dataset.get_idx_split()
+
+    ### automatic evaluator. takes dataset name as input
+    evaluator = Evaluator(args.dataset)
+
+    train_loader = DataLoader(dataset[splitted_idx["train"]], batch_size=args.batch_size, shuffle=True, collate_fn = collate_dgl, num_workers = args.num_workers)
+    valid_loader = DataLoader(dataset[splitted_idx["valid"]], batch_size=args.batch_size, shuffle=False, collate_fn = collate_dgl, num_workers = args.num_workers)
+    test_loader = DataLoader(dataset[splitted_idx["test"]], batch_size=args.batch_size, shuffle=False, collate_fn = collate_dgl, num_workers = args.num_workers)
+
+    model = GIN(num_task = dataset.num_tasks, device = device).to(device)
+
+    optimizer = optim.Adam(model.parameters(), lr=0.001)
+
+    for epoch in range(1, args.epochs + 1):
+        train(model, device, train_loader, optimizer)
+        #print("Evaluating training...")
+        #print(eval(model, device, train_loader, evaluator))
+        print("Evaluating validation:")
+        print(eval(model, device, valid_loader, evaluator))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/graphproppred/main_pyg.py b/examples/graphproppred/main_pyg.py
@@ -0,0 +1,164 @@
+### example code of GIN using pytorch geometrics
+import torch
+from torch_geometric.nn import MessagePassing
+from torch_geometric.nn import global_mean_pool
+from torch_geometric.data import DataLoader
+import torch.optim as optim
+import torch.nn.functional as F
+
+from tqdm import tqdm
+import argparse
+import time
+import numpy as np
+
+### importing OGB
+
+### for loading dataset
+from ogb.graphproppred.dataset_pyg import PygGraphPropPredDataset
+### for encoding raw molecule features
+from ogb.graphproppred.mol_encoder import AtomEncoder, BondEncoder
+### for evaluation
+from ogb.graphproppred import Evaluator
+
+criterion = torch.nn.BCEWithLogitsLoss()
+
+class GINConv(MessagePassing):
+    """
+    - GIN architecture.
+    - Assume both x and edge_attr have the dimensionality of emb_dim.
+    """
+    def __init__(self, emb_dim):
+        super(GINConv, self).__init__(aggr="add")
+
+        self.mlp = torch.nn.Sequential(torch.nn.Linear(emb_dim, 2*emb_dim), torch.nn.BatchNorm1d(2*emb_dim), torch.nn.ReLU(), torch.nn.Linear(2*emb_dim, emb_dim))
+        self.eps = torch.nn.Parameter(torch.Tensor([0]))
+
+    def forward(self, x, edge_index, edge_attr):
+        ### propagate = message -> aggr -> update
+        h = (1 + self.eps) * x + self.propagate(edge_index, x=x, edge_attr=edge_attr)
+        out = self.mlp(h)
+
+        return out
+
+    ### message to be aggregated
+    ### x_j is the feature of source node
+    def message(self, x_j, edge_attr):
+        return x_j + edge_attr
+
+    def update(self, aggr_out):
+        return aggr_out
+
+
+class GIN(torch.nn.Module):
+    def __init__(self, num_layer = 5, emb_dim = 100, num_task = 2):
+        super(GIN, self).__init__()
+
+        self.num_layer = num_layer
+
+        self.gins = torch.nn.ModuleList()
+        self.batch_norms = torch.nn.ModuleList()
+        for layer in range(self.num_layer):
+            self.gins.append(GINConv(emb_dim))
+            self.batch_norms.append(torch.nn.BatchNorm1d(emb_dim))
+
+        ### convenient module to encode/embed raw molecule node/edge features. (TODO) make it more efficient.
+        self.atom_encoder = AtomEncoder(emb_dim)
+        self.bond_encoder = BondEncoder(emb_dim)
+
+        self.graph_pred_linear = torch.nn.Linear(emb_dim, num_task)
+
+    def forward(self, batch):
+        x, edge_index, edge_attr, batch = batch.x, batch.edge_index, batch.edge_attr, batch.batch
+
+        h = self.atom_encoder(x)
+        edge_emb = self.bond_encoder(edge_attr)
+
+        ### iterative message passing to obtain node embeddings
+        for layer in range(self.num_layer):
+            h = self.gins[layer](h, edge_index, edge_emb)
+            h = self.batch_norms[layer](h)
+            h = F.relu(h)
+
+        ### pooling
+        h_graph = global_mean_pool(h, batch)
+
+        return self.graph_pred_linear(h_graph)
+
+
+def train(model, device, loader, optimizer):
+    model.train()
+
+    for step, batch in enumerate(tqdm(loader, desc="Iteration")):
+        batch = batch.to(device)
+        pred = model(batch)
+        optimizer.zero_grad()
+        is_valid = batch.y == batch.y
+        loss = criterion(pred.to(torch.float32)[is_valid], batch.y.to(torch.float32)[is_valid])
+        loss.backward()
+        optimizer.step()
+
+def eval(model, device, loader, evaluator):
+    model.eval()
+    y_true = []
+    y_pred = []
+
+    for step, batch in enumerate(tqdm(loader, desc="Iteration")):
+        batch = batch.to(device)
+
+        with torch.no_grad():
+            pred = model(batch)
+
+        y_true.append(batch.y.view(pred.shape).detach().cpu())
+        y_pred.append(pred.detach().cpu())
+
+    y_true = torch.cat(y_true, dim = 0).numpy()
+    y_pred = torch.cat(y_pred, dim = 0).numpy()
+
+    input_dict = {"y_true": y_true, "y_pred": y_pred}
+
+    return evaluator.eval(input_dict)
+
+
+def main():
+    # Training settings
+    parser = argparse.ArgumentParser(description='GIN with Pytorch Geometrics')
+    parser.add_argument('--device', type=int, default=0,
+                        help='which gpu to use if any (default: 0)')
+    parser.add_argument('--batch_size', type=int, default=32,
+                        help='input batch size for training (default: 32)')
+    parser.add_argument('--epochs', type=int, default=100,
+                        help='number of epochs to train (default: 100)')
+    parser.add_argument('--num_workers', type=int, default=0,
+                        help='number of workers (default: 0)')
+    parser.add_argument('--dataset', type=str, default="ogbg-mol-tox21",
+                        help='dataset name (default: ogbg-mol-tox21)')
+    args = parser.parse_args()
+
+    device = torch.device("cuda:" + str(args.device)) if torch.cuda.is_available() else torch.device("cpu")
+
+
+    ### automatic dataloading and splitting
+    dataset = PygGraphPropPredDataset(name = args.dataset)
+    splitted_idx = dataset.get_idx_split()
+
+    ### automatic evaluator. takes dataset name as input
+    evaluator = Evaluator(args.dataset)
+
+    train_loader = DataLoader(dataset[splitted_idx["train"]], batch_size=args.batch_size, shuffle=True, num_workers = args.num_workers)
+    valid_loader = DataLoader(dataset[splitted_idx["valid"]], batch_size=args.batch_size, shuffle=False, num_workers = args.num_workers)
+    test_loader = DataLoader(dataset[splitted_idx["test"]], batch_size=args.batch_size, shuffle=False, num_workers = args.num_workers)
+
+    model = GIN(num_task = dataset.num_tasks).to(device)
+
+    optimizer = optim.Adam(model.parameters(), lr=0.001)
+
+    for epoch in range(1, args.epochs + 1):
+        train(model, device, train_loader, optimizer)
+        #print("Evaluating training...")
+        #print(eval(model, device, train_loader, evaluator))
+        print("Evaluating validation:")
+        print(eval(model, device, valid_loader, evaluator))
+
+
+if __name__ == "__main__":
+    main()