diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..f236977
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,8 @@
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+### PyCharm ###
+\.idea/
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..7603f1c
--- /dev/null
+++ b/README.md
@@ -0,0 +1,67 @@
+BC learning for images
+=========================
+
+Implementation of [Between-class Learning for Image Classification](https://arxiv.org/abs/1711.10284) by Yuji Tokozume, Yoshitaka Ushiku, and Tatsuya Harada.
+
+#### Between-class (BC) learning:
+- We generate between-class examples by mixing two training examples belonging to different classes with a random ratio.
+- We then input the mixed data to the model and
+train the model to output the mixing ratio.
+- Original paper: [Learning from Between-class Examples for Deep Sound Recognition](https://arxiv.org/abs/1711.10282) by us ([github](https://github.com/mil-tokyo/bc_learning_sound))
+
+## Contents
+- BC learning for images
+	- BC: mix two images simply using internal divisions.
+	- BC+: mix two images treating them as waveforms.
+- Training of 11-layer CNN on CIFAR datasets
+
+
+## Setup
+- Install [Chainer](https://chainer.org/) v1.24 on a machine with CUDA GPU.
+- Prepare CIFAR datasets.
+
+
+## Training
+- Template:
+
+		python main.py --dataset [cifar10 or cifar100] --netType convnet --data path/to/dataset/directory/ (--BC) (--plus)
+ 
+- Recipes:
+	- Standard learning on CIFAR-10 (around 6.1% error):
+
+			python main.py --dataset cifar10 --netType convnet --data path/to/dataset/directory/
+	
+
+	- BC learning on CIFAR-10 (around 5.4% error):
+
+			python main.py --dataset cifar10 --netType convnet --data path/to/dataset/directory/ --BC
+	
+	- BC+ learning on CIFAR-10 (around 5.2% error):
+
+			python main.py --dataset cifar10 --netType convnet --data path/to/dataset/directory/ --BC --plus
+	
+- Notes:
+	- It uses the same data augmentation scheme as [fb.resnet.torch](https://github.com/facebook/fb.resnet.torch).
+	- By default, it runs training 10 times. You can specify the number of trials by using --nTrials command.
+	- Please check [opts.py](https://github.com/mil-tokyo/bc_learning_image/blob/master/opts.py) for other command line arguments.
+
+## Results
+
+Error rate (average of 10 trials)
+
+| Learning | CIFAR-10 | CIFAR-100 |
+|:--|:-:|:-:|
+| Standard | 6.07  | 26.68 |
+| BC (ours) | 5.40 | 24.28 |
+| BC+ (ours) | **5.22** | **23.68** |
+
+- Other results (please see [paper](https://arxiv.org/abs/1711.10284)):
+	- The performance of [Shake-Shake Regularization](https://github.com/xgastaldi/shake-shake) [[1]](#1) on CIFAR-10 was improved from 2.86% to 2.26%.
+	- The performance of [ResNeXt](https://github.com/facebookresearch/ResNeXt) [[2]](#2) on ImageNet-1K was improved from 20.4% to 19.4% (single-crop top-1 validation error).
+
+---
+
+#### Reference
+<i id=1></i>[1] X. Gastaldi. Shake-shake regularization. In *ICLR Workshop*, 2017.
+
+<i id=2></i>[2] S. Xie, R. Girshick, P. Dollar, Z. Tu, and K. He. Aggregated residual transformations for deep neural networks. In *CVPR*, 2017.
\ No newline at end of file
diff --git a/dataset.py b/dataset.py
new file mode 100644
index 0000000..eb22a09
--- /dev/null
+++ b/dataset.py
@@ -0,0 +1,117 @@
+import os
+import numpy as np
+import random
+import cPickle
+import chainer
+
+import utils as U
+
+
+class ImageDataset(chainer.dataset.DatasetMixin):
+    def __init__(self, images, labels, opt, train=True):
+        self.base = chainer.datasets.TupleDataset(images, labels)
+        self.opt = opt
+        self.train = train
+        self.mix = (opt.BC and train)
+        if opt.dataset == 'cifar10':
+            if opt.plus:
+                self.mean = np.array([4.60, 2.24, -6.84])
+                self.std = np.array([55.9, 53.7, 56.5])
+            else:
+                self.mean = np.array([125.3, 123.0, 113.9])
+                self.std = np.array([63.0, 62.1, 66.7])
+        else:
+            if opt.plus:
+                self.mean = np.array([7.37, 2.13, -9.50])
+                self.std = np.array([57.6, 54.0, 58.5])
+            else:
+                self.mean = np.array([129.3, 124.1, 112.4])
+                self.std = np.array([68.2, 65.4, 70.4])
+
+        self.preprocess_funcs = self.preprocess_setup()
+
+    def __len__(self):
+        return len(self.base)
+
+    def preprocess_setup(self):
+        if self.opt.plus:
+            normalize = U.zero_mean
+        else:
+            normalize = U.normalize
+        if self.train:
+            funcs = [normalize(self.mean, self.std),
+                     U.horizontal_flip(),
+                     U.padding(4),
+                     U.random_crop(32),
+                     ]
+        else:
+            funcs = [normalize(self.mean, self.std)]
+
+        return funcs
+
+    def preprocess(self, image):
+        for f in self.preprocess_funcs:
+            image = f(image)
+
+        return image
+
+    def get_example(self, i):
+        if self.mix:  # Training phase of BC learning
+            while True:  # Select two training examples
+                image1, label1 = self.base[random.randint(0, len(self.base) - 1)]
+                image2, label2 = self.base[random.randint(0, len(self.base) - 1)]
+                if label1 != label2:
+                    break
+            image1 = self.preprocess(image1)
+            image2 = self.preprocess(image2)
+
+            # Mix two images
+            r = np.array(random.random())
+            if self.opt.plus:
+                g1 = np.std(image1)
+                g2 = np.std(image2)
+                p = 1.0 / (1 + g1 / g2 * (1 - r) / r)
+                image = ((image1 * p + image2 * (1 - p)) / np.sqrt(p ** 2 + (1 - p) ** 2)).astype(np.float32)
+            else:
+                image = (image1 * r + image2 * (1 - r)).astype(np.float32)
+
+            # Mix two labels
+            eye = np.eye(self.opt.nClasses)
+            label = (eye[label1] * r + eye[label2] * (1 - r)).astype(np.float32)
+
+        else:  # Training phase of standard learning or testing phase
+            image, label = self.base[i]
+            image = self.preprocess(image).astype(np.float32)
+            label = np.array(label, dtype=np.int32)
+
+        return image, label
+
+
+def setup(opt):
+    def unpickle(fn):
+        with open(fn, 'rb') as f:
+            data = cPickle.load(f)
+        return data
+
+    if opt.dataset == 'cifar10':
+        train = [unpickle(os.path.join(opt.data, 'data_batch_{}'.format(i))) for i in range(1, 6)]
+        train_images = np.concatenate([d['data'] for d in train]).reshape((-1, 3, 32, 32))
+        train_labels = np.concatenate([d['labels'] for d in train])
+        val = unpickle(os.path.join(opt.data, 'test_batch'))
+        val_images = val['data'].reshape((-1, 3, 32, 32))
+        val_labels = val['labels']
+    else:
+        train = unpickle(os.path.join(opt.data, 'train'))
+        train_images = train['data'].reshape(-1, 3, 32, 32)
+        train_labels = train['fine_labels']
+        val = unpickle(os.path.join(opt.data, 'test'))
+        val_images = val['data'].reshape((-1, 3, 32, 32))
+        val_labels = val['fine_labels']
+
+    # Iterator setup
+    train_data = ImageDataset(train_images, train_labels, opt, train=True)
+    val_data = ImageDataset(val_images, val_labels, opt, train=False)
+    train_iter = chainer.iterators.MultiprocessIterator(train_data, opt.batchSize, repeat=False)
+    val_iter = chainer.iterators.SerialIterator(val_data, opt.batchSize, repeat=False, shuffle=False)
+
+    return train_iter, val_iter
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..4cf9037
--- /dev/null
+++ b/main.py
@@ -0,0 +1,55 @@
+"""
+ Between-class Learning for Image Classification.
+ Yuji Tokozume, Yoshitaka Ushiku, and Tatsuya Harada
+
+"""
+
+import sys
+
+sys.path.append('/home/harada/local/envs/chainer/site-packages/')
+
+import os
+import datetime
+import chainer
+
+import opts
+import models
+import dataset
+from train import Trainer
+
+
+def main():
+    opt = opts.parse()
+    chainer.cuda.get_device_from_id(opt.gpu).use()
+    for i in range(1, opt.nTrials + 1):
+        print('+-- Trial {} --+'.format(i))
+        train(opt, i)
+
+
+def train(opt, trial):
+    model = getattr(models, opt.netType)(opt.nClasses)
+    model.to_gpu()
+    optimizer = chainer.optimizers.NesterovAG(lr=opt.LR, momentum=opt.momentum)
+    optimizer.setup(model)
+    optimizer.add_hook(chainer.optimizer.WeightDecay(opt.weightDecay))
+    train_iter, val_iter = dataset.setup(opt)
+    trainer = Trainer(model, optimizer, train_iter, val_iter, opt)
+
+    for epoch in range(1, opt.nEpochs + 1):
+        train_loss, train_top1 = trainer.train(epoch)
+        val_top1 = trainer.val()
+        sys.stderr.write('\r\033[K')
+        sys.stdout.write(
+            '| Epoch: {}/{} | Train: LR {}  Loss {:.3f}  top1 {:.2f} | Val: top1 {:.2f}\n'.format(
+                epoch, opt.nEpochs, trainer.optimizer.lr, train_loss, train_top1, val_top1))
+        sys.stdout.flush()
+
+    if opt.save != 'None':
+        chainer.serializers.save_npz(
+            os.path.join(opt.save, 'model_trial{}.npz'.format(datetime.datetime.now())), model)
+        chainer.serializers.save_npz(
+            os.path.join(opt.save, 'optimizer_trial{}.npz'.format(datetime.datetime.now())), optimizer)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/models/__init__.py b/models/__init__.py
new file mode 100644
index 0000000..03821fc
--- /dev/null
+++ b/models/__init__.py
@@ -0,0 +1 @@
+from convnet import ConvNet as convnet
diff --git a/models/convbnrelu.py b/models/convbnrelu.py
new file mode 100644
index 0000000..a199ba2
--- /dev/null
+++ b/models/convbnrelu.py
@@ -0,0 +1,19 @@
+import chainer
+import chainer.functions as F
+import chainer.links as L
+
+
+class ConvBNReLU(chainer.Chain):
+    def __init__(self, in_channels, out_channels, ksize, stride=1, pad=0,
+                 initialW=chainer.initializers.HeNormal(), nobias=True):
+        super(ConvBNReLU, self).__init__(
+            conv=L.Convolution2D(in_channels, out_channels, ksize, stride, pad,
+                                 initialW=initialW, nobias=nobias),
+            bn=L.BatchNormalization(out_channels, eps=1e-5)
+        )
+
+    def __call__(self, x, train):
+        h = self.conv(x)
+        h = self.bn(h, test=not train)
+
+        return F.relu(h)
diff --git a/models/convnet.py b/models/convnet.py
new file mode 100644
index 0000000..d3d759c
--- /dev/null
+++ b/models/convnet.py
@@ -0,0 +1,44 @@
+import math
+import chainer
+import chainer.functions as F
+import chainer.links as L
+from chainer.initializers import Uniform
+from convbnrelu import ConvBNReLU
+
+
+class ConvNet(chainer.Chain):
+    def __init__(self, n_classes):
+        super(ConvNet, self).__init__(
+            conv11=ConvBNReLU(3, 64, 3, pad=1),
+            conv12=ConvBNReLU(64, 64, 3, pad=1),
+            conv21=ConvBNReLU(64, 128, 3, pad=1),
+            conv22=ConvBNReLU(128, 128, 3, pad=1),
+            conv31=ConvBNReLU(128, 256, 3, pad=1),
+            conv32=ConvBNReLU(256, 256, 3, pad=1),
+            conv33=ConvBNReLU(256, 256, 3, pad=1),
+            conv34=ConvBNReLU(256, 256, 3, pad=1),
+            fc4=L.Linear(256 * 4 * 4, 1024, initialW=Uniform(1. / math.sqrt(256 * 4 * 4))),
+            fc5=L.Linear(1024, 1024, initialW=Uniform(1. / math.sqrt(1024))),
+            fc6=L.Linear(1024, n_classes, initialW=Uniform(1. / math.sqrt(1024)))
+        )
+        self.train = True
+
+    def __call__(self, x):
+        h = self.conv11(x, self.train)
+        h = self.conv12(h, self.train)
+        h = F.max_pooling_2d(h, 2)
+
+        h = self.conv21(h, self.train)
+        h = self.conv22(h, self.train)
+        h = F.max_pooling_2d(h, 2)
+
+        h = self.conv31(h, self.train)
+        h = self.conv32(h, self.train)
+        h = self.conv33(h, self.train)
+        h = self.conv34(h, self.train)
+        h = F.max_pooling_2d(h, 2)
+
+        h = F.dropout(F.relu(self.fc4(h)), train=self.train)
+        h = F.dropout(F.relu(self.fc5(h)), train=self.train)
+
+        return self.fc6(h)
diff --git a/opts.py b/opts.py
new file mode 100644
index 0000000..ecb2f90
--- /dev/null
+++ b/opts.py
@@ -0,0 +1,77 @@
+import os
+import argparse
+
+
+def parse():
+    parser = argparse.ArgumentParser(description='BC learning for image classification')
+
+    # General settings
+    parser.add_argument('--dataset', required=True, choices=['cifar10', 'cifar100'])
+    parser.add_argument('--netType', required=True, choices=['convnet'])
+    parser.add_argument('--data', required=True, help='Path to dataset')
+    parser.add_argument('--nTrials', type=int, default=10)
+    parser.add_argument('--save', default='None', help='Directory to save the results')
+    parser.add_argument('--gpu', type=int, default=0)
+
+    # Learning settings
+    parser.add_argument('--BC', action='store_true', help='BC learning')
+    parser.add_argument('--plus', action='store_true', help='Use BC+')
+    parser.add_argument('--nEpochs', type=int, default=-1)
+    parser.add_argument('--LR', type=float, default=-1, help='Initial learning rate')
+    parser.add_argument('--schedule', type=float, nargs='*', default=-1, help='When to divide the LR')
+    parser.add_argument('--warmup', type=int, default=-1, help='Number of epochs to warm up')
+    parser.add_argument('--batchSize', type=int, default=-1)
+    parser.add_argument('--weightDecay', type=float, default=5e-4)
+    parser.add_argument('--momentum', type=float, default=0.9)
+
+    opt = parser.parse_args()
+    if opt.plus and not opt.BC:
+        raise Exception('Using only --plus option is invalid.')
+
+    # Dataset details
+    if opt.dataset == 'cifar10':
+        opt.nClasses = 10
+    else:  # cifar100
+        opt.nClasses = 100
+
+    # Default settings
+    default_settings = dict()
+    default_settings['cifar10'] = {
+        'convnet': {'nEpochs': 250, 'LR': 0.1, 'schedule': [0.4, 0.6, 0.8], 'warmup': 0, 'batchSize': 128}
+    }
+    default_settings['cifar100'] = {
+        'convnet': {'nEpochs': 250, 'LR': 0.1, 'schedule': [0.4, 0.6, 0.8], 'warmup': 0, 'batchSize': 128}
+    }
+    for key in ['nEpochs', 'LR', 'schedule', 'warmup', 'batchSize']:
+        if eval('opt.{}'.format(key)) == -1:
+            setattr(opt, key, default_settings[opt.dataset][opt.netType][key])
+
+    if opt.save != 'None' and not os.path.isdir(opt.save):
+        os.makedirs(opt.save)
+
+    display_info(opt)
+
+    return opt
+
+
+def display_info(opt):
+    if opt.BC:
+        if opt.plus:
+            learning = 'BC+'
+        else:
+            learning = 'BC'
+    else:
+        learning = 'standard'
+
+    print('+------------------------------+')
+    print('| CIFAR classification')
+    print('+------------------------------+')
+    print('| dataset  : {}'.format(opt.dataset))
+    print('| netType  : {}'.format(opt.netType))
+    print('| learning : {}'.format(learning))
+    print('| nEpochs  : {}'.format(opt.nEpochs))
+    print('| LRInit   : {}'.format(opt.LR))
+    print('| schedule : {}'.format(opt.schedule))
+    print('| warmup   : {}'.format(opt.warmup))
+    print('| batchSize: {}'.format(opt.batchSize))
+    print('+------------------------------+')
diff --git a/train.py b/train.py
new file mode 100644
index 0000000..2ef9995
--- /dev/null
+++ b/train.py
@@ -0,0 +1,82 @@
+import sys
+import numpy as np
+import chainer
+from chainer import cuda
+import chainer.functions as F
+import time
+
+import utils
+
+
+class Trainer:
+    def __init__(self, model, optimizer, train_iter, val_iter, opt):
+        self.model = model
+        self.optimizer = optimizer
+        self.train_iter = train_iter
+        self.val_iter = val_iter
+        self.opt = opt
+        self.n_batches = (len(train_iter.dataset) - 1) // opt.batchSize + 1
+        self.start_time = time.time()
+
+    def train(self, epoch):
+        self.optimizer.lr = self.lr_schedule(epoch)
+        train_loss = 0
+        train_acc = 0
+        for i, batch in enumerate(self.train_iter):
+            x_array, t_array = chainer.dataset.concat_examples(batch)
+            x = chainer.Variable(cuda.to_gpu(x_array))
+            t = chainer.Variable(cuda.to_gpu(t_array))
+            self.optimizer.zero_grads()
+            y = self.model(x)
+            if self.opt.BC:
+                loss = utils.kl_divergence(y, t)
+                acc = F.accuracy(y, F.argmax(t, axis=1))
+            else:
+                loss = F.softmax_cross_entropy(y, t)
+                acc = F.accuracy(y, t)
+
+            loss.backward()
+            self.optimizer.update()
+            train_loss += float(loss.data) * len(t.data)
+            train_acc += float(acc.data) * len(t.data)
+
+            elapsed_time = time.time() - self.start_time
+            progress = (self.n_batches * (epoch - 1) + i + 1) * 1.0 / (self.n_batches * self.opt.nEpochs)
+            eta = elapsed_time / progress - elapsed_time
+
+            line = '* Epoch: {}/{} ({}/{}) | Train: LR {} | Time: {} (ETA: {})'.format(
+                epoch, self.opt.nEpochs, i + 1, self.n_batches,
+                self.optimizer.lr, utils.to_hms(elapsed_time), utils.to_hms(eta))
+            sys.stderr.write('\r\033[K' + line)
+            sys.stderr.flush()
+
+        self.train_iter.reset()
+        train_loss /= len(self.train_iter.dataset)
+        train_top1 = 100 * (1 - train_acc / len(self.train_iter.dataset))
+
+        return train_loss, train_top1
+
+    def val(self):
+        self.model.train = False
+        val_acc = 0
+        for batch in self.val_iter:
+            x_array, t_array = chainer.dataset.concat_examples(batch)
+            x = chainer.Variable(cuda.to_gpu(x_array), volatile=True)
+            t = chainer.Variable(cuda.to_gpu(t_array), volatile=True)
+            y = F.softmax(self.model(x))
+            acc = F.accuracy(y, t)
+            val_acc += float(acc.data) * len(t.data)
+
+        self.val_iter.reset()
+        self.model.train = True
+        val_top1 = 100 * (1 - val_acc / len(self.val_iter.dataset))
+
+        return val_top1
+
+    def lr_schedule(self, epoch):
+        divide_epoch = np.array([self.opt.nEpochs * i for i in self.opt.schedule])
+        decay = sum(epoch > divide_epoch)
+        if epoch <= self.opt.warmup:
+            decay = 1
+
+        return self.opt.LR * np.power(0.1, decay)
diff --git a/utils.py b/utils.py
new file mode 100644
index 0000000..d0ec32f
--- /dev/null
+++ b/utils.py
@@ -0,0 +1,64 @@
+import numpy as np
+import random
+import chainer.functions as F
+
+
+def padding(pad):
+    def f(image):
+        return np.pad(image, ((0, 0), (pad, pad), (pad, pad)), 'constant')
+
+    return f
+
+
+def random_crop(size):
+    def f(image):
+        _, h, w = image.shape
+        p = random.randint(0, h - size)
+        q = random.randint(0, w - size)
+        return image[:, p: p + size, q: q + size]
+
+    return f
+
+
+def horizontal_flip():
+    def f(image):
+        if random.randint(0, 1):
+            image = image[:, :, ::-1]
+        return image
+
+    return f
+
+
+def normalize(mean, std):
+    def f(image):
+        return (image - mean[:, None, None]) / std[:, None, None]
+
+    return f
+
+
+# For BC+
+def zero_mean(mean, std):
+    def f(image):
+        image_mean = np.mean(image, keepdims=True)
+        return (image - image_mean - mean[:, None, None]) / std[:, None, None]
+
+    return f
+
+
+def kl_divergence(y, t):
+    entropy = - F.sum(t[t.data.nonzero()] * F.log(t[t.data.nonzero()]))
+    crossEntropy = - F.sum(t * F.log_softmax(y))
+
+    return (crossEntropy - entropy) / y.shape[0]
+
+
+def to_hms(time):
+    h = int(time // 3600)
+    m = int((time - h * 3600) // 60)
+    s = int(time - h * 3600 - m * 60)
+    if h > 0:
+        line = '{}h{:02d}m'.format(h, m)
+    else:
+        line = '{}m{:02d}s'.format(m, s)
+
+    return line