diff --git a/dataset analysis/Dataset analysis.ipynb b/code/Dataset analysis.ipynb similarity index 100% rename from dataset analysis/Dataset analysis.ipynb rename to code/Dataset analysis.ipynb diff --git a/code/main.ipynb b/code/main.ipynb index 10b40a0..0edb2f3 100644 --- a/code/main.ipynb +++ b/code/main.ipynb @@ -1,264 +1,280 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "Fg5GvKa0qXkT" - }, - "source": [ - "# Установка нужных библиотек" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "id": "e51DLLWEqXkW", - "outputId": "27094984-95e4-4301-937e-2f3d1bd7f9b7", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "\u001b[33m DEPRECATION: A future pip version will change local packages to be built in-place without first copying to a temporary directory. We recommend you use --use-feature=in-tree-build to test your packages with this new behavior before it becomes the default.\n", - " pip 21.3 will remove support for this functionality. You can find discussion regarding this at https://github.com/pypa/pip/issues/7555.\u001b[0m\n", - " Building wheel for mylib (setup.py) ... \u001b[?25l\u001b[?25hdone\n" - ] - } - ], - "source": [ - "import warnings\n", - "warnings.filterwarnings(\"ignore\")\n", - "\n", - "try:\n", - " import google.colab\n", - " IN_COLAB = True\n", - "except:\n", - " IN_COLAB = False\n", - " \n", - "if IN_COLAB:\n", - " !git clone -qq https://github.com/Intelligent-Systems-Phystech/ProjectTemplate.git /tmp/repo\n", - " !python3 -m pip install -qq /tmp/repo/src/ && rm -rf /tmp/repo" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "7P4TWOOmqXkY" - }, - "source": [ - "# Импорт библиотек" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "id": "4EVJmkwOqXkY" - }, - "outputs": [], - "source": [ - "import os\n", - "\n", - "from sklearn.linear_model import LogisticRegression\n", - "import matplotlib.pyplot as plt\n", - "\n", - "from mylib.train import cv_parameters, Trainer, SyntheticBernuliDataset" - ] - }, - { - "cell_type": "markdown", - "source": [ - "# Настройка окружения" - ], - "metadata": { - "id": "stLbGQHDq6lS" - } - }, - { - "cell_type": "code", - "source": [ - "if IN_COLAB:\n", - " figures = '.'\n", - "else:\n", - " figures = '../figures'" - ], - "metadata": { - "id": "0TbwjK9Qq5Pg" - }, - "execution_count": 3, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "f2HeCQ89qXkZ" - }, - "source": [ - "# Работа с данными" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "dJJn3rfVqXka" - }, - "source": [ - "## Генерация синтетической выборки" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "id": "OSQfsmRrqXka" - }, - "outputs": [], - "source": [ - "dataset = SyntheticBernuliDataset(n=10, m=100, seed=42)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "KBgjk1tvqXkb" - }, - "source": [ - "# Эксперимент с логистической регрессией" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "19nb_usNqXkc" - }, - "source": [ - "## Обучение одной модели" - ] - }, - { - "cell_type": "code", - "source": [ - "trainer = Trainer(\n", - " LogisticRegression(penalty='l1', solver='saga', C=1.0),\n", - " dataset.X, dataset.y,\n", - ")\n", - "\n", - "trainer.train()\n", - "print(trainer.eval())" - ], - "metadata": { - "id": "ZMK7mqNQZPXJ", - "outputId": "a95524d6-db85-4a34-9c36-fa2befec2f34", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "execution_count": 5, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - " precision recall f1-score support\n", - "\n", - " 0 1.00 0.91 0.95 11\n", - " 1 0.93 1.00 0.97 14\n", - "\n", - " accuracy 0.96 25\n", - " macro avg 0.97 0.95 0.96 25\n", - "weighted avg 0.96 0.96 0.96 25\n", - "\n" - ] - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "g1Mq2ylfqXkd" - }, - "source": [ - "## Зависимость весов параметров от параметров регуляризации" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "id": "HvHCcNPwqXkd" - }, - "outputs": [], - "source": [ - "Cs, accuracy, parameters = cv_parameters(dataset.X, dataset.y)" - ] - }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "Fg5GvKa0qXkT" + }, + "source": [ + "# Установка нужных библиотек" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "7P4TWOOmqXkY" + }, + "source": [ + "# Импорт библиотек" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "4EVJmkwOqXkY", + "ExecuteTime": { + "end_time": "2024-04-01T17:21:48.882165Z", + "start_time": "2024-04-01T17:21:45.522872Z" + } + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "id": "LQL1mX1VqXke", - "outputId": "0868006d-d6c1-4504-9e66-d66af0fb56e9", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 283 - } - }, - "outputs": [ - { - "output_type": "display_data", - "data": { - "text/plain": [ - "
" - ], - "image/png": "\n" - }, - "metadata": { - "needs_background": "light" - } - } - ], - "source": [ - "plt.plot(Cs, parameters)\n", - "\n", - "plt.xlabel('Параметр регуляризации $C$')\n", - "plt.ylabel('$w$')\n", - "\n", - "plt.savefig(\n", - " os.path.join(figures, 'log_reg_cs_exp.eps'),\n", - " bbox_inches='tight')\n", - "\n", - "plt.show()" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-04-01 20:21:47.470487: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", + "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", + "2024-04-01 20:21:48.253379: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n" + ] } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.3" - }, + ], + "source": [ + "import os\n", + "import json\n", + "import glob\n", + "import torch\n", + "\n", + "import matplotlib.pyplot as plt\n", + "\n", + "from src.mylib.models.models import BaselineModel, MHAModel\n", + "from src.mylib.train import Trainer\n", + "\n", + "file = os.path.abspath('')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "f2HeCQ89qXkZ" + }, + "source": [ + "# Работа с данными" + ] + }, + { + "cell_type": "markdown", + "source": [ + "Начальные параметры" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "outputs": [], + "source": [ + "batch_size = 64\n", + "\n", + "# Длина окна\n", + "window_length_seconds = 5 \n", + "sample_rate = 64\n", + "window_length = window_length_seconds * sample_rate\n", + "\n", + "# Расстояние между двумя окнами\n", + "hop_length_seconds = 1\n", + "hop_length = sample_rate * hop_length_seconds\n", + "\n", + "# Количество ложные стимулов\n", + "number_of_mismatch = 4" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-04-01T17:21:48.887002Z", + "start_time": "2024-04-01T17:21:48.883345Z" + } + }, + "execution_count": 2 + }, + { + "cell_type": "code", + "outputs": [], + "source": [ + "experiment_folder = os.path.dirname(file)\n", + "\n", + "# Load the config file\n", + "with open(os.path.join(experiment_folder, \"src/mylib/utils/config.json\")) as file_path:\n", + " config = json.load(file_path)\n", + "\n", + "# Path to the dataset, which is already split to train, val, test\n", + "data_folder = os.path.join(config[\"dataset_folder\"], config['derivatives_folder'], config[\"split_folder\"])\n", + "\n", + "# Пути к данным тренировочным, валидационным и тестовым данным\n", + "train_files = [x for x in glob.glob(os.path.join(data_folder, \"train_-_*\")) if\n", + " os.path.basename(x).split(\"_-_\")[-1].split(\".\")[0] in [\"eeg\", \"envelope\"]]\n", + "val_files = [x for x in glob.glob(os.path.join(data_folder, \"val_-_*\")) if\n", + " os.path.basename(x).split(\"_-_\")[-1].split(\".\")[0] in [\"eeg\", \"envelope\"]]\n", + "test_files = [x for x in glob.glob(os.path.join(data_folder, \"test_-_*\")) if\n", + " os.path.basename(x).split(\"_-_\")[-1].split(\".\")[0] in [\"eeg\", \"envelope\"]]" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-04-01T17:21:48.897759Z", + "start_time": "2024-04-01T17:21:48.887963Z" + } + }, + "execution_count": 3 + }, + { + "cell_type": "markdown", + "metadata": { + "id": "19nb_usNqXkc" + }, + "source": [ + "## Обучение " + ] + }, + { + "cell_type": "markdown", + "source": [ + "Базовое решение" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "source": [ + "# model = BaselineModel()\n", + "args = {\"window_length\" : window_length, \"hop_length\" : hop_length, \"number_of_mismatch\" : number_of_mismatch, \"batch_size\" : batch_size, \n", + " \"max_files\" : 100}\n", + "for model in [BaselineModel, MHAModel]:\n", + " model = model()\n", + " print(f\"Model: {model.__class__.__name__}\")\n", + " \n", + " trainer = Trainer(\n", + " model, train_files, val_files, test_files, args, torch.optim.Adam(model.parameters(), lr=0.001), torch.nn.CrossEntropyLoss(), \n", + " )\n", + "\n", + " trainer.train_model(epochs=5, run_name=f\"{model.__class__.__name__}\")\n", + " print(trainer.test())\n", + " print(\"-----\" * 5)" + ], + "metadata": { + "id": "ZMK7mqNQZPXJ", + "outputId": "a95524d6-db85-4a34-9c36-fa2befec2f34", "colab": { - "name": "main.ipynb", - "provenance": [] + "base_uri": "https://localhost:8080/" + }, + "ExecuteTime": { + "end_time": "2024-04-01T17:30:37.468006Z", + "start_time": "2024-04-01T17:21:48.898678Z" } + }, + "execution_count": 4, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "EPOCH 1:\n", + " batch 100 loss: 0.1482216489744413\n", + " batch 200 loss: 0.0002496870611862123\n", + " batch 300 loss: 0.0004293969416104437\n", + " batch 400 loss: 0.00014352307940049714\n", + " batch 500 loss: 0.0003824394412247301\n", + " batch 600 loss: 0.0005175532017259399\n", + " batch 700 loss: 0.00033389098716623877\n", + " batch 800 loss: 8.32907846506714e-05\n", + " batch 900 loss: 2.4057848494294377e-05\n", + "LOSS train 2.4057848494294377e-05 valid 0.0\n", + "EPOCH 2:\n", + " batch 100 loss: 2.239500692658325e-05\n", + " batch 200 loss: 1.717424765956821e-07\n", + " batch 300 loss: 2.382965994911501e-06\n", + " batch 400 loss: 2.0097729375834205e-07\n", + " batch 500 loss: 5.319957272149622e-06\n", + " batch 600 loss: 4.880248161498457e-07\n", + " batch 700 loss: 7.180011834861944e-08\n", + " batch 800 loss: 1.2190427505487378e-06\n", + " batch 900 loss: 2.3094077732821461e-07\n", + "LOSS train 2.3094077732821461e-07 valid 0.0\n", + "EPOCH 3:\n", + " batch 100 loss: 1.9006388822617737e-06\n", + " batch 200 loss: 1.8029883719350436e-08\n", + " batch 300 loss: 5.650982788552028e-07\n", + " batch 400 loss: 3.155075988914291e-08\n", + " batch 500 loss: 2.0160868007224052e-06\n", + " batch 600 loss: 1.7061551261576823e-07\n", + " batch 700 loss: 1.929664222188876e-08\n", + " batch 800 loss: 5.126564428525882e-07\n", + " batch 900 loss: 6.992110229475657e-08\n", + "LOSS train 6.992110229475657e-08 valid 0.0\n", + "EPOCH 4:\n", + " batch 100 loss: 8.588485644622779e-07\n", + " batch 200 loss: 5.774140259262595e-09\n", + " batch 300 loss: 2.494339387482114e-07\n", + " batch 400 loss: 8.158223745446947e-09\n", + " batch 500 loss: 1.0297044354956597e-06\n", + " batch 600 loss: 7.403204108413774e-08\n", + " batch 700 loss: 6.612346510337375e-09\n", + " batch 800 loss: 2.495401167823541e-07\n", + " batch 900 loss: 2.607663617482103e-08\n", + "LOSS train 2.607663617482103e-08 valid 0.0\n", + "EPOCH 5:\n", + " batch 100 loss: 4.253824216959856e-07\n", + " batch 200 loss: 2.067528156457499e-09\n", + " batch 300 loss: 1.1764754141552203e-07\n", + " batch 400 loss: 2.4027991685215965e-09\n", + " batch 500 loss: 5.495035293279216e-07\n", + " batch 600 loss: 3.333958602524944e-08\n", + " batch 700 loss: 2.384180106673739e-09\n", + " batch 800 loss: 1.2554179193102755e-07\n", + " batch 900 loss: 1.0691511533877929e-08\n", + "LOSS train 1.0691511533877929e-08 valid 0.0\n", + " precision recall f1-score support\n", + "\n", + " 0 1.00 0.99 0.99 1551\n", + " 1 0.99 1.00 1.00 1524\n", + " 2 0.99 1.00 0.99 1501\n", + " 3 0.99 0.99 0.99 1475\n", + " 4 0.99 1.00 0.99 1507\n", + "\n", + " accuracy 0.99 7558\n", + " macro avg 0.99 0.99 0.99 7558\n", + "weighted avg 0.99 0.99 0.99 7558\n" + ] + } + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" }, - "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file + "colab": { + "name": "main.ipynb", + "provenance": [] + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/code/model.py b/code/model.py index 27f8870..2e47637 100644 --- a/code/model.py +++ b/code/model.py @@ -15,12 +15,14 @@ def build(self, input_shape): super(TransformerBlock, self).build(input_shape) def call(self, inputs): - attn_output = self.att(inputs, inputs) - attn_output = self.dropout1(attn_output) - out1 = self.layernorm1(inputs + attn_output) - ffn_output = self.ffn(out1) - ffn_output = self.dropout2(ffn_output) - out = self.layernorm2(out1 + ffn_output) + attn_output = self.att(inputs, inputs) + attn_output = self.dropout1(attn_output) + out1 = self.layernorm1(inputs + attn_output) + print(out1.shape) + ffn_output = self.ffn(out1) + print(ffn_output.shape) + ffn_output = self.dropout2(ffn_output) + out = self.layernorm2(out1 + ffn_output) return out @@ -105,7 +107,7 @@ def Model( transformer_block = TransformerBlock(embed_dim=eeg_input_dimension, num_heads=2, ff_dim=32) eeg_proj_1 = transformer_block(eeg) - + print(eeg_proj_1.shape) # Construct dilation layers for layer_index in range(layers): diff --git a/src/mylib/models/eeg_encoders.py b/src/mylib/models/eeg_encoders.py new file mode 100644 index 0000000..3741007 --- /dev/null +++ b/src/mylib/models/eeg_encoders.py @@ -0,0 +1,47 @@ +import torch.nn as nn + + +class BaselineEEGEncoder(nn.Module): + """Encoder for EEG""" + + def __init__(self, in_channels=8, dilation_filters=16, kernel_size=3, layers=3): + super(BaselineEEGEncoder, self).__init__() + + self.eeg_convos = nn.Sequential() + + for layer_index in range(layers): + self.eeg_convos.add_module(f"conv1d_lay{layer_index}", + nn.Conv1d( + in_channels=dilation_filters * (layer_index != 0) + ( + layer_index == 0) * in_channels, + out_channels=dilation_filters, + kernel_size=kernel_size, + dilation=kernel_size ** layer_index, + bias=True)) + self.eeg_convos.add_module(f"relu_lay{layer_index}", nn.ReLU()) + + def forward(self, eeg): + return self.eeg_convos(eeg) + + +class MultiheadAttentionEEGEncoder(nn.Module): + """EEG Encoder using transformer""" + + def __init__(self, embed_dim, ff_dim): + super(MultiheadAttentionEEGEncoder, self).__init__() + + self.mha_attention = nn.MultiheadAttention(embed_dim=embed_dim, num_heads=2) + self.ffn = nn.Sequential(nn.Linear(embed_dim, ff_dim), nn.ReLU(), nn.Linear(ff_dim, embed_dim)) + self.layer_norm1 = nn.LayerNorm(embed_dim, eps=1e-6) + self.layer_norm2 = nn.LayerNorm(embed_dim, eps=1e-6) + self.dropout1 = nn.Dropout(p=0.5) + self.dropout2 = nn.Dropout(p=0.5) + + def forward(self, x): + attn_output, _ = self.mha_attention(x, x, x) + attn_output = self.dropout1(attn_output) + out1 = self.layer_norm1(attn_output + x) + ffn_output = self.ffn(out1) + ffn_output = self.dropout2(ffn_output) + out = self.layer_norm2(out1 + ffn_output) + return out diff --git a/src/mylib/models/models.py b/src/mylib/models/models.py new file mode 100644 index 0000000..d90a4ef --- /dev/null +++ b/src/mylib/models/models.py @@ -0,0 +1,88 @@ +import torch +import torch.nn as nn +from src.mylib.models.eeg_encoders import BaselineEEGEncoder, MultiheadAttentionEEGEncoder +from src.mylib.models.stimulus_encoders import BaselineStimulusEncoder + + +class BaselineModel(nn.Module): + """Baseline model""" + + def __init__(self, + layers=3, + kernel_size=3, + spatial_filters=8, + dilation_filters=16): + super(BaselineModel, self).__init__() + + # EEG spatial transformation + self.spatial_transformation = nn.Conv1d( + in_channels=64, + out_channels=spatial_filters, + kernel_size=1, + bias=True + ) + + args = {"dilation_filters": dilation_filters, "kernel_size": kernel_size, "layers": layers} + + # EEG encoder + self.eeg_encoder = BaselineEEGEncoder(in_channels=spatial_filters, **args) + + # Stimulus encoder + self.stimulus_encoder = BaselineStimulusEncoder(**args) + + self.fc = nn.Linear(in_features=dilation_filters * dilation_filters, + out_features=1, + bias=True) + + def forward(self, eeg, stimuli): + eeg = self.spatial_transformation(eeg) + eeg = self.eeg_encoder(eeg) + + # shared weights for stimuli + for i in range(len(stimuli)): + stimuli[i] = self.stimulus_encoder(stimuli[i]) + + cosine_sim = [] + for stimulus in stimuli: + cosine_sim.append(eeg @ stimulus.transpose(-1, -2)) + sim_projections = [self.fc(torch.flatten(sim, start_dim=1)) for sim in cosine_sim] + return torch.cat(sim_projections, dim=1) + + +class MHAModel(nn.Module): + """Model with transformer block as spatial transformation""" + + def __init__(self, + layers=3, + kernel_size=3, + dilation_filters=16): + super(MHAModel, self).__init__() + + # EEG spatial transformation + self.spatial_transformation = MultiheadAttentionEEGEncoder(embed_dim=64, ff_dim=32) + + args = {"dilation_filters": dilation_filters, "kernel_size": kernel_size, "layers": layers} + + # EEG encoder + self.eeg_encoder = BaselineEEGEncoder(in_channels=64, **args) + + # Stimulus encoder + self.stimulus_encoder = BaselineStimulusEncoder(**args) + + self.fc = nn.Linear(in_features=dilation_filters * dilation_filters, + out_features=1, + bias=True) + + def forward(self, eeg, stimuli): + eeg = self.spatial_transformation(eeg.transpose(1, 2)) + eeg = self.eeg_encoder(eeg.transpose(1, 2)) + + # shared weights for stimuli + for i in range(len(stimuli)): + stimuli[i] = self.stimulus_encoder(stimuli[i]) + + cosine_sim = [] + for stimulus in stimuli: + cosine_sim.append(eeg @ stimulus.transpose(-1, -2)) + sim_projections = [self.fc(torch.flatten(sim, start_dim=1)) for sim in cosine_sim] + return torch.cat(sim_projections, dim=1) diff --git a/src/mylib/models/stimulus_encoders.py b/src/mylib/models/stimulus_encoders.py new file mode 100644 index 0000000..0786887 --- /dev/null +++ b/src/mylib/models/stimulus_encoders.py @@ -0,0 +1,22 @@ +import torch.nn as nn + + +class BaselineStimulusEncoder(nn.Module): + """Stimulus encoder from baseline solution""" + + def __init__(self, dilation_filters=16, kernel_size=3, layers=3): + super(BaselineStimulusEncoder, self).__init__() + + self.env_convos = nn.Sequential() + for layer_index in range(layers): + self.env_convos.add_module(f"conv1d_lay{layer_index}", + nn.Conv1d( + in_channels=dilation_filters * (layer_index != 0) + (layer_index == 0), + out_channels=dilation_filters, + kernel_size=kernel_size, + dilation=kernel_size ** layer_index, + bias=True)) + self.env_convos.add_module(f"relu_lay{layer_index}", nn.ReLU()) + + def forward(self, stimulus): + return self.env_convos(stimulus) diff --git a/src/mylib/train.py b/src/mylib/train.py index 15f6729..912d79e 100755 --- a/src/mylib/train.py +++ b/src/mylib/train.py @@ -1,132 +1,150 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -''' -The :mod:`mylib.train` contains classes: +import os -- :class:`mylib.train.Trainer` +import torch +from torch.utils.tensorboard import SummaryWriter +import torch.nn as nn -The :mod:`mylib.train` contains functions: - -- :func:`mylib.train.cv_parameters` -''' -from __future__ import print_function - -__docformat__ = 'restructuredtext' - -import numpy -from scipy.special import expit -from sklearn.linear_model import LogisticRegression -from sklearn.model_selection import train_test_split +from src.mylib.utils.data import TaskDataset from sklearn.metrics import classification_report -class SyntheticBernuliDataset(object): - r'''Base class for synthetic dataset.''' - def __init__(self, n=10, m=100, seed=42): - r'''Constructor method - :param n: the number of feature - :type n: int - :param m: the number of object - :type m: int - :param seed: seed for random state. - :type seed: int - ''' - rs = numpy.random.RandomState(seed) +class Trainer(object): + r"""Base class for all trainer.""" - self.w = rs.randn(n) # Генерим вектор параметров из нормального распределения - self.X = rs.randn(m, n) # Генерим вектора признаков из нормального распределения + def __init__(self, model, train_files, val_files, test_files, args, optimizer, loss_fn): + r"""Constructor method - self.y = rs.binomial(1, expit(self.X@self.w)) # Гипотеза порождения данных - целевая переменная из схемы Бернули + :param train_files: path to train files + :type train_files: list + :param val_files: path to val files + :type val_files: list -class Trainer(object): - r'''Base class for all trainer.''' - def __init__(self, model, X, Y, seed=42): - r'''Constructor method - - :param model: The class with fit and predict methods. - :type model: object - - :param X: The array of shape - `num_elements` :math:`\times` `num_feature`. - :type X: numpy.array - :param Y: The array of shape - `num_elements` :math:`\times` `num_answers`. - :type Y: numpy.array - - :param seed: Seed for random state. - :type seed: int - ''' - self.model = model - self.seed = seed - ( - self.X_train, - self.X_val, - self.Y_train, - self.Y_val - ) = train_test_split(X, Y, random_state=self.seed) - - def train(self): - r''' Train model - ''' - self.model.fit(self.X_train, self.Y_train) - - def eval(self, output_dict=False): - r'''Evaluate model for initial validadtion dataset. - ''' - return classification_report( - self.Y_val, - self.model.predict( - self.X_val), output_dict=output_dict) - - def test(self, X, Y, output_dict=False): - r"""Evaluate model for given dataset. - - :param X: The array of shape - `num_elements` :math:`\times` `num_feature`. - :type X: numpy.array - :param Y: The array of shape - `num_elements` :math:`\times` `num_answers`. - :type Y: numpy.array + :param test_files: path to test files + :type test_files: list """ - return classification_report( - Y, self.model.predict(X), output_dict=output_dict) - - -def cv_parameters(X, Y, seed=42, minimal=0.1, maximum=25, count=100): - r'''Function for the experiment with different regularisation parameters - and return accuracy and weidth for LogisticRegression for each parameter. - - :param X: The array of shape - `num_elements` :math:`\times` `num_feature`. - :type X: numpy.array - :param Y: The array of shape - `num_elements` :math:`\times` `num_answers`. - :type Y: numpy.array - - :param seed: Seed for random state. - :type seed: int - :param minimal: Minimum value for the Cs linspace. - :type minimal: int - :param maximum: Maximum value for the Cs linspace. - :type maximum: int - :param count: Number of the Cs points. - :type count: int - ''' - - Cs = numpy.linspace(minimal, maximum, count) - parameters = [] - accuracy = [] - for C in Cs: - trainer = Trainer( - LogisticRegression(penalty='l1', solver='saga', C=1/C), - X, Y, - ) - - trainer.train() - - accuracy.append(trainer.eval(output_dict=True)['accuracy']) - - parameters.extend(trainer.model.coef_) - - return Cs, accuracy, parameters + self.model = model + self.args = args + self.optimizer = optimizer + self.loss_fn = loss_fn + self.test_files = test_files + self.initialize_dataloaders(train_files, val_files, test_files) + + def initialize_dataloaders(self, train_files, val_files, test_files): + r"""Initialize dataloaders""" + + conf = {"window_length": self.args["window_length"], "hop_length": self.args["hop_length"], + "number_of_mismatch": self.args["number_of_mismatch"], "max_files": self.args["max_files"]} + self.train_dataloader = torch.utils.data.DataLoader(TaskDataset(train_files, **conf), + batch_size=self.args["batch_size"]) + self.val_dataloader = torch.utils.data.DataLoader(TaskDataset(val_files, **conf), + batch_size=self.args["batch_size"]) + self.test_dataloader = torch.utils.data.DataLoader(TaskDataset(test_files, **conf), + batch_size=1) + + def train_one_epoch(self, epoch_index, writer): + r"""Train one epoch""" + + running_loss = 0 + last_loss = 0 + + for i, data in enumerate(self.train_dataloader): + inputs, labels = data + + self.optimizer.zero_grad() + outputs = self.model(inputs[0], inputs[1:]) + + # TODO: CLASSIFICATION METRIC DURING TRAINING + # probs = (torch.nn.functional.softmax(outputs.data, dim=1) >= 0.5) + # _, predicted = torch.max(probs.data, 1) + + loss = self.loss_fn(outputs, labels) + loss.backward() + + self.optimizer.step() + + running_loss += loss.item() + if i % 100 == 99: + last_loss = running_loss / 100 + print(' batch {} loss: {}'.format(i + 1, last_loss)) + x = epoch_index * len(self.train_dataloader) + i + 1 + writer.add_scalar('Loss/train', last_loss, x) + running_loss = 0 + + return last_loss + + def train_model(self, epochs, run_name): + r""" Train models""" + + writer = SummaryWriter(f"runs/{run_name}_{self.model.__class__.__name__}") + + best_vloss = 1_000_000 + if not os.path.isdir("saved_models"): + os.makedirs("saved_models") + + for epoch in range(epochs): + print(f"EPOCH {epoch + 1}:") + self.model.train() + avg_loss = self.train_one_epoch(epoch + 1, writer) + + running_vloss = 0.0 + self.model.eval() + with torch.no_grad(): + for i, vdata in enumerate(self.val_dataloader): + vinputs, vlabels = vdata + voutputs = self.model(vinputs[0], vinputs[1:]) + vloss = self.loss_fn(voutputs, vlabels) + running_vloss += vloss.item() + + avg_vloss = running_vloss / (i + 1) + print("LOSS train {} valid {}".format(avg_loss, avg_vloss)) + + writer.add_scalars("Training vs. Validation Loss", + {"Training": avg_loss, "Validation": avg_vloss}, + epoch + 1) + writer.flush() + + if avg_vloss < best_vloss: + best_vloss = avg_vloss + model_path = f"saved_models/{self.model.__class__.__name__}_{epoch}" + torch.save(self.model.state_dict(), model_path) + + def eval(self): + r"""Evaluate model for initial validation dataset.""" + pass + + def test(self): + r"""Evaluate model for given dataset""" + + total = 0 + self.model.eval() + y_pred = [] + y_true = [] + subjects = list(set([os.path.basename(x).split("_-_")[1] for x in self.test_files])) + loss_fn = nn.functional.cross_entropy + with torch.no_grad(): + for sub in subjects: + sub_test_files = [f for f in self.test_files if sub in os.path.basename(f)] + test_dataloader = torch.utils.data.DataLoader(TaskDataset(sub_test_files, self.args["window_length"], self.args["hop_length"])) + loss = 0 + correct = 0 + for inputs, label in test_dataloader: + outputs = self.model(inputs[0], inputs[1:]) + + loss += loss_fn(outputs, label).item() + probs = (torch.nn.functional.softmax(outputs.data, dim=1) >= 0.5) + _, predicted = torch.max() + + for data in self.test_dataloader: + inputs, labels = data + + outputs = self.model(inputs[0], inputs[1:]) + _, predicted = torch.max(outputs.data, 1) + total += labels.size(0) + + y_pred.extend(predicted.tolist()) + y_true.extend(labels.tolist()) + + correct += (predicted == labels).sum().item() + + return classification_report(y_true, y_pred) diff --git a/code/config.json b/src/mylib/utils/config.json similarity index 67% rename from code/config.json rename to src/mylib/utils/config.json index e29f701..dbf1b54 100644 --- a/code/config.json +++ b/src/mylib/utils/config.json @@ -1,8 +1,8 @@ { - "dataset_folder": "--absolute path to dataset folder--", + "dataset_folder": "/home/bukkacha/Desktop/EEGDataset", "derivatives_folder": "derivatives", "preprocessed_eeg_folder": "preprocessed_eeg", "preprocessed_stimuli_folder": "preprocessed_stimuli", "split_folder": "split_data", - "test_folder": "test_set" + "stimuli": "stimuli" } diff --git a/src/mylib/utils/data.py b/src/mylib/utils/data.py new file mode 100644 index 0000000..573f4d8 --- /dev/null +++ b/src/mylib/utils/data.py @@ -0,0 +1,71 @@ +import torch +import numpy as np +import itertools +import os +from torch.utils.data import Dataset + + +class TaskDataset(Dataset): + """Generate data for the Match/Mismatch task.""" + + def __init__(self, files, window_length, hop_length, number_of_mismatch, max_files=100): + self.labels = [] + assert number_of_mismatch != 0 + self.window_length = window_length + self.hop_length = hop_length + self.number_of_mismatch = number_of_mismatch + self.files = files + self.max_files = max_files + self.group_recordings() + self.frame_recordings() + self.create_imposter_segments() + self.create_labels_randomize_positions() + + def group_recordings(self): + new_files = [] + grouped = itertools.groupby(sorted(self.files), lambda x: "_-_".join(os.path.basename(x).split("_-_")[:3])) + + for recording_name, feature_paths in grouped: + sub_recordings = sorted(feature_paths, key=lambda x: "0" if x == "eeg" else x) + eeg, envelope = np.load(sub_recordings[0]), np.load(sub_recordings[1]) # eeg [L, C], env [L, 1] + new_files += [[torch.tensor(eeg.T).float(), torch.tensor(envelope.T).float()]] + + if self.max_files is not None and len(new_files) == self.max_files: + break + + self.files = new_files + + def frame_recordings(self): + new_files = [] + for i in range(len(self.files)): + self.files[i][0] = self.files[i][0].unfold( + 1, self.window_length, self.hop_length).transpose(0, 1) # [num_of_frames, C, window_length] + self.files[i][1] = self.files[i][1].unfold( + 1, self.window_length, self.hop_length).transpose(0, 1) # [num_of_frames, C, window_length] + eegs = list(torch.tensor_split(self.files[i][0], self.files[i][0].shape[0], dim=0)) + envs = list(torch.tensor_split(self.files[i][1], self.files[i][1].shape[0], dim=0)) + for eeg, env in zip(eegs, envs): + new_files.append([eeg.squeeze(), env.squeeze(dim=0)]) + self.files = new_files + + def create_imposter_segments(self): + for i in range(len(self.files)): + for _ in range(self.number_of_mismatch): + t = self.files[i][-1].view(-1) + t = t[torch.randperm(t.shape[-1])].view(self.files[i][-1].shape) + self.files[i].append(t) + + def create_labels_randomize_positions(self): + for i in range(len(self.files)): + idx_permutation = torch.randperm(self.number_of_mismatch + 1) + 1 + permuted = [] + for idx in idx_permutation: + permuted.append(self.files[i][idx]) + self.files[i][1:] = permuted + self.labels.append(torch.argmax((idx_permutation == 1).long())) + + def __len__(self): + return len(self.files) + + def __getitem__(self, idx): + return self.files[idx], self.labels[idx]