-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
2 changed files
with
1,588 additions
and
0 deletions.
There are no files selected for viewing
296 changes: 296 additions & 0 deletions
296
Submission/Code/3. Various Modeling/Modeling_Keras_Embedding_wandb.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,296 @@ | ||
{ | ||
"nbformat": 4, | ||
"nbformat_minor": 0, | ||
"metadata": { | ||
"colab": { | ||
"name": "5. Modeling_wandb.ipynb", | ||
"provenance": [], | ||
"collapsed_sections": [], | ||
"toc_visible": true | ||
}, | ||
"kernelspec": { | ||
"name": "python3", | ||
"display_name": "Python 3" | ||
}, | ||
"accelerator": "GPU" | ||
}, | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"metadata": { | ||
"id": "Acu-SKVaK03i", | ||
"colab": { | ||
"base_uri": "https://localhost:8080/", | ||
"height": 35 | ||
}, | ||
"outputId": "b0b477e5-7252-44f5-ea6f-1982736a4e26" | ||
}, | ||
"source": [ | ||
"import os\r\n", | ||
"os.getcwd()\r\n", | ||
"# os.chdir('drive/MyDrive/NLP/Dacon_NH competition/')" | ||
], | ||
"execution_count": 12, | ||
"outputs": [ | ||
{ | ||
"output_type": "execute_result", | ||
"data": { | ||
"application/vnd.google.colaboratory.intrinsic+json": { | ||
"type": "string" | ||
}, | ||
"text/plain": [ | ||
"'/content/drive/My Drive/NLP/Dacon_NH competition'" | ||
] | ||
}, | ||
"metadata": { | ||
"tags": [] | ||
}, | ||
"execution_count": 12 | ||
} | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"metadata": { | ||
"colab": { | ||
"base_uri": "https://localhost:8080/" | ||
}, | ||
"id": "bMZd0rEV2-0F", | ||
"outputId": "01a2e7ca-e688-4098-8469-654a2935d4e3" | ||
}, | ||
"source": [ | ||
"%pip install wandb -q\r\n", | ||
"\r\n", | ||
"import numpy as np\r\n", | ||
"import pandas as pd\r\n", | ||
"import tensorflow as tf\r\n", | ||
"from tensorflow.keras.preprocessing.text import Tokenizer\r\n", | ||
"from tensorflow.keras.preprocessing.sequence import pad_sequences\r\n", | ||
"from tensorflow.keras.models import Sequential\r\n", | ||
"from tensorflow.keras.layers import Embedding, SpatialDropout1D, LSTM, Dropout, Dense\r\n", | ||
"from tensorflow.keras import regularizers\r\n", | ||
"from tensorflow.keras.callbacks import EarlyStopping\r\n", | ||
"\r\n", | ||
"import wandb\r\n", | ||
"from wandb.keras import WandbCallback\r\n", | ||
"wandb.login()" | ||
], | ||
"execution_count": 13, | ||
"outputs": [ | ||
{ | ||
"output_type": "execute_result", | ||
"data": { | ||
"text/plain": [ | ||
"True" | ||
] | ||
}, | ||
"metadata": { | ||
"tags": [] | ||
}, | ||
"execution_count": 13 | ||
} | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"metadata": { | ||
"id": "cbP3p5bi--S3" | ||
}, | ||
"source": [ | ||
"import logging\r\n", | ||
"logger = logging.getLogger(\"wandb\")\r\n", | ||
"logger.setLevel(logging.ERROR)" | ||
], | ||
"execution_count": 15, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"metadata": { | ||
"id": "KE1-bLBnDtLx" | ||
}, | ||
"source": [ | ||
"train = pd.read_csv('data/news_train_preprocessing_Mecab_12.24.csv')\r\n", | ||
"train = train.fillna('')\r\n", | ||
"\r\n", | ||
"token_list = [t.split(' ') for t in train['text']]\r\n", | ||
"max_len = max(len(l) for l in token_list)\r\n", | ||
"\r\n", | ||
"def text2sequence(train_text, max_len): # max_len에 뉴스의 최대 길이\r\n", | ||
" max_words = 10000\r\n", | ||
" tokenizer = Tokenizer(num_words = max_words) # keras의 vectorizing 함수 호출\r\n", | ||
" tokenizer.fit_on_texts(train_text) # train 문장에 fit\r\n", | ||
" train_X_seq = tokenizer.texts_to_sequences(train_text) # 각 토큰들에 정수값 부여\r\n", | ||
" vocabulary = tokenizer.word_index\r\n", | ||
" vocab_size = len(tokenizer.word_index) + 1 # 모델에 알려줄 vocabulary의 크기 계산(padding값 0을 포함해야 하므로 +1)\r\n", | ||
" # print('vocab_size : ', vocab_size)\r\n", | ||
" X_train = pad_sequences(train_X_seq, maxlen = max_len) # 설정한 문장의 최대 길이만큼 padding\r\n", | ||
"\r\n", | ||
" return X_train, vocabulary, vocab_size, tokenizer\r\n", | ||
"\r\n", | ||
"train_y = train['info']\r\n", | ||
"train_X, vocabulary, vocab_size, vectorizer = text2sequence(token_list, max_len)\r\n", | ||
"\r\n", | ||
"test = pd.read_csv('data/news_test_preprocessing_Mecab_12.24.csv')\r\n", | ||
"test = test.fillna('')\r\n", | ||
"\r\n", | ||
"token_list_test = [t.split(' ') for t in test['text']]\r\n", | ||
"\r\n", | ||
"test_X_seq = vectorizer.texts_to_sequences(token_list_test) # 각 토큰들에 정수값 부여\r\n", | ||
"X_test = pad_sequences(test_X_seq, maxlen = max_len) # 설정한 문장의 최대 길이만큼 padding\r\n", | ||
"\r\n", | ||
"submission_data = pd.read_csv('data/sample_submission.csv')" | ||
], | ||
"execution_count": 16, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"metadata": { | ||
"id": "dKHHsWaSC3Tj" | ||
}, | ||
"source": [ | ||
"# Configure the sweep – specify the parameters to search through, the search strategy, the optimization metric et all.\r\n", | ||
"\r\n", | ||
"sweep_config = {\r\n", | ||
" 'method': 'random', # grid, random\r\n", | ||
" 'metric': {\r\n", | ||
" 'name': 'val_loss',\r\n", | ||
" 'goal': 'minimize' \r\n", | ||
" },\r\n", | ||
" 'parameters': {\r\n", | ||
" 'embedding_dim': {\r\n", | ||
" 'values': [50, 100, 150, 200]\r\n", | ||
" },\r\n", | ||
" 'SpatialDropout1D': {\r\n", | ||
" 'values': [0.1, 0.2, 0.3, 0.4, 0.5]\r\n", | ||
" },\r\n", | ||
" 'LSTM_hidden': {\r\n", | ||
" 'values': [16, 32, 64]\r\n", | ||
" },\r\n", | ||
" 'dropout_rate1': {\r\n", | ||
" 'values': [0.2, 0.3, 0.4, 0.5]\r\n", | ||
" },\r\n", | ||
" 'dropout_rate2': {\r\n", | ||
" 'values': [0.2, 0.3, 0.4, 0.5]\r\n", | ||
" },\r\n", | ||
" 'regularizer': {\r\n", | ||
" 'values': [0.01, 0.001, 0.0001]\r\n", | ||
" },\r\n", | ||
" 'learning_rate': {\r\n", | ||
" 'values': [1e-2, 1e-3, 1e-4, 3e-4, 3e-5, 1e-5]\r\n", | ||
" },\r\n", | ||
" 'optimizer': {\r\n", | ||
" 'values': ['adam', 'rmsprop']\r\n", | ||
" },\r\n", | ||
" 'batch_size': {\r\n", | ||
" 'values': [32, 64]\r\n", | ||
" },\r\n", | ||
" 'epochs': {\r\n", | ||
" 'values': [3, 5, 10, 20]\r\n", | ||
" }\r\n", | ||
" }\r\n", | ||
"}" | ||
], | ||
"execution_count": 23, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"metadata": { | ||
"colab": { | ||
"base_uri": "https://localhost:8080/" | ||
}, | ||
"id": "tAc11QgXEPZ1", | ||
"outputId": "6f9da855-16e3-4ace-c66d-9bbe96c20551" | ||
}, | ||
"source": [ | ||
"# Initialize a new sweep\r\n", | ||
"# Arguments:\r\n", | ||
"# – sweep_config: the sweep config dictionary defined above\r\n", | ||
"# – entity: Set the username for the sweep\r\n", | ||
"# – project: Set the project name for the sweep\r\n", | ||
"\r\n", | ||
"sweep_id = wandb.sweep(sweep_config, entity=\"ldc\", project=\"NH_competition_keras_Embedding\")" | ||
], | ||
"execution_count": 24, | ||
"outputs": [ | ||
{ | ||
"output_type": "stream", | ||
"text": [ | ||
"Create sweep with ID: snalj0xq\n", | ||
"Sweep URL: https://wandb.ai/ldc/NH_competition_test/sweeps/snalj0xq\n" | ||
], | ||
"name": "stdout" | ||
} | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"metadata": { | ||
"id": "XXcQ6K-NEP8Y" | ||
}, | ||
"source": [ | ||
"def train():\r\n", | ||
" # Default values for hyper-parameters we're going to sweep over\r\n", | ||
" default_config = {'embedding_dim' : 50, 'SpatialDropout1D' : 0.3, 'LSTM_hidden' : 64, 'dropout_rate1' : 0.2, 'dropout_rate2' : 0.3, \r\n", | ||
" 'regularizer' : 0.001, 'batch_size' : 32, 'learning_rate' : 0.001, 'optimizer' : 'rmsprop', 'epochs' : 10}\r\n", | ||
"\r\n", | ||
" # Initialize a new wandb run\r\n", | ||
" wandb.init(config = default_config)\r\n", | ||
"\r\n", | ||
" # Config is a variable that holds and saves hyperparameters and inputs\r\n", | ||
" config = wandb.config\r\n", | ||
"\r\n", | ||
" max_words = 10000\r\n", | ||
" embedding_dim = config.embedding_dim # 50 ~ 200 사이에서 적절히 설정\r\n", | ||
"\r\n", | ||
" model = Sequential()\r\n", | ||
" model.add(Embedding(input_dim = max_words, output_dim = embedding_dim, input_length = max_len))\r\n", | ||
" model.add(SpatialDropout1D(config.SpatialDropout1D))\r\n", | ||
" model.add(LSTM(config.LSTM_hidden))\r\n", | ||
" model.add(Dropout(config.dropout_rate1))\r\n", | ||
" model.add(Dense(32, activation = 'relu', kernel_regularizer = regularizers.l2(config.regularizer)))\r\n", | ||
" model.add(Dropout(config.dropout_rate2))\r\n", | ||
" # model.add(Dense(16, activation = 'relu'))\r\n", | ||
" model.add(Dense(1, activation='sigmoid'))\r\n", | ||
"\r\n", | ||
" if config.optimizer == 'rmsprop':\r\n", | ||
" optimizer = tf.keras.optimizers.RMSprop(learning_rate = config.learning_rate, rho=0.9, momentum=0.0, epsilon=1e-07, centered=False)\r\n", | ||
" elif config.optimizer == 'adam':\r\n", | ||
" optimizer = tf.keras.optimizers.Adam(learning_rate = config.learning_rate, beta_1=0.9, beta_2=0.999)\r\n", | ||
"\r\n", | ||
" model.compile(optimizer = optimizer, loss = 'binary_crossentropy', metrics = ['acc'])\r\n", | ||
" # print(model.summary())\r\n", | ||
"\r\n", | ||
" # callbacks\r\n", | ||
" early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=3,\r\n", | ||
" mode='min', restore_best_weights=False)\r\n", | ||
" wandb_check = WandbCallback()\r\n", | ||
" callbacks_list = [early_stop, wandb_check]\r\n", | ||
"\r\n", | ||
" model.fit(train_X, train_y, epochs=config.epochs, batch_size=config.batch_size, validation_split=0.2, callbacks = callbacks_list)\r\n", | ||
"\r\n", | ||
" pred_test = model.predict(X_test)\r\n", | ||
" submission_data.loc[:,'info'] = np.where(pred_test > 0.5, 1,0).reshape(-1)" | ||
], | ||
"execution_count": 25, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"metadata": { | ||
"id": "QOj_4itZEDGN" | ||
}, | ||
"source": [ | ||
"# Initialize a new sweep\r\n", | ||
"# Arguments:\r\n", | ||
"# – sweep_id: the sweep_id to run - this was returned above by wandb.sweep()\r\n", | ||
"# – function: function that defines your model architecture and trains it\r\n", | ||
"wandb.agent(sweep_id, train, count=150)" | ||
], | ||
"execution_count": null, | ||
"outputs": [] | ||
} | ||
] | ||
} |
Oops, something went wrong.