Skip to content

Commit

Permalink
upload code
Browse files Browse the repository at this point in the history
  • Loading branch information
donggrii authored Oct 13, 2021
1 parent e4f7ec1 commit 99f0d9b
Show file tree
Hide file tree
Showing 2 changed files with 1,588 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,296 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "5. Modeling_wandb.ipynb",
"provenance": [],
"collapsed_sections": [],
"toc_visible": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"accelerator": "GPU"
},
"cells": [
{
"cell_type": "code",
"metadata": {
"id": "Acu-SKVaK03i",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 35
},
"outputId": "b0b477e5-7252-44f5-ea6f-1982736a4e26"
},
"source": [
"import os\r\n",
"os.getcwd()\r\n",
"# os.chdir('drive/MyDrive/NLP/Dacon_NH competition/')"
],
"execution_count": 12,
"outputs": [
{
"output_type": "execute_result",
"data": {
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "string"
},
"text/plain": [
"'/content/drive/My Drive/NLP/Dacon_NH competition'"
]
},
"metadata": {
"tags": []
},
"execution_count": 12
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "bMZd0rEV2-0F",
"outputId": "01a2e7ca-e688-4098-8469-654a2935d4e3"
},
"source": [
"%pip install wandb -q\r\n",
"\r\n",
"import numpy as np\r\n",
"import pandas as pd\r\n",
"import tensorflow as tf\r\n",
"from tensorflow.keras.preprocessing.text import Tokenizer\r\n",
"from tensorflow.keras.preprocessing.sequence import pad_sequences\r\n",
"from tensorflow.keras.models import Sequential\r\n",
"from tensorflow.keras.layers import Embedding, SpatialDropout1D, LSTM, Dropout, Dense\r\n",
"from tensorflow.keras import regularizers\r\n",
"from tensorflow.keras.callbacks import EarlyStopping\r\n",
"\r\n",
"import wandb\r\n",
"from wandb.keras import WandbCallback\r\n",
"wandb.login()"
],
"execution_count": 13,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"True"
]
},
"metadata": {
"tags": []
},
"execution_count": 13
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "cbP3p5bi--S3"
},
"source": [
"import logging\r\n",
"logger = logging.getLogger(\"wandb\")\r\n",
"logger.setLevel(logging.ERROR)"
],
"execution_count": 15,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "KE1-bLBnDtLx"
},
"source": [
"train = pd.read_csv('data/news_train_preprocessing_Mecab_12.24.csv')\r\n",
"train = train.fillna('')\r\n",
"\r\n",
"token_list = [t.split(' ') for t in train['text']]\r\n",
"max_len = max(len(l) for l in token_list)\r\n",
"\r\n",
"def text2sequence(train_text, max_len): # max_len에 뉴스의 최대 길이\r\n",
" max_words = 10000\r\n",
" tokenizer = Tokenizer(num_words = max_words) # keras의 vectorizing 함수 호출\r\n",
" tokenizer.fit_on_texts(train_text) # train 문장에 fit\r\n",
" train_X_seq = tokenizer.texts_to_sequences(train_text) # 각 토큰들에 정수값 부여\r\n",
" vocabulary = tokenizer.word_index\r\n",
" vocab_size = len(tokenizer.word_index) + 1 # 모델에 알려줄 vocabulary의 크기 계산(padding값 0을 포함해야 하므로 +1)\r\n",
" # print('vocab_size : ', vocab_size)\r\n",
" X_train = pad_sequences(train_X_seq, maxlen = max_len) # 설정한 문장의 최대 길이만큼 padding\r\n",
"\r\n",
" return X_train, vocabulary, vocab_size, tokenizer\r\n",
"\r\n",
"train_y = train['info']\r\n",
"train_X, vocabulary, vocab_size, vectorizer = text2sequence(token_list, max_len)\r\n",
"\r\n",
"test = pd.read_csv('data/news_test_preprocessing_Mecab_12.24.csv')\r\n",
"test = test.fillna('')\r\n",
"\r\n",
"token_list_test = [t.split(' ') for t in test['text']]\r\n",
"\r\n",
"test_X_seq = vectorizer.texts_to_sequences(token_list_test) # 각 토큰들에 정수값 부여\r\n",
"X_test = pad_sequences(test_X_seq, maxlen = max_len) # 설정한 문장의 최대 길이만큼 padding\r\n",
"\r\n",
"submission_data = pd.read_csv('data/sample_submission.csv')"
],
"execution_count": 16,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "dKHHsWaSC3Tj"
},
"source": [
"# Configure the sweep – specify the parameters to search through, the search strategy, the optimization metric et all.\r\n",
"\r\n",
"sweep_config = {\r\n",
" 'method': 'random', # grid, random\r\n",
" 'metric': {\r\n",
" 'name': 'val_loss',\r\n",
" 'goal': 'minimize' \r\n",
" },\r\n",
" 'parameters': {\r\n",
" 'embedding_dim': {\r\n",
" 'values': [50, 100, 150, 200]\r\n",
" },\r\n",
" 'SpatialDropout1D': {\r\n",
" 'values': [0.1, 0.2, 0.3, 0.4, 0.5]\r\n",
" },\r\n",
" 'LSTM_hidden': {\r\n",
" 'values': [16, 32, 64]\r\n",
" },\r\n",
" 'dropout_rate1': {\r\n",
" 'values': [0.2, 0.3, 0.4, 0.5]\r\n",
" },\r\n",
" 'dropout_rate2': {\r\n",
" 'values': [0.2, 0.3, 0.4, 0.5]\r\n",
" },\r\n",
" 'regularizer': {\r\n",
" 'values': [0.01, 0.001, 0.0001]\r\n",
" },\r\n",
" 'learning_rate': {\r\n",
" 'values': [1e-2, 1e-3, 1e-4, 3e-4, 3e-5, 1e-5]\r\n",
" },\r\n",
" 'optimizer': {\r\n",
" 'values': ['adam', 'rmsprop']\r\n",
" },\r\n",
" 'batch_size': {\r\n",
" 'values': [32, 64]\r\n",
" },\r\n",
" 'epochs': {\r\n",
" 'values': [3, 5, 10, 20]\r\n",
" }\r\n",
" }\r\n",
"}"
],
"execution_count": 23,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "tAc11QgXEPZ1",
"outputId": "6f9da855-16e3-4ace-c66d-9bbe96c20551"
},
"source": [
"# Initialize a new sweep\r\n",
"# Arguments:\r\n",
"# – sweep_config: the sweep config dictionary defined above\r\n",
"# – entity: Set the username for the sweep\r\n",
"# – project: Set the project name for the sweep\r\n",
"\r\n",
"sweep_id = wandb.sweep(sweep_config, entity=\"ldc\", project=\"NH_competition_keras_Embedding\")"
],
"execution_count": 24,
"outputs": [
{
"output_type": "stream",
"text": [
"Create sweep with ID: snalj0xq\n",
"Sweep URL: https://wandb.ai/ldc/NH_competition_test/sweeps/snalj0xq\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "XXcQ6K-NEP8Y"
},
"source": [
"def train():\r\n",
" # Default values for hyper-parameters we're going to sweep over\r\n",
" default_config = {'embedding_dim' : 50, 'SpatialDropout1D' : 0.3, 'LSTM_hidden' : 64, 'dropout_rate1' : 0.2, 'dropout_rate2' : 0.3, \r\n",
" 'regularizer' : 0.001, 'batch_size' : 32, 'learning_rate' : 0.001, 'optimizer' : 'rmsprop', 'epochs' : 10}\r\n",
"\r\n",
" # Initialize a new wandb run\r\n",
" wandb.init(config = default_config)\r\n",
"\r\n",
" # Config is a variable that holds and saves hyperparameters and inputs\r\n",
" config = wandb.config\r\n",
"\r\n",
" max_words = 10000\r\n",
" embedding_dim = config.embedding_dim # 50 ~ 200 사이에서 적절히 설정\r\n",
"\r\n",
" model = Sequential()\r\n",
" model.add(Embedding(input_dim = max_words, output_dim = embedding_dim, input_length = max_len))\r\n",
" model.add(SpatialDropout1D(config.SpatialDropout1D))\r\n",
" model.add(LSTM(config.LSTM_hidden))\r\n",
" model.add(Dropout(config.dropout_rate1))\r\n",
" model.add(Dense(32, activation = 'relu', kernel_regularizer = regularizers.l2(config.regularizer)))\r\n",
" model.add(Dropout(config.dropout_rate2))\r\n",
" # model.add(Dense(16, activation = 'relu'))\r\n",
" model.add(Dense(1, activation='sigmoid'))\r\n",
"\r\n",
" if config.optimizer == 'rmsprop':\r\n",
" optimizer = tf.keras.optimizers.RMSprop(learning_rate = config.learning_rate, rho=0.9, momentum=0.0, epsilon=1e-07, centered=False)\r\n",
" elif config.optimizer == 'adam':\r\n",
" optimizer = tf.keras.optimizers.Adam(learning_rate = config.learning_rate, beta_1=0.9, beta_2=0.999)\r\n",
"\r\n",
" model.compile(optimizer = optimizer, loss = 'binary_crossentropy', metrics = ['acc'])\r\n",
" # print(model.summary())\r\n",
"\r\n",
" # callbacks\r\n",
" early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=3,\r\n",
" mode='min', restore_best_weights=False)\r\n",
" wandb_check = WandbCallback()\r\n",
" callbacks_list = [early_stop, wandb_check]\r\n",
"\r\n",
" model.fit(train_X, train_y, epochs=config.epochs, batch_size=config.batch_size, validation_split=0.2, callbacks = callbacks_list)\r\n",
"\r\n",
" pred_test = model.predict(X_test)\r\n",
" submission_data.loc[:,'info'] = np.where(pred_test > 0.5, 1,0).reshape(-1)"
],
"execution_count": 25,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "QOj_4itZEDGN"
},
"source": [
"# Initialize a new sweep\r\n",
"# Arguments:\r\n",
"# – sweep_id: the sweep_id to run - this was returned above by wandb.sweep()\r\n",
"# – function: function that defines your model architecture and trains it\r\n",
"wandb.agent(sweep_id, train, count=150)"
],
"execution_count": null,
"outputs": []
}
]
}
Loading

0 comments on commit 99f0d9b

Please sign in to comment.