From db48b10234b97a54fa3c277a9847efb7a528d91b Mon Sep 17 00:00:00 2001 From: Junghwan Park Date: Sun, 27 Mar 2022 13:34:19 +0900 Subject: [PATCH] Rebuild --- docs/.buildinfo | 2 +- .../mario_rl_tutorial.ipynb | 54 +-- .../mario_rl_tutorial.py | 395 ++++++++-------- .../super_resolution_with_onnxruntime.ipynb | 2 +- .../super_resolution_with_onnxruntime.py | 2 +- .../sphx_glr_mario_rl_tutorial_001.png | Bin 2396 -> 2396 bytes docs/_sources/index.rst.txt | 2 +- docs/_static/check-solid.svg | 4 + docs/_static/clipboard.min.js | 8 +- docs/_static/copy-button.svg | 2 +- docs/_static/copybutton.css | 64 ++- docs/_static/copybutton.js | 103 +++- docs/_static/copybutton_funcs.js | 29 +- docs/advanced/ONNXLive.html | 8 +- docs/advanced/cpp_autograd.html | 10 +- docs/advanced/cpp_export.html | 8 +- docs/advanced/cpp_extension.html | 8 +- docs/advanced/cpp_frontend.html | 10 +- docs/advanced/ddp_pipeline.html | 8 +- docs/advanced/dispatcher.html | 8 +- .../dynamic_quantization_tutorial.html | 8 +- docs/advanced/extend_dispatcher.html | 8 +- docs/advanced/generic_join.html | 8 +- docs/advanced/neural_style_tutorial.html | 8 +- docs/advanced/numpy_extensions_tutorial.html | 8 +- docs/advanced/rpc_ddp_tutorial.html | 8 +- docs/advanced/sg_execution_times.html | 8 +- .../static_quantization_tutorial.html | 325 +++++++------ .../super_resolution_with_onnxruntime.html | 10 +- docs/advanced/torch-script-parallelism.html | 8 +- .../advanced/torch_script_custom_classes.html | 8 +- docs/advanced/torch_script_custom_ops.html | 8 +- .../Intro_to_TorchScript_tutorial.html | 8 +- .../audio_data_augmentation_tutorial.html | 8 +- docs/beginner/audio_datasets_tutorial.html | 8 +- .../audio_feature_augmentation_tutorial.html | 8 +- .../audio_feature_extractions_tutorial.html | 8 +- docs/beginner/audio_io_tutorial.html | 8 +- .../audio_preprocessing_tutorial.html | 8 +- docs/beginner/audio_resampling_tutorial.html | 8 +- docs/beginner/basics/autogradqs_tutorial.html | 8 +- docs/beginner/basics/buildmodel_tutorial.html | 8 +- docs/beginner/basics/data_tutorial.html | 8 +- docs/beginner/basics/intro.html | 8 +- .../basics/optimization_tutorial.html | 8 +- docs/beginner/basics/quickstart_tutorial.html | 8 +- .../beginner/basics/saveloadrun_tutorial.html | 8 +- docs/beginner/basics/sg_execution_times.html | 8 +- docs/beginner/basics/tensorqs_tutorial.html | 8 +- docs/beginner/basics/transforms_tutorial.html | 8 +- docs/beginner/blitz/autograd_tutorial.html | 8 +- docs/beginner/blitz/cifar10_tutorial.html | 8 +- .../blitz/data_parallel_tutorial.html | 8 +- .../blitz/neural_networks_tutorial.html | 8 +- docs/beginner/blitz/sg_execution_times.html | 8 +- docs/beginner/blitz/tensor_tutorial.html | 8 +- docs/beginner/chatbot_tutorial.html | 8 +- docs/beginner/colab.html | 8 +- docs/beginner/data_loading_tutorial.html | 8 +- docs/beginner/dcgan_faces_tutorial.html | 8 +- docs/beginner/deep_learning_60min_blitz.html | 8 +- docs/beginner/deep_learning_nlp_tutorial.html | 8 +- docs/beginner/deeplabv3_on_android.html | 8 +- docs/beginner/deeplabv3_on_ios.html | 8 +- ...ploy_seq2seq_hybrid_frontend_tutorial.html | 8 +- docs/beginner/dist_overview.html | 12 +- .../polynomial_autograd.html | 8 +- .../polynomial_custom_function.html | 8 +- docs/beginner/examples_nn/dynamic_net.html | 8 +- .../examples_nn/polynomial_module.html | 8 +- docs/beginner/examples_nn/polynomial_nn.html | 8 +- .../examples_nn/polynomial_optim.html | 8 +- .../examples_tensor/polynomial_numpy.html | 8 +- .../examples_tensor/polynomial_tensor.html | 8 +- docs/beginner/fgsm_tutorial.html | 8 +- ...inetuning_torchvision_models_tutorial.html | 8 +- .../autograd_tutorial_old.html | 8 +- .../former_torchies/nnft_tutorial.html | 8 +- .../former_torchies/parallelism_tutorial.html | 8 +- .../former_torchies/sg_execution_times.html | 8 +- .../former_torchies/tensor_tutorial_old.html | 8 +- docs/beginner/former_torchies_tutorial.html | 8 +- ...rid_frontend_through_example_tutorial.html | 8 +- .../hybrid_frontend/sg_execution_times.html | 8 +- docs/beginner/hybrid_frontend_tutorial.html | 8 +- .../hyperparameter_tuning_tutorial.html | 8 +- docs/beginner/introyt.html | 8 +- .../beginner/introyt/autogradyt_tutorial.html | 8 +- docs/beginner/introyt/captumyt.html | 8 +- docs/beginner/introyt/introyt1_tutorial.html | 8 +- docs/beginner/introyt/modelsyt_tutorial.html | 8 +- docs/beginner/introyt/sg_execution_times.html | 8 +- .../introyt/tensorboardyt_tutorial.html | 8 +- .../introyt/tensors_deeper_tutorial.html | 8 +- docs/beginner/introyt/trainingyt.html | 8 +- docs/beginner/nlp/advanced_tutorial.html | 8 +- docs/beginner/nlp/deep_learning_tutorial.html | 8 +- docs/beginner/nlp/pytorch_tutorial.html | 8 +- .../nlp/sequence_models_tutorial.html | 8 +- docs/beginner/nlp/sg_execution_times.html | 8 +- .../nlp/word_embeddings_tutorial.html | 8 +- docs/beginner/nn_tutorial.html | 8 +- docs/beginner/profiler.html | 8 +- docs/beginner/ptcheat.html | 8 +- docs/beginner/pytorch_with_examples.html | 8 +- docs/beginner/saving_loading_models.html | 8 +- docs/beginner/sg_execution_times.html | 8 +- .../text_sentiment_ngrams_tutorial.html | 8 +- docs/beginner/torchtext_translation.html | 8 +- docs/beginner/transfer_learning_tutorial.html | 8 +- docs/beginner/transformer_tutorial.html | 8 +- docs/beginner/translation_transformer.html | 8 +- docs/beginner/vt_tutorial.html | 8 +- docs/genindex.html | 8 +- docs/index.html | 10 +- ...autograd_saved_tensors_hooks_tutorial.html | 8 +- .../char_rnn_classification_tutorial.html | 8 +- .../char_rnn_generation_tutorial.html | 8 +- .../custom_function_conv_bn_tutorial.html | 8 +- ...tom_function_double_backward_tutorial.html | 8 +- docs/intermediate/ddp_tutorial.html | 8 +- .../dist_pipeline_parallel_tutorial.html | 8 +- docs/intermediate/dist_tuto.html | 8 +- .../dynamic_quantization_bert_tutorial.html | 8 +- .../intermediate/flask_rest_api_tutorial.html | 12 +- ...ed_alignment_with_torchaudio_tutorial.html | 8 +- docs/intermediate/forward_ad_usage.html | 8 +- docs/intermediate/fx_conv_bn_fuser.html | 8 +- docs/intermediate/fx_profiling_tutorial.html | 8 +- docs/intermediate/mario_rl_tutorial.html | 441 +++++++++--------- docs/intermediate/memory_format_tutorial.html | 8 +- .../intermediate/model_parallel_tutorial.html | 8 +- docs/intermediate/named_tensor_tutorial.html | 8 +- docs/intermediate/parametrizations.html | 8 +- docs/intermediate/pipeline_tutorial.html | 8 +- docs/intermediate/pruning_tutorial.html | 8 +- .../quantized_transfer_learning_tutorial.html | 12 +- .../reinforcement_q_learning.html | 12 +- docs/intermediate/rpc_async_execution.html | 8 +- .../rpc_param_server_tutorial.html | 8 +- docs/intermediate/rpc_tutorial.html | 8 +- .../seq2seq_translation_tutorial.html | 8 +- docs/intermediate/sg_execution_times.html | 31 +- .../spatial_transformer_tutorial.html | 8 +- ...assification_with_torchaudio_tutorial.html | 8 +- ..._recognition_with_torchaudio_tutorial.html | 8 +- .../speech_recognition_pipeline_tutorial.html | 8 +- .../tensorboard_profiler_tutorial.html | 8 +- docs/intermediate/tensorboard_tutorial.html | 8 +- .../text_to_speech_with_torchaudio.html | 8 +- docs/intermediate/torchvision_tutorial.html | 8 +- docs/objects.inv | Bin 11391 -> 11407 bytes docs/prototype/distributed_rpc_profiling.html | 8 +- docs/prototype/fx_graph_mode_ptq_dynamic.html | 8 +- docs/prototype/fx_graph_mode_ptq_static.html | 8 +- docs/prototype/fx_graph_mode_quant_guide.html | 8 +- .../graph_mode_dynamic_bert_tutorial.html | 8 +- docs/prototype/ios_coreml_workflow.html | 8 +- docs/prototype/ios_gpu_workflow.html | 8 +- docs/prototype/nnapi_mobilenetv2.html | 8 +- docs/prototype/numeric_suite_tutorial.html | 8 +- docs/prototype/prototype_index.html | 8 +- docs/prototype/sg_execution_times.html | 8 +- docs/prototype/skip_param_init.html | 8 +- docs/prototype/torchscript_freezing.html | 8 +- .../tracing_based_selective_build.html | 8 +- docs/prototype/vmap_recipe.html | 8 +- docs/prototype/vulkan_workflow.html | 8 +- .../android_native_app_with_custom_op.html | 8 +- docs/recipes/bundled_inputs.html | 8 +- docs/recipes/cuda_rpc.html | 8 +- docs/recipes/deployment_with_flask.html | 8 +- .../distributed_optim_torchscript.html | 8 +- docs/recipes/distributed_rpc_profiling.html | 8 +- docs/recipes/fuse.html | 8 +- docs/recipes/intel_extension_for_pytorch.html | 8 +- .../intel_neural_compressor_for_pytorch.html | 8 +- docs/recipes/mobile_interpreter.html | 8 +- docs/recipes/mobile_perf.html | 8 +- docs/recipes/model_preparation_android.html | 8 +- docs/recipes/model_preparation_ios.html | 8 +- docs/recipes/ptmobile_recipes_summary.html | 8 +- docs/recipes/quantization.html | 8 +- docs/recipes/recipes/Captum_Recipe.html | 8 +- docs/recipes/recipes/amp_recipe.html | 8 +- docs/recipes/recipes/benchmark.html | 8 +- .../recipes/defining_a_neural_network.html | 8 +- .../recipes/recipes/dynamic_quantization.html | 8 +- docs/recipes/recipes/loading_data_recipe.html | 8 +- docs/recipes/recipes/profiler_recipe.html | 8 +- .../recipes/save_load_across_devices.html | 8 +- ...ving_and_loading_a_general_checkpoint.html | 8 +- ...ving_and_loading_models_for_inference.html | 8 +- .../saving_multiple_models_in_one_file.html | 8 +- .../recipes/tensorboard_with_pytorch.html | 8 +- docs/recipes/recipes/timer_quick_start.html | 8 +- docs/recipes/recipes/tuning_guide.html | 8 +- ...ing_parameters_from_a_different_model.html | 8 +- docs/recipes/recipes/what_is_state_dict.html | 8 +- .../recipes/zeroing_out_gradients.html | 8 +- docs/recipes/recipes_index.html | 8 +- docs/recipes/script_optimized.html | 8 +- docs/recipes/torchscript_inference.html | 8 +- docs/recipes/zero_redundancy_optimizer.html | 8 +- docs/search.html | 8 +- docs/searchindex.js | 2 +- 206 files changed, 1547 insertions(+), 1447 deletions(-) create mode 100644 docs/_static/check-solid.svg diff --git a/docs/.buildinfo b/docs/.buildinfo index 1db7d27f8..d01d986f2 100644 --- a/docs/.buildinfo +++ b/docs/.buildinfo @@ -1,4 +1,4 @@ # Sphinx build info version 1 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. -config: d3c78aee0796eaca17b472b08560486c +config: 85edaa44f46a784d15b287d6d8ea99db tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/docs/_downloads/1ceade89aedc4c99a944f752a51c6d35/mario_rl_tutorial.ipynb b/docs/_downloads/1ceade89aedc4c99a944f752a51c6d35/mario_rl_tutorial.ipynb index 8dd537b1c..717a3057e 100644 --- a/docs/_downloads/1ceade89aedc4c99a944f752a51c6d35/mario_rl_tutorial.ipynb +++ b/docs/_downloads/1ceade89aedc4c99a944f752a51c6d35/mario_rl_tutorial.ipynb @@ -15,7 +15,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\nTrain a Mario-playing RL Agent\n================\n\nAuthors: `Yuansong Feng `__, `Suraj\nSubramanian `__, `Howard\nWang `__, `Steven\nGuo `__.\n\n\nThis tutorial walks you through the fundamentals of Deep Reinforcement\nLearning. At the end, you will implement an AI-powered Mario (using\n`Double Deep Q-Networks `__) that\ncan play the game by itself.\n\nAlthough no prior knowledge of RL is necessary for this tutorial, you\ncan familiarize yourself with these RL\n`concepts `__,\nand have this handy\n`cheatsheet `__\nas your companion. The full code is available\n`here `__.\n\n.. figure:: /_static/img/mario.gif\n :alt: mario\n" + "\n\ub9c8\ub9ac\uc624 \uac8c\uc784 RL \uc5d0\uc774\uc804\ud2b8\ub85c \ud559\uc2b5\ud558\uae30\n===============================\n\n\uc800\uc790: `Yuansong Feng `__, `Suraj\nSubramanian `__, `Howard\nWang `__, `Steven\nGuo `__.\n\n\ubc88\uc5ed: `\uae40\ud0dc\uc601 `__. \n\n\uc774\ubc88 \ud29c\ud1a0\ub9ac\uc5bc\uc5d0\uc11c\ub294 \uc2ec\uce35 \uac15\ud654 \ud559\uc2b5\uc758 \uae30\ubcf8 \uc0ac\ud56d\ub4e4\uc5d0 \ub300\ud574 \uc774\uc57c\uae30\ud574\ubcf4\ub3c4\ub85d \ud558\uaca0\uc2b5\ub2c8\ub2e4.\n\ub9c8\uc9c0\ub9c9\uc5d0\ub294, \uc2a4\uc2a4\ub85c \uac8c\uc784\uc744 \ud560 \uc218 \uc788\ub294 AI \uae30\ubc18 \ub9c8\ub9ac\uc624\ub97c \n(`Double Deep Q-Networks `__ \uc0ac\uc6a9) \n\uad6c\ud604\ud558\uac8c \ub429\ub2c8\ub2e4.\n\n\uc774 \ud29c\ud1a0\ub9ac\uc5bc\uc5d0\uc11c\ub294 RL\uc5d0 \ub300\ud55c \uc0ac\uc804 \uc9c0\uc2dd\uc774 \ud544\uc694\ud558\uc9c0 \uc54a\uc9c0\ub9cc, \n\uc774\ub7ec\ud55c `\ub9c1\ud06c `__\n\ub97c \ud1b5\ud574 RL \uac1c\ub150\uc5d0 \uce5c\uc219\ud574 \uc9c8 \uc218 \uc788\uc73c\uba70,\n\uc5ec\uae30 \uc788\ub294\n`\uce58\ud2b8\uc2dc\ud2b8 `__\n\ub97c \ud65c\uc6a9\ud560 \uc218\ub3c4 \uc788\uc2b5\ub2c8\ub2e4. \ud29c\ud1a0\ub9ac\uc5bc\uc5d0\uc11c \uc0ac\uc6a9\ud558\ub294 \uc804\uccb4 \ucf54\ub4dc\ub294\n`\uc5ec\uae30 `__\n\uc5d0\uc11c \ud655\uc778 \ud560 \uc218 \uc788\uc2b5\ub2c8\ub2e4.\n\n.. figure:: /_static/img/mario.gif\n :alt: mario\n" ] }, { @@ -26,21 +26,21 @@ }, "outputs": [], "source": [ - "# !pip install gym-super-mario-bros==7.3.0\n\nimport torch\nfrom torch import nn\nfrom torchvision import transforms as T\nfrom PIL import Image\nimport numpy as np\nfrom pathlib import Path\nfrom collections import deque\nimport random, datetime, os, copy\n\n# Gym is an OpenAI toolkit for RL\nimport gym\nfrom gym.spaces import Box\nfrom gym.wrappers import FrameStack\n\n# NES Emulator for OpenAI Gym\nfrom nes_py.wrappers import JoypadSpace\n\n# Super Mario environment for OpenAI Gym\nimport gym_super_mario_bros" + "# !pip install gym-super-mario-bros==7.3.0\n\nimport torch\nfrom torch import nn\nfrom torchvision import transforms as T\nfrom PIL import Image\nimport numpy as np\nfrom pathlib import Path\nfrom collections import deque\nimport random, datetime, os, copy\n\n# Gym\uc740 \uac15\ud654\ud559\uc2b5\uc744 \uc704\ud55c OpenAI \ud234\ud0b7\uc785\ub2c8\ub2e4.\nimport gym\nfrom gym.spaces import Box\nfrom gym.wrappers import FrameStack\n\n# OpenAI Gym\uc744 \uc704\ud55c NES \uc5d0\ubbac\ub808\uc774\ud130\nfrom nes_py.wrappers import JoypadSpace\n\n# OpenAI Gym\uc5d0\uc11c\uc758 \uc288\ud37c \ub9c8\ub9ac\uc624 \ud658\uacbd \uc138\ud305\nimport gym_super_mario_bros" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "RL Definitions\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n\n**Environment** The world that an agent interacts with and learns from.\n\n**Action** $a$ : How the Agent responds to the Environment. The\nset of all possible Actions is called *action-space*.\n\n**State** $s$ : The current characteristic of the Environment. The\nset of all possible States the Environment can be in is called\n*state-space*.\n\n**Reward** $r$ : Reward is the key feedback from Environment to\nAgent. It is what drives the Agent to learn and to change its future\naction. An aggregation of rewards over multiple time steps is called\n**Return**.\n\n**Optimal Action-Value function** $Q^*(s,a)$ : Gives the expected\nreturn if you start in state $s$, take an arbitrary action\n$a$, and then for each future time step take the action that\nmaximizes returns. $Q$ can be said to stand for the \u201cquality\u201d of\nthe action in a state. We try to approximate this function.\n\n\n" + "\uac15\ud654\ud559\uc2b5 \uac1c\ub150\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n\n**\ud658\uacbd(Environment)** : \uc5d0\uc774\uc804\ud2b8\uac00 \uc0c1\ud638\uc791\uc6a9\ud558\uba70 \uc2a4\uc2a4\ub85c \ubc30\uc6b0\ub294 \uc138\uacc4\uc785\ub2c8\ub2e4.\n\n**\ud589\ub3d9(Action)** $a$ : \uc5d0\uc774\uc804\ud2b8\uac00 \ud658\uacbd\uc5d0 \uc5b4\ub5bb\uac8c \uc751\ub2f5\ud558\ub294\uc9c0 \ud589\ub3d9\uc744 \ud1b5\ud574 \ub098\ud0c0\ub0c5\ub2c8\ub2e4. \n\uac00\ub2a5\ud55c \ubaa8\ub4e0 \ud589\ub3d9\uc758 \uc9d1\ud569\uc744 *\ud589\ub3d9 \uacf5\uac04* \uc774\ub77c\uace0 \ud569\ub2c8\ub2e4.\n\n**\uc0c1\ud0dc(State)** $s$ : \ud658\uacbd\uc758 \ud604\uc7ac \ud2b9\uc131\uc744 \uc0c1\ud0dc\ub97c \ud1b5\ud574 \ub098\ud0c0\ub0c5\ub2c8\ub2e4.\n\ud658\uacbd\uc774 \uc788\uc744 \uc218 \uc788\ub294 \ubaa8\ub4e0 \uac00\ub2a5\ud55c \uc0c1\ud0dc \uc9d1\ud569\uc744 *\uc0c1\ud0dc \uacf5\uac04* \uc774\ub77c\uace0 \ud569\ub2c8\ub2e4.\n\n**\ud3ec\uc0c1(Reward)** $r$ : \ud3ec\uc0c1\uc740 \ud658\uacbd\uc5d0\uc11c \uc5d0\uc774\uc804\ud2b8\ub85c \uc804\ub2ec\ub418\ub294 \ud575\uc2ec \ud53c\ub4dc\ubc31\uc785\ub2c8\ub2e4.\n\uc5d0\uc774\uc804\ud2b8\uac00 \ud559\uc2b5\ud558\uace0 \ud5a5\ud6c4 \ud589\ub3d9\uc744 \ubcc0\uacbd\ud558\ub3c4\ub85d \uc720\ub3c4\ud558\ub294 \uac83\uc785\ub2c8\ub2e4.\n\uc5ec\ub7ec \uc2dc\uac04 \ub2e8\uacc4\uc5d0 \uac78\uce5c \ud3ec\uc0c1\uc758 \ud569\uc744 **\ub9ac\ud134(Return)** \uc774\ub77c\uace0 \ud569\ub2c8\ub2e4.\n\n**\ucd5c\uc801\uc758 \ud589\ub3d9-\uac00\uce58 \ud568\uc218(Action-Value function)** $Q^*(s,a)$ : \uc0c1\ud0dc $s$\n\uc5d0\uc11c \uc2dc\uc791\ud558\uba74 \uc608\uc0c1\ub418\ub294 \ub9ac\ud134\uc744 \ubc18\ud658\ud558\uace0, \uc784\uc758\uc758 \ud589\ub3d9 $a$\n\ub97c \uc120\ud0dd\ud569\ub2c8\ub2e4. \uadf8\ub9ac\uace0 \uac01\uac01\uc758 \ubbf8\ub798\uc758 \ub2e8\uacc4\uc5d0\uc11c \ud3ec\uc0c1\uc758 \ud569\uc744 \uadf9\ub300\ud654\ud558\ub294 \ud589\ub3d9\uc744 \uc120\ud0dd\ud558\ub3c4\ub85d \ud569\ub2c8\ub2e4.\n$Q$ \ub294 \uc0c1\ud0dc\uc5d0\uc11c \ud589\ub3d9\uc758 \u201c\ud488\uc9c8\u201d \n\uc744 \ub098\ud0c0\ub0c5\ub2c8\ub2e4. \uc6b0\ub9ac\ub294 \uc774 \ud568\uc218\ub97c \uadfc\uc0ac \uc2dc\ud0a4\ub824\uace0 \ud569\ub2c8\ub2e4.\n\n\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Environment\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n\nInitialize Environment\n------------------------\n\nIn Mario, the environment consists of tubes, mushrooms and other\ncomponents.\n\nWhen Mario makes an action, the environment responds with the changed\n(next) state, reward and other info.\n\n\n" + "\ud658\uacbd(Environment)\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n\n\ud658\uacbd \ucd08\uae30\ud654\ud558\uae30\n------------------------\n\n\ub9c8\ub9ac\uc624 \uac8c\uc784\uc5d0\uc11c \ud658\uacbd\uc740 \ud29c\ube0c, \ubc84\uc12f, \uadf8 \uc774\uc678 \ub2e4\ub978 \uc5ec\ub7ec \uc694\uc18c\ub4e4\ub85c \uad6c\uc131\ub418\uc5b4 \uc788\uc2b5\ub2c8\ub2e4.\n\n\ub9c8\ub9ac\uc624\uac00 \ud589\ub3d9\uc744 \ucde8\ud558\uba74, \ud658\uacbd\uc740 \ubcc0\uacbd\ub41c (\ub2e4\uc74c)\uc0c1\ud0dc, \ud3ec\uc0c1 \uadf8\ub9ac\uace0\n\ub2e4\ub978 \uc815\ubcf4\ub4e4\ub85c \uc751\ub2f5\ud569\ub2c8\ub2e4.\n\n\n" ] }, { @@ -51,14 +51,14 @@ }, "outputs": [], "source": [ - "# Initialize Super Mario environment\nenv = gym_super_mario_bros.make(\"SuperMarioBros-1-1-v0\")\n\n# Limit the action-space to\n# 0. walk right\n# 1. jump right\nenv = JoypadSpace(env, [[\"right\"], [\"right\", \"A\"]])\n\nenv.reset()\nnext_state, reward, done, info = env.step(action=0)\nprint(f\"{next_state.shape},\\n {reward},\\n {done},\\n {info}\")" + "# \uc288\ud37c \ub9c8\ub9ac\uc624 \ud658\uacbd \ucd08\uae30\ud654\ud558\uae30\nenv = gym_super_mario_bros.make(\"SuperMarioBros-1-1-v0\")\n\n# \uc0c1\ud0dc \uacf5\uac04\uc744 2\uac00\uc9c0\ub85c \uc81c\ud55c\ud558\uae30\n# 0. \uc624\ub978\ucabd\uc73c\ub85c \uac77\uae30\n# 1. \uc624\ub978\ucabd\uc73c\ub85c \uc810\ud504\ud558\uae30\nenv = JoypadSpace(env, [[\"right\"], [\"right\", \"A\"]])\n\nenv.reset()\nnext_state, reward, done, info = env.step(action=0)\nprint(f\"{next_state.shape},\\n {reward},\\n {done},\\n {info}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Preprocess Environment\n------------------------\n\nEnvironment data is returned to the agent in ``next_state``. As you saw\nabove, each state is represented by a ``[3, 240, 256]`` size array.\nOften that is more information than our agent needs; for instance,\nMario\u2019s actions do not depend on the color of the pipes or the sky!\n\nWe use **Wrappers** to preprocess environment data before sending it to\nthe agent.\n\n``GrayScaleObservation`` is a common wrapper to transform an RGB image\nto grayscale; doing so reduces the size of the state representation\nwithout losing useful information. Now the size of each state:\n``[1, 240, 256]``\n\n``ResizeObservation`` downsamples each observation into a square image.\nNew size: ``[1, 84, 84]``\n\n``SkipFrame`` is a custom wrapper that inherits from ``gym.Wrapper`` and\nimplements the ``step()`` function. Because consecutive frames don\u2019t\nvary much, we can skip n-intermediate frames without losing much\ninformation. The n-th frame aggregates rewards accumulated over each\nskipped frame.\n\n``FrameStack`` is a wrapper that allows us to squash consecutive frames\nof the environment into a single observation point to feed to our\nlearning model. This way, we can identify if Mario was landing or\njumping based on the direction of his movement in the previous several\nframes.\n\n\n" + "\ud658\uacbd \uc804\ucc98\ub9ac \uacfc\uc815 \uac70\uce58\uae30\n------------------------\n\n``\ub2e4\uc74c \uc0c1\ud0dc(next_state)`` \uc5d0\uc11c \ud658\uacbd \ub370\uc774\ud130\uac00 \uc5d0\uc774\uc804\ud2b8\ub85c \ubc18\ud658\ub429\ub2c8\ub2e4.\n\uc55e\uc11c \uc0b4\ud3b4\ubcf4\uc558\ub4ef\uc774, \uac01\uac01\uc758 \uc0c1\ud0dc\ub294 ``[3, 240, 256]`` \uc758 \ubc30\uc5f4\ub85c \ub098\ud0c0\ub0b4\uace0 \uc788\uc2b5\ub2c8\ub2e4.\n\uc885\uc885 \uc0c1\ud0dc\uac00 \uc81c\uacf5\ud558\ub294 \uac83\uc740 \uc5d0\uc774\uc804\ud2b8\uac00 \ud544\uc694\ub85c \ud558\ub294 \uac83\ubcf4\ub2e4 \ub354 \ub9ce\uc740 \uc815\ubcf4\uc785\ub2c8\ub2e4.\n\uc608\ub97c \ub4e4\uc5b4, \ub9c8\ub9ac\uc624\uc758 \ud589\ub3d9\uc740 \ud30c\uc774\ud504\uc758 \uc0c9\uae54\uc774\ub098 \ud558\ub298\uc758 \uc0c9\uae54\uc5d0 \uc88c\uc6b0\ub418\uc9c0 \uc54a\uc2b5\ub2c8\ub2e4!\n\n\uc544\ub798\uc5d0 \uc124\uba85\ud560 \ud074\ub798\uc2a4\ub4e4\uc740 \ud658\uacbd \ub370\uc774\ud130\ub97c \uc5d0\uc774\uc804\ud2b8\uc5d0 \ubcf4\ub0b4\uae30 \uc804 \ub2e8\uacc4\uc5d0\uc11c \uc804\ucc98\ub9ac \uacfc\uc815\uc5d0 \uc0ac\uc6a9\ud560\n**\ub798\ud37c(Wrappers)** \uc785\ub2c8\ub2e4.\n\n``GrayScaleObservation`` \uc740 RGB \uc774\ubbf8\uc9c0\ub97c \ud751\ubc31 \uc774\ubbf8\uc9c0\ub85c \ubc14\uafb8\ub294 \uc77c\ubc18\uc801\uc778 \ub798\ud37c\uc785\ub2c8\ub2e4.\n``GrayScaleObservation`` \ud074\ub798\uc2a4\ub97c \uc0ac\uc6a9\ud558\uba74 \uc720\uc6a9\ud55c \uc815\ubcf4\ub97c \uc783\uc9c0 \uc54a\uace0 \uc0c1\ud0dc\uc758 \ud06c\uae30\ub97c \uc904\uc77c \uc218 \uc788\uc2b5\ub2c8\ub2e4.\n``GrayScaleObservation`` \ub97c \uc801\uc6a9\ud558\uba74 \uac01\uac01 \uc0c1\ud0dc\uc758 \ud06c\uae30\ub294\n``[1, 240, 256]`` \uc774 \ub429\ub2c8\ub2e4.\n\n``ResizeObservation`` \uc740 \uac01\uac01\uc758 \uc0c1\ud0dc(Observation)\ub97c \uc815\uc0ac\uac01\ud615 \uc774\ubbf8\uc9c0\ub85c \ub2e4\uc6b4 \uc0d8\ud50c\ub9c1\ud569\ub2c8\ub2e4.\n\uc774 \ub798\ud37c\ub97c \uc801\uc6a9\ud558\uba74 \uac01\uac01 \uc0c1\ud0dc\uc758 \ud06c\uae30\ub294 ``[1, 84, 84]`` \uc774 \ub429\ub2c8\ub2e4.\n\n``SkipFrame`` \uc740 ``gym.Wrapper`` \uc73c\ub85c\ubd80\ud130 \uc0c1\uc18d\uc744 \ubc1b\uc740 \uc0ac\uc6a9\uc790 \uc9c0\uc815 \ud074\ub798\uc2a4\uc774\uace0,\n``step()`` \ud568\uc218\ub97c \uad6c\ud604\ud569\ub2c8\ub2e4. \uc65c\ub0d0\ud558\uba74 \uc5f0\uc18d\ub418\ub294 \ud504\ub808\uc784\uc740 \ud070 \ucc28\uc774\uac00 \uc5c6\uae30 \ub54c\ubb38\uc5d0\nn\uac1c\uc758 \uc911\uac04 \ud504\ub808\uc784\uc744 \ud070 \uc815\ubcf4\uc758 \uc190\uc2e4 \uc5c6\uc774 \uac74\ub108\ub6f8 \uc218 \uc788\uae30 \ub54c\ubb38\uc785\ub2c8\ub2e4.\nn\ubc88\uc9f8 \ud504\ub808\uc784\uc740 \uac74\ub108\ub6f4 \uac01 \ud504\ub808\uc784\uc5d0 \uac78\uccd0 \ub204\uc801\ub41c \ud3ec\uc0c1\uc744\n\uc9d1\uacc4\ud569\ub2c8\ub2e4.\n\n``FrameStack`` \uc740 \ud658\uacbd\uc758 \uc5f0\uc18d \ud504\ub808\uc784\uc744\n\ub2e8\uc77c \uad00\ucc30 \uc9c0\uc810\uc73c\ub85c \ubc14\uafb8\uc5b4 \ud559\uc2b5 \ubaa8\ub378\uc5d0 \uc81c\uacf5\ud560 \uc218 \uc788\ub294 \ub798\ud37c\uc785\ub2c8\ub2e4.\n\uc774\ub807\uac8c \ud558\uba74 \ub9c8\ub9ac\uc624\uac00 \ucc29\uc9c0 \uc911\uc774\uc600\ub294\uc9c0 \ub610\ub294 \uc810\ud504 \uc911\uc774\uc5c8\ub294\uc9c0\n\uc774\uc804 \uba87 \ud504\ub808\uc784\uc758 \uc6c0\uc9c1\uc784 \ubc29\ud5a5\uc5d0 \ub530\ub77c \ud655\uc778\ud560 \uc218\n\uc788\uc2b5\ub2c8\ub2e4.\n\n\n" ] }, { @@ -69,21 +69,21 @@ }, "outputs": [], "source": [ - "class SkipFrame(gym.Wrapper):\n def __init__(self, env, skip):\n \"\"\"Return only every `skip`-th frame\"\"\"\n super().__init__(env)\n self._skip = skip\n\n def step(self, action):\n \"\"\"Repeat action, and sum reward\"\"\"\n total_reward = 0.0\n done = False\n for i in range(self._skip):\n # Accumulate reward and repeat the same action\n obs, reward, done, info = self.env.step(action)\n total_reward += reward\n if done:\n break\n return obs, total_reward, done, info\n\n\nclass GrayScaleObservation(gym.ObservationWrapper):\n def __init__(self, env):\n super().__init__(env)\n obs_shape = self.observation_space.shape[:2]\n self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype=np.uint8)\n\n def permute_orientation(self, observation):\n # permute [H, W, C] array to [C, H, W] tensor\n observation = np.transpose(observation, (2, 0, 1))\n observation = torch.tensor(observation.copy(), dtype=torch.float)\n return observation\n\n def observation(self, observation):\n observation = self.permute_orientation(observation)\n transform = T.Grayscale()\n observation = transform(observation)\n return observation\n\n\nclass ResizeObservation(gym.ObservationWrapper):\n def __init__(self, env, shape):\n super().__init__(env)\n if isinstance(shape, int):\n self.shape = (shape, shape)\n else:\n self.shape = tuple(shape)\n\n obs_shape = self.shape + self.observation_space.shape[2:]\n self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype=np.uint8)\n\n def observation(self, observation):\n transforms = T.Compose(\n [T.Resize(self.shape), T.Normalize(0, 255)]\n )\n observation = transforms(observation).squeeze(0)\n return observation\n\n\n# Apply Wrappers to environment\nenv = SkipFrame(env, skip=4)\nenv = GrayScaleObservation(env)\nenv = ResizeObservation(env, shape=84)\nenv = FrameStack(env, num_stack=4)" + "class SkipFrame(gym.Wrapper):\n def __init__(self, env, skip):\n \"\"\"\ubaa8\ub4e0 `skip` \ud504\ub808\uc784\ub9cc \ubc18\ud658\ud569\ub2c8\ub2e4.\"\"\"\n super().__init__(env)\n self._skip = skip\n\n def step(self, action):\n \"\"\"\ud589\ub3d9\uc744 \ubc18\ubcf5\ud558\uace0 \ud3ec\uc0c1\uc744 \ub354\ud569\ub2c8\ub2e4.\"\"\"\n total_reward = 0.0\n done = False\n for i in range(self._skip):\n # \ud3ec\uc0c1\uc744 \ub204\uc801\ud558\uace0 \ub3d9\uc77c\ud55c \uc791\uc5c5\uc744 \ubc18\ubcf5\ud569\ub2c8\ub2e4.\n obs, reward, done, info = self.env.step(action)\n total_reward += reward\n if done:\n break\n return obs, total_reward, done, info\n\n\nclass GrayScaleObservation(gym.ObservationWrapper):\n def __init__(self, env):\n super().__init__(env)\n obs_shape = self.observation_space.shape[:2]\n self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype=np.uint8)\n\n def permute_orientation(self, observation):\n # [H, W, C] \ubc30\uc5f4\uc744 [C, H, W] \ud150\uc11c\ub85c \ubc14\uafc9\ub2c8\ub2e4.\n observation = np.transpose(observation, (2, 0, 1))\n observation = torch.tensor(observation.copy(), dtype=torch.float)\n return observation\n\n def observation(self, observation):\n observation = self.permute_orientation(observation)\n transform = T.Grayscale()\n observation = transform(observation)\n return observation\n\n\nclass ResizeObservation(gym.ObservationWrapper):\n def __init__(self, env, shape):\n super().__init__(env)\n if isinstance(shape, int):\n self.shape = (shape, shape)\n else:\n self.shape = tuple(shape)\n\n obs_shape = self.shape + self.observation_space.shape[2:]\n self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype=np.uint8)\n\n def observation(self, observation):\n transforms = T.Compose(\n [T.Resize(self.shape), T.Normalize(0, 255)]\n )\n observation = transforms(observation).squeeze(0)\n return observation\n\n\n# \ub798\ud37c\ub97c \ud658\uacbd\uc5d0 \uc801\uc6a9\ud569\ub2c8\ub2e4.\nenv = SkipFrame(env, skip=4)\nenv = GrayScaleObservation(env)\nenv = ResizeObservation(env, shape=84)\nenv = FrameStack(env, num_stack=4)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "After applying the above wrappers to the environment, the final wrapped\nstate consists of 4 gray-scaled consecutive frames stacked together, as\nshown above in the image on the left. Each time Mario makes an action,\nthe environment responds with a state of this structure. The structure\nis represented by a 3-D array of size ``[4, 84, 84]``.\n\n.. figure:: /_static/img/mario_env.png\n :alt: picture\n\n\n\n" + "\uc55e\uc11c \uc18c\uac1c\ud55c \ub798\ud37c\ub97c \ud658\uacbd\uc5d0 \uc801\uc6a9\ud55c \ud6c4,\n\ucd5c\uc885 \ub798\ud551 \uc0c1\ud0dc\ub294 \uc67c\ucabd \uc544\ub798 \uc774\ubbf8\uc9c0\uc5d0 \ud45c\uc2dc\ub41c \uac83\ucc98\ub7fc 4\uac1c\uc758 \uc5f0\uc18d\ub41c \ud751\ubc31 \ud504\ub808\uc784\uc73c\ub85c \n\uad6c\uc131\ub429\ub2c8\ub2e4. \ub9c8\ub9ac\uc624\uac00 \ud589\ub3d9\uc744 \ud560 \ub54c\ub9c8\ub2e4,\n\ud658\uacbd\uc740 \uc774 \uad6c\uc870\uc758 \uc0c1\ud0dc\ub85c \uc751\ub2f5\ud569\ub2c8\ub2e4.\n\uad6c\uc870\ub294 ``[4, 84, 84]`` \ud06c\uae30\uc758 3\ucc28\uc6d0 \ubc30\uc5f4\ub85c \uad6c\uc131\ub418\uc5b4 \uc788\uc2b5\ub2c8\ub2e4.\n\n.. figure:: /_static/img/mario_env.png\n :alt: picture\n\n\n\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Agent\n\"\"\"\"\"\"\"\"\"\n\nWe create a class ``Mario`` to represent our agent in the game. Mario\nshould be able to:\n\n- **Act** according to the optimal action policy based on the current\n state (of the environment).\n\n- **Remember** experiences. Experience = (current state, current\n action, reward, next state). Mario *caches* and later *recalls* his\n experiences to update his action policy.\n\n- **Learn** a better action policy over time\n\n\n" + "\uc5d0\uc774\uc804\ud2b8(Agent)\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n\n``Mario`` \ub77c\ub294 \ud074\ub798\uc2a4\ub97c \uc774 \uac8c\uc784\uc758 \uc5d0\uc774\uc804\ud2b8\ub85c \uc0dd\uc131\ud569\ub2c8\ub2e4.\n\ub9c8\ub9ac\uc624\ub294 \ub2e4\uc74c\uacfc \uac19\uc740 \uae30\ub2a5\uc744 \ud560 \uc218 \uc788\uc5b4\uc57c \ud569\ub2c8\ub2e4.\n\n- **\ud589\ub3d9(Act)** \uc740 (\ud658\uacbd\uc758) \ud604\uc7ac \uc0c1\ud0dc\ub97c \uae30\ubc18\uc73c\ub85c \n \ucd5c\uc801\uc758 \ud589\ub3d9 \uc815\ucc45\uc5d0 \ub530\ub77c \uc815\ud574\uc9d1\ub2c8\ub2e4.\n\n- \uacbd\ud5d8\uc744 **\uae30\uc5b5(Remember)** \ud558\ub294 \uac83. \n \uacbd\ud5d8\uc740 (\ud604\uc7ac \uc0c1\ud0dc, \ud604\uc7ac \ud589\ub3d9, \ud3ec\uc0c1, \ub2e4\uc74c \uc0c1\ud0dc) \ub85c \uc774\ub8e8\uc5b4\uc838 \uc788\uc2b5\ub2c8\ub2e4. \n \ub9c8\ub9ac\uc624\ub294 \uadf8\uc758 \ud589\ub3d9 \uc815\ucc45\uc744 \uc5c5\ub370\uc774\ud2b8 \ud558\uae30 \uc704\ud574 *\uce90\uc2dc(caches)* \ub97c \ud55c \ub2e4\uc74c, \uadf8\uc758 \uacbd\ud5d8\uc744 *\ub9ac\ucf5c(recalls)* \ud569\ub2c8\ub2e4.\n\n- **\ud559\uc2b5(Learn)** \uc744 \ud1b5\ud574 \uc2dc\uac04\uc774 \uc9c0\ub0a8\uc5d0 \ub530\ub77c \ub354 \ub098\uc740 \ud589\ub3d9 \uc815\ucc45\uc744 \ud0dd\ud569\ub2c8\ub2e4.\n\n\n" ] }, { @@ -94,21 +94,21 @@ }, "outputs": [], "source": [ - "class Mario:\n def __init__():\n pass\n\n def act(self, state):\n \"\"\"Given a state, choose an epsilon-greedy action\"\"\"\n pass\n\n def cache(self, experience):\n \"\"\"Add the experience to memory\"\"\"\n pass\n\n def recall(self):\n \"\"\"Sample experiences from memory\"\"\"\n pass\n\n def learn(self):\n \"\"\"Update online action value (Q) function with a batch of experiences\"\"\"\n pass" + "class Mario:\n def __init__():\n pass\n\n def act(self, state):\n \"\"\"\uc0c1\ud0dc\uac00 \uc8fc\uc5b4\uc9c0\uba74, \uc785\uc2e4\ub860-\uadf8\ub9ac\ub514 \ud589\ub3d9(epsilon-greedy action)\uc744 \uc120\ud0dd\ud574\uc57c \ud569\ub2c8\ub2e4.\"\"\"\n pass\n\n def cache(self, experience):\n \"\"\"\uba54\ubaa8\ub9ac\uc5d0 \uacbd\ud5d8\uc744 \ucd94\uac00\ud569\ub2c8\ub2e4.\"\"\"\n pass\n\n def recall(self):\n \"\"\"\uba54\ubaa8\ub9ac\ub85c\ubd80\ud130 \uacbd\ud5d8\uc744 \uc0d8\ud50c\ub9c1\ud569\ub2c8\ub2e4.\"\"\"\n pass\n\n def learn(self):\n \"\"\"\uc77c\ub828\uc758 \uacbd\ud5d8\ub4e4\ub85c \uc2e4\uc2dc\uac04 \ud589\ub3d9 \uac00\uce58(online action value) (Q) \ud568\uc218\ub97c \uc5c5\ub370\uc774\ud2b8 \ud569\ub2c8\ub2e4.\"\"\"\n pass" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "In the following sections, we will populate Mario\u2019s parameters and\ndefine his functions.\n\n\n" + "\uc774\ubc88 \uc139\uc158\uc5d0\uc11c\ub294 \ub9c8\ub9ac\uc624 \ud074\ub798\uc2a4\uc758 \ub9e4\uac1c\ubcc0\uc218\ub97c \ucc44\uc6b0\uace0, \n\ub9c8\ub9ac\uc624 \ud074\ub798\uc2a4\uc758 \ud568\uc218\ub4e4\uc744 \uc815\uc758\ud558\uaca0\uc2b5\ub2c8\ub2e4.\n\n\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Act\n--------------\n\nFor any given state, an agent can choose to do the most optimal action\n(**exploit**) or a random action (**explore**).\n\nMario randomly explores with a chance of ``self.exploration_rate``; when\nhe chooses to exploit, he relies on ``MarioNet`` (implemented in\n``Learn`` section) to provide the most optimal action.\n\n\n" + "\ud589\ub3d9\ud558\uae30(Act)\n--------------\n\n\uc8fc\uc5b4\uc9c4 \uc0c1\ud0dc\uc5d0 \ub300\ud574, \uc5d0\uc774\uc804\ud2b8\ub294 \ucd5c\uc801\uc758 \ud589\ub3d9\uc744 \uc774\uc6a9\ud560 \uac83\uc778\uc9c0\n\uc784\uc758\uc758 \ud589\ub3d9\uc744 \uc120\ud0dd\ud558\uc5ec \ubd84\uc11d\ud560 \uac83\uc778\uc9c0 \uc120\ud0dd\ud560 \uc218 \uc788\uc2b5\ub2c8\ub2e4.\n\n\ub9c8\ub9ac\uc624\ub294 \uc784\uc758\uc758 \ud589\ub3d9\uc744 \uc120\ud0dd\ud588\uc744 \ub54c ``self.exploration_rate`` \ub97c \ud65c\uc6a9\ud569\ub2c8\ub2e4.\n\ucd5c\uc801\uc758 \ud589\ub3d9\uc744 \uc774\uc6a9\ud55c\ub2e4\uace0 \ud588\uc744 \ub54c, \uadf8\ub294 \ucd5c\uc801\uc758 \ud589\ub3d9\uc744 \uc218\ud589\ud558\uae30 \uc704\ud574 \n(``\ud559\uc2b5\ud558\uae30(Learn)`` \uc139\uc158\uc5d0\uc11c \uad6c\ud604\ub41c) ``MarioNet`` \uc774 \ud544\uc694\ud569\ub2c8\ub2e4.\n\n\n" ] }, { @@ -119,14 +119,14 @@ }, "outputs": [], "source": [ - "class Mario:\n def __init__(self, state_dim, action_dim, save_dir):\n self.state_dim = state_dim\n self.action_dim = action_dim\n self.save_dir = save_dir\n\n self.use_cuda = torch.cuda.is_available()\n\n # Mario's DNN to predict the most optimal action - we implement this in the Learn section\n self.net = MarioNet(self.state_dim, self.action_dim).float()\n if self.use_cuda:\n self.net = self.net.to(device=\"cuda\")\n\n self.exploration_rate = 1\n self.exploration_rate_decay = 0.99999975\n self.exploration_rate_min = 0.1\n self.curr_step = 0\n\n self.save_every = 5e5 # no. of experiences between saving Mario Net\n\n def act(self, state):\n \"\"\"\n Given a state, choose an epsilon-greedy action and update value of step.\n\n Inputs:\n state(LazyFrame): A single observation of the current state, dimension is (state_dim)\n Outputs:\n action_idx (int): An integer representing which action Mario will perform\n \"\"\"\n # EXPLORE\n if np.random.rand() < self.exploration_rate:\n action_idx = np.random.randint(self.action_dim)\n\n # EXPLOIT\n else:\n state = state.__array__()\n if self.use_cuda:\n state = torch.tensor(state).cuda()\n else:\n state = torch.tensor(state)\n state = state.unsqueeze(0)\n action_values = self.net(state, model=\"online\")\n action_idx = torch.argmax(action_values, axis=1).item()\n\n # decrease exploration_rate\n self.exploration_rate *= self.exploration_rate_decay\n self.exploration_rate = max(self.exploration_rate_min, self.exploration_rate)\n\n # increment step\n self.curr_step += 1\n return action_idx" + "class Mario:\n def __init__(self, state_dim, action_dim, save_dir):\n self.state_dim = state_dim\n self.action_dim = action_dim\n self.save_dir = save_dir\n\n self.use_cuda = torch.cuda.is_available()\n\n # \ub9c8\ub9ac\uc624\uc758 DNN\uc740 \ucd5c\uc801\uc758 \ud589\ub3d9\uc744 \uc608\uce21\ud569\ub2c8\ub2e4 - \uc774\ub294 \ud559\uc2b5\ud558\uae30 \uc139\uc158\uc5d0\uc11c \uad6c\ud604\ud569\ub2c8\ub2e4.\n self.net = MarioNet(self.state_dim, self.action_dim).float()\n if self.use_cuda:\n self.net = self.net.to(device=\"cuda\")\n\n self.exploration_rate = 1\n self.exploration_rate_decay = 0.99999975\n self.exploration_rate_min = 0.1\n self.curr_step = 0\n\n self.save_every = 5e5 # Mario Net \uc800\uc7a5 \uc0ac\uc774\uc758 \uacbd\ud5d8 \ud69f\uc218\n\n def act(self, state):\n \"\"\"\n \uc8fc\uc5b4\uc9c4 \uc0c1\ud0dc\uc5d0\uc11c, \uc785\uc2e4\ub860-\uadf8\ub9ac\ub514 \ud589\ub3d9(epsilon-greedy action)\uc744 \uc120\ud0dd\ud558\uace0, \uc2a4\ud15d\uc758 \uac12\uc744 \uc5c5\ub370\uc774\ud2b8 \ud569\ub2c8\ub2e4.\n\n \uc785\ub825\uac12:\n state(LazyFrame): \ud604\uc7ac \uc0c1\ud0dc\uc5d0\uc11c\uc758 \ub2e8\uc77c \uc0c1\ud0dc(observation)\uac12\uc744 \ub9d0\ud569\ub2c8\ub2e4. \ucc28\uc6d0\uc740 (state_dim)\uc785\ub2c8\ub2e4.\n \ucd9c\ub825\uac12:\n action_idx (int): Mario\uac00 \uc218\ud589\ud560 \ud589\ub3d9\uc744 \ub098\ud0c0\ub0b4\ub294 \uc815\uc218 \uac12\uc785\ub2c8\ub2e4.\n \"\"\"\n # \uc784\uc758\uc758 \ud589\ub3d9\uc744 \uc120\ud0dd\ud558\uae30\n if np.random.rand() < self.exploration_rate:\n action_idx = np.random.randint(self.action_dim)\n\n # \ucd5c\uc801\uc758 \ud589\ub3d9\uc744 \uc774\uc6a9\ud558\uae30\n else:\n state = state.__array__()\n if self.use_cuda:\n state = torch.tensor(state).cuda()\n else:\n state = torch.tensor(state)\n state = state.unsqueeze(0)\n action_values = self.net(state, model=\"online\")\n action_idx = torch.argmax(action_values, axis=1).item()\n\n # exploration_rate \uac10\uc18c\ud558\uae30\n self.exploration_rate *= self.exploration_rate_decay\n self.exploration_rate = max(self.exploration_rate_min, self.exploration_rate)\n\n # \uc2a4\ud15d \uc218 \uc99d\uac00\ud558\uae30\n self.curr_step += 1\n return action_idx" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Cache and Recall\n----------------------\n\nThese two functions serve as Mario\u2019s \u201cmemory\u201d process.\n\n``cache()``: Each time Mario performs an action, he stores the\n``experience`` to his memory. His experience includes the current\n*state*, *action* performed, *reward* from the action, the *next state*,\nand whether the game is *done*.\n\n``recall()``: Mario randomly samples a batch of experiences from his\nmemory, and uses that to learn the game.\n\n\n" + "\uce90\uc2dc(Cache)\uc640 \ub9ac\ucf5c(Recall)\ud558\uae30\n------------------------------\n\n\uc774 \ub450\uac00\uc9c0 \ud568\uc218\ub294 \ub9c8\ub9ac\uc624\uc758 \u201c\uba54\ubaa8\ub9ac\u201d \ud504\ub85c\uc138\uc2a4 \uc5ed\ud560\uc744 \ud569\ub2c8\ub2e4.\n\n``cache()``: \ub9c8\ub9ac\uc624\uac00 \ud589\ub3d9\uc744 \ud560 \ub54c\ub9c8\ub2e4, \uadf8\ub294\n``\uacbd\ud5d8`` \uc744 \uadf8\uc758 \uba54\ubaa8\ub9ac\uc5d0 \uc800\uc7a5\ud569\ub2c8\ub2e4. \uadf8\uc758 \uacbd\ud5d8\uc5d0\ub294 \ud604\uc7ac *\uc0c1\ud0dc* \uc5d0 \ub530\ub978 \uc218\ud589\ub41c\n*\ud589\ub3d9* , \ud589\ub3d9\uc73c\ub85c\ubd80\ud130 \uc5bb\uc740 *\ud3ec\uc0c1* , *\ub2e4\uc74c \uc0c1\ud0dc*,\n\uadf8\ub9ac\uace0 \uac8c\uc784 *\uc644\ub8cc* \uc5ec\ubd80\uac00 \ud3ec\ud568\ub429\ub2c8\ub2e4.\n\n``recall()``: Mario\ub294 \uc790\uc2e0\uc758 \uae30\uc5b5\uc5d0\uc11c \ubb34\uc791\uc704\ub85c \uc77c\ub828\uc758 \uacbd\ud5d8\uc744 \uc0d8\ud50c\ub9c1\ud558\uc5ec\n\uac8c\uc784\uc744 \ud559\uc2b5\ud558\ub294 \ub370 \uc0ac\uc6a9\ud569\ub2c8\ub2e4.\n\n\n" ] }, { @@ -137,14 +137,14 @@ }, "outputs": [], "source": [ - "class Mario(Mario): # subclassing for continuity\n def __init__(self, state_dim, action_dim, save_dir):\n super().__init__(state_dim, action_dim, save_dir)\n self.memory = deque(maxlen=100000)\n self.batch_size = 32\n\n def cache(self, state, next_state, action, reward, done):\n \"\"\"\n Store the experience to self.memory (replay buffer)\n\n Inputs:\n state (LazyFrame),\n next_state (LazyFrame),\n action (int),\n reward (float),\n done(bool))\n \"\"\"\n state = state.__array__()\n next_state = next_state.__array__()\n\n if self.use_cuda:\n state = torch.tensor(state).cuda()\n next_state = torch.tensor(next_state).cuda()\n action = torch.tensor([action]).cuda()\n reward = torch.tensor([reward]).cuda()\n done = torch.tensor([done]).cuda()\n else:\n state = torch.tensor(state)\n next_state = torch.tensor(next_state)\n action = torch.tensor([action])\n reward = torch.tensor([reward])\n done = torch.tensor([done])\n\n self.memory.append((state, next_state, action, reward, done,))\n\n def recall(self):\n \"\"\"\n Retrieve a batch of experiences from memory\n \"\"\"\n batch = random.sample(self.memory, self.batch_size)\n state, next_state, action, reward, done = map(torch.stack, zip(*batch))\n return state, next_state, action.squeeze(), reward.squeeze(), done.squeeze()" + "class Mario(Mario): # \uc5f0\uc18d\uc131\uc744 \uc704\ud55c \ud558\uc704 \ud074\ub798\uc2a4\uc785\ub2c8\ub2e4.\n def __init__(self, state_dim, action_dim, save_dir):\n super().__init__(state_dim, action_dim, save_dir)\n self.memory = deque(maxlen=100000)\n self.batch_size = 32\n\n def cache(self, state, next_state, action, reward, done):\n \"\"\"\n Store the experience to self.memory (replay buffer)\n\n \uc785\ub825\uac12:\n state (LazyFrame),\n next_state (LazyFrame),\n action (int),\n reward (float),\n done (bool))\n \"\"\"\n state = state.__array__()\n next_state = next_state.__array__()\n\n if self.use_cuda:\n state = torch.tensor(state).cuda()\n next_state = torch.tensor(next_state).cuda()\n action = torch.tensor([action]).cuda()\n reward = torch.tensor([reward]).cuda()\n done = torch.tensor([done]).cuda()\n else:\n state = torch.tensor(state)\n next_state = torch.tensor(next_state)\n action = torch.tensor([action])\n reward = torch.tensor([reward])\n done = torch.tensor([done])\n\n self.memory.append((state, next_state, action, reward, done,))\n\n def recall(self):\n \"\"\"\n \uba54\ubaa8\ub9ac\uc5d0\uc11c \uc77c\ub828\uc758 \uacbd\ud5d8\ub4e4\uc744 \uac80\uc0c9\ud569\ub2c8\ub2e4.\n \"\"\"\n batch = random.sample(self.memory, self.batch_size)\n state, next_state, action, reward, done = map(torch.stack, zip(*batch))\n return state, next_state, action.squeeze(), reward.squeeze(), done.squeeze()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Learn\n--------------\n\nMario uses the `DDQN algorithm `__\nunder the hood. DDQN uses two ConvNets - $Q_{online}$ and\n$Q_{target}$ - that independently approximate the optimal\naction-value function.\n\nIn our implementation, we share feature generator ``features`` across\n$Q_{online}$ and $Q_{target}$, but maintain separate FC\nclassifiers for each. $\\theta_{target}$ (the parameters of\n$Q_{target}$) is frozen to prevent updation by backprop. Instead,\nit is periodically synced with $\\theta_{online}$ (more on this\nlater).\n\nNeural Network\n~~~~~~~~~~~~~~~~~~\n\n" + "\ud559\uc2b5\ud558\uae30(Learn)\n-----------------\n\n\ub9c8\ub9ac\uc624\ub294 `DDQN \uc54c\uace0\ub9ac\uc998 `__\n\uc744 \uc0ac\uc6a9\ud569\ub2c8\ub2e4. DDQN \ub450\uac1c\uc758 ConvNets ( $Q_{online}$ \uacfc\n$Q_{target}$ ) \uc744 \uc0ac\uc6a9\ud558\uace0, \ub3c5\ub9bd\uc801\uc73c\ub85c \ucd5c\uc801\uc758 \ud589\ub3d9-\uac00\uce58 \ud568\uc218\uc5d0 \n\uadfc\uc0ac \uc2dc\ud0a4\ub824\uace0 \ud569\ub2c8\ub2e4.\n\n\uad6c\ud604\uc744 \ud560 \ub54c, \ud2b9\uc9d5 \uc0dd\uc131\uae30\uc5d0\uc11c ``\ud2b9\uc9d5\ub4e4`` \uc744 $Q_{online}$ \uc640 $Q_{target}$\n\uc5d0 \uacf5\uc720\ud569\ub2c8\ub2e4. \uadf8\ub7ec\ub098 \uac01\uac01\uc758 FC \ubd84\ub958\uae30\ub294\n\uac00\uc9c0\uace0 \uc788\ub3c4\ub85d \uc124\uacc4\ud569\ub2c8\ub2e4. $\\theta_{target}$ ($Q_{target}$\n\uc758 \ub9e4\uac1c\ubcc0\uc218 \uac12) \ub294 \uc5ed\uc804\ud30c\uc5d0 \uc758\ud574 \uac12\uc774 \uc5c5\ub370\uc774\ud2b8 \ub418\uc9c0 \uc54a\ub3c4\ub85d \uace0\uc815\ub418\uc5c8\uc2b5\ub2c8\ub2e4.\n\ub300\uc2e0, $\\theta_{online}$ \uc640 \uc8fc\uae30\uc801\uc73c\ub85c \ub3d9\uae30\ud654\ub97c \uc9c4\ud589\ud569\ub2c8\ub2e4. \n\uc774\uac83\uc5d0 \ub300\ud574\uc11c\ub294 \ucd94\ud6c4\uc5d0 \ub2e4\ub8e8\ub3c4\ub85d \ud558\uaca0\uc2b5\ub2c8\ub2e4.)\n\n\uc2e0\uacbd\ub9dd\n~~~~~~~~~~~~~~~~~~\n\n" ] }, { @@ -155,14 +155,14 @@ }, "outputs": [], "source": [ - "class MarioNet(nn.Module):\n \"\"\"mini cnn structure\n input -> (conv2d + relu) x 3 -> flatten -> (dense + relu) x 2 -> output\n \"\"\"\n\n def __init__(self, input_dim, output_dim):\n super().__init__()\n c, h, w = input_dim\n\n if h != 84:\n raise ValueError(f\"Expecting input height: 84, got: {h}\")\n if w != 84:\n raise ValueError(f\"Expecting input width: 84, got: {w}\")\n\n self.online = nn.Sequential(\n nn.Conv2d(in_channels=c, out_channels=32, kernel_size=8, stride=4),\n nn.ReLU(),\n nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2),\n nn.ReLU(),\n nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1),\n nn.ReLU(),\n nn.Flatten(),\n nn.Linear(3136, 512),\n nn.ReLU(),\n nn.Linear(512, output_dim),\n )\n\n self.target = copy.deepcopy(self.online)\n\n # Q_target parameters are frozen.\n for p in self.target.parameters():\n p.requires_grad = False\n\n def forward(self, input, model):\n if model == \"online\":\n return self.online(input)\n elif model == \"target\":\n return self.target(input)" + "class MarioNet(nn.Module):\n \"\"\"\uc791\uc740 cnn \uad6c\uc870\n \uc785\ub825 -> (conv2d + relu) x 3 -> flatten -> (dense + relu) x 2 -> \ucd9c\ub825\n \"\"\"\n\n def __init__(self, input_dim, output_dim):\n super().__init__()\n c, h, w = input_dim\n\n if h != 84:\n raise ValueError(f\"Expecting input height: 84, got: {h}\")\n if w != 84:\n raise ValueError(f\"Expecting input width: 84, got: {w}\")\n\n self.online = nn.Sequential(\n nn.Conv2d(in_channels=c, out_channels=32, kernel_size=8, stride=4),\n nn.ReLU(),\n nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2),\n nn.ReLU(),\n nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1),\n nn.ReLU(),\n nn.Flatten(),\n nn.Linear(3136, 512),\n nn.ReLU(),\n nn.Linear(512, output_dim),\n )\n\n self.target = copy.deepcopy(self.online)\n\n # Q_target \ub9e4\uac1c\ubcc0\uc218 \uac12\uc740 \uace0\uc815\uc2dc\ud0b5\ub2c8\ub2e4.\n for p in self.target.parameters():\n p.requires_grad = False\n\n def forward(self, input, model):\n if model == \"online\":\n return self.online(input)\n elif model == \"target\":\n return self.target(input)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "TD Estimate & TD Target\n~~~~~~~~~~~~~~~~~~~~~~~~~~\n\nTwo values are involved in learning:\n\n**TD Estimate** - the predicted optimal $Q^*$ for a given state\n$s$\n\n\\begin{align}{TD}_e = Q_{online}^*(s,a)\\end{align}\n\n**TD Target** - aggregation of current reward and the estimated\n$Q^*$ in the next state $s'$\n\n\\begin{align}a' = argmax_{a} Q_{online}(s', a)\\end{align}\n\n\\begin{align}{TD}_t = r + \\gamma Q_{target}^*(s',a')\\end{align}\n\nBecause we don\u2019t know what next action $a'$ will be, we use the\naction $a'$ maximizes $Q_{online}$ in the next state\n$s'$.\n\nNotice we use the\n`@torch.no_grad() `__\ndecorator on ``td_target()`` to disable gradient calculations here\n(because we don\u2019t need to backpropagate on $\\theta_{target}$).\n\n\n" + "TD \ucd94\uc815 & TD \ubaa9\ud45c\uac12\n~~~~~~~~~~~~~~~~~~~~~~~~~~\n\n\ud559\uc2b5\uc744 \ud558\ub294\ub370 \ub450 \uac00\uc9c0 \uac12\ub4e4\uc774 \ud3ec\ud568\ub429\ub2c8\ub2e4.\n\n**TD \ucd94\uc815** - \uc8fc\uc5b4\uc9c4 \uc0c1\ud0dc $s$ \uc5d0\uc11c \ucd5c\uc801\uc758 \uc608\uce21 $Q^*$. \n\n\\begin{align}{TD}_e = Q_{online}^*(s,a)\\end{align}\n\n**TD \ubaa9\ud45c** - \ud604\uc7ac\uc758 \ud3ec\uc0c1\uacfc \ub2e4\uc74c\uc0c1\ud0dc $s'$ \uc5d0\uc11c \ucd94\uc815\ub41c $Q^*$ \uc758 \ud569.\n\n\\begin{align}a' = argmax_{a} Q_{online}(s', a)\\end{align}\n\n\\begin{align}{TD}_t = r + \\gamma Q_{target}^*(s',a')\\end{align}\n\n\ub2e4\uc74c \ud589\ub3d9 $a'$ \uac00 \uc5b4\ub5a8\uc9c0 \ubaa8\ub974\uae30 \ub54c\ubb38\uc5d0 \n\ub2e4\uc74c \uc0c1\ud0dc $s'$ \uc5d0\uc11c $Q_{online}$ \uac12\uc774 \ucd5c\ub300\uac00 \ub418\ub3c4\ub85d \ud558\ub294\n\ud589\ub3d9 $a'$ \ub97c \uc0ac\uc6a9\ud569\ub2c8\ub2e4.\n\n\uc5ec\uae30\uc5d0\uc11c \ubcc0\ud654\ub3c4 \uacc4\uc0b0\uc744 \ube44\ud65c\uc131\ud654\ud558\uae30 \uc704\ud574\n``td_target()`` \uc5d0\uc11c `@torch.no_grad() `__\n\ub370\ucf54\ub808\uc774\ud130(decorator)\ub97c \uc0ac\uc6a9\ud569\ub2c8\ub2e4.\n($\\theta_{target}$ \uc758 \uc5ed\uc804\ud30c \uacc4\uc0b0\uc774 \ud544\uc694\ub85c \ud558\uc9c0 \uc54a\uae30 \ub54c\ubb38\uc785\ub2c8\ub2e4.)\n\n\n" ] }, { @@ -180,7 +180,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Updating the model\n~~~~~~~~~~~~~~~~~~~~~~\n\nAs Mario samples inputs from his replay buffer, we compute $TD_t$\nand $TD_e$ and backpropagate this loss down $Q_{online}$ to\nupdate its parameters $\\theta_{online}$ ($\\alpha$ is the\nlearning rate ``lr`` passed to the ``optimizer``)\n\n\\begin{align}\\theta_{online} \\leftarrow \\theta_{online} + \\alpha \\nabla(TD_e - TD_t)\\end{align}\n\n$\\theta_{target}$ does not update through backpropagation.\nInstead, we periodically copy $\\theta_{online}$ to\n$\\theta_{target}$\n\n\\begin{align}\\theta_{target} \\leftarrow \\theta_{online}\\end{align}\n\n\n\n" + "\ubaa8\ub378\uc744 \uc5c5\ub370\uc774\ud2b8 \ud558\uae30.\n~~~~~~~~~~~~~~~~~~~~~~\n\n\ub9c8\ub9ac\uc624\uac00 \uc7ac\uc0dd \ubc84\ud37c\uc5d0\uc11c \uc785\ub825\uc744 \uc0d8\ud50c\ub9c1\ud560 \ub54c, $TD_t$\n\uc640 $TD_e$ \ub97c \uacc4\uc0b0\ud569\ub2c8\ub2e4. \uadf8\ub9ac\uace0 \uc774 \uc190\uc2e4\uc744 \uc774\uc6a9\ud558\uc5ec $Q_{online}$ \uc5ed\uc804\ud30c\ud558\uc5ec\n\ub9e4\uac1c\ubcc0\uc218 $\\theta_{online}$ \ub97c \uc5c5\ub370\uc774\ud2b8\ud569\ub2c8\ub2e4. ($\\alpha$ \ub294 \n``optimizer`` \uc5d0 \uc804\ub2ec\ub418\ub294 \ud559\uc2b5\ub960 ``lr`` \uc785\ub2c8\ub2e4.)\n\n\\begin{align}\\theta_{online} \\leftarrow \\theta_{online} + \\alpha \\nabla(TD_e - TD_t)\\end{align}\n\n$\\theta_{target}$ \uc740 \uc5ed\uc804\ud30c\ub97c \ud1b5\ud574 \uc5c5\ub370\uc774\ud2b8 \ub418\uc9c0 \uc54a\uc2b5\ub2c8\ub2e4.\n\ub300\uc2e0, \uc8fc\uae30\uc801\uc73c\ub85c $\\theta_{online}$ \uc758 \uac12\uc744 $\\theta_{target}$ \n\ub85c \ubcf5\uc0ac\ud569\ub2c8\ub2e4.\n\n\\begin{align}\\theta_{target} \\leftarrow \\theta_{online}\\end{align}\n\n\n\n" ] }, { @@ -198,7 +198,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Save checkpoint\n~~~~~~~~~~~~~~~~~~\n\n\n" + "\uccb4\ud06c\ud3ec\uc778\ud2b8\ub97c \uc800\uc7a5\ud569\ub2c8\ub2e4.\n~~~~~~~~~~~~~~~~~~~~~~~\n\n\n" ] }, { @@ -216,7 +216,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Putting it all together\n~~~~~~~~~~~~~~~~~~~~~~~~~~\n\n\n" + "\ubaa8\ub4e0 \uae30\ub2a5\uc744 \uc885\ud569\ud574\ubd05\uc2dc\ub2e4.\n~~~~~~~~~~~~~~~~~~~~~~~~~~\n\n\n" ] }, { @@ -227,14 +227,14 @@ }, "outputs": [], "source": [ - "class Mario(Mario):\n def __init__(self, state_dim, action_dim, save_dir):\n super().__init__(state_dim, action_dim, save_dir)\n self.burnin = 1e4 # min. experiences before training\n self.learn_every = 3 # no. of experiences between updates to Q_online\n self.sync_every = 1e4 # no. of experiences between Q_target & Q_online sync\n\n def learn(self):\n if self.curr_step % self.sync_every == 0:\n self.sync_Q_target()\n\n if self.curr_step % self.save_every == 0:\n self.save()\n\n if self.curr_step < self.burnin:\n return None, None\n\n if self.curr_step % self.learn_every != 0:\n return None, None\n\n # Sample from memory\n state, next_state, action, reward, done = self.recall()\n\n # Get TD Estimate\n td_est = self.td_estimate(state, action)\n\n # Get TD Target\n td_tgt = self.td_target(reward, next_state, done)\n\n # Backpropagate loss through Q_online\n loss = self.update_Q_online(td_est, td_tgt)\n\n return (td_est.mean().item(), loss)" + "class Mario(Mario):\n def __init__(self, state_dim, action_dim, save_dir):\n super().__init__(state_dim, action_dim, save_dir)\n self.burnin = 1e4 # \ud559\uc2b5\uc744 \uc9c4\ud589\ud558\uae30 \uc804 \ucd5c\uc18c\ud55c\uc758 \uacbd\ud5d8\uac12.\n self.learn_every = 3 # Q_online \uc5c5\ub370\uc774\ud2b8 \uc0ac\uc774\uc758 \uacbd\ud5d8 \ud69f\uc218.\n self.sync_every = 1e4 # Q_target\uacfc Q_online sync \uc0ac\uc774\uc758 \uacbd\ud5d8 \uc218\n\n def learn(self):\n if self.curr_step % self.sync_every == 0:\n self.sync_Q_target()\n\n if self.curr_step % self.save_every == 0:\n self.save()\n\n if self.curr_step < self.burnin:\n return None, None\n\n if self.curr_step % self.learn_every != 0:\n return None, None\n\n # \uba54\ubaa8\ub9ac\ub85c\ubd80\ud130 \uc0d8\ud50c\ub9c1\uc744 \ud569\ub2c8\ub2e4.\n state, next_state, action, reward, done = self.recall()\n\n # TD \ucd94\uc815\uac12\uc744 \uac00\uc838\uc635\ub2c8\ub2e4.\n td_est = self.td_estimate(state, action)\n\n # TD \ubaa9\ud45c\uac12\uc744 \uac00\uc838\uc635\ub2c8\ub2e4.\n td_tgt = self.td_target(reward, next_state, done)\n\n # \uc2e4\uc2dc\uac04 Q(Q_online)\uc744 \ud1b5\ud574 \uc5ed\uc804\ud30c \uc190\uc2e4\uc744 \uacc4\uc0b0\ud569\ub2c8\ub2e4.\n loss = self.update_Q_online(td_est, td_tgt)\n\n return (td_est.mean().item(), loss)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Logging\n--------------\n\n\n" + "\uae30\ub85d\ud558\uae30\n--------------\n\n\n" ] }, { @@ -245,14 +245,14 @@ }, "outputs": [], "source": [ - "import numpy as np\nimport time, datetime\nimport matplotlib.pyplot as plt\n\n\nclass MetricLogger:\n def __init__(self, save_dir):\n self.save_log = save_dir / \"log\"\n with open(self.save_log, \"w\") as f:\n f.write(\n f\"{'Episode':>8}{'Step':>8}{'Epsilon':>10}{'MeanReward':>15}\"\n f\"{'MeanLength':>15}{'MeanLoss':>15}{'MeanQValue':>15}\"\n f\"{'TimeDelta':>15}{'Time':>20}\\n\"\n )\n self.ep_rewards_plot = save_dir / \"reward_plot.jpg\"\n self.ep_lengths_plot = save_dir / \"length_plot.jpg\"\n self.ep_avg_losses_plot = save_dir / \"loss_plot.jpg\"\n self.ep_avg_qs_plot = save_dir / \"q_plot.jpg\"\n\n # History metrics\n self.ep_rewards = []\n self.ep_lengths = []\n self.ep_avg_losses = []\n self.ep_avg_qs = []\n\n # Moving averages, added for every call to record()\n self.moving_avg_ep_rewards = []\n self.moving_avg_ep_lengths = []\n self.moving_avg_ep_avg_losses = []\n self.moving_avg_ep_avg_qs = []\n\n # Current episode metric\n self.init_episode()\n\n # Timing\n self.record_time = time.time()\n\n def log_step(self, reward, loss, q):\n self.curr_ep_reward += reward\n self.curr_ep_length += 1\n if loss:\n self.curr_ep_loss += loss\n self.curr_ep_q += q\n self.curr_ep_loss_length += 1\n\n def log_episode(self):\n \"Mark end of episode\"\n self.ep_rewards.append(self.curr_ep_reward)\n self.ep_lengths.append(self.curr_ep_length)\n if self.curr_ep_loss_length == 0:\n ep_avg_loss = 0\n ep_avg_q = 0\n else:\n ep_avg_loss = np.round(self.curr_ep_loss / self.curr_ep_loss_length, 5)\n ep_avg_q = np.round(self.curr_ep_q / self.curr_ep_loss_length, 5)\n self.ep_avg_losses.append(ep_avg_loss)\n self.ep_avg_qs.append(ep_avg_q)\n\n self.init_episode()\n\n def init_episode(self):\n self.curr_ep_reward = 0.0\n self.curr_ep_length = 0\n self.curr_ep_loss = 0.0\n self.curr_ep_q = 0.0\n self.curr_ep_loss_length = 0\n\n def record(self, episode, epsilon, step):\n mean_ep_reward = np.round(np.mean(self.ep_rewards[-100:]), 3)\n mean_ep_length = np.round(np.mean(self.ep_lengths[-100:]), 3)\n mean_ep_loss = np.round(np.mean(self.ep_avg_losses[-100:]), 3)\n mean_ep_q = np.round(np.mean(self.ep_avg_qs[-100:]), 3)\n self.moving_avg_ep_rewards.append(mean_ep_reward)\n self.moving_avg_ep_lengths.append(mean_ep_length)\n self.moving_avg_ep_avg_losses.append(mean_ep_loss)\n self.moving_avg_ep_avg_qs.append(mean_ep_q)\n\n last_record_time = self.record_time\n self.record_time = time.time()\n time_since_last_record = np.round(self.record_time - last_record_time, 3)\n\n print(\n f\"Episode {episode} - \"\n f\"Step {step} - \"\n f\"Epsilon {epsilon} - \"\n f\"Mean Reward {mean_ep_reward} - \"\n f\"Mean Length {mean_ep_length} - \"\n f\"Mean Loss {mean_ep_loss} - \"\n f\"Mean Q Value {mean_ep_q} - \"\n f\"Time Delta {time_since_last_record} - \"\n f\"Time {datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S')}\"\n )\n\n with open(self.save_log, \"a\") as f:\n f.write(\n f\"{episode:8d}{step:8d}{epsilon:10.3f}\"\n f\"{mean_ep_reward:15.3f}{mean_ep_length:15.3f}{mean_ep_loss:15.3f}{mean_ep_q:15.3f}\"\n f\"{time_since_last_record:15.3f}\"\n f\"{datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S'):>20}\\n\"\n )\n\n for metric in [\"ep_rewards\", \"ep_lengths\", \"ep_avg_losses\", \"ep_avg_qs\"]:\n plt.plot(getattr(self, f\"moving_avg_{metric}\"))\n plt.savefig(getattr(self, f\"{metric}_plot\"))\n plt.clf()" + "import numpy as np\nimport time, datetime\nimport matplotlib.pyplot as plt\n\n\nclass MetricLogger:\n def __init__(self, save_dir):\n self.save_log = save_dir / \"log\"\n with open(self.save_log, \"w\") as f:\n f.write(\n f\"{'Episode':>8}{'Step':>8}{'Epsilon':>10}{'MeanReward':>15}\"\n f\"{'MeanLength':>15}{'MeanLoss':>15}{'MeanQValue':>15}\"\n f\"{'TimeDelta':>15}{'Time':>20}\\n\"\n )\n self.ep_rewards_plot = save_dir / \"reward_plot.jpg\"\n self.ep_lengths_plot = save_dir / \"length_plot.jpg\"\n self.ep_avg_losses_plot = save_dir / \"loss_plot.jpg\"\n self.ep_avg_qs_plot = save_dir / \"q_plot.jpg\"\n\n # \uc9c0\ud45c(Metric)\uc640 \uad00\ub828\ub41c \ub9ac\uc2a4\ud2b8\uc785\ub2c8\ub2e4.\n self.ep_rewards = []\n self.ep_lengths = []\n self.ep_avg_losses = []\n self.ep_avg_qs = []\n\n # \ubaa8\ub4e0 record() \ud568\uc218\ub97c \ud638\ucd9c\ud55c \ud6c4 \uc774\ub3d9 \ud3c9\uade0(Moving average)\uc744 \uacc4\uc0b0\ud569\ub2c8\ub2e4.\n self.moving_avg_ep_rewards = []\n self.moving_avg_ep_lengths = []\n self.moving_avg_ep_avg_losses = []\n self.moving_avg_ep_avg_qs = []\n\n # \ud604\uc7ac \uc5d0\ud53c\uc2a4\ub4dc\uc5d0 \ub300\ud55c \uc9c0\ud45c\ub97c \uae30\ub85d\ud569\ub2c8\ub2e4.\n self.init_episode()\n\n # \uc2dc\uac04\uc5d0 \ub300\ud55c \uae30\ub85d\uc785\ub2c8\ub2e4.\n self.record_time = time.time()\n\n def log_step(self, reward, loss, q):\n self.curr_ep_reward += reward\n self.curr_ep_length += 1\n if loss:\n self.curr_ep_loss += loss\n self.curr_ep_q += q\n self.curr_ep_loss_length += 1\n\n def log_episode(self):\n \"\uc5d0\ud53c\uc2a4\ub4dc\uc758 \ub05d\uc744 \ud45c\uc2dc\ud569\ub2c8\ub2e4.\"\n self.ep_rewards.append(self.curr_ep_reward)\n self.ep_lengths.append(self.curr_ep_length)\n if self.curr_ep_loss_length == 0:\n ep_avg_loss = 0\n ep_avg_q = 0\n else:\n ep_avg_loss = np.round(self.curr_ep_loss / self.curr_ep_loss_length, 5)\n ep_avg_q = np.round(self.curr_ep_q / self.curr_ep_loss_length, 5)\n self.ep_avg_losses.append(ep_avg_loss)\n self.ep_avg_qs.append(ep_avg_q)\n\n self.init_episode()\n\n def init_episode(self):\n self.curr_ep_reward = 0.0\n self.curr_ep_length = 0\n self.curr_ep_loss = 0.0\n self.curr_ep_q = 0.0\n self.curr_ep_loss_length = 0\n\n def record(self, episode, epsilon, step):\n mean_ep_reward = np.round(np.mean(self.ep_rewards[-100:]), 3)\n mean_ep_length = np.round(np.mean(self.ep_lengths[-100:]), 3)\n mean_ep_loss = np.round(np.mean(self.ep_avg_losses[-100:]), 3)\n mean_ep_q = np.round(np.mean(self.ep_avg_qs[-100:]), 3)\n self.moving_avg_ep_rewards.append(mean_ep_reward)\n self.moving_avg_ep_lengths.append(mean_ep_length)\n self.moving_avg_ep_avg_losses.append(mean_ep_loss)\n self.moving_avg_ep_avg_qs.append(mean_ep_q)\n\n last_record_time = self.record_time\n self.record_time = time.time()\n time_since_last_record = np.round(self.record_time - last_record_time, 3)\n\n print(\n f\"Episode {episode} - \"\n f\"Step {step} - \"\n f\"Epsilon {epsilon} - \"\n f\"Mean Reward {mean_ep_reward} - \"\n f\"Mean Length {mean_ep_length} - \"\n f\"Mean Loss {mean_ep_loss} - \"\n f\"Mean Q Value {mean_ep_q} - \"\n f\"Time Delta {time_since_last_record} - \"\n f\"Time {datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S')}\"\n )\n\n with open(self.save_log, \"a\") as f:\n f.write(\n f\"{episode:8d}{step:8d}{epsilon:10.3f}\"\n f\"{mean_ep_reward:15.3f}{mean_ep_length:15.3f}{mean_ep_loss:15.3f}{mean_ep_q:15.3f}\"\n f\"{time_since_last_record:15.3f}\"\n f\"{datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S'):>20}\\n\"\n )\n\n for metric in [\"ep_rewards\", \"ep_lengths\", \"ep_avg_losses\", \"ep_avg_qs\"]:\n plt.plot(getattr(self, f\"moving_avg_{metric}\"))\n plt.savefig(getattr(self, f\"{metric}_plot\"))\n plt.clf()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Let\u2019s play!\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n\nIn this example we run the training loop for 10 episodes, but for Mario to truly learn the ways of\nhis world, we suggest running the loop for at least 40,000 episodes!\n\n\n" + "\uac8c\uc784\uc744 \uc2e4\ud589\uc2dc\ucf1c\ubd05\uc2dc\ub2e4!\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n\n\uc774\ubc88 \uc608\uc81c\uc5d0\uc11c\ub294 10\uac1c\uc758 \uc5d0\ud53c\uc18c\ub4dc\uc5d0 \ub300\ud574 \ud559\uc2b5 \ub8e8\ud504\ub97c \uc2e4\ud589\uc2dc\ucf30\uc2b5\ub2c8\ub2e4.\ud558\uc9c0\ub9cc \ub9c8\ub9ac\uc624\uac00 \uc9c4\uc815\uc73c\ub85c \n\uc138\uacc4\ub97c \ud559\uc2b5\ud558\uae30 \uc704\ud574\uc11c\ub294 \uc801\uc5b4\ub3c4 40000\uac1c\uc758 \uc5d0\ud53c\uc18c\ub4dc\uc5d0 \ub300\ud574 \ud559\uc2b5\uc744 \uc2dc\ud0ac \uac83\uc744 \uc81c\uc548\ud569\ub2c8\ub2e4!\n\n\n" ] }, { @@ -263,14 +263,14 @@ }, "outputs": [], "source": [ - "use_cuda = torch.cuda.is_available()\nprint(f\"Using CUDA: {use_cuda}\")\nprint()\n\nsave_dir = Path(\"checkpoints\") / datetime.datetime.now().strftime(\"%Y-%m-%dT%H-%M-%S\")\nsave_dir.mkdir(parents=True)\n\nmario = Mario(state_dim=(4, 84, 84), action_dim=env.action_space.n, save_dir=save_dir)\n\nlogger = MetricLogger(save_dir)\n\nepisodes = 10\nfor e in range(episodes):\n\n state = env.reset()\n\n # Play the game!\n while True:\n\n # Run agent on the state\n action = mario.act(state)\n\n # Agent performs action\n next_state, reward, done, info = env.step(action)\n\n # Remember\n mario.cache(state, next_state, action, reward, done)\n\n # Learn\n q, loss = mario.learn()\n\n # Logging\n logger.log_step(reward, loss, q)\n\n # Update state\n state = next_state\n\n # Check if end of game\n if done or info[\"flag_get\"]:\n break\n\n logger.log_episode()\n\n if e % 20 == 0:\n logger.record(episode=e, epsilon=mario.exploration_rate, step=mario.curr_step)" + "use_cuda = torch.cuda.is_available()\nprint(f\"Using CUDA: {use_cuda}\")\nprint()\n\nsave_dir = Path(\"checkpoints\") / datetime.datetime.now().strftime(\"%Y-%m-%dT%H-%M-%S\")\nsave_dir.mkdir(parents=True)\n\nmario = Mario(state_dim=(4, 84, 84), action_dim=env.action_space.n, save_dir=save_dir)\n\nlogger = MetricLogger(save_dir)\n\nepisodes = 10\nfor e in range(episodes):\n\n state = env.reset()\n\n # \uac8c\uc784\uc744 \uc2e4\ud589\uc2dc\ucf1c\ubd05\uc2dc\ub2e4!\n while True:\n\n # \ud604\uc7ac \uc0c1\ud0dc\uc5d0\uc11c \uc5d0\uc774\uc804\ud2b8 \uc2e4\ud589\ud558\uae30\n action = mario.act(state)\n\n # \uc5d0\uc774\uc804\ud2b8\uac00 \uc561\uc158 \uc218\ud589\ud558\uae30\n next_state, reward, done, info = env.step(action)\n\n # \uae30\uc5b5\ud558\uae30\n mario.cache(state, next_state, action, reward, done)\n\n # \ubc30\uc6b0\uae30\n q, loss = mario.learn()\n\n # \uae30\ub85d\ud558\uae30\n logger.log_step(reward, loss, q)\n\n # \uc0c1\ud0dc \uc5c5\ub370\uc774\ud2b8\ud558\uae30\n state = next_state\n\n # \uac8c\uc784\uc774 \ub05d\ub0ac\ub294\uc9c0 \ud655\uc778\ud558\uae30\n if done or info[\"flag_get\"]:\n break\n\n logger.log_episode()\n\n if e % 20 == 0:\n logger.record(episode=e, epsilon=mario.exploration_rate, step=mario.curr_step)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Conclusion\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n\nIn this tutorial, we saw how we can use PyTorch to train a game-playing AI. You can use the same methods\nto train an AI to play any of the games at the `OpenAI gym `__. Hope you enjoyed this tutorial, feel free to reach us at\n`our github `__!\n\n" + "\uacb0\ub860\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n\n\uc774 \ud29c\ud1a0\ub9ac\uc5bc\uc5d0\uc11c\ub294 PyTorch\ub97c \uc0ac\uc6a9\ud558\uc5ec \uac8c\uc784 \ud50c\ub808\uc774 AI\ub97c \ud6c8\ub828\ud558\ub294 \ubc29\ubc95\uc744 \uc0b4\ud3b4\ubcf4\uc558\uc2b5\ub2c8\ub2e4. `OpenAI gym `__\n\uc5d0 \uc788\ub294 \uc5b4\ub5a4 \uac8c\uc784\uc774\ub4e0 \ub3d9\uc77c\ud55c \ubc29\ubc95\uc73c\ub85c AI\ub97c \ud6c8\ub828\uc2dc\ud0a4\uace0 \uac8c\uc784\uc744 \uc9c4\ud589\ud560 \uc218 \uc788\uc2b5\ub2c8\ub2e4. \uc774 \ud29c\ud1a0\ub9ac\uc5bc\uc774 \ub3c4\uc6c0\uc774 \ub418\uc5c8\uae30\ub97c \ubc14\ub77c\uba70, \n`Github \uc800\uc7a5\uc18c `__ \uc5d0\uc11c \ud3b8\ud558\uac8c \uc800\uc790\ub4e4\uc5d0\uac8c \uc5f0\ub77d\uc744 \ud558\uc154\ub3c4 \ub429\ub2c8\ub2e4!\n\n" ] } ], diff --git a/docs/_downloads/934f425a64afc04a1466c778c945fb84/mario_rl_tutorial.py b/docs/_downloads/934f425a64afc04a1466c778c945fb84/mario_rl_tutorial.py index a2d856c41..d038f356f 100644 --- a/docs/_downloads/934f425a64afc04a1466c778c945fb84/mario_rl_tutorial.py +++ b/docs/_downloads/934f425a64afc04a1466c778c945fb84/mario_rl_tutorial.py @@ -1,26 +1,28 @@ # -*- coding: utf-8 -*- """ -Train a Mario-playing RL Agent -================ +마리오 게임 RL 에이전트로 학습하기 +=============================== -Authors: `Yuansong Feng `__, `Suraj +저자: `Yuansong Feng `__, `Suraj Subramanian `__, `Howard Wang `__, `Steven Guo `__. +번역: `김태영 `__. -This tutorial walks you through the fundamentals of Deep Reinforcement -Learning. At the end, you will implement an AI-powered Mario (using -`Double Deep Q-Networks `__) that -can play the game by itself. +이번 튜토리얼에서는 심층 강화 학습의 기본 사항들에 대해 이야기해보도록 하겠습니다. +마지막에는, 스스로 게임을 할 수 있는 AI 기반 마리오를 +(`Double Deep Q-Networks `__ 사용) +구현하게 됩니다. -Although no prior knowledge of RL is necessary for this tutorial, you -can familiarize yourself with these RL -`concepts `__, -and have this handy -`cheatsheet `__ -as your companion. The full code is available -`here `__. +이 튜토리얼에서는 RL에 대한 사전 지식이 필요하지 않지만, +이러한 `링크 `__ +를 통해 RL 개념에 친숙해 질 수 있으며, +여기 있는 +`치트시트 `__ +를 활용할 수도 있습니다. 튜토리얼에서 사용하는 전체 코드는 +`여기 `__ +에서 확인 할 수 있습니다. .. figure:: /_static/img/mario.gif :alt: mario @@ -43,64 +45,61 @@ from collections import deque import random, datetime, os, copy -# Gym is an OpenAI toolkit for RL +# Gym은 강화학습을 위한 OpenAI 툴킷입니다. import gym from gym.spaces import Box from gym.wrappers import FrameStack -# NES Emulator for OpenAI Gym +# OpenAI Gym을 위한 NES 에뮬레이터 from nes_py.wrappers import JoypadSpace -# Super Mario environment for OpenAI Gym +# OpenAI Gym에서의 슈퍼 마리오 환경 세팅 import gym_super_mario_bros ###################################################################### -# RL Definitions +# 강화학습 개념 # """""""""""""""""" # -# **Environment** The world that an agent interacts with and learns from. +# **환경(Environment)** : 에이전트가 상호작용하며 스스로 배우는 세계입니다. # -# **Action** :math:`a` : How the Agent responds to the Environment. The -# set of all possible Actions is called *action-space*. +# **행동(Action)** :math:`a` : 에이전트가 환경에 어떻게 응답하는지 행동을 통해 나타냅니다. +# 가능한 모든 행동의 집합을 *행동 공간* 이라고 합니다. # -# **State** :math:`s` : The current characteristic of the Environment. The -# set of all possible States the Environment can be in is called -# *state-space*. -# -# **Reward** :math:`r` : Reward is the key feedback from Environment to -# Agent. It is what drives the Agent to learn and to change its future -# action. An aggregation of rewards over multiple time steps is called -# **Return**. -# -# **Optimal Action-Value function** :math:`Q^*(s,a)` : Gives the expected -# return if you start in state :math:`s`, take an arbitrary action -# :math:`a`, and then for each future time step take the action that -# maximizes returns. :math:`Q` can be said to stand for the “quality” of -# the action in a state. We try to approximate this function. +# **상태(State)** :math:`s` : 환경의 현재 특성을 상태를 통해 나타냅니다. +# 환경이 있을 수 있는 모든 가능한 상태 집합을 *상태 공간* 이라고 합니다. +# +# **포상(Reward)** :math:`r` : 포상은 환경에서 에이전트로 전달되는 핵심 피드백입니다. +# 에이전트가 학습하고 향후 행동을 변경하도록 유도하는 것입니다. +# 여러 시간 단계에 걸친 포상의 합을 **리턴(Return)** 이라고 합니다. +# +# **최적의 행동-가치 함수(Action-Value function)** :math:`Q^*(s,a)` : 상태 :math:`s` +# 에서 시작하면 예상되는 리턴을 반환하고, 임의의 행동 :math:`a` +# 를 선택합니다. 그리고 각각의 미래의 단계에서 포상의 합을 극대화하는 행동을 선택하도록 합니다. +# :math:`Q` 는 상태에서 행동의 “품질” +# 을 나타냅니다. 우리는 이 함수를 근사 시키려고 합니다. # ###################################################################### -# Environment -# """""""""""""""" +# 환경(Environment) +# """""""""""""""""""" # -# Initialize Environment +# 환경 초기화하기 # ------------------------ # -# In Mario, the environment consists of tubes, mushrooms and other -# components. -# -# When Mario makes an action, the environment responds with the changed -# (next) state, reward and other info. +# 마리오 게임에서 환경은 튜브, 버섯, 그 이외 다른 여러 요소들로 구성되어 있습니다. +# +# 마리오가 행동을 취하면, 환경은 변경된 (다음)상태, 포상 그리고 +# 다른 정보들로 응답합니다. # -# Initialize Super Mario environment +# 슈퍼 마리오 환경 초기화하기 env = gym_super_mario_bros.make("SuperMarioBros-1-1-v0") -# Limit the action-space to -# 0. walk right -# 1. jump right +# 상태 공간을 2가지로 제한하기 +# 0. 오른쪽으로 걷기 +# 1. 오른쪽으로 점프하기 env = JoypadSpace(env, [["right"], ["right", "A"]]) env.reset() @@ -109,51 +108,51 @@ ###################################################################### -# Preprocess Environment +# 환경 전처리 과정 거치기 # ------------------------ # -# Environment data is returned to the agent in ``next_state``. As you saw -# above, each state is represented by a ``[3, 240, 256]`` size array. -# Often that is more information than our agent needs; for instance, -# Mario’s actions do not depend on the color of the pipes or the sky! +# ``다음 상태(next_state)`` 에서 환경 데이터가 에이전트로 반환됩니다. +# 앞서 살펴보았듯이, 각각의 상태는 ``[3, 240, 256]`` 의 배열로 나타내고 있습니다. +# 종종 상태가 제공하는 것은 에이전트가 필요로 하는 것보다 더 많은 정보입니다. +# 예를 들어, 마리오의 행동은 파이프의 색깔이나 하늘의 색깔에 좌우되지 않습니다! # -# We use **Wrappers** to preprocess environment data before sending it to -# the agent. +# 아래에 설명할 클래스들은 환경 데이터를 에이전트에 보내기 전 단계에서 전처리 과정에 사용할 +# **래퍼(Wrappers)** 입니다. # -# ``GrayScaleObservation`` is a common wrapper to transform an RGB image -# to grayscale; doing so reduces the size of the state representation -# without losing useful information. Now the size of each state: -# ``[1, 240, 256]`` +# ``GrayScaleObservation`` 은 RGB 이미지를 흑백 이미지로 바꾸는 일반적인 래퍼입니다. +# ``GrayScaleObservation`` 클래스를 사용하면 유용한 정보를 잃지 않고 상태의 크기를 줄일 수 있습니다. +# ``GrayScaleObservation`` 를 적용하면 각각 상태의 크기는 +# ``[1, 240, 256]`` 이 됩니다. # -# ``ResizeObservation`` downsamples each observation into a square image. -# New size: ``[1, 84, 84]`` +# ``ResizeObservation`` 은 각각의 상태(Observation)를 정사각형 이미지로 다운 샘플링합니다. +# 이 래퍼를 적용하면 각각 상태의 크기는 ``[1, 84, 84]`` 이 됩니다. # -# ``SkipFrame`` is a custom wrapper that inherits from ``gym.Wrapper`` and -# implements the ``step()`` function. Because consecutive frames don’t -# vary much, we can skip n-intermediate frames without losing much -# information. The n-th frame aggregates rewards accumulated over each -# skipped frame. +# ``SkipFrame`` 은 ``gym.Wrapper`` 으로부터 상속을 받은 사용자 지정 클래스이고, +# ``step()`` 함수를 구현합니다. 왜냐하면 연속되는 프레임은 큰 차이가 없기 때문에 +# n개의 중간 프레임을 큰 정보의 손실 없이 건너뛸 수 있기 때문입니다. +# n번째 프레임은 건너뛴 각 프레임에 걸쳐 누적된 포상을 +# 집계합니다. # -# ``FrameStack`` is a wrapper that allows us to squash consecutive frames -# of the environment into a single observation point to feed to our -# learning model. This way, we can identify if Mario was landing or -# jumping based on the direction of his movement in the previous several -# frames. +# ``FrameStack`` 은 환경의 연속 프레임을 +# 단일 관찰 지점으로 바꾸어 학습 모델에 제공할 수 있는 래퍼입니다. +# 이렇게 하면 마리오가 착지 중이였는지 또는 점프 중이었는지 +# 이전 몇 프레임의 움직임 방향에 따라 확인할 수 +# 있습니다. # class SkipFrame(gym.Wrapper): def __init__(self, env, skip): - """Return only every `skip`-th frame""" + """모든 `skip` 프레임만 반환합니다.""" super().__init__(env) self._skip = skip def step(self, action): - """Repeat action, and sum reward""" + """행동을 반복하고 포상을 더합니다.""" total_reward = 0.0 done = False for i in range(self._skip): - # Accumulate reward and repeat the same action + # 포상을 누적하고 동일한 작업을 반복합니다. obs, reward, done, info = self.env.step(action) total_reward += reward if done: @@ -168,7 +167,7 @@ def __init__(self, env): self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype=np.uint8) def permute_orientation(self, observation): - # permute [H, W, C] array to [C, H, W] tensor + # [H, W, C] 배열을 [C, H, W] 텐서로 바꿉니다. observation = np.transpose(observation, (2, 0, 1)) observation = torch.tensor(observation.copy(), dtype=torch.float) return observation @@ -199,7 +198,7 @@ def observation(self, observation): return observation -# Apply Wrappers to environment +# 래퍼를 환경에 적용합니다. env = SkipFrame(env, skip=4) env = GrayScaleObservation(env) env = ResizeObservation(env, shape=84) @@ -207,11 +206,11 @@ def observation(self, observation): ###################################################################### -# After applying the above wrappers to the environment, the final wrapped -# state consists of 4 gray-scaled consecutive frames stacked together, as -# shown above in the image on the left. Each time Mario makes an action, -# the environment responds with a state of this structure. The structure -# is represented by a 3-D array of size ``[4, 84, 84]``. +# 앞서 소개한 래퍼를 환경에 적용한 후, +# 최종 래핑 상태는 왼쪽 아래 이미지에 표시된 것처럼 4개의 연속된 흑백 프레임으로 +# 구성됩니다. 마리오가 행동을 할 때마다, +# 환경은 이 구조의 상태로 응답합니다. +# 구조는 ``[4, 84, 84]`` 크기의 3차원 배열로 구성되어 있습니다. # # .. figure:: /_static/img/mario_env.png # :alt: picture @@ -220,20 +219,20 @@ def observation(self, observation): ###################################################################### -# Agent -# """"""""" +# 에이전트(Agent) +# """"""""""""""""" # -# We create a class ``Mario`` to represent our agent in the game. Mario -# should be able to: +# ``Mario`` 라는 클래스를 이 게임의 에이전트로 생성합니다. +# 마리오는 다음과 같은 기능을 할 수 있어야 합니다. # -# - **Act** according to the optimal action policy based on the current -# state (of the environment). +# - **행동(Act)** 은 (환경의) 현재 상태를 기반으로 +# 최적의 행동 정책에 따라 정해집니다. # -# - **Remember** experiences. Experience = (current state, current -# action, reward, next state). Mario *caches* and later *recalls* his -# experiences to update his action policy. +# - 경험을 **기억(Remember)** 하는 것. +# 경험은 (현재 상태, 현재 행동, 포상, 다음 상태) 로 이루어져 있습니다. +# 마리오는 그의 행동 정책을 업데이트 하기 위해 *캐시(caches)* 를 한 다음, 그의 경험을 *리콜(recalls)* 합니다. # -# - **Learn** a better action policy over time +# - **학습(Learn)** 을 통해 시간이 지남에 따라 더 나은 행동 정책을 택합니다. # @@ -242,38 +241,38 @@ def __init__(): pass def act(self, state): - """Given a state, choose an epsilon-greedy action""" + """상태가 주어지면, 입실론-그리디 행동(epsilon-greedy action)을 선택해야 합니다.""" pass def cache(self, experience): - """Add the experience to memory""" + """메모리에 경험을 추가합니다.""" pass def recall(self): - """Sample experiences from memory""" + """메모리로부터 경험을 샘플링합니다.""" pass def learn(self): - """Update online action value (Q) function with a batch of experiences""" + """일련의 경험들로 실시간 행동 가치(online action value) (Q) 함수를 업데이트 합니다.""" pass ###################################################################### -# In the following sections, we will populate Mario’s parameters and -# define his functions. +# 이번 섹션에서는 마리오 클래스의 매개변수를 채우고, +# 마리오 클래스의 함수들을 정의하겠습니다. # ###################################################################### -# Act +# 행동하기(Act) # -------------- # -# For any given state, an agent can choose to do the most optimal action -# (**exploit**) or a random action (**explore**). +# 주어진 상태에 대해, 에이전트는 최적의 행동을 이용할 것인지 +# 임의의 행동을 선택하여 분석할 것인지 선택할 수 있습니다. # -# Mario randomly explores with a chance of ``self.exploration_rate``; when -# he chooses to exploit, he relies on ``MarioNet`` (implemented in -# ``Learn`` section) to provide the most optimal action. +# 마리오는 임의의 행동을 선택했을 때 ``self.exploration_rate`` 를 활용합니다. +# 최적의 행동을 이용한다고 했을 때, 그는 최적의 행동을 수행하기 위해 +# (``학습하기(Learn)`` 섹션에서 구현된) ``MarioNet`` 이 필요합니다. # @@ -285,7 +284,7 @@ def __init__(self, state_dim, action_dim, save_dir): self.use_cuda = torch.cuda.is_available() - # Mario's DNN to predict the most optimal action - we implement this in the Learn section + # 마리오의 DNN은 최적의 행동을 예측합니다 - 이는 학습하기 섹션에서 구현합니다. self.net = MarioNet(self.state_dim, self.action_dim).float() if self.use_cuda: self.net = self.net.to(device="cuda") @@ -295,22 +294,22 @@ def __init__(self, state_dim, action_dim, save_dir): self.exploration_rate_min = 0.1 self.curr_step = 0 - self.save_every = 5e5 # no. of experiences between saving Mario Net + self.save_every = 5e5 # Mario Net 저장 사이의 경험 횟수 def act(self, state): """ - Given a state, choose an epsilon-greedy action and update value of step. + 주어진 상태에서, 입실론-그리디 행동(epsilon-greedy action)을 선택하고, 스텝의 값을 업데이트 합니다. - Inputs: - state(LazyFrame): A single observation of the current state, dimension is (state_dim) - Outputs: - action_idx (int): An integer representing which action Mario will perform + 입력값: + state(LazyFrame): 현재 상태에서의 단일 상태(observation)값을 말합니다. 차원은 (state_dim)입니다. + 출력값: + action_idx (int): Mario가 수행할 행동을 나타내는 정수 값입니다. """ - # EXPLORE + # 임의의 행동을 선택하기 if np.random.rand() < self.exploration_rate: action_idx = np.random.randint(self.action_dim) - # EXPLOIT + # 최적의 행동을 이용하기 else: state = state.__array__() if self.use_cuda: @@ -321,32 +320,32 @@ def act(self, state): action_values = self.net(state, model="online") action_idx = torch.argmax(action_values, axis=1).item() - # decrease exploration_rate + # exploration_rate 감소하기 self.exploration_rate *= self.exploration_rate_decay self.exploration_rate = max(self.exploration_rate_min, self.exploration_rate) - # increment step + # 스텝 수 증가하기 self.curr_step += 1 return action_idx ###################################################################### -# Cache and Recall -# ---------------------- +# 캐시(Cache)와 리콜(Recall)하기 +# ------------------------------ # -# These two functions serve as Mario’s “memory” process. +# 이 두가지 함수는 마리오의 “메모리” 프로세스 역할을 합니다. # -# ``cache()``: Each time Mario performs an action, he stores the -# ``experience`` to his memory. His experience includes the current -# *state*, *action* performed, *reward* from the action, the *next state*, -# and whether the game is *done*. +# ``cache()``: 마리오가 행동을 할 때마다, 그는 +# ``경험`` 을 그의 메모리에 저장합니다. 그의 경험에는 현재 *상태* 에 따른 수행된 +# *행동* , 행동으로부터 얻은 *포상* , *다음 상태*, +# 그리고 게임 *완료* 여부가 포함됩니다. # -# ``recall()``: Mario randomly samples a batch of experiences from his -# memory, and uses that to learn the game. +# ``recall()``: Mario는 자신의 기억에서 무작위로 일련의 경험을 샘플링하여 +# 게임을 학습하는 데 사용합니다. # -class Mario(Mario): # subclassing for continuity +class Mario(Mario): # 연속성을 위한 하위 클래스입니다. def __init__(self, state_dim, action_dim, save_dir): super().__init__(state_dim, action_dim, save_dir) self.memory = deque(maxlen=100000) @@ -356,12 +355,12 @@ def cache(self, state, next_state, action, reward, done): """ Store the experience to self.memory (replay buffer) - Inputs: + 입력값: state (LazyFrame), next_state (LazyFrame), action (int), reward (float), - done(bool)) + done (bool)) """ state = state.__array__() next_state = next_state.__array__() @@ -383,7 +382,7 @@ def cache(self, state, next_state, action, reward, done): def recall(self): """ - Retrieve a batch of experiences from memory + 메모리에서 일련의 경험들을 검색합니다. """ batch = random.sample(self.memory, self.batch_size) state, next_state, action, reward, done = map(torch.stack, zip(*batch)) @@ -391,28 +390,28 @@ def recall(self): ###################################################################### -# Learn -# -------------- -# -# Mario uses the `DDQN algorithm `__ -# under the hood. DDQN uses two ConvNets - :math:`Q_{online}` and -# :math:`Q_{target}` - that independently approximate the optimal -# action-value function. -# -# In our implementation, we share feature generator ``features`` across -# :math:`Q_{online}` and :math:`Q_{target}`, but maintain separate FC -# classifiers for each. :math:`\theta_{target}` (the parameters of -# :math:`Q_{target}`) is frozen to prevent updation by backprop. Instead, -# it is periodically synced with :math:`\theta_{online}` (more on this -# later). -# -# Neural Network +# 학습하기(Learn) +# ----------------- +# +# 마리오는 `DDQN 알고리즘 `__ +# 을 사용합니다. DDQN 두개의 ConvNets ( :math:`Q_{online}` 과 +# :math:`Q_{target}` ) 을 사용하고, 독립적으로 최적의 행동-가치 함수에 +# 근사 시키려고 합니다. +# +# 구현을 할 때, 특징 생성기에서 ``특징들`` 을 :math:`Q_{online}` 와 :math:`Q_{target}` +# 에 공유합니다. 그러나 각각의 FC 분류기는 +# 가지고 있도록 설계합니다. :math:`\theta_{target}` (:math:`Q_{target}` +# 의 매개변수 값) 는 역전파에 의해 값이 업데이트 되지 않도록 고정되었습니다. +# 대신, :math:`\theta_{online}` 와 주기적으로 동기화를 진행합니다. +# 이것에 대해서는 추후에 다루도록 하겠습니다.) +# +# 신경망 # ~~~~~~~~~~~~~~~~~~ class MarioNet(nn.Module): - """mini cnn structure - input -> (conv2d + relu) x 3 -> flatten -> (dense + relu) x 2 -> output + """작은 cnn 구조 + 입력 -> (conv2d + relu) x 3 -> flatten -> (dense + relu) x 2 -> 출력 """ def __init__(self, input_dim, output_dim): @@ -439,7 +438,7 @@ def __init__(self, input_dim, output_dim): self.target = copy.deepcopy(self.online) - # Q_target parameters are frozen. + # Q_target 매개변수 값은 고정시킵니다. for p in self.target.parameters(): p.requires_grad = False @@ -451,21 +450,19 @@ def forward(self, input, model): ###################################################################### -# TD Estimate & TD Target +# TD 추정 & TD 목표값 # ~~~~~~~~~~~~~~~~~~~~~~~~~~ # -# Two values are involved in learning: -# -# **TD Estimate** - the predicted optimal :math:`Q^*` for a given state -# :math:`s` +# 학습을 하는데 두 가지 값들이 포함됩니다. # +# **TD 추정** - 주어진 상태 :math:`s` 에서 최적의 예측 :math:`Q^*`. +# # .. math:: # # # {TD}_e = Q_{online}^*(s,a) # -# **TD Target** - aggregation of current reward and the estimated -# :math:`Q^*` in the next state :math:`s'` +# **TD 목표** - 현재의 포상과 다음상태 :math:`s'` 에서 추정된 :math:`Q^*` 의 합. # # .. math:: # @@ -477,14 +474,14 @@ def forward(self, input, model): # # {TD}_t = r + \gamma Q_{target}^*(s',a') # -# Because we don’t know what next action :math:`a'` will be, we use the -# action :math:`a'` maximizes :math:`Q_{online}` in the next state -# :math:`s'`. +# 다음 행동 :math:`a'` 가 어떨지 모르기 때문에 +# 다음 상태 :math:`s'` 에서 :math:`Q_{online}` 값이 최대가 되도록 하는 +# 행동 :math:`a'` 를 사용합니다. # -# Notice we use the -# `@torch.no_grad() `__ -# decorator on ``td_target()`` to disable gradient calculations here -# (because we don’t need to backpropagate on :math:`\theta_{target}`). +# 여기에서 변화도 계산을 비활성화하기 위해 +# ``td_target()`` 에서 `@torch.no_grad() `__ +# 데코레이터(decorator)를 사용합니다. +# (:math:`\theta_{target}` 의 역전파 계산이 필요로 하지 않기 때문입니다.) # @@ -510,22 +507,22 @@ def td_target(self, reward, next_state, done): ###################################################################### -# Updating the model +# 모델을 업데이트 하기. # ~~~~~~~~~~~~~~~~~~~~~~ # -# As Mario samples inputs from his replay buffer, we compute :math:`TD_t` -# and :math:`TD_e` and backpropagate this loss down :math:`Q_{online}` to -# update its parameters :math:`\theta_{online}` (:math:`\alpha` is the -# learning rate ``lr`` passed to the ``optimizer``) +# 마리오가 재생 버퍼에서 입력을 샘플링할 때, :math:`TD_t` +# 와 :math:`TD_e` 를 계산합니다. 그리고 이 손실을 이용하여 :math:`Q_{online}` 역전파하여 +# 매개변수 :math:`\theta_{online}` 를 업데이트합니다. (:math:`\alpha` 는 +# ``optimizer`` 에 전달되는 학습률 ``lr`` 입니다.) # # .. math:: # # # \theta_{online} \leftarrow \theta_{online} + \alpha \nabla(TD_e - TD_t) # -# :math:`\theta_{target}` does not update through backpropagation. -# Instead, we periodically copy :math:`\theta_{online}` to -# :math:`\theta_{target}` +# :math:`\theta_{target}` 은 역전파를 통해 업데이트 되지 않습니다. +# 대신, 주기적으로 :math:`\theta_{online}` 의 값을 :math:`\theta_{target}` +# 로 복사합니다. # # .. math:: # @@ -553,8 +550,8 @@ def sync_Q_target(self): ###################################################################### -# Save checkpoint -# ~~~~~~~~~~~~~~~~~~ +# 체크포인트를 저장합니다. +# ~~~~~~~~~~~~~~~~~~~~~~~ # @@ -571,7 +568,7 @@ def save(self): ###################################################################### -# Putting it all together +# 모든 기능을 종합해봅시다. # ~~~~~~~~~~~~~~~~~~~~~~~~~~ # @@ -579,9 +576,9 @@ def save(self): class Mario(Mario): def __init__(self, state_dim, action_dim, save_dir): super().__init__(state_dim, action_dim, save_dir) - self.burnin = 1e4 # min. experiences before training - self.learn_every = 3 # no. of experiences between updates to Q_online - self.sync_every = 1e4 # no. of experiences between Q_target & Q_online sync + self.burnin = 1e4 # 학습을 진행하기 전 최소한의 경험값. + self.learn_every = 3 # Q_online 업데이트 사이의 경험 횟수. + self.sync_every = 1e4 # Q_target과 Q_online sync 사이의 경험 수 def learn(self): if self.curr_step % self.sync_every == 0: @@ -596,23 +593,23 @@ def learn(self): if self.curr_step % self.learn_every != 0: return None, None - # Sample from memory + # 메모리로부터 샘플링을 합니다. state, next_state, action, reward, done = self.recall() - # Get TD Estimate + # TD 추정값을 가져옵니다. td_est = self.td_estimate(state, action) - # Get TD Target + # TD 목표값을 가져옵니다. td_tgt = self.td_target(reward, next_state, done) - # Backpropagate loss through Q_online + # 실시간 Q(Q_online)을 통해 역전파 손실을 계산합니다. loss = self.update_Q_online(td_est, td_tgt) return (td_est.mean().item(), loss) ###################################################################### -# Logging +# 기록하기 # -------------- # @@ -635,22 +632,22 @@ def __init__(self, save_dir): self.ep_avg_losses_plot = save_dir / "loss_plot.jpg" self.ep_avg_qs_plot = save_dir / "q_plot.jpg" - # History metrics + # 지표(Metric)와 관련된 리스트입니다. self.ep_rewards = [] self.ep_lengths = [] self.ep_avg_losses = [] self.ep_avg_qs = [] - # Moving averages, added for every call to record() + # 모든 record() 함수를 호출한 후 이동 평균(Moving average)을 계산합니다. self.moving_avg_ep_rewards = [] self.moving_avg_ep_lengths = [] self.moving_avg_ep_avg_losses = [] self.moving_avg_ep_avg_qs = [] - # Current episode metric + # 현재 에피스드에 대한 지표를 기록합니다. self.init_episode() - # Timing + # 시간에 대한 기록입니다. self.record_time = time.time() def log_step(self, reward, loss, q): @@ -662,7 +659,7 @@ def log_step(self, reward, loss, q): self.curr_ep_loss_length += 1 def log_episode(self): - "Mark end of episode" + "에피스드의 끝을 표시합니다." self.ep_rewards.append(self.curr_ep_reward) self.ep_lengths.append(self.curr_ep_length) if self.curr_ep_loss_length == 0: @@ -724,11 +721,11 @@ def record(self, episode, epsilon, step): ###################################################################### -# Let’s play! -# """"""""""""""" +# 게임을 실행시켜봅시다! +# """"""""""""""""""" # -# In this example we run the training loop for 10 episodes, but for Mario to truly learn the ways of -# his world, we suggest running the loop for at least 40,000 episodes! +# 이번 예제에서는 10개의 에피소드에 대해 학습 루프를 실행시켰습니다.하지만 마리오가 진정으로 +# 세계를 학습하기 위해서는 적어도 40000개의 에피소드에 대해 학습을 시킬 것을 제안합니다! # use_cuda = torch.cuda.is_available() print(f"Using CUDA: {use_cuda}") @@ -746,28 +743,28 @@ def record(self, episode, epsilon, step): state = env.reset() - # Play the game! + # 게임을 실행시켜봅시다! while True: - # Run agent on the state + # 현재 상태에서 에이전트 실행하기 action = mario.act(state) - # Agent performs action + # 에이전트가 액션 수행하기 next_state, reward, done, info = env.step(action) - # Remember + # 기억하기 mario.cache(state, next_state, action, reward, done) - # Learn + # 배우기 q, loss = mario.learn() - # Logging + # 기록하기 logger.log_step(reward, loss, q) - # Update state + # 상태 업데이트하기 state = next_state - # Check if end of game + # 게임이 끝났는지 확인하기 if done or info["flag_get"]: break @@ -778,9 +775,9 @@ def record(self, episode, epsilon, step): ###################################################################### -# Conclusion +# 결론 # """"""""""""""" # -# In this tutorial, we saw how we can use PyTorch to train a game-playing AI. You can use the same methods -# to train an AI to play any of the games at the `OpenAI gym `__. Hope you enjoyed this tutorial, feel free to reach us at -# `our github `__! +# 이 튜토리얼에서는 PyTorch를 사용하여 게임 플레이 AI를 훈련하는 방법을 살펴보았습니다. `OpenAI gym `__ +# 에 있는 어떤 게임이든 동일한 방법으로 AI를 훈련시키고 게임을 진행할 수 있습니다. 이 튜토리얼이 도움이 되었기를 바라며, +# `Github 저장소 `__ 에서 편하게 저자들에게 연락을 하셔도 됩니다! diff --git a/docs/_downloads/dba562a36c81e67ddf3b2f9503b7009b/super_resolution_with_onnxruntime.ipynb b/docs/_downloads/dba562a36c81e67ddf3b2f9503b7009b/super_resolution_with_onnxruntime.ipynb index 101581b8f..494efd448 100644 --- a/docs/_downloads/dba562a36c81e67ddf3b2f9503b7009b/super_resolution_with_onnxruntime.ipynb +++ b/docs/_downloads/dba562a36c81e67ddf3b2f9503b7009b/super_resolution_with_onnxruntime.ipynb @@ -80,7 +80,7 @@ }, "outputs": [], "source": [ - "# \ubaa8\ub378\uc5d0 \ub300\ud55c \uc785\ub825\uac12\nx = torch.randn(batch_size, 1, 224, 224, requires_grad=True)\ntorch_out = torch_model(x)\n\n# \ubaa8\ub378 \ubcc0\ud658\ntorch.onnx.export(torch_model, # \uc2e4\ud589\ub420 \ubaa8\ub378\n x, # \ubaa8\ub378 \uc785\ub825\uac12 (\ud29c\ud50c \ub610\ub294 \uc5ec\ub7ec \uc785\ub825\uac12\ub4e4\ub3c4 \uac00\ub2a5)\n \"super_resolution.onnx\", # \ubaa8\ub378 \uc800\uc7a5 \uacbd\ub85c (\ud30c\uc77c \ub610\ub294 \ud30c\uc77c\uacfc \uc720\uc0ac\ud55c \uac1d\uccb4 \ubaa8\ub450 \uac00\ub2a5)\n export_params=True, # \ubaa8\ub378 \ud30c\uc77c \uc548\uc5d0 \ud559\uc2b5\ub41c \ubaa8\ub378 \uac00\uc911\uce58\ub97c \uc800\uc7a5\ud560\uc9c0\uc758 \uc5ec\ubd80\n opset_version=10, # \ubaa8\ub378\uc744 \ubcc0\ud658\ud560 \ub54c \uc0ac\uc6a9\ud560 ONNX \ubc84\uc804\n do_constant_folding=True, # \ucd5c\uc801\ud558\uc2dc \uc0c1\uc218\ud3f4\ub529\uc744 \uc0ac\uc6a9\ud560\uc9c0\uc758 \uc5ec\ubd80\n input_names = ['input'], # \ubaa8\ub378\uc758 \uc785\ub825\uac12\uc744 \uac00\ub9ac\ud0a4\ub294 \uc774\ub984\n output_names = ['output'], # \ubaa8\ub378\uc758 \ucd9c\ub825\uac12\uc744 \uac00\ub9ac\ud0a4\ub294 \uc774\ub984\n dynamic_axes={'input' : {0 : 'batch_size'}, # \uac00\ubcc0\uc801\uc778 \uae38\uc774\ub97c \uac00\uc9c4 \ucc28\uc6d0\n 'output' : {0 : 'batch_size'}})" + "# \ubaa8\ub378\uc5d0 \ub300\ud55c \uc785\ub825\uac12\nx = torch.randn(batch_size, 1, 224, 224, requires_grad=True)\ntorch_out = torch_model(x)\n\n# \ubaa8\ub378 \ubcc0\ud658\ntorch.onnx.export(torch_model, # \uc2e4\ud589\ub420 \ubaa8\ub378\n x, # \ubaa8\ub378 \uc785\ub825\uac12 (\ud29c\ud50c \ub610\ub294 \uc5ec\ub7ec \uc785\ub825\uac12\ub4e4\ub3c4 \uac00\ub2a5)\n \"super_resolution.onnx\", # \ubaa8\ub378 \uc800\uc7a5 \uacbd\ub85c (\ud30c\uc77c \ub610\ub294 \ud30c\uc77c\uacfc \uc720\uc0ac\ud55c \uac1d\uccb4 \ubaa8\ub450 \uac00\ub2a5)\n export_params=True, # \ubaa8\ub378 \ud30c\uc77c \uc548\uc5d0 \ud559\uc2b5\ub41c \ubaa8\ub378 \uac00\uc911\uce58\ub97c \uc800\uc7a5\ud560\uc9c0\uc758 \uc5ec\ubd80\n opset_version=10, # \ubaa8\ub378\uc744 \ubcc0\ud658\ud560 \ub54c \uc0ac\uc6a9\ud560 ONNX \ubc84\uc804\n do_constant_folding=True, # \ucd5c\uc801\ud654\uc2dc \uc0c1\uc218\ud3f4\ub529\uc744 \uc0ac\uc6a9\ud560\uc9c0\uc758 \uc5ec\ubd80\n input_names = ['input'], # \ubaa8\ub378\uc758 \uc785\ub825\uac12\uc744 \uac00\ub9ac\ud0a4\ub294 \uc774\ub984\n output_names = ['output'], # \ubaa8\ub378\uc758 \ucd9c\ub825\uac12\uc744 \uac00\ub9ac\ud0a4\ub294 \uc774\ub984\n dynamic_axes={'input' : {0 : 'batch_size'}, # \uac00\ubcc0\uc801\uc778 \uae38\uc774\ub97c \uac00\uc9c4 \ucc28\uc6d0\n 'output' : {0 : 'batch_size'}})" ] }, { diff --git a/docs/_downloads/f6608f362ad07183c0ee305ce6aaa917/super_resolution_with_onnxruntime.py b/docs/_downloads/f6608f362ad07183c0ee305ce6aaa917/super_resolution_with_onnxruntime.py index 8728b9bae..7c2886342 100644 --- a/docs/_downloads/f6608f362ad07183c0ee305ce6aaa917/super_resolution_with_onnxruntime.py +++ b/docs/_downloads/f6608f362ad07183c0ee305ce6aaa917/super_resolution_with_onnxruntime.py @@ -126,7 +126,7 @@ def _initialize_weights(self): "super_resolution.onnx", # 모델 저장 경로 (파일 또는 파일과 유사한 객체 모두 가능) export_params=True, # 모델 파일 안에 학습된 모델 가중치를 저장할지의 여부 opset_version=10, # 모델을 변환할 때 사용할 ONNX 버전 - do_constant_folding=True, # 최적하시 상수폴딩을 사용할지의 여부 + do_constant_folding=True, # 최적화시 상수폴딩을 사용할지의 여부 input_names = ['input'], # 모델의 입력값을 가리키는 이름 output_names = ['output'], # 모델의 출력값을 가리키는 이름 dynamic_axes={'input' : {0 : 'batch_size'}, # 가변적인 길이를 가진 차원 diff --git a/docs/_images/sphx_glr_mario_rl_tutorial_001.png b/docs/_images/sphx_glr_mario_rl_tutorial_001.png index 4d4c61d996415ae2dcf07e467201488cf4b1715b..4b76927a57ef500630c5495b552a4285d6385662 100644 GIT binary patch delta 43 zcmca3bVq1{n}VsHp^idENl8JmmA-y%Vo5 + + + diff --git a/docs/_static/clipboard.min.js b/docs/_static/clipboard.min.js index 02c549e35..54b3c4638 100644 --- a/docs/_static/clipboard.min.js +++ b/docs/_static/clipboard.min.js @@ -1,7 +1,7 @@ /*! - * clipboard.js v2.0.4 - * https://zenorocha.github.io/clipboard.js - * + * clipboard.js v2.0.8 + * https://clipboardjs.com/ + * * Licensed MIT © Zeno Rocha */ -!function(t,e){"object"==typeof exports&&"object"==typeof module?module.exports=e():"function"==typeof define&&define.amd?define([],e):"object"==typeof exports?exports.ClipboardJS=e():t.ClipboardJS=e()}(this,function(){return function(n){var o={};function r(t){if(o[t])return o[t].exports;var e=o[t]={i:t,l:!1,exports:{}};return n[t].call(e.exports,e,e.exports,r),e.l=!0,e.exports}return r.m=n,r.c=o,r.d=function(t,e,n){r.o(t,e)||Object.defineProperty(t,e,{enumerable:!0,get:n})},r.r=function(t){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(t,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(t,"__esModule",{value:!0})},r.t=function(e,t){if(1&t&&(e=r(e)),8&t)return e;if(4&t&&"object"==typeof e&&e&&e.__esModule)return e;var n=Object.create(null);if(r.r(n),Object.defineProperty(n,"default",{enumerable:!0,value:e}),2&t&&"string"!=typeof e)for(var o in e)r.d(n,o,function(t){return e[t]}.bind(null,o));return n},r.n=function(t){var e=t&&t.__esModule?function(){return t.default}:function(){return t};return r.d(e,"a",e),e},r.o=function(t,e){return Object.prototype.hasOwnProperty.call(t,e)},r.p="",r(r.s=0)}([function(t,e,n){"use strict";var r="function"==typeof Symbol&&"symbol"==typeof Symbol.iterator?function(t){return typeof t}:function(t){return t&&"function"==typeof Symbol&&t.constructor===Symbol&&t!==Symbol.prototype?"symbol":typeof t},i=function(){function o(t,e){for(var n=0;n + diff --git a/docs/_static/copybutton.css b/docs/_static/copybutton.css index 75b17a83d..40eafe5fc 100644 --- a/docs/_static/copybutton.css +++ b/docs/_static/copybutton.css @@ -1,30 +1,50 @@ /* Copy buttons */ -a.copybtn { +button.copybtn { position: absolute; - top: .2em; - right: .2em; - width: 1em; - height: 1em; - opacity: .3; - transition: opacity 0.5s; - border: none; + display: flex; + top: .3em; + right: .3em; + width: 1.7em; + height: 1.7em; + opacity: 0; + transition: opacity 0.3s, border .3s, background-color .3s; user-select: none; + padding: 0; + border: none; + outline: none; + border-radius: 0.4em; + /* The colors that GitHub uses */ + border: #1b1f2426 1px solid; + background-color: #f6f8fa; + color: #57606a; +} + +button.copybtn.success { + border-color: #22863a; + color: #22863a; +} + +button.copybtn svg { + stroke: currentColor; + width: 1.5em; + height: 1.5em; + padding: 0.1em; } div.highlight { position: relative; } -a.copybtn > img { - vertical-align: top; - margin: 0; - top: 0; - left: 0; - position: absolute; +.highlight:hover button.copybtn { + opacity: 1; } -.highlight:hover .copybtn { - opacity: 1; +.highlight button.copybtn:hover { + background-color: rgb(235, 235, 235); +} + +.highlight button.copybtn:active { + background-color: rgb(187, 187, 187); } /** @@ -44,11 +64,10 @@ a.copybtn > img { visibility: hidden; position: absolute; content: attr(data-tooltip); - padding: 2px; - top: 0; + padding: .2em; + font-size: .8em; left: -.2em; background: grey; - font-size: 1rem; color: white; white-space: nowrap; z-index: 2; @@ -65,3 +84,10 @@ a.copybtn > img { transition: opacity 0.2s cubic-bezier(0.64, 0.09, 0.08, 1), transform 0.2s cubic-bezier(0.64, 0.09, 0.08, 1); transition-delay: .5s; } + +/* By default the copy button shouldn't show up when printing a page */ +@media print { + button.copybtn { + display: none; + } +} diff --git a/docs/_static/copybutton.js b/docs/_static/copybutton.js index 65a59167a..40ac33108 100644 --- a/docs/_static/copybutton.js +++ b/docs/_static/copybutton.js @@ -17,6 +17,30 @@ const messages = { 'copy_to_clipboard': 'In die Zwischenablage kopieren', 'copy_success': 'Kopiert!', 'copy_failure': 'Fehler beim Kopieren', + }, + 'fr' : { + 'copy': 'Copier', + 'copy_to_clipboard': 'Copié dans le presse-papier', + 'copy_success': 'Copié !', + 'copy_failure': 'Échec de la copie', + }, + 'ru': { + 'copy': 'Скопировать', + 'copy_to_clipboard': 'Скопировать в буфер', + 'copy_success': 'Скопировано!', + 'copy_failure': 'Не удалось скопировать', + }, + 'zh-CN': { + 'copy': '复制', + 'copy_to_clipboard': '复制到剪贴板', + 'copy_success': '复制成功!', + 'copy_failure': '复制失败', + }, + 'it' : { + 'copy': 'Copiare', + 'copy_to_clipboard': 'Copiato negli appunti', + 'copy_success': 'Copiato!', + 'copy_failure': 'Errore durante la copia', } } @@ -26,6 +50,31 @@ if( document.documentElement.lang !== undefined locale = document.documentElement.lang } +let doc_url_root = DOCUMENTATION_OPTIONS.URL_ROOT; +if (doc_url_root == '#') { + doc_url_root = ''; +} + +/** + * SVG files for our copy buttons + */ +let iconCheck = ` + ${messages[locale]['copy_success']} + + +` + +// If the user specified their own SVG use that, otherwise use the default +let iconCopy = ``; +if (!iconCopy) { + iconCopy = ` + ${messages[locale]['copy_to_clipboard']} + + + +` +} + /** * Set up copy/paste for code blocks */ @@ -54,10 +103,17 @@ const clearSelection = () => { } // Changes tooltip text for two seconds, then changes it back -const temporarilyChangeTooltip = (el, newText) => { - const oldText = el.getAttribute('data-tooltip') +const temporarilyChangeTooltip = (el, oldText, newText) => { el.setAttribute('data-tooltip', newText) + el.classList.add('success') setTimeout(() => el.setAttribute('data-tooltip', oldText), 2000) + setTimeout(() => el.classList.remove('success'), 2000) +} + +// Changes the copy button icon for two seconds, then changes it back +const temporarilyChangeIcon = (el) => { + el.innerHTML = iconCheck; + setTimeout(() => {el.innerHTML = iconCopy}, 2000) } const addCopyButtonToCodeCells = () => { @@ -73,12 +129,11 @@ const addCopyButtonToCodeCells = () => { codeCells.forEach((codeCell, index) => { const id = codeCellId(index) codeCell.setAttribute('id', id) - const pre_bg = getComputedStyle(codeCell).backgroundColor; const clipboardButton = id => - ` - ${messages[locale]['copy_to_clipboard']} - ` + `` codeCell.insertAdjacentHTML('afterend', clipboardButton(id)) }) @@ -88,11 +143,15 @@ function escapeRegExp(string) { // Callback when a copy button is clicked. Will be passed the node that was clicked // should then grab the text and replace pieces of text that shouldn't be used in output -function formatCopyText(textContent, copybuttonPromptText, isRegexp = false, onlyCopyPromptLines = true, removePrompts = true) { +function formatCopyText(textContent, copybuttonPromptText, isRegexp = false, onlyCopyPromptLines = true, removePrompts = true, copyEmptyLines = true, lineContinuationChar = "", hereDocDelim = "") { var regexp; var match; + // Do we check for line continuation characters and "HERE-documents"? + var useLineCont = !!lineContinuationChar + var useHereDoc = !!hereDocDelim + // create regexp to capture prompt and remaining line if (isRegexp) { regexp = new RegExp('^(' + copybuttonPromptText + ')(.*)') @@ -102,24 +161,31 @@ function formatCopyText(textContent, copybuttonPromptText, isRegexp = false, onl const outputLines = []; var promptFound = false; + var gotLineCont = false; + var gotHereDoc = false; + const lineGotPrompt = []; for (const line of textContent.split('\n')) { match = line.match(regexp) - if (match) { - promptFound = true - if (removePrompts) { + if (match || gotLineCont || gotHereDoc) { + promptFound = regexp.test(line) + lineGotPrompt.push(promptFound) + if (removePrompts && promptFound) { outputLines.push(match[2]) } else { outputLines.push(line) } - } else { - if (!onlyCopyPromptLines) { - outputLines.push(line) - } + gotLineCont = line.endsWith(lineContinuationChar) & useLineCont + if (line.includes(hereDocDelim) & useHereDoc) + gotHereDoc = !gotHereDoc + } else if (!onlyCopyPromptLines) { + outputLines.push(line) + } else if (copyEmptyLines && line.trim() === '') { + outputLines.push(line) } } // If no lines with the prompt were found then just use original lines - if (promptFound) { + if (lineGotPrompt.some(v => v === true)) { textContent = outputLines.join('\n'); } @@ -133,7 +199,7 @@ function formatCopyText(textContent, copybuttonPromptText, isRegexp = false, onl var copyTargetText = (trigger) => { var target = document.querySelector(trigger.attributes['data-clipboard-target'].value); - return formatCopyText(target.innerText, '', false, true, true) + return formatCopyText(target.innerText, '', false, true, true, true, '', '') } // Initialize with a callback so we can modify the text before copy @@ -142,11 +208,12 @@ var copyTargetText = (trigger) => { // Update UI with error/success messages clipboard.on('success', event => { clearSelection() - temporarilyChangeTooltip(event.trigger, messages[locale]['copy_success']) + temporarilyChangeTooltip(event.trigger, messages[locale]['copy'], messages[locale]['copy_success']) + temporarilyChangeIcon(event.trigger) }) clipboard.on('error', event => { - temporarilyChangeTooltip(event.trigger, messages[locale]['copy_failure']) + temporarilyChangeTooltip(event.trigger, messages[locale]['copy'], messages[locale]['copy_failure']) }) } diff --git a/docs/_static/copybutton_funcs.js b/docs/_static/copybutton_funcs.js index 57caa5585..b9168c556 100644 --- a/docs/_static/copybutton_funcs.js +++ b/docs/_static/copybutton_funcs.js @@ -4,11 +4,15 @@ function escapeRegExp(string) { // Callback when a copy button is clicked. Will be passed the node that was clicked // should then grab the text and replace pieces of text that shouldn't be used in output -export function formatCopyText(textContent, copybuttonPromptText, isRegexp = false, onlyCopyPromptLines = true, removePrompts = true) { +export function formatCopyText(textContent, copybuttonPromptText, isRegexp = false, onlyCopyPromptLines = true, removePrompts = true, copyEmptyLines = true, lineContinuationChar = "", hereDocDelim = "") { var regexp; var match; + // Do we check for line continuation characters and "HERE-documents"? + var useLineCont = !!lineContinuationChar + var useHereDoc = !!hereDocDelim + // create regexp to capture prompt and remaining line if (isRegexp) { regexp = new RegExp('^(' + copybuttonPromptText + ')(.*)') @@ -18,24 +22,31 @@ export function formatCopyText(textContent, copybuttonPromptText, isRegexp = fal const outputLines = []; var promptFound = false; + var gotLineCont = false; + var gotHereDoc = false; + const lineGotPrompt = []; for (const line of textContent.split('\n')) { match = line.match(regexp) - if (match) { - promptFound = true - if (removePrompts) { + if (match || gotLineCont || gotHereDoc) { + promptFound = regexp.test(line) + lineGotPrompt.push(promptFound) + if (removePrompts && promptFound) { outputLines.push(match[2]) } else { outputLines.push(line) } - } else { - if (!onlyCopyPromptLines) { - outputLines.push(line) - } + gotLineCont = line.endsWith(lineContinuationChar) & useLineCont + if (line.includes(hereDocDelim) & useHereDoc) + gotHereDoc = !gotHereDoc + } else if (!onlyCopyPromptLines) { + outputLines.push(line) + } else if (copyEmptyLines && line.trim() === '') { + outputLines.push(line) } } // If no lines with the prompt were found then just use original lines - if (promptFound) { + if (lineGotPrompt.some(v => v === true)) { textContent = outputLines.join('\n'); } diff --git a/docs/advanced/ONNXLive.html b/docs/advanced/ONNXLive.html index 0df27da2a..b1ea12153 100644 --- a/docs/advanced/ONNXLive.html +++ b/docs/advanced/ONNXLive.html @@ -9,7 +9,7 @@ - ONNX Live 튜토리얼 — PyTorch Tutorials 1.10.2+cu102 documentation + ONNX Live 튜토리얼 — PyTorch Tutorials 1.11.0+cu102 documentation @@ -111,7 +111,7 @@
- 1.10.2+cu102 + 1.11.0+cu102
@@ -207,7 +207,7 @@

강화학습

PyTorch 모델을 프로덕션 환경에 배포하기

병렬 및 분산 학습