diff --git a/.github/ISSUE_TEMPLATE/1_TRANSLATE_REQUEST.md b/.github/ISSUE_TEMPLATE/1_TRANSLATE_REQUEST.md
index 455ec8321..f709bae1c 100644
--- a/.github/ISSUE_TEMPLATE/1_TRANSLATE_REQUEST.md
+++ b/.github/ISSUE_TEMPLATE/1_TRANSLATE_REQUEST.md
@@ -15,4 +15,4 @@ _(반드시 지키셔야 하는 일정이 아닙니다 - 일정이 너무 늦어
 ## 관련 이슈
 _현재 번역 요청 / 진행 내역을 보기 위해 각 버전의 메인 이슈를 참조합니다._ <br />
 _(특별한 일이 없다면 변경하지 않으셔도 됩니다.)_
-* 관련 이슈: #445 (v1.11)
+* 관련 이슈: #483 (v1.12)
diff --git a/README.md b/README.md
index fd80db417..8f4a4a25a 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@
 
 PyTorch에서 제공하는 튜토리얼의 한국어 번역을 위한 저장소입니다.\
 번역의 결과물은 [https://tutorials.pytorch.kr](https://tutorials.pytorch.kr)에서 확인하실 수 있습니다. (번역을 진행하며 **불규칙적으로** 업데이트합니다.)\
-현재 버전의 번역 / 변경 관련 이슈는 [#445 이슈](https://github.com/PyTorchKorea/tutorials-kr/issues/445)를 참고해주세요.
+현재 버전의 번역 / 변경 관련 이슈는 [#483 이슈](https://github.com/PyTorchKorea/tutorials-kr/issues/483)를 참고해주세요.
 
 ## 기여하기
 
@@ -22,7 +22,7 @@ PyTorch에서 제공하는 튜토리얼의 한국어 번역을 위한 저장소
 
 ## 원문
 
-현재 PyTorch v1.11 튜토리얼([pytorch/tutorials@6e21cf2](https://github.com/pytorch/tutorials/commit/6e21cf2e81beb8b4dddc9713be0c8746087fd59e) 기준) 번역이 진행 중입니다.
+현재 PyTorch v1.12 튜토리얼([pytorch/tutorials@0918023](https://github.com/pytorch/tutorials/commit/091802395ed6cc5563643b6b6ef54ba99d412e66) 기준) 번역이 진행 중입니다.
 
 최신 버전의 튜토리얼(공식, 영어)은 [PyTorch tutorials 사이트](https://pytorch.org/tutorials) 및 [PyTorch tutorials 저장소](https://github.com/pytorch/tutorials)를 참고해주세요.
 
@@ -46,5 +46,5 @@ v1.0 이후 번역은 별도 저장소로 관리하지 않습니다. [이 저장
 빌드 방법은 [기여하기 문서의 `2-5. (내 컴퓨터에서) 결과 확인하기`](https://github.com/PyTorchKorea/tutorials-kr/blob/master/CONTRIBUTING.md#2-5-내-컴퓨터에서-결과-확인하기) 부분을 참고해주세요.
 
 ---
-This is a project to translate [pytorch/tutorials@6e21cf2](https://github.com/pytorch/tutorials/commit/6e21cf2e81beb8b4dddc9713be0c8746087fd59e) into Korean.
+This is a project to translate [pytorch/tutorials@0918023](https://github.com/pytorch/tutorials/commit/091802395ed6cc5563643b6b6ef54ba99d412e66) into Korean.
 For the latest version, please visit to the [official PyTorch tutorials repo](https://github.com/pytorch/tutorials).
diff --git a/_static/img/torchserve-ipex-images/1.png b/_static/img/torchserve-ipex-images/1.png
new file mode 100644
index 000000000..fc8748b22
Binary files /dev/null and b/_static/img/torchserve-ipex-images/1.png differ
diff --git a/_static/img/torchserve-ipex-images/10.png b/_static/img/torchserve-ipex-images/10.png
new file mode 100644
index 000000000..833a1bb7c
Binary files /dev/null and b/_static/img/torchserve-ipex-images/10.png differ
diff --git a/_static/img/torchserve-ipex-images/11.gif b/_static/img/torchserve-ipex-images/11.gif
new file mode 100644
index 000000000..1c1a2644e
Binary files /dev/null and b/_static/img/torchserve-ipex-images/11.gif differ
diff --git a/_static/img/torchserve-ipex-images/12.png b/_static/img/torchserve-ipex-images/12.png
new file mode 100644
index 000000000..b55968fd7
Binary files /dev/null and b/_static/img/torchserve-ipex-images/12.png differ
diff --git a/_static/img/torchserve-ipex-images/13.png b/_static/img/torchserve-ipex-images/13.png
new file mode 100644
index 000000000..de9c08814
Binary files /dev/null and b/_static/img/torchserve-ipex-images/13.png differ
diff --git a/_static/img/torchserve-ipex-images/14.png b/_static/img/torchserve-ipex-images/14.png
new file mode 100644
index 000000000..4d776d816
Binary files /dev/null and b/_static/img/torchserve-ipex-images/14.png differ
diff --git a/_static/img/torchserve-ipex-images/15.png b/_static/img/torchserve-ipex-images/15.png
new file mode 100644
index 000000000..513ccf8e0
Binary files /dev/null and b/_static/img/torchserve-ipex-images/15.png differ
diff --git a/_static/img/torchserve-ipex-images/16.png b/_static/img/torchserve-ipex-images/16.png
new file mode 100644
index 000000000..3670d0a1d
Binary files /dev/null and b/_static/img/torchserve-ipex-images/16.png differ
diff --git a/_static/img/torchserve-ipex-images/17.png b/_static/img/torchserve-ipex-images/17.png
new file mode 100644
index 000000000..5ab17373c
Binary files /dev/null and b/_static/img/torchserve-ipex-images/17.png differ
diff --git a/_static/img/torchserve-ipex-images/18.png b/_static/img/torchserve-ipex-images/18.png
new file mode 100644
index 000000000..50304884d
Binary files /dev/null and b/_static/img/torchserve-ipex-images/18.png differ
diff --git a/_static/img/torchserve-ipex-images/19.png b/_static/img/torchserve-ipex-images/19.png
new file mode 100644
index 000000000..b12348053
Binary files /dev/null and b/_static/img/torchserve-ipex-images/19.png differ
diff --git a/_static/img/torchserve-ipex-images/1_.png b/_static/img/torchserve-ipex-images/1_.png
new file mode 100644
index 000000000..fc8748b22
Binary files /dev/null and b/_static/img/torchserve-ipex-images/1_.png differ
diff --git a/_static/img/torchserve-ipex-images/2.png b/_static/img/torchserve-ipex-images/2.png
new file mode 100644
index 000000000..27633f25b
Binary files /dev/null and b/_static/img/torchserve-ipex-images/2.png differ
diff --git a/_static/img/torchserve-ipex-images/20.gif b/_static/img/torchserve-ipex-images/20.gif
new file mode 100644
index 000000000..ba8e9e953
Binary files /dev/null and b/_static/img/torchserve-ipex-images/20.gif differ
diff --git a/_static/img/torchserve-ipex-images/21.png b/_static/img/torchserve-ipex-images/21.png
new file mode 100644
index 000000000..04b3ca622
Binary files /dev/null and b/_static/img/torchserve-ipex-images/21.png differ
diff --git a/_static/img/torchserve-ipex-images/22.png b/_static/img/torchserve-ipex-images/22.png
new file mode 100644
index 000000000..cbb2c269a
Binary files /dev/null and b/_static/img/torchserve-ipex-images/22.png differ
diff --git a/_static/img/torchserve-ipex-images/23.png b/_static/img/torchserve-ipex-images/23.png
new file mode 100644
index 000000000..c9bc44463
Binary files /dev/null and b/_static/img/torchserve-ipex-images/23.png differ
diff --git a/_static/img/torchserve-ipex-images/24.png b/_static/img/torchserve-ipex-images/24.png
new file mode 100644
index 000000000..8b5718c30
Binary files /dev/null and b/_static/img/torchserve-ipex-images/24.png differ
diff --git a/_static/img/torchserve-ipex-images/25.png b/_static/img/torchserve-ipex-images/25.png
new file mode 100644
index 000000000..4de920e63
Binary files /dev/null and b/_static/img/torchserve-ipex-images/25.png differ
diff --git a/_static/img/torchserve-ipex-images/26.gif b/_static/img/torchserve-ipex-images/26.gif
new file mode 100644
index 000000000..60a5a64ad
Binary files /dev/null and b/_static/img/torchserve-ipex-images/26.gif differ
diff --git a/_static/img/torchserve-ipex-images/27.png b/_static/img/torchserve-ipex-images/27.png
new file mode 100644
index 000000000..c7e766155
Binary files /dev/null and b/_static/img/torchserve-ipex-images/27.png differ
diff --git a/_static/img/torchserve-ipex-images/28.png b/_static/img/torchserve-ipex-images/28.png
new file mode 100644
index 000000000..b7056c4c4
Binary files /dev/null and b/_static/img/torchserve-ipex-images/28.png differ
diff --git a/_static/img/torchserve-ipex-images/29.png b/_static/img/torchserve-ipex-images/29.png
new file mode 100644
index 000000000..9dcd87351
Binary files /dev/null and b/_static/img/torchserve-ipex-images/29.png differ
diff --git a/_static/img/torchserve-ipex-images/3.png b/_static/img/torchserve-ipex-images/3.png
new file mode 100644
index 000000000..230907157
Binary files /dev/null and b/_static/img/torchserve-ipex-images/3.png differ
diff --git a/_static/img/torchserve-ipex-images/30.png b/_static/img/torchserve-ipex-images/30.png
new file mode 100644
index 000000000..96b07ec72
Binary files /dev/null and b/_static/img/torchserve-ipex-images/30.png differ
diff --git a/_static/img/torchserve-ipex-images/31.png b/_static/img/torchserve-ipex-images/31.png
new file mode 100644
index 000000000..601b63e51
Binary files /dev/null and b/_static/img/torchserve-ipex-images/31.png differ
diff --git a/_static/img/torchserve-ipex-images/4.png b/_static/img/torchserve-ipex-images/4.png
new file mode 100644
index 000000000..f12d8c7cc
Binary files /dev/null and b/_static/img/torchserve-ipex-images/4.png differ
diff --git a/_static/img/torchserve-ipex-images/5.png b/_static/img/torchserve-ipex-images/5.png
new file mode 100644
index 000000000..55e05e5e5
Binary files /dev/null and b/_static/img/torchserve-ipex-images/5.png differ
diff --git a/_static/img/torchserve-ipex-images/6.png b/_static/img/torchserve-ipex-images/6.png
new file mode 100644
index 000000000..59a028f94
Binary files /dev/null and b/_static/img/torchserve-ipex-images/6.png differ
diff --git a/_static/img/torchserve-ipex-images/7.png b/_static/img/torchserve-ipex-images/7.png
new file mode 100644
index 000000000..5739cb4f5
Binary files /dev/null and b/_static/img/torchserve-ipex-images/7.png differ
diff --git a/_static/img/torchserve-ipex-images/8.png b/_static/img/torchserve-ipex-images/8.png
new file mode 100644
index 000000000..1e6531b6c
Binary files /dev/null and b/_static/img/torchserve-ipex-images/8.png differ
diff --git a/_static/img/torchserve-ipex-images/9.gif b/_static/img/torchserve-ipex-images/9.gif
new file mode 100644
index 000000000..682e2f342
Binary files /dev/null and b/_static/img/torchserve-ipex-images/9.gif differ
diff --git a/_templates/layout.html b/_templates/layout.html
index ff288bff0..f33e834ad 100644
--- a/_templates/layout.html
+++ b/_templates/layout.html
@@ -1,8 +1,18 @@
 {% extends "!layout.html" %}
 
-{% block footer %}
+{%- block content %}
 {{ super() }}
+<script>
+if((window.location.href.indexOf("/prototype/")!= -1) && (window.location.href.indexOf("/prototype/prototype_index")< 1))
+  {
+    var div = '<div class="admonition note"><p class="admonition-title">Note</p><p><i class="fa fa-flask" aria-hidden="true">&nbsp</i> 이 튜토리얼은 프로토타입(prototype) 기능들에 대해서 설명하고 있습니다. 프로토타입 기능은 일반적으로 피드백 및 테스트용으로, 런타임 플래그 없이는 PyPI나 Conda로 배포되는 바이너리에서는 사용할 수 없습니다.</p></div>'
+    document.getElementById("pytorch-article").insertAdjacentHTML('afterBegin', div)
+  }
+</script>
+{%- endblock %}
 
+{% block footer %}
+{{ super() }}
 <script>
 //add microsoft link
 
@@ -81,7 +91,7 @@
 </script>
 
 <script type="text/javascript">
-  var collapsedSections = ['파이토치(PyTorch) 레시피', '파이토치(PyTorch) 배우기', '이미지/비디오', '오디오', '텍스트', '강화학습', 'PyTorch 모델을 프로덕션 환경에 배포하기', 'Code Transforms with FX', '프론트엔드 API', 'PyTorch 확장하기', '모델 최적화', '병렬 및 분산 학습', 'Mobile'];
+  var collapsedSections = ['파이토치(PyTorch) 레시피', '파이토치(PyTorch) 배우기', '이미지/비디오', '오디오', '텍스트', '강화학습', 'PyTorch 모델을 프로덕션 환경에 배포하기', 'Code Transforms with FX', '프론트엔드 API', 'PyTorch 확장하기', '모델 최적화', '병렬 및 분산 학습', 'Mobile', 'Introduction to PyTorch on YouTube', 'Recommendation Systems'];
 </script>
 
 {% endblock %}
diff --git a/advanced_source/generic_join.rst b/advanced_source/generic_join.rst
index 3df9ddeda..a87871bd1 100644
--- a/advanced_source/generic_join.rst
+++ b/advanced_source/generic_join.rst
@@ -3,6 +3,9 @@ Distributed Training with Uneven Inputs Using the Join Context Manager
 
 **Author**\ : `Andrew Gu <https://github.com/andwgu>`_
 
+.. note::
+   View the source code for this tutorial in `github <https://github.com/pytorch/tutorials/blob/master/advanced_source/generic_join.rst>`__.
+
 .. note:: ``Join`` is introduced in PyTorch 1.10 as a prototype feature. This
     API is subject to change.
 
@@ -366,7 +369,7 @@ of inputs across all ranks.
         def join_hook(self, **kwargs) -> JoinHook:
             r"""
             Return a join hook that shadows the all-reduce in :meth:`__call__`.
-
+            
             This join hook supports the following keyword arguments:
                 sync_max_count (bool, optional): whether to synchronize the maximum
                     count across all ranks once all ranks join; default is ``False``.
@@ -439,9 +442,9 @@ Some key points to highlight:
 
 
 .. _Join: https://pytorch.org/docs/master/distributed.algorithms.join.html
-.. _Getting Started with Distributed Data Parallel: https://tutorials.pytorch.kr/intermediate/ddp_tutorial.html
-.. _Getting Started with Distributed Data Parallel - Basic Use Case: https://tutorials.pytorch.kr/intermediate/ddp_tutorial.html#basic-use-case
-.. _Shard Optimizer States with ZeroRedundancyOptimizer: https://tutorials.pytorch.kr/recipes/zero_redundancy_optimizer.html
+.. _Getting Started with Distributed Data Parallel: https://pytorch.org/tutorials/intermediate/ddp_tutorial.html
+.. _Getting Started with Distributed Data Parallel - Basic Use Case: https://pytorch.org/tutorials/intermediate/ddp_tutorial.html#basic-use-case
+.. _Shard Optimizer States with ZeroRedundancyOptimizer: https://pytorch.org/tutorials/recipes/zero_redundancy_optimizer.html
 .. _DistributedDataParallel: https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html
 .. _join(): https://pytorch.org/docs/stable/_modules/torch/nn/parallel/distributed.html#DistributedDataParallel.join
-.. _ZeroRedundancyOptimizer: https://pytorch.org/docs/stable/distributed.optim.html
\ No newline at end of file
+.. _ZeroRedundancyOptimizer: https://pytorch.org/docs/stable/distributed.optim.html
diff --git a/advanced_source/rpc_ddp_tutorial.rst b/advanced_source/rpc_ddp_tutorial.rst
index af29c31eb..ce414218f 100644
--- a/advanced_source/rpc_ddp_tutorial.rst
+++ b/advanced_source/rpc_ddp_tutorial.rst
@@ -4,6 +4,8 @@
 
 **번역**: `박다정 <https://github.com/dajeongPark-dev>`_
 
+.. note::
+   이 튜토리얼의 소스 코드는 `GitHub <https://github.com/PyTorchKorea/tutorials-kr/blob/master/advanced_source/rpc_ddp_tutorial.rst>`__ 에서 확인할 수 있습니다.
 
 이 튜토리얼은 간단한 예제를 사용하여 분산 데이터 병렬 처리(distributed data parallelism)와
 분산 모델 병렬 처리(distributed model parallelism)를 결합하여 간단한 모델 학습시킬 때
diff --git a/advanced_source/sharding.rst b/advanced_source/sharding.rst
new file mode 100644
index 000000000..7dfeeb88b
--- /dev/null
+++ b/advanced_source/sharding.rst
@@ -0,0 +1,336 @@
+Exploring TorchRec sharding
+===========================
+
+This tutorial will mainly cover the sharding schemes of embedding tables
+via ``EmbeddingPlanner`` and ``DistributedModelParallel`` API and
+explore the benefits of different sharding schemes for the embedding
+tables by explicitly configuring them.
+
+Installation
+------------
+
+Requirements: - python >= 3.7
+
+We highly recommend CUDA when using torchRec. If using CUDA: - cuda >=
+11.0
+
+.. code:: python
+
+    # install conda to make installying pytorch with cudatoolkit 11.3 easier. 
+    !sudo rm Miniconda3-py37_4.9.2-Linux-x86_64.sh Miniconda3-py37_4.9.2-Linux-x86_64.sh.*
+    !sudo wget https://repo.anaconda.com/miniconda/Miniconda3-py37_4.9.2-Linux-x86_64.sh
+    !sudo chmod +x Miniconda3-py37_4.9.2-Linux-x86_64.sh
+    !sudo bash ./Miniconda3-py37_4.9.2-Linux-x86_64.sh -b -f -p /usr/local
+
+.. code:: python
+
+    # install pytorch with cudatoolkit 11.3
+    !sudo conda install pytorch cudatoolkit=11.3 -c pytorch-nightly -y
+
+Installing torchRec will also install
+`FBGEMM <https://github.com/pytorch/fbgemm>`__, a collection of CUDA
+kernels and GPU enabled operations to run
+
+.. code:: python
+
+    # install torchrec
+    !pip3 install torchrec-nightly
+
+Install multiprocess which works with ipython to for multi-processing
+programming within colab
+
+.. code:: python
+
+    !pip3 install multiprocess
+
+The following steps are needed for the Colab runtime to detect the added
+shared libraries. The runtime searches for shared libraries in /usr/lib,
+so we copy over the libraries which were installed in /usr/local/lib/.
+**This is a very necessary step, only in the colab runtime**.
+
+.. code:: python
+
+    !sudo cp /usr/local/lib/lib* /usr/lib/
+
+**Restart your runtime at this point for the newly installed packages
+to be seen.** Run the step below immediately after restarting so that
+python knows where to look for packages. **Always run this step after
+restarting the runtime.**
+
+.. code:: python
+
+    import sys
+    sys.path = ['', '/env/python', '/usr/local/lib/python37.zip', '/usr/local/lib/python3.7', '/usr/local/lib/python3.7/lib-dynload', '/usr/local/lib/python3.7/site-packages', './.local/lib/python3.7/site-packages']
+
+
+Distributed Setup
+-----------------
+
+Due to the notebook enviroment, we cannot run
+`SPMD <https://en.wikipedia.org/wiki/SPMD>`_ program here but we
+can do multiprocessing inside the notebook to mimic the setup. Users
+should be responsible for setting up their own
+`SPMD <https://en.wikipedia.org/wiki/SPMD>`_ launcher when using
+Torchrec. We setup our environment so that torch distributed based
+communication backend can work.
+
+.. code:: python
+
+    import os
+    import torch
+    import torchrec
+
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = "29500"
+
+Constructing our embedding model
+--------------------------------
+
+Here we use TorchRec offering of
+`EmbeddingBagCollection <https://github.com/facebookresearch/torchrec/blob/main/torchrec/modules/embedding_modules.py#L59>`_
+to construct our embedding bag model with embedding tables.
+
+Here, we create an EmbeddingBagCollection (EBC) with four embedding
+bags. We have two types of tables: large tables and small tables
+differentiated by their row size difference: 4096 vs 1024. Each table is
+still represented by 64 dimension embedding.
+
+We configure the ``ParameterConstraints`` data structure for the tables,
+which provides hints for the model parallel API to help decide the
+sharding and placement strategy for the tables. In TorchRec, we support
+\* ``table-wise``: place the entire table on one device; \*
+``row-wise``: shard the table evenly by row dimension and place one
+shard on each device of the communication world; \* ``column-wise``:
+shard the table evenly by embedding dimension, and place one shard on
+each device of the communication world; \* ``table-row-wise``: special
+sharding optimized for intra-host communication for available fast
+intra-machine device interconnect, e.g. NVLink; \* ``data_parallel``:
+replicate the tables for every device;
+
+Note how we initially allocate the EBC on device "meta". This will tell
+EBC to not allocate memory yet.
+
+.. code:: python
+
+    from torchrec.distributed.planner.types import ParameterConstraints
+    from torchrec.distributed.embedding_types import EmbeddingComputeKernel
+    from torchrec.distributed.types import ShardingType
+    from typing import Dict
+
+    large_table_cnt = 2
+    small_table_cnt = 2
+    large_tables=[
+      torchrec.EmbeddingBagConfig(
+        name="large_table_" + str(i),
+        embedding_dim=64,
+        num_embeddings=4096,
+        feature_names=["large_table_feature_" + str(i)],
+        pooling=torchrec.PoolingType.SUM,
+      ) for i in range(large_table_cnt)
+    ]
+    small_tables=[
+      torchrec.EmbeddingBagConfig(
+        name="small_table_" + str(i),
+        embedding_dim=64,
+        num_embeddings=1024,
+        feature_names=["small_table_feature_" + str(i)],
+        pooling=torchrec.PoolingType.SUM,
+      ) for i in range(small_table_cnt)
+    ]
+
+    def gen_constraints(sharding_type: ShardingType = ShardingType.TABLE_WISE) -> Dict[str, ParameterConstraints]:
+      large_table_constraints = {
+        "large_table_" + str(i): ParameterConstraints(
+          sharding_types=[sharding_type.value],
+        ) for i in range(large_table_cnt)
+      }
+      small_table_constraints = {
+        "small_table_" + str(i): ParameterConstraints(
+          sharding_types=[sharding_type.value],
+        ) for i in range(small_table_cnt)
+      }
+      constraints = {**large_table_constraints, **small_table_constraints}
+      return constraints
+
+.. code:: python
+
+    ebc = torchrec.EmbeddingBagCollection(
+        device="cuda",
+        tables=large_tables + small_tables
+    )
+
+DistributedModelParallel in multiprocessing
+-------------------------------------------
+
+Now, we have a single process execution function for mimicking one
+rank's work during `SPMD <https://en.wikipedia.org/wiki/SPMD>`_
+execution.
+
+This code will shard the model collectively with other processes and
+allocate memories accordingly. It first sets up process groups and do
+embedding table placement using planner and generate sharded model using
+``DistributedModelParallel``.
+
+.. code:: python
+
+    def single_rank_execution(
+        rank: int,
+        world_size: int,
+        constraints: Dict[str, ParameterConstraints],
+        module: torch.nn.Module,
+        backend: str,
+    ) -> None:
+        import os
+        import torch
+        import torch.distributed as dist
+        from torchrec.distributed.embeddingbag import EmbeddingBagCollectionSharder
+        from torchrec.distributed.model_parallel import DistributedModelParallel
+        from torchrec.distributed.planner import EmbeddingShardingPlanner, Topology
+        from torchrec.distributed.types import ModuleSharder, ShardingEnv
+        from typing import cast
+
+        def init_distributed_single_host(
+            rank: int,
+            world_size: int,
+            backend: str,
+            # pyre-fixme[11]: Annotation `ProcessGroup` is not defined as a type.
+        ) -> dist.ProcessGroup:
+            os.environ["RANK"] = f"{rank}"
+            os.environ["WORLD_SIZE"] = f"{world_size}"
+            dist.init_process_group(rank=rank, world_size=world_size, backend=backend)
+            return dist.group.WORLD
+
+        if backend == "nccl":
+            device = torch.device(f"cuda:{rank}")
+            torch.cuda.set_device(device)
+        else:
+            device = torch.device("cpu")
+        topology = Topology(world_size=world_size, compute_device="cuda")
+        pg = init_distributed_single_host(rank, world_size, backend)
+        planner = EmbeddingShardingPlanner(
+            topology=topology,
+            constraints=constraints,
+        )
+        sharders = [cast(ModuleSharder[torch.nn.Module], EmbeddingBagCollectionSharder())]
+        plan: ShardingPlan = planner.collective_plan(module, sharders, pg)
+    
+        sharded_model = DistributedModelParallel(
+            module,
+            env=ShardingEnv.from_process_group(pg),
+            plan=plan,
+            sharders=sharders,
+            device=device,
+        )
+        print(f"rank:{rank},sharding plan: {plan}")
+        return sharded_model
+
+
+Multiprocessing Execution
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Now let's execute the code in multi-processes representing multiple GPU
+ranks.
+
+.. code:: python
+
+    import multiprocess
+       
+    def spmd_sharing_simulation(
+        sharding_type: ShardingType = ShardingType.TABLE_WISE,
+        world_size = 2,
+    ):
+      ctx = multiprocess.get_context("spawn")
+      processes = []
+      for rank in range(world_size):
+          p = ctx.Process(
+              target=single_rank_execution,
+              args=(
+                  rank,
+                  world_size,
+                  gen_constraints(sharding_type),
+                  ebc,
+                  "nccl"
+              ),
+          )
+          p.start()
+          processes.append(p)
+    
+      for p in processes:
+          p.join()
+          assert 0 == p.exitcode
+
+Table Wise Sharding
+~~~~~~~~~~~~~~~~~~~
+
+Now let's execute the code in two processes for 2 GPUs. We can see in
+the plan print that how our tables are sharded across GPUs. Each node
+will have one large table and one small which shows our planner tries
+for load balance for the embedding tables. Table-wise is the de-factor
+go-to sharding schemes for many small-medium size tables for load
+balancing over the devices.
+
+.. code:: python
+
+    spmd_sharing_simulation(ShardingType.TABLE_WISE)
+
+
+.. parsed-literal::
+
+    rank:1,sharding plan: {'': {'large_table_0': ParameterSharding(sharding_type='table_wise', compute_kernel='batched_fused', ranks=[0], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[4096, 64], placement=rank:0/cuda:0)])), 'large_table_1': ParameterSharding(sharding_type='table_wise', compute_kernel='batched_fused', ranks=[1], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[4096, 64], placement=rank:1/cuda:1)])), 'small_table_0': ParameterSharding(sharding_type='table_wise', compute_kernel='batched_fused', ranks=[0], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[1024, 64], placement=rank:0/cuda:0)])), 'small_table_1': ParameterSharding(sharding_type='table_wise', compute_kernel='batched_fused', ranks=[1], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[1024, 64], placement=rank:1/cuda:1)]))}}
+    rank:0,sharding plan: {'': {'large_table_0': ParameterSharding(sharding_type='table_wise', compute_kernel='batched_fused', ranks=[0], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[4096, 64], placement=rank:0/cuda:0)])), 'large_table_1': ParameterSharding(sharding_type='table_wise', compute_kernel='batched_fused', ranks=[1], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[4096, 64], placement=rank:1/cuda:1)])), 'small_table_0': ParameterSharding(sharding_type='table_wise', compute_kernel='batched_fused', ranks=[0], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[1024, 64], placement=rank:0/cuda:0)])), 'small_table_1': ParameterSharding(sharding_type='table_wise', compute_kernel='batched_fused', ranks=[1], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[1024, 64], placement=rank:1/cuda:1)]))}}
+
+Explore other sharding modes
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+We have initially explored what table-wise sharding would look like and
+how it balances the tables placement. Now we explore sharding modes with
+finer focus on load balance: row-wise. Row-wise is specifically
+addressing large tables which a single device cannot hold due to the
+memory size increase from large embedding row numbers. It can address
+the placement of the super large tables in your models. Users can see
+that in the ``shard_sizes`` section in the printed plan log, the tables
+are halved by row dimension to be distributed onto two GPUs.
+
+.. code:: python
+
+    spmd_sharing_simulation(ShardingType.ROW_WISE)
+
+
+.. parsed-literal::
+
+    rank:1,sharding plan: {'': {'large_table_0': ParameterSharding(sharding_type='row_wise', compute_kernel='batched_fused', ranks=[0, 1], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[2048, 64], placement=rank:0/cuda:0), ShardMetadata(shard_offsets=[2048, 0], shard_sizes=[2048, 64], placement=rank:1/cuda:1)])), 'large_table_1': ParameterSharding(sharding_type='row_wise', compute_kernel='batched_fused', ranks=[0, 1], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[2048, 64], placement=rank:0/cuda:0), ShardMetadata(shard_offsets=[2048, 0], shard_sizes=[2048, 64], placement=rank:1/cuda:1)])), 'small_table_0': ParameterSharding(sharding_type='row_wise', compute_kernel='batched_fused', ranks=[0, 1], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[512, 64], placement=rank:0/cuda:0), ShardMetadata(shard_offsets=[512, 0], shard_sizes=[512, 64], placement=rank:1/cuda:1)])), 'small_table_1': ParameterSharding(sharding_type='row_wise', compute_kernel='batched_fused', ranks=[0, 1], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[512, 64], placement=rank:0/cuda:0), ShardMetadata(shard_offsets=[512, 0], shard_sizes=[512, 64], placement=rank:1/cuda:1)]))}}
+    rank:0,sharding plan: {'': {'large_table_0': ParameterSharding(sharding_type='row_wise', compute_kernel='batched_fused', ranks=[0, 1], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[2048, 64], placement=rank:0/cuda:0), ShardMetadata(shard_offsets=[2048, 0], shard_sizes=[2048, 64], placement=rank:1/cuda:1)])), 'large_table_1': ParameterSharding(sharding_type='row_wise', compute_kernel='batched_fused', ranks=[0, 1], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[2048, 64], placement=rank:0/cuda:0), ShardMetadata(shard_offsets=[2048, 0], shard_sizes=[2048, 64], placement=rank:1/cuda:1)])), 'small_table_0': ParameterSharding(sharding_type='row_wise', compute_kernel='batched_fused', ranks=[0, 1], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[512, 64], placement=rank:0/cuda:0), ShardMetadata(shard_offsets=[512, 0], shard_sizes=[512, 64], placement=rank:1/cuda:1)])), 'small_table_1': ParameterSharding(sharding_type='row_wise', compute_kernel='batched_fused', ranks=[0, 1], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[512, 64], placement=rank:0/cuda:0), ShardMetadata(shard_offsets=[512, 0], shard_sizes=[512, 64], placement=rank:1/cuda:1)]))}}
+
+Column-wise on the other hand, address the load imbalance problems for
+tables with large embedding dimensions. We will split the table
+vertically. Users can see that in the ``shard_sizes`` section in the
+printed plan log, the tables are halved by embedding dimension to be
+distributed onto two GPUs.
+
+.. code:: python
+
+    spmd_sharing_simulation(ShardingType.COLUMN_WISE)
+
+
+.. parsed-literal::
+
+    rank:0,sharding plan: {'': {'large_table_0': ParameterSharding(sharding_type='column_wise', compute_kernel='batched_fused', ranks=[0, 1], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[4096, 32], placement=rank:0/cuda:0), ShardMetadata(shard_offsets=[0, 32], shard_sizes=[4096, 32], placement=rank:1/cuda:1)])), 'large_table_1': ParameterSharding(sharding_type='column_wise', compute_kernel='batched_fused', ranks=[0, 1], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[4096, 32], placement=rank:0/cuda:0), ShardMetadata(shard_offsets=[0, 32], shard_sizes=[4096, 32], placement=rank:1/cuda:1)])), 'small_table_0': ParameterSharding(sharding_type='column_wise', compute_kernel='batched_fused', ranks=[0, 1], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[1024, 32], placement=rank:0/cuda:0), ShardMetadata(shard_offsets=[0, 32], shard_sizes=[1024, 32], placement=rank:1/cuda:1)])), 'small_table_1': ParameterSharding(sharding_type='column_wise', compute_kernel='batched_fused', ranks=[0, 1], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[1024, 32], placement=rank:0/cuda:0), ShardMetadata(shard_offsets=[0, 32], shard_sizes=[1024, 32], placement=rank:1/cuda:1)]))}}
+    rank:1,sharding plan: {'': {'large_table_0': ParameterSharding(sharding_type='column_wise', compute_kernel='batched_fused', ranks=[0, 1], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[4096, 32], placement=rank:0/cuda:0), ShardMetadata(shard_offsets=[0, 32], shard_sizes=[4096, 32], placement=rank:1/cuda:1)])), 'large_table_1': ParameterSharding(sharding_type='column_wise', compute_kernel='batched_fused', ranks=[0, 1], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[4096, 32], placement=rank:0/cuda:0), ShardMetadata(shard_offsets=[0, 32], shard_sizes=[4096, 32], placement=rank:1/cuda:1)])), 'small_table_0': ParameterSharding(sharding_type='column_wise', compute_kernel='batched_fused', ranks=[0, 1], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[1024, 32], placement=rank:0/cuda:0), ShardMetadata(shard_offsets=[0, 32], shard_sizes=[1024, 32], placement=rank:1/cuda:1)])), 'small_table_1': ParameterSharding(sharding_type='column_wise', compute_kernel='batched_fused', ranks=[0, 1], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[1024, 32], placement=rank:0/cuda:0), ShardMetadata(shard_offsets=[0, 32], shard_sizes=[1024, 32], placement=rank:1/cuda:1)]))}}
+
+For ``table-row-wise``, unfortuately we cannot simulate it due to its
+nature of operating under multi-host setup. We will present a python
+`SPMD <https://en.wikipedia.org/wiki/SPMD>`_ example in the future
+to train models with ``table-row-wise``.
+
+With data parallel, we will repeat the tables for all devices.
+
+.. code:: python
+
+    spmd_sharing_simulation(ShardingType.DATA_PARALLEL)
+
+
+.. parsed-literal::
+
+    rank:0,sharding plan: {'': {'large_table_0': ParameterSharding(sharding_type='data_parallel', compute_kernel='batched_dense', ranks=[0, 1], sharding_spec=None), 'large_table_1': ParameterSharding(sharding_type='data_parallel', compute_kernel='batched_dense', ranks=[0, 1], sharding_spec=None), 'small_table_0': ParameterSharding(sharding_type='data_parallel', compute_kernel='batched_dense', ranks=[0, 1], sharding_spec=None), 'small_table_1': ParameterSharding(sharding_type='data_parallel', compute_kernel='batched_dense', ranks=[0, 1], sharding_spec=None)}}
+    rank:1,sharding plan: {'': {'large_table_0': ParameterSharding(sharding_type='data_parallel', compute_kernel='batched_dense', ranks=[0, 1], sharding_spec=None), 'large_table_1': ParameterSharding(sharding_type='data_parallel', compute_kernel='batched_dense', ranks=[0, 1], sharding_spec=None), 'small_table_0': ParameterSharding(sharding_type='data_parallel', compute_kernel='batched_dense', ranks=[0, 1], sharding_spec=None), 'small_table_1': ParameterSharding(sharding_type='data_parallel', compute_kernel='batched_dense', ranks=[0, 1], sharding_spec=None)}}
+
diff --git a/beginner_source/audio_data_augmentation_tutorial.py b/beginner_source/audio_data_augmentation_tutorial.py
index 4c0ff9966..f3ac7917d 100644
--- a/beginner_source/audio_data_augmentation_tutorial.py
+++ b/beginner_source/audio_data_augmentation_tutorial.py
@@ -1,14 +1,15 @@
 # -*- coding: utf-8 -*-
 """
 Audio Data Augmentation
-=================
+=======================
 
 ``torchaudio`` provides a variety of ways to augment audio data.
-"""
 
-# When running this tutorial in Google Colab, install the required packages
-# with the following.
-# !pip install torchaudio
+In this tutorial, we look into a way to apply effects, filters,
+RIR (room impulse response) and codecs.
+
+At the end, we synthesize noisy speech over phone from clean speech.
+"""
 
 import torch
 import torchaudio
@@ -18,167 +19,37 @@
 print(torchaudio.__version__)
 
 ######################################################################
-# Preparing data and utility functions (skip this section)
-# --------------------------------------------------------
+# Preparation
+# -----------
+#
+# First, we import the modules and download the audio assets we use in this tutorial.
 #
-
-#@title Prepare data and utility functions. {display-mode: "form"}
-#@markdown
-#@markdown You do not need to look into this cell.
-#@markdown Just execute once and you are good to go.
-#@markdown
-#@markdown In this tutorial, we will use a speech data from [VOiCES dataset](https://iqtlabs.github.io/voices/), which is licensed under Creative Commos BY 4.0.
-
-#-------------------------------------------------------------------------------
-# Preparation of data and helper functions.
-#-------------------------------------------------------------------------------
 
 import math
-import os
-import requests
 
+from IPython.display import Audio
 import matplotlib.pyplot as plt
-from IPython.display import Audio, display
-
-
-_SAMPLE_DIR = "_sample_data"
-
-SAMPLE_WAV_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/steam-train-whistle-daniel_simon.wav"
-SAMPLE_WAV_PATH = os.path.join(_SAMPLE_DIR, "steam.wav")
-
-SAMPLE_RIR_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/VOiCES_devkit/distant-16k/room-response/rm1/impulse/Lab41-SRI-VOiCES-rm1-impulse-mc01-stu-clo.wav"
-SAMPLE_RIR_PATH = os.path.join(_SAMPLE_DIR, "rir.wav")
-
-SAMPLE_WAV_SPEECH_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
-SAMPLE_WAV_SPEECH_PATH = os.path.join(_SAMPLE_DIR, "speech.wav")
-
-SAMPLE_NOISE_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/VOiCES_devkit/distant-16k/distractors/rm1/babb/Lab41-SRI-VOiCES-rm1-babb-mc01-stu-clo.wav"
-SAMPLE_NOISE_PATH = os.path.join(_SAMPLE_DIR, "bg.wav")
-
-os.makedirs(_SAMPLE_DIR, exist_ok=True)
-
-def _fetch_data():
-  uri = [
-    (SAMPLE_WAV_URL, SAMPLE_WAV_PATH),
-    (SAMPLE_RIR_URL, SAMPLE_RIR_PATH),
-    (SAMPLE_WAV_SPEECH_URL, SAMPLE_WAV_SPEECH_PATH),
-    (SAMPLE_NOISE_URL, SAMPLE_NOISE_PATH),
-  ]
-  for url, path in uri:
-    with open(path, 'wb') as file_:
-      file_.write(requests.get(url).content)
-
-_fetch_data()
-
-def _get_sample(path, resample=None):
-  effects = [
-    ["remix", "1"]
-  ]
-  if resample:
-    effects.extend([
-      ["lowpass", f"{resample // 2}"],
-      ["rate", f'{resample}'],
-    ])
-  return torchaudio.sox_effects.apply_effects_file(path, effects=effects)
-
-def get_sample(*, resample=None):
-  return _get_sample(SAMPLE_WAV_PATH, resample=resample)
-
-def get_speech_sample(*, resample=None):
-  return _get_sample(SAMPLE_WAV_SPEECH_PATH, resample=resample)
-
-def plot_waveform(waveform, sample_rate, title="Waveform", xlim=None, ylim=None):
-  waveform = waveform.numpy()
-
-  num_channels, num_frames = waveform.shape
-  time_axis = torch.arange(0, num_frames) / sample_rate
-
-  figure, axes = plt.subplots(num_channels, 1)
-  if num_channels == 1:
-    axes = [axes]
-  for c in range(num_channels):
-    axes[c].plot(time_axis, waveform[c], linewidth=1)
-    axes[c].grid(True)
-    if num_channels > 1:
-      axes[c].set_ylabel(f'Channel {c+1}')
-    if xlim:
-      axes[c].set_xlim(xlim)
-    if ylim:
-      axes[c].set_ylim(ylim)
-  figure.suptitle(title)
-  plt.show(block=False)
-
-def print_stats(waveform, sample_rate=None, src=None):
-  if src:
-    print("-" * 10)
-    print("Source:", src)
-    print("-" * 10)
-  if sample_rate:
-    print("Sample Rate:", sample_rate)
-  print("Shape:", tuple(waveform.shape))
-  print("Dtype:", waveform.dtype)
-  print(f" - Max:     {waveform.max().item():6.3f}")
-  print(f" - Min:     {waveform.min().item():6.3f}")
-  print(f" - Mean:    {waveform.mean().item():6.3f}")
-  print(f" - Std Dev: {waveform.std().item():6.3f}")
-  print()
-  print(waveform)
-  print()
 
-def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
-  waveform = waveform.numpy()
-
-  num_channels, num_frames = waveform.shape
-  time_axis = torch.arange(0, num_frames) / sample_rate
-
-  figure, axes = plt.subplots(num_channels, 1)
-  if num_channels == 1:
-    axes = [axes]
-  for c in range(num_channels):
-    axes[c].specgram(waveform[c], Fs=sample_rate)
-    if num_channels > 1:
-      axes[c].set_ylabel(f'Channel {c+1}')
-    if xlim:
-      axes[c].set_xlim(xlim)
-  figure.suptitle(title)
-  plt.show(block=False)
-
-def play_audio(waveform, sample_rate):
-  waveform = waveform.numpy()
-
-  num_channels, num_frames = waveform.shape
-  if num_channels == 1:
-    display(Audio(waveform[0], rate=sample_rate))
-  elif num_channels == 2:
-    display(Audio((waveform[0], waveform[1]), rate=sample_rate))
-  else:
-    raise ValueError("Waveform with more than 2 channels are not supported.")
-
-def get_rir_sample(*, resample=None, processed=False):
-  rir_raw, sample_rate = _get_sample(SAMPLE_RIR_PATH, resample=resample)
-  if not processed:
-    return rir_raw, sample_rate
-  rir = rir_raw[:, int(sample_rate*1.01):int(sample_rate*1.3)]
-  rir = rir / torch.norm(rir, p=2)
-  rir = torch.flip(rir, [1])
-  return rir, sample_rate
-
-def get_noise_sample(*, resample=None):
-  return _get_sample(SAMPLE_NOISE_PATH, resample=resample)
+from torchaudio.utils import download_asset
+
+SAMPLE_WAV = download_asset("tutorial-assets/steam-train-whistle-daniel_simon.wav")
+SAMPLE_RIR = download_asset("tutorial-assets/Lab41-SRI-VOiCES-rm1-impulse-mc01-stu-clo-8000hz.wav")
+SAMPLE_SPEECH = download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042-8000hz.wav")
+SAMPLE_NOISE = download_asset("tutorial-assets/Lab41-SRI-VOiCES-rm1-babb-mc01-stu-clo-8000hz.wav")
 
 
 ######################################################################
 # Applying effects and filtering
 # ------------------------------
 #
-# ``torchaudio.sox_effects`` allows for directly applying filters similar to
+# :py:func:`torchaudio.sox_effects` allows for directly applying filters similar to
 # those available in ``sox`` to Tensor objects and file object audio sources.
 #
 # There are two functions for this:
 #
-# -  ``torchaudio.sox_effects.apply_effects_tensor`` for applying effects
+# -  :py:func:`torchaudio.sox_effects.apply_effects_tensor` for applying effects
 #    to Tensor.
-# -  ``torchaudio.sox_effects.apply_effects_file`` for applying effects to
+# -  :py:func:`torchaudio.sox_effects.apply_effects_file` for applying effects to
 #    other audio sources.
 #
 # Both functions accept effect definitions in the form
@@ -191,55 +62,107 @@ def get_noise_sample(*, resample=None):
 # documentation <http://sox.sourceforge.net/sox.html>`__.
 #
 # **Tip** If you need to load and resample your audio data on the fly,
-# then you can use ``torchaudio.sox_effects.apply_effects_file`` with
-# effect ``"rate"``.
+# then you can use :py:func:`torchaudio.sox_effects.apply_effects_file`
+# with effect ``"rate"``.
 #
-# **Note** ``apply_effects_file`` accepts a file-like object or path-like
-# object. Similar to ``torchaudio.load``, when the audio format cannot be
+# **Note** :py:func:`torchaudio.sox_effects.apply_effects_file` accepts a
+# file-like object or path-like object.
+# Similar to :py:func:`torchaudio.load`, when the audio format cannot be
 # inferred from either the file extension or header, you can provide
 # argument ``format`` to specify the format of the audio source.
 #
 # **Note** This process is not differentiable.
 #
 
-
 # Load the data
-waveform1, sample_rate1 = get_sample(resample=16000)
+waveform1, sample_rate1 = torchaudio.load(SAMPLE_WAV)
 
 # Define effects
 effects = [
-  ["lowpass", "-1", "300"], # apply single-pole lowpass filter
-  ["speed", "0.8"],  # reduce the speed
-                     # This only changes sample rate, so it is necessary to
-                     # add `rate` effect with original sample rate after this.
-  ["rate", f"{sample_rate1}"],
-  ["reverb", "-w"],  # Reverbration gives some dramatic feeling
+    ["lowpass", "-1", "300"],  # apply single-pole lowpass filter
+    ["speed", "0.8"],  # reduce the speed
+    # This only changes sample rate, so it is necessary to
+    # add `rate` effect with original sample rate after this.
+    ["rate", f"{sample_rate1}"],
+    ["reverb", "-w"],  # Reverbration gives some dramatic feeling
 ]
 
 # Apply effects
-waveform2, sample_rate2 = torchaudio.sox_effects.apply_effects_tensor(
-    waveform1, sample_rate1, effects)
+waveform2, sample_rate2 = torchaudio.sox_effects.apply_effects_tensor(waveform1, sample_rate1, effects)
 
-plot_waveform(waveform1, sample_rate1, title="Original", xlim=(-.1, 3.2))
-plot_waveform(waveform2, sample_rate2, title="Effects Applied", xlim=(-.1, 3.2))
-print_stats(waveform1, sample_rate=sample_rate1, src="Original")
-print_stats(waveform2, sample_rate=sample_rate2, src="Effects Applied")
+print(waveform1.shape, sample_rate1)
+print(waveform2.shape, sample_rate2)
 
 ######################################################################
 # Note that the number of frames and number of channels are different from
 # those of the original after the effects are applied. Let’s listen to the
-# audio. Doesn’t it sound more dramatic?
+# audio.
 #
 
+def plot_waveform(waveform, sample_rate, title="Waveform", xlim=None):
+    waveform = waveform.numpy()
+
+    num_channels, num_frames = waveform.shape
+    time_axis = torch.arange(0, num_frames) / sample_rate
+
+    figure, axes = plt.subplots(num_channels, 1)
+    if num_channels == 1:
+        axes = [axes]
+    for c in range(num_channels):
+        axes[c].plot(time_axis, waveform[c], linewidth=1)
+        axes[c].grid(True)
+        if num_channels > 1:
+            axes[c].set_ylabel(f"Channel {c+1}")
+        if xlim:
+            axes[c].set_xlim(xlim)
+    figure.suptitle(title)
+    plt.show(block=False)
+
+######################################################################
+#
+
+def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
+    waveform = waveform.numpy()
+
+    num_channels, _ = waveform.shape
+
+    figure, axes = plt.subplots(num_channels, 1)
+    if num_channels == 1:
+        axes = [axes]
+    for c in range(num_channels):
+        axes[c].specgram(waveform[c], Fs=sample_rate)
+        if num_channels > 1:
+            axes[c].set_ylabel(f"Channel {c+1}")
+        if xlim:
+            axes[c].set_xlim(xlim)
+    figure.suptitle(title)
+    plt.show(block=False)
+
+######################################################################
+# Original:
+# ~~~~~~~~~
+#
+
+plot_waveform(waveform1, sample_rate1, title="Original", xlim=(-0.1, 3.2))
 plot_specgram(waveform1, sample_rate1, title="Original", xlim=(0, 3.04))
-play_audio(waveform1, sample_rate1)
+Audio(waveform1, rate=sample_rate1)
+
+######################################################################
+# Effects applied:
+# ~~~~~~~~~~~~~~~~
+#
+
+plot_waveform(waveform2, sample_rate2, title="Effects Applied", xlim=(-0.1, 3.2))
 plot_specgram(waveform2, sample_rate2, title="Effects Applied", xlim=(0, 3.04))
-play_audio(waveform2, sample_rate2)
+Audio(waveform2, rate=sample_rate2)
 
+######################################################################
+# Doesn’t it sound more dramatic?
+#
 
 ######################################################################
 # Simulating room reverberation
-# ----------------------------
+# -----------------------------
 #
 # `Convolution
 # reverb <https://en.wikipedia.org/wiki/Convolution_reverb>`__ is a
@@ -254,44 +177,48 @@ def get_noise_sample(*, resample=None):
 # and clap your hands.
 #
 
-
-sample_rate = 8000
-
-rir_raw, _ = get_rir_sample(resample=sample_rate)
-
-plot_waveform(rir_raw, sample_rate, title="Room Impulse Response (raw)", ylim=None)
+rir_raw, sample_rate = torchaudio.load(SAMPLE_RIR)
+plot_waveform(rir_raw, sample_rate, title="Room Impulse Response (raw)")
 plot_specgram(rir_raw, sample_rate, title="Room Impulse Response (raw)")
-play_audio(rir_raw, sample_rate)
+Audio(rir_raw, rate=sample_rate)
 
 ######################################################################
 # First, we need to clean up the RIR. We extract the main impulse, normalize
 # the signal power, then flip along the time axis.
 #
 
-rir = rir_raw[:, int(sample_rate*1.01):int(sample_rate*1.3)]
+rir = rir_raw[:, int(sample_rate * 1.01) : int(sample_rate * 1.3)]
 rir = rir / torch.norm(rir, p=2)
-rir = torch.flip(rir, [1])
+RIR = torch.flip(rir, [1])
 
-print_stats(rir)
-plot_waveform(rir, sample_rate, title="Room Impulse Response", ylim=None)
+plot_waveform(rir, sample_rate, title="Room Impulse Response")
 
 ######################################################################
 # Then, we convolve the speech signal with the RIR filter.
 #
 
-speech, _ = get_speech_sample(resample=sample_rate)
+speech, _ = torchaudio.load(SAMPLE_SPEECH)
 
-speech_ = torch.nn.functional.pad(speech, (rir.shape[1]-1, 0))
-augmented = torch.nn.functional.conv1d(speech_[None, ...], rir[None, ...])[0]
+speech_ = torch.nn.functional.pad(speech, (RIR.shape[1] - 1, 0))
+augmented = torch.nn.functional.conv1d(speech_[None, ...], RIR[None, ...])[0]
 
-plot_waveform(speech, sample_rate, title="Original", ylim=None)
-plot_waveform(augmented, sample_rate, title="RIR Applied", ylim=None)
+######################################################################
+# Original:
+# ~~~~~~~~~
+#
 
+plot_waveform(speech, sample_rate, title="Original")
 plot_specgram(speech, sample_rate, title="Original")
-play_audio(speech, sample_rate)
+Audio(speech, rate=sample_rate)
 
+######################################################################
+# RIR applied:
+# ~~~~~~~~~~~~
+#
+
+plot_waveform(augmented, sample_rate, title="RIR Applied")
 plot_specgram(augmented, sample_rate, title="RIR Applied")
-play_audio(augmented, sample_rate)
+Audio(augmented, rate=sample_rate)
 
 
 ######################################################################
@@ -303,58 +230,123 @@ def get_noise_sample(*, resample=None):
 # intensity of noise is changing the Signal-to-Noise Ratio (SNR).
 # [`wikipedia <https://en.wikipedia.org/wiki/Signal-to-noise_ratio>`__]
 #
-# \begin{align}\mathrm{SNR} = \frac{P_\mathrm{signal}}{P_\mathrm{noise}}\end{align}
+# $$ \\mathrm{SNR} = \\frac{P_{signal}}{P_{noise}} $$
 #
-# \begin{align}{\mathrm  {SNR_{{dB}}}}=10\log _{{10}}\left({\mathrm  {SNR}}\right)\end{align}
+# $$ \\mathrm{SNR_{dB}} = 10 \\log _{{10}} \\mathrm {SNR} $$
 #
 
+speech, _ = torchaudio.load(SAMPLE_SPEECH)
+noise, _ = torchaudio.load(SAMPLE_NOISE)
+noise = noise[:, : speech.shape[1]]
+
+speech_power = speech.norm(p=2)
+noise_power = noise.norm(p=2)
 
-sample_rate = 8000
-speech, _ = get_speech_sample(resample=sample_rate)
-noise, _ = get_noise_sample(resample=sample_rate)
-noise = noise[:, :speech.shape[1]]
+snr_dbs = [20, 10, 3]
+noisy_speeches = []
+for snr_db in snr_dbs:
+    snr = 10 ** (snr_db / 20)
+    scale = snr * noise_power / speech_power
+    noisy_speeches.append((scale * speech + noise) / 2)
+
+######################################################################
+# Background noise:
+# ~~~~~~~~~~~~~~~~~
+#
 
 plot_waveform(noise, sample_rate, title="Background noise")
 plot_specgram(noise, sample_rate, title="Background noise")
-play_audio(noise, sample_rate)
+Audio(noise, rate=sample_rate)
 
-speech_power = speech.norm(p=2)
-noise_power = noise.norm(p=2)
+######################################################################
+# SNR 20 dB:
+# ~~~~~~~~~~
+#
 
-for snr_db in [20, 10, 3]:
-  snr = math.exp(snr_db / 10)
-  scale = snr * noise_power / speech_power
-  noisy_speech = (scale * speech + noise) / 2
+snr_db, noisy_speech = snr_dbs[0], noisy_speeches[0]
+plot_waveform(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]")
+plot_specgram(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]")
+Audio(noisy_speech, rate=sample_rate)
+
+######################################################################
+# SNR 10 dB:
+# ~~~~~~~~~~
+#
+
+snr_db, noisy_speech = snr_dbs[1], noisy_speeches[1]
+plot_waveform(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]")
+plot_specgram(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]")
+Audio(noisy_speech, rate=sample_rate)
+
+######################################################################
+# SNR 3 dB:
+# ~~~~~~~~~
+#
+
+snr_db, noisy_speech = snr_dbs[2], noisy_speeches[2]
+plot_waveform(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]")
+plot_specgram(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]")
+Audio(noisy_speech, rate=sample_rate)
 
-  plot_waveform(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]")
-  plot_specgram(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]")
-  play_audio(noisy_speech, sample_rate)
 
 ######################################################################
 # Applying codec to Tensor object
 # -------------------------------
 #
-# ``torchaudio.functional.apply_codec`` can apply codecs to a Tensor object.
+# :py:func:`torchaudio.functional.apply_codec` can apply codecs to
+# a Tensor object.
 #
 # **Note** This process is not differentiable.
 #
 
 
-waveform, sample_rate = get_speech_sample(resample=8000)
-
-plot_specgram(waveform, sample_rate, title="Original")
-play_audio(waveform, sample_rate)
+waveform, sample_rate = torchaudio.load(SAMPLE_SPEECH)
 
 configs = [
-    ({"format": "wav", "encoding": 'ULAW', "bits_per_sample": 8}, "8 bit mu-law"),
-    ({"format": "gsm"}, "GSM-FR"),
-    ({"format": "mp3", "compression": -9}, "MP3"),
-    ({"format": "vorbis", "compression": -1}, "Vorbis"),
+    {"format": "wav", "encoding": "ULAW", "bits_per_sample": 8},
+    {"format": "gsm"},
+    {"format": "vorbis", "compression": -1},
 ]
-for param, title in configs:
-  augmented = F.apply_codec(waveform, sample_rate, **param)
-  plot_specgram(augmented, sample_rate, title=title)
-  play_audio(augmented, sample_rate)
+waveforms = []
+for param in configs:
+    augmented = F.apply_codec(waveform, sample_rate, **param)
+    waveforms.append(augmented)
+
+######################################################################
+# Original:
+# ~~~~~~~~~
+#
+
+plot_waveform(waveform, sample_rate, title="Original")
+plot_specgram(waveform, sample_rate, title="Original")
+Audio(waveform, rate=sample_rate)
+
+######################################################################
+# 8 bit mu-law:
+# ~~~~~~~~~~~~~
+#
+
+plot_waveform(waveforms[0], sample_rate, title="8 bit mu-law")
+plot_specgram(waveforms[0], sample_rate, title="8 bit mu-law")
+Audio(waveforms[0], rate=sample_rate)
+
+######################################################################
+# GSM-FR:
+# ~~~~~~~
+#
+
+plot_waveform(waveforms[1], sample_rate, title="GSM-FR")
+plot_specgram(waveforms[1], sample_rate, title="GSM-FR")
+Audio(waveforms[1], rate=sample_rate)
+
+######################################################################
+# Vorbis:
+# ~~~~~~~
+#
+
+plot_waveform(waveforms[2], sample_rate, title="Vorbis")
+plot_specgram(waveforms[2], sample_rate, title="Vorbis")
+Audio(waveforms[2], rate=sample_rate)
 
 ######################################################################
 # Simulating a phone recoding
@@ -366,49 +358,86 @@ def get_noise_sample(*, resample=None):
 #
 
 sample_rate = 16000
-speech, _ = get_speech_sample(resample=sample_rate)
+original_speech, sample_rate = torchaudio.load(SAMPLE_SPEECH)
 
-plot_specgram(speech, sample_rate, title="Original")
-play_audio(speech, sample_rate)
+plot_specgram(original_speech, sample_rate, title="Original")
 
 # Apply RIR
-rir, _ = get_rir_sample(resample=sample_rate, processed=True)
-speech_ = torch.nn.functional.pad(speech, (rir.shape[1]-1, 0))
-speech = torch.nn.functional.conv1d(speech_[None, ...], rir[None, ...])[0]
+speech_ = torch.nn.functional.pad(original_speech, (RIR.shape[1] - 1, 0))
+rir_applied = torch.nn.functional.conv1d(speech_[None, ...], RIR[None, ...])[0]
 
-plot_specgram(speech, sample_rate, title="RIR Applied")
-play_audio(speech, sample_rate)
+plot_specgram(rir_applied, sample_rate, title="RIR Applied")
 
 # Add background noise
 # Because the noise is recorded in the actual environment, we consider that
 # the noise contains the acoustic feature of the environment. Therefore, we add
 # the noise after RIR application.
-noise, _ = get_noise_sample(resample=sample_rate)
-noise = noise[:, :speech.shape[1]]
+noise, _ = torchaudio.load(SAMPLE_NOISE)
+noise = noise[:, : rir_applied.shape[1]]
 
 snr_db = 8
-scale = math.exp(snr_db / 10) * noise.norm(p=2) / speech.norm(p=2)
-speech = (scale * speech + noise) / 2
+scale = math.exp(snr_db / 10) * noise.norm(p=2) / rir_applied.norm(p=2)
+bg_added = (scale * rir_applied + noise) / 2
 
-plot_specgram(speech, sample_rate, title="BG noise added")
-play_audio(speech, sample_rate)
+plot_specgram(bg_added, sample_rate, title="BG noise added")
 
 # Apply filtering and change sample rate
-speech, sample_rate = torchaudio.sox_effects.apply_effects_tensor(
-  speech,
-  sample_rate,
-  effects=[
-      ["lowpass", "4000"],
-      ["compand", "0.02,0.05", "-60,-60,-30,-10,-20,-8,-5,-8,-2,-8", "-8", "-7", "0.05"],
-      ["rate", "8000"],
-  ],
+filtered, sample_rate2 = torchaudio.sox_effects.apply_effects_tensor(
+    bg_added,
+    sample_rate,
+    effects=[
+        ["lowpass", "4000"],
+        [
+            "compand",
+            "0.02,0.05",
+            "-60,-60,-30,-10,-20,-8,-5,-8,-2,-8",
+            "-8",
+            "-7",
+            "0.05",
+        ],
+        ["rate", "8000"],
+    ],
 )
 
-plot_specgram(speech, sample_rate, title="Filtered")
-play_audio(speech, sample_rate)
+plot_specgram(filtered, sample_rate2, title="Filtered")
 
 # Apply telephony codec
-speech = F.apply_codec(speech, sample_rate, format="gsm")
+codec_applied = F.apply_codec(filtered, sample_rate2, format="gsm")
+
+plot_specgram(codec_applied, sample_rate2, title="GSM Codec Applied")
+
+
+######################################################################
+# Original speech:
+# ~~~~~~~~~~~~~~~~
+#
+
+Audio(original_speech, rate=sample_rate)
+
+######################################################################
+# RIR applied:
+# ~~~~~~~~~~~~
+#
+
+Audio(rir_applied, rate=sample_rate)
+
+######################################################################
+# Background noise added:
+# ~~~~~~~~~~~~~~~~~~~~~~~
+#
+
+Audio(bg_added, rate=sample_rate)
+
+######################################################################
+# Filtered:
+# ~~~~~~~~~
+#
+
+Audio(filtered, rate=sample_rate2)
+
+######################################################################
+# Codec aplied:
+# ~~~~~~~~~~~~~
+#
 
-plot_specgram(speech, sample_rate, title="GSM Codec Applied")
-play_audio(speech, sample_rate)
+Audio(codec_applied, rate=sample_rate2)
diff --git a/beginner_source/audio_datasets_tutorial.py b/beginner_source/audio_datasets_tutorial.py
index 4b0e48f88..f08ed99e0 100644
--- a/beginner_source/audio_datasets_tutorial.py
+++ b/beginner_source/audio_datasets_tutorial.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 """
 Audio Datasets
-========
+==============
 
 ``torchaudio`` provides easy access to common, publicly accessible
 datasets. Please refer to the official documentation for the list of
@@ -23,14 +23,14 @@
 # --------------------------------------------------------
 #
 
-#@title Prepare data and utility functions. {display-mode: "form"}
-#@markdown
-#@markdown You do not need to look into this cell.
-#@markdown Just execute once and you are good to go.
+# @title Prepare data and utility functions. {display-mode: "form"}
+# @markdown
+# @markdown You do not need to look into this cell.
+# @markdown Just execute once and you are good to go.
 
-#-------------------------------------------------------------------------------
+# -------------------------------------------------------------------------------
 # Preparation of data and helper functions.
-#-------------------------------------------------------------------------------
+# -------------------------------------------------------------------------------
 import multiprocessing
 import os
 
@@ -38,56 +38,50 @@
 from IPython.display import Audio, display
 
 
-_SAMPLE_DIR = "_sample_data"
+_SAMPLE_DIR = "_assets"
 YESNO_DATASET_PATH = os.path.join(_SAMPLE_DIR, "yes_no")
 os.makedirs(YESNO_DATASET_PATH, exist_ok=True)
 
-def _download_yesno():
-  if os.path.exists(os.path.join(YESNO_DATASET_PATH, "waves_yesno.tar.gz")):
-    return
-  torchaudio.datasets.YESNO(root=YESNO_DATASET_PATH, download=True)
-
-YESNO_DOWNLOAD_PROCESS = multiprocessing.Process(target=_download_yesno)
-YESNO_DOWNLOAD_PROCESS.start()
 
 def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
-  waveform = waveform.numpy()
-
-  num_channels, num_frames = waveform.shape
-  time_axis = torch.arange(0, num_frames) / sample_rate
-
-  figure, axes = plt.subplots(num_channels, 1)
-  if num_channels == 1:
-    axes = [axes]
-  for c in range(num_channels):
-    axes[c].specgram(waveform[c], Fs=sample_rate)
-    if num_channels > 1:
-      axes[c].set_ylabel(f'Channel {c+1}')
-    if xlim:
-      axes[c].set_xlim(xlim)
-  figure.suptitle(title)
-  plt.show(block=False)
+    waveform = waveform.numpy()
+
+    num_channels, num_frames = waveform.shape
+
+    figure, axes = plt.subplots(num_channels, 1)
+    if num_channels == 1:
+        axes = [axes]
+    for c in range(num_channels):
+        axes[c].specgram(waveform[c], Fs=sample_rate)
+        if num_channels > 1:
+            axes[c].set_ylabel(f"Channel {c+1}")
+        if xlim:
+            axes[c].set_xlim(xlim)
+    figure.suptitle(title)
+    plt.show(block=False)
+
 
 def play_audio(waveform, sample_rate):
-  waveform = waveform.numpy()
+    waveform = waveform.numpy()
+
+    num_channels, num_frames = waveform.shape
+    if num_channels == 1:
+        display(Audio(waveform[0], rate=sample_rate))
+    elif num_channels == 2:
+        display(Audio((waveform[0], waveform[1]), rate=sample_rate))
+    else:
+        raise ValueError("Waveform with more than 2 channels are not supported.")
 
-  num_channels, num_frames = waveform.shape
-  if num_channels == 1:
-    display(Audio(waveform[0], rate=sample_rate))
-  elif num_channels == 2:
-    display(Audio((waveform[0], waveform[1]), rate=sample_rate))
-  else:
-    raise ValueError("Waveform with more than 2 channels are not supported.")
 
 ######################################################################
-# Here, we show how to use the ``YESNO`` dataset.
+# Here, we show how to use the
+# :py:func:`torchaudio.datasets.YESNO` dataset.
 #
 
-YESNO_DOWNLOAD_PROCESS.join()
 
 dataset = torchaudio.datasets.YESNO(YESNO_DATASET_PATH, download=True)
 
 for i in [1, 3, 5]:
-  waveform, sample_rate, label = dataset[i]
-  plot_specgram(waveform, sample_rate, title=f"Sample {i}: {label}")
-  play_audio(waveform, sample_rate)
+    waveform, sample_rate, label = dataset[i]
+    plot_specgram(waveform, sample_rate, title=f"Sample {i}: {label}")
+    play_audio(waveform, sample_rate)
diff --git a/beginner_source/audio_feature_augmentation_tutorial.py b/beginner_source/audio_feature_augmentation_tutorial.py
index c11696de3..3961dafbc 100644
--- a/beginner_source/audio_feature_augmentation_tutorial.py
+++ b/beginner_source/audio_feature_augmentation_tutorial.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 """
 Audio Feature Augmentation
-====================
+==========================
 """
 
 # When running this tutorial in Google Colab, install the required packages
@@ -20,82 +20,90 @@
 # --------------------------------------------------------
 #
 
-#@title Prepare data and utility functions. {display-mode: "form"}
-#@markdown
-#@markdown You do not need to look into this cell.
-#@markdown Just execute once and you are good to go.
-#@markdown
-#@markdown In this tutorial, we will use a speech data from [VOiCES dataset](https://iqtlabs.github.io/voices/), which is licensed under Creative Commos BY 4.0.
+# @title Prepare data and utility functions. {display-mode: "form"}
+# @markdown
+# @markdown You do not need to look into this cell.
+# @markdown Just execute once and you are good to go.
+# @markdown
+# @markdown In this tutorial, we will use a speech data from [VOiCES dataset](https://iqtlabs.github.io/voices/),
+# @markdown which is licensed under Creative Commos BY 4.0.
 
-#-------------------------------------------------------------------------------
+# -------------------------------------------------------------------------------
 # Preparation of data and helper functions.
-#-------------------------------------------------------------------------------
+# -------------------------------------------------------------------------------
 
 import os
-import requests
 
 import librosa
 import matplotlib.pyplot as plt
+import requests
 
 
-_SAMPLE_DIR = "_sample_data"
+_SAMPLE_DIR = "_assets"
 
-SAMPLE_WAV_SPEECH_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
+SAMPLE_WAV_SPEECH_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"  # noqa: E501
 SAMPLE_WAV_SPEECH_PATH = os.path.join(_SAMPLE_DIR, "speech.wav")
 
 os.makedirs(_SAMPLE_DIR, exist_ok=True)
 
+
 def _fetch_data():
-  uri = [
-    (SAMPLE_WAV_SPEECH_URL, SAMPLE_WAV_SPEECH_PATH),
-  ]
-  for url, path in uri:
-    with open(path, 'wb') as file_:
-      file_.write(requests.get(url).content)
+    uri = [
+        (SAMPLE_WAV_SPEECH_URL, SAMPLE_WAV_SPEECH_PATH),
+    ]
+    for url, path in uri:
+        with open(path, "wb") as file_:
+            file_.write(requests.get(url).content)
+
 
 _fetch_data()
 
+
 def _get_sample(path, resample=None):
-  effects = [
-    ["remix", "1"]
-  ]
-  if resample:
-    effects.extend([
-      ["lowpass", f"{resample // 2}"],
-      ["rate", f'{resample}'],
-    ])
-  return torchaudio.sox_effects.apply_effects_file(path, effects=effects)
+    effects = [["remix", "1"]]
+    if resample:
+        effects.extend(
+            [
+                ["lowpass", f"{resample // 2}"],
+                ["rate", f"{resample}"],
+            ]
+        )
+    return torchaudio.sox_effects.apply_effects_file(path, effects=effects)
+
 
 def get_speech_sample(*, resample=None):
-  return _get_sample(SAMPLE_WAV_SPEECH_PATH, resample=resample)
+    return _get_sample(SAMPLE_WAV_SPEECH_PATH, resample=resample)
+
 
 def get_spectrogram(
-    n_fft = 400,
-    win_len = None,
-    hop_len = None,
-    power = 2.0,
+    n_fft=400,
+    win_len=None,
+    hop_len=None,
+    power=2.0,
 ):
-  waveform, _ = get_speech_sample()
-  spectrogram = T.Spectrogram(
-      n_fft=n_fft,
-      win_length=win_len,
-      hop_length=hop_len,
-      center=True,
-      pad_mode="reflect",
-      power=power,
-  )
-  return spectrogram(waveform)
-
-def plot_spectrogram(spec, title=None, ylabel='freq_bin', aspect='auto', xmax=None):
-  fig, axs = plt.subplots(1, 1)
-  axs.set_title(title or 'Spectrogram (db)')
-  axs.set_ylabel(ylabel)
-  axs.set_xlabel('frame')
-  im = axs.imshow(librosa.power_to_db(spec), origin='lower', aspect=aspect)
-  if xmax:
-    axs.set_xlim((0, xmax))
-  fig.colorbar(im, ax=axs)
-  plt.show(block=False)
+    waveform, _ = get_speech_sample()
+    spectrogram = T.Spectrogram(
+        n_fft=n_fft,
+        win_length=win_len,
+        hop_length=hop_len,
+        center=True,
+        pad_mode="reflect",
+        power=power,
+    )
+    return spectrogram(waveform)
+
+
+def plot_spectrogram(spec, title=None, ylabel="freq_bin", aspect="auto", xmax=None):
+    fig, axs = plt.subplots(1, 1)
+    axs.set_title(title or "Spectrogram (db)")
+    axs.set_ylabel(ylabel)
+    axs.set_xlabel("frame")
+    im = axs.imshow(librosa.power_to_db(spec), origin="lower", aspect=aspect)
+    if xmax:
+        axs.set_xlim((0, xmax))
+    fig.colorbar(im, ax=axs)
+    plt.show(block=False)
+
 
 ######################################################################
 # SpecAugment
@@ -104,29 +112,33 @@ def plot_spectrogram(spec, title=None, ylabel='freq_bin', aspect='auto', xmax=No
 # `SpecAugment <https://ai.googleblog.com/2019/04/specaugment-new-data-augmentation.html>`__
 # is a popular spectrogram augmentation technique.
 #
-# ``torchaudio`` implements ``TimeStretch``, ``TimeMasking`` and
-# ``FrequencyMasking``.
+# ``torchaudio`` implements :py:func:`torchaudio.transforms.TimeStretch`,
+# :py:func:`torchaudio.transforms.TimeMasking` and
+# :py:func:`torchaudio.transforms.FrequencyMasking`.
 #
+
+######################################################################
 # TimeStretch
-# ~~~~~~~~~~
+# -----------
 #
 
+
 spec = get_spectrogram(power=None)
 stretch = T.TimeStretch()
 
 rate = 1.2
 spec_ = stretch(spec, rate)
-plot_spectrogram(torch.abs(spec_[0]), title=f"Stretched x{rate}", aspect='equal', xmax=304)
+plot_spectrogram(torch.abs(spec_[0]), title=f"Stretched x{rate}", aspect="equal", xmax=304)
 
-plot_spectrogram(torch.abs(spec[0]), title="Original", aspect='equal', xmax=304)
+plot_spectrogram(torch.abs(spec[0]), title="Original", aspect="equal", xmax=304)
 
 rate = 0.9
 spec_ = stretch(spec, rate)
-plot_spectrogram(torch.abs(spec_[0]), title=f"Stretched x{rate}", aspect='equal', xmax=304)
+plot_spectrogram(torch.abs(spec_[0]), title=f"Stretched x{rate}", aspect="equal", xmax=304)
 
 ######################################################################
 # TimeMasking
-# ~~~~~~~~~~~
+# -----------
 #
 
 torch.random.manual_seed(4)
@@ -141,7 +153,7 @@ def plot_spectrogram(spec, title=None, ylabel='freq_bin', aspect='auto', xmax=No
 
 ######################################################################
 # FrequencyMasking
-# ~~~~~~~~~~~~~~~~
+# ----------------
 #
 
 
diff --git a/beginner_source/audio_feature_extractions_tutorial.py b/beginner_source/audio_feature_extractions_tutorial.py
index d2b3b8585..822c00d97 100644
--- a/beginner_source/audio_feature_extractions_tutorial.py
+++ b/beginner_source/audio_feature_extractions_tutorial.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 """
 Audio Feature Extractions
-===================
+=========================
 
 ``torchaudio`` implements feature extractions commonly used in the audio
 domain. They are available in ``torchaudio.functional`` and
@@ -11,20 +11,10 @@
 They are stateless.
 
 ``transforms`` implements features as objects,
-using implementations from ``functional`` and ``torch.nn.Module``. Because all
-transforms are subclasses of ``torch.nn.Module``, they can be serialized
-using TorchScript.
-
-For the complete list of available features, please refer to the
-documentation. In this tutorial, we will look into converting between the
-time domain and frequency domain (``Spectrogram``, ``GriffinLim``,
-``MelSpectrogram``).
+using implementations from ``functional`` and ``torch.nn.Module``.
+They can be serialized using TorchScript.
 """
 
-# When running this tutorial in Google Colab, install the required packages
-# with the following.
-# !pip install torchaudio librosa
-
 import torch
 import torchaudio
 import torchaudio.functional as F
@@ -34,186 +24,95 @@
 print(torchaudio.__version__)
 
 ######################################################################
-# Preparing data and utility functions (skip this section)
-# --------------------------------------------------------
+# Preparation
+# -----------
+#
+# .. note::
+#
+#    When running this tutorial in Google Colab, install the required packages
 #
+#    .. code::
+#
+#       !pip install librosa
+#
+from IPython.display import Audio
+import librosa
+import matplotlib.pyplot as plt
+from torchaudio.utils import download_asset
 
-#@title Prepare data and utility functions. {display-mode: "form"}
-#@markdown
-#@markdown You do not need to look into this cell.
-#@markdown Just execute once and you are good to go.
-#@markdown
-#@markdown In this tutorial, we will use a speech data from [VOiCES dataset](https://iqtlabs.github.io/voices/), which is licensed under Creative Commos BY 4.0.
+torch.random.manual_seed(0)
 
-#-------------------------------------------------------------------------------
-# Preparation of data and helper functions.
-#-------------------------------------------------------------------------------
+SAMPLE_SPEECH = download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav")
 
-import os
-import requests
 
-import librosa
-import matplotlib.pyplot as plt
-from IPython.display import Audio, display
-
-
-_SAMPLE_DIR = "_sample_data"
-
-SAMPLE_WAV_SPEECH_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
-SAMPLE_WAV_SPEECH_PATH = os.path.join(_SAMPLE_DIR, "speech.wav")
-
-os.makedirs(_SAMPLE_DIR, exist_ok=True)
-
-
-def _fetch_data():
-  uri = [
-    (SAMPLE_WAV_SPEECH_URL, SAMPLE_WAV_SPEECH_PATH),
-  ]
-  for url, path in uri:
-    with open(path, 'wb') as file_:
-      file_.write(requests.get(url).content)
-
-_fetch_data()
-
-def _get_sample(path, resample=None):
-  effects = [
-    ["remix", "1"]
-  ]
-  if resample:
-    effects.extend([
-      ["lowpass", f"{resample // 2}"],
-      ["rate", f'{resample}'],
-    ])
-  return torchaudio.sox_effects.apply_effects_file(path, effects=effects)
-
-def get_speech_sample(*, resample=None):
-  return _get_sample(SAMPLE_WAV_SPEECH_PATH, resample=resample)
-
-def print_stats(waveform, sample_rate=None, src=None):
-  if src:
-    print("-" * 10)
-    print("Source:", src)
-    print("-" * 10)
-  if sample_rate:
-    print("Sample Rate:", sample_rate)
-  print("Shape:", tuple(waveform.shape))
-  print("Dtype:", waveform.dtype)
-  print(f" - Max:     {waveform.max().item():6.3f}")
-  print(f" - Min:     {waveform.min().item():6.3f}")
-  print(f" - Mean:    {waveform.mean().item():6.3f}")
-  print(f" - Std Dev: {waveform.std().item():6.3f}")
-  print()
-  print(waveform)
-  print()
-
-def plot_spectrogram(spec, title=None, ylabel='freq_bin', aspect='auto', xmax=None):
-  fig, axs = plt.subplots(1, 1)
-  axs.set_title(title or 'Spectrogram (db)')
-  axs.set_ylabel(ylabel)
-  axs.set_xlabel('frame')
-  im = axs.imshow(librosa.power_to_db(spec), origin='lower', aspect=aspect)
-  if xmax:
-    axs.set_xlim((0, xmax))
-  fig.colorbar(im, ax=axs)
-  plt.show(block=False)
-
-def plot_waveform(waveform, sample_rate, title="Waveform", xlim=None, ylim=None):
-  waveform = waveform.numpy()
-
-  num_channels, num_frames = waveform.shape
-  time_axis = torch.arange(0, num_frames) / sample_rate
-
-  figure, axes = plt.subplots(num_channels, 1)
-  if num_channels == 1:
-    axes = [axes]
-  for c in range(num_channels):
-    axes[c].plot(time_axis, waveform[c], linewidth=1)
-    axes[c].grid(True)
-    if num_channels > 1:
-      axes[c].set_ylabel(f'Channel {c+1}')
-    if xlim:
-      axes[c].set_xlim(xlim)
-    if ylim:
-      axes[c].set_ylim(ylim)
-  figure.suptitle(title)
-  plt.show(block=False)
-
-def play_audio(waveform, sample_rate):
-  waveform = waveform.numpy()
-
-  num_channels, num_frames = waveform.shape
-  if num_channels == 1:
-    display(Audio(waveform[0], rate=sample_rate))
-  elif num_channels == 2:
-    display(Audio((waveform[0], waveform[1]), rate=sample_rate))
-  else:
-    raise ValueError("Waveform with more than 2 channels are not supported.")
-
-def plot_mel_fbank(fbank, title=None):
-  fig, axs = plt.subplots(1, 1)
-  axs.set_title(title or 'Filter bank')
-  axs.imshow(fbank, aspect='auto')
-  axs.set_ylabel('frequency bin')
-  axs.set_xlabel('mel bin')
-  plt.show(block=False)
-
-def plot_pitch(waveform, sample_rate, pitch):
-  figure, axis = plt.subplots(1, 1)
-  axis.set_title("Pitch Feature")
-  axis.grid(True)
-
-  end_time = waveform.shape[1] / sample_rate
-  time_axis = torch.linspace(0, end_time,  waveform.shape[1])
-  axis.plot(time_axis, waveform[0], linewidth=1, color='gray', alpha=0.3)
-
-  axis2 = axis.twinx()
-  time_axis = torch.linspace(0, end_time, pitch.shape[1])
-  ln2 = axis2.plot(
-      time_axis, pitch[0], linewidth=2, label='Pitch', color='green')
-
-  axis2.legend(loc=0)
-  plt.show(block=False)
-
-def plot_kaldi_pitch(waveform, sample_rate, pitch, nfcc):
-  figure, axis = plt.subplots(1, 1)
-  axis.set_title("Kaldi Pitch Feature")
-  axis.grid(True)
-
-  end_time = waveform.shape[1] / sample_rate
-  time_axis = torch.linspace(0, end_time,  waveform.shape[1])
-  axis.plot(time_axis, waveform[0], linewidth=1, color='gray', alpha=0.3)
-
-  time_axis = torch.linspace(0, end_time, pitch.shape[1])
-  ln1 = axis.plot(time_axis, pitch[0], linewidth=2, label='Pitch', color='green')
-  axis.set_ylim((-1.3, 1.3))
-
-  axis2 = axis.twinx()
-  time_axis = torch.linspace(0, end_time, nfcc.shape[1])
-  ln2 = axis2.plot(
-      time_axis, nfcc[0], linewidth=2, label='NFCC', color='blue', linestyle='--')
-
-  lns = ln1 + ln2
-  labels = [l.get_label() for l in lns]
-  axis.legend(lns, labels, loc=0)
-  plt.show(block=False)
+def plot_waveform(waveform, sr, title="Waveform"):
+    waveform = waveform.numpy()
+
+    num_channels, num_frames = waveform.shape
+    time_axis = torch.arange(0, num_frames) / sr
+
+    figure, axes = plt.subplots(num_channels, 1)
+    axes.plot(time_axis, waveform[0], linewidth=1)
+    axes.grid(True)
+    figure.suptitle(title)
+    plt.show(block=False)
+
+
+def plot_spectrogram(specgram, title=None, ylabel="freq_bin"):
+    fig, axs = plt.subplots(1, 1)
+    axs.set_title(title or "Spectrogram (db)")
+    axs.set_ylabel(ylabel)
+    axs.set_xlabel("frame")
+    im = axs.imshow(librosa.power_to_db(specgram), origin="lower", aspect="auto")
+    fig.colorbar(im, ax=axs)
+    plt.show(block=False)
+
+
+def plot_fbank(fbank, title=None):
+    fig, axs = plt.subplots(1, 1)
+    axs.set_title(title or "Filter bank")
+    axs.imshow(fbank, aspect="auto")
+    axs.set_ylabel("frequency bin")
+    axs.set_xlabel("mel bin")
+    plt.show(block=False)
+
+
+######################################################################
+# Overview of audio features
+# --------------------------
+#
+# The following diagram shows the relationship between common audio features
+# and torchaudio APIs to generate them.
+#
+# .. image:: https://download.pytorch.org/torchaudio/tutorial-assets/torchaudio_feature_extractions.png
+#
+# For the complete list of available features, please refer to the
+# documentation.
+#
+
 
 ######################################################################
 # Spectrogram
 # -----------
 #
 # To get the frequency make-up of an audio signal as it varies with time,
-# you can use ``Spectrogram``.
+# you can use :py:func:`torchaudio.transforms.Spectrogram`.
 #
 
+SPEECH_WAVEFORM, SAMPLE_RATE = torchaudio.load(SAMPLE_SPEECH)
 
+plot_waveform(SPEECH_WAVEFORM, SAMPLE_RATE, title="Original waveform")
+Audio(SPEECH_WAVEFORM.numpy(), rate=SAMPLE_RATE)
 
-waveform, sample_rate = get_speech_sample()
+
+######################################################################
+#
 
 n_fft = 1024
 win_length = None
 hop_length = 512
 
-# define transformation
+# Define transform
 spectrogram = T.Spectrogram(
     n_fft=n_fft,
     win_length=win_length,
@@ -222,11 +121,17 @@ def plot_kaldi_pitch(waveform, sample_rate, pitch, nfcc):
     pad_mode="reflect",
     power=2.0,
 )
-# Perform transformation
-spec = spectrogram(waveform)
 
-print_stats(spec)
-plot_spectrogram(spec[0], title='torchaudio')
+######################################################################
+#
+
+# Perform transform
+spec = spectrogram(SPEECH_WAVEFORM)
+
+######################################################################
+#
+
+plot_spectrogram(spec[0], title="torchaudio")
 
 ######################################################################
 # GriffinLim
@@ -235,11 +140,7 @@ def plot_kaldi_pitch(waveform, sample_rate, pitch, nfcc):
 # To recover a waveform from a spectrogram, you can use ``GriffinLim``.
 #
 
-
 torch.random.manual_seed(0)
-waveform, sample_rate = get_speech_sample()
-plot_waveform(waveform, sample_rate, title="Original")
-play_audio(waveform, sample_rate)
 
 n_fft = 1024
 win_length = None
@@ -249,30 +150,39 @@ def plot_kaldi_pitch(waveform, sample_rate, pitch, nfcc):
     n_fft=n_fft,
     win_length=win_length,
     hop_length=hop_length,
-)(waveform)
+)(SPEECH_WAVEFORM)
+
+######################################################################
+#
 
 griffin_lim = T.GriffinLim(
     n_fft=n_fft,
     win_length=win_length,
     hop_length=hop_length,
 )
-waveform = griffin_lim(spec)
 
-plot_waveform(waveform, sample_rate, title="Reconstructed")
-play_audio(waveform, sample_rate)
+######################################################################
+#
+
+reconstructed_waveform = griffin_lim(spec)
+
+######################################################################
+#
+
+plot_waveform(reconstructed_waveform, SAMPLE_RATE, title="Reconstructed")
+Audio(reconstructed_waveform, rate=SAMPLE_RATE)
 
 ######################################################################
 # Mel Filter Bank
 # ---------------
 #
-# ``torchaudio.functional.melscale_fbanks`` generates the filter bank
+# :py:func:`torchaudio.functional.melscale_fbanks` generates the filter bank
 # for converting frequency bins to mel-scale bins.
 #
 # Since this function does not require input audio/features, there is no
-# equivalent transform in ``torchaudio.transforms``.
+# equivalent transform in :py:func:`torchaudio.transforms`.
 #
 
-
 n_fft = 256
 n_mels = 64
 sample_rate = 6000
@@ -280,12 +190,16 @@ def plot_kaldi_pitch(waveform, sample_rate, pitch, nfcc):
 mel_filters = F.melscale_fbanks(
     int(n_fft // 2 + 1),
     n_mels=n_mels,
-    f_min=0.,
-    f_max=sample_rate/2.,
+    f_min=0.0,
+    f_max=sample_rate / 2.0,
     sample_rate=sample_rate,
-    norm='slaney'
+    norm="slaney",
 )
-plot_mel_fbank(mel_filters, "Mel Filter Bank - torchaudio")
+
+######################################################################
+#
+
+plot_fbank(mel_filters, "Mel Filter Bank - torchaudio")
 
 ######################################################################
 # Comparison against librosa
@@ -295,34 +209,34 @@ def plot_kaldi_pitch(waveform, sample_rate, pitch, nfcc):
 # with ``librosa``.
 #
 
-
 mel_filters_librosa = librosa.filters.mel(
-    sample_rate,
-    n_fft,
+    sr=sample_rate,
+    n_fft=n_fft,
     n_mels=n_mels,
-    fmin=0.,
-    fmax=sample_rate/2.,
-    norm='slaney',
+    fmin=0.0,
+    fmax=sample_rate / 2.0,
+    norm="slaney",
     htk=True,
 ).T
 
-plot_mel_fbank(mel_filters_librosa, "Mel Filter Bank - librosa")
+######################################################################
+#
+
+plot_fbank(mel_filters_librosa, "Mel Filter Bank - librosa")
 
 mse = torch.square(mel_filters - mel_filters_librosa).mean().item()
-print('Mean Square Difference: ', mse)
+print("Mean Square Difference: ", mse)
 
 ######################################################################
 # MelSpectrogram
 # --------------
 #
 # Generating a mel-scale spectrogram involves generating a spectrogram
-# and performing mel-scale conversion. In ``torchaudio``, ``MelSpectrogram`` provides
+# and performing mel-scale conversion. In ``torchaudio``,
+# :py:func:`torchaudio.transforms.MelSpectrogram` provides
 # this functionality.
 #
 
-
-waveform, sample_rate = get_speech_sample()
-
 n_fft = 1024
 win_length = None
 hop_length = 512
@@ -336,15 +250,18 @@ def plot_kaldi_pitch(waveform, sample_rate, pitch, nfcc):
     center=True,
     pad_mode="reflect",
     power=2.0,
-    norm='slaney',
+    norm="slaney",
     onesided=True,
     n_mels=n_mels,
     mel_scale="htk",
 )
 
-melspec = mel_spectrogram(waveform)
-plot_spectrogram(
-    melspec[0], title="MelSpectrogram - torchaudio", ylabel='mel freq')
+melspec = mel_spectrogram(SPEECH_WAVEFORM)
+
+######################################################################
+#
+
+plot_spectrogram(melspec[0], title="MelSpectrogram - torchaudio", ylabel="mel freq")
 
 ######################################################################
 # Comparison against librosa
@@ -354,9 +271,8 @@ def plot_kaldi_pitch(waveform, sample_rate, pitch, nfcc):
 # spectrograms with ``librosa``.
 #
 
-
 melspec_librosa = librosa.feature.melspectrogram(
-    waveform.numpy()[0],
+    y=SPEECH_WAVEFORM.numpy()[0],
     sr=sample_rate,
     n_fft=n_fft,
     hop_length=hop_length,
@@ -365,22 +281,23 @@ def plot_kaldi_pitch(waveform, sample_rate, pitch, nfcc):
     pad_mode="reflect",
     power=2.0,
     n_mels=n_mels,
-    norm='slaney',
+    norm="slaney",
     htk=True,
 )
-plot_spectrogram(
-    melspec_librosa, title="MelSpectrogram - librosa", ylabel='mel freq')
+
+######################################################################
+#
+
+plot_spectrogram(melspec_librosa, title="MelSpectrogram - librosa", ylabel="mel freq")
 
 mse = torch.square(melspec - melspec_librosa).mean().item()
-print('Mean Square Difference: ', mse)
+print("Mean Square Difference: ", mse)
 
 ######################################################################
 # MFCC
 # ----
 #
 
-waveform, sample_rate = get_speech_sample()
-
 n_fft = 2048
 win_length = None
 hop_length = 512
@@ -391,48 +308,102 @@ def plot_kaldi_pitch(waveform, sample_rate, pitch, nfcc):
     sample_rate=sample_rate,
     n_mfcc=n_mfcc,
     melkwargs={
-      'n_fft': n_fft,
-      'n_mels': n_mels,
-      'hop_length': hop_length,
-      'mel_scale': 'htk',
-    }
+        "n_fft": n_fft,
+        "n_mels": n_mels,
+        "hop_length": hop_length,
+        "mel_scale": "htk",
+    },
 )
 
-mfcc = mfcc_transform(waveform)
+mfcc = mfcc_transform(SPEECH_WAVEFORM)
+
+######################################################################
+#
 
 plot_spectrogram(mfcc[0])
 
 ######################################################################
-# Comparing against librosa
-# ~~~~~~~~~~~~~~~~~~~~~~~~~
+# Comparison against librosa
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~
 #
 
-
 melspec = librosa.feature.melspectrogram(
-  y=waveform.numpy()[0], sr=sample_rate, n_fft=n_fft,
-  win_length=win_length, hop_length=hop_length,
-  n_mels=n_mels, htk=True, norm=None)
+    y=SPEECH_WAVEFORM.numpy()[0],
+    sr=sample_rate,
+    n_fft=n_fft,
+    win_length=win_length,
+    hop_length=hop_length,
+    n_mels=n_mels,
+    htk=True,
+    norm=None,
+)
 
 mfcc_librosa = librosa.feature.mfcc(
-  S=librosa.core.spectrum.power_to_db(melspec),
-  n_mfcc=n_mfcc, dct_type=2, norm='ortho')
+    S=librosa.core.spectrum.power_to_db(melspec),
+    n_mfcc=n_mfcc,
+    dct_type=2,
+    norm="ortho",
+)
+
+######################################################################
+#
 
 plot_spectrogram(mfcc_librosa)
 
 mse = torch.square(mfcc - mfcc_librosa).mean().item()
-print('Mean Square Difference: ', mse)
+print("Mean Square Difference: ", mse)
+
+######################################################################
+# LFCC
+# ----
+#
+
+n_fft = 2048
+win_length = None
+hop_length = 512
+n_lfcc = 256
+
+lfcc_transform = T.LFCC(
+    sample_rate=sample_rate,
+    n_lfcc=n_lfcc,
+    speckwargs={
+        "n_fft": n_fft,
+        "win_length": win_length,
+        "hop_length": hop_length,
+    },
+)
+
+lfcc = lfcc_transform(SPEECH_WAVEFORM)
+plot_spectrogram(lfcc[0])
 
 ######################################################################
 # Pitch
 # -----
 #
 
+pitch = F.detect_pitch_frequency(SPEECH_WAVEFORM, SAMPLE_RATE)
+
+######################################################################
+#
+
+def plot_pitch(waveform, sr, pitch):
+    figure, axis = plt.subplots(1, 1)
+    axis.set_title("Pitch Feature")
+    axis.grid(True)
+
+    end_time = waveform.shape[1] / sr
+    time_axis = torch.linspace(0, end_time, waveform.shape[1])
+    axis.plot(time_axis, waveform[0], linewidth=1, color="gray", alpha=0.3)
+
+    axis2 = axis.twinx()
+    time_axis = torch.linspace(0, end_time, pitch.shape[1])
+    axis2.plot(time_axis, pitch[0], linewidth=2, label="Pitch", color="green")
 
-waveform, sample_rate = get_speech_sample()
+    axis2.legend(loc=0)
+    plt.show(block=False)
 
-pitch = F.detect_pitch_frequency(waveform, sample_rate)
-plot_pitch(waveform, sample_rate, pitch)
-play_audio(waveform, sample_rate)
+
+plot_pitch(SPEECH_WAVEFORM, SAMPLE_RATE, pitch)
 
 ######################################################################
 # Kaldi Pitch (beta)
@@ -440,7 +411,7 @@ def plot_kaldi_pitch(waveform, sample_rate, pitch, nfcc):
 #
 # Kaldi Pitch feature [1] is a pitch detection mechanism tuned for automatic
 # speech recognition (ASR) applications. This is a beta feature in ``torchaudio``,
-# and it is available only in ``functional``.
+# and it is available as :py:func:`torchaudio.functional.compute_kaldi_pitch`.
 #
 # 1. A pitch extraction algorithm tuned for automatic speech recognition
 #
@@ -454,11 +425,33 @@ def plot_kaldi_pitch(waveform, sample_rate, pitch, nfcc):
 #    [`paper <https://danielpovey.com/files/2014_icassp_pitch.pdf>`__]
 #
 
+pitch_feature = F.compute_kaldi_pitch(SPEECH_WAVEFORM, SAMPLE_RATE)
+pitch, nfcc = pitch_feature[..., 0], pitch_feature[..., 1]
+
+######################################################################
+#
+
+def plot_kaldi_pitch(waveform, sr, pitch, nfcc):
+    _, axis = plt.subplots(1, 1)
+    axis.set_title("Kaldi Pitch Feature")
+    axis.grid(True)
 
-waveform, sample_rate = get_speech_sample(resample=16000)
+    end_time = waveform.shape[1] / sr
+    time_axis = torch.linspace(0, end_time, waveform.shape[1])
+    axis.plot(time_axis, waveform[0], linewidth=1, color="gray", alpha=0.3)
+
+    time_axis = torch.linspace(0, end_time, pitch.shape[1])
+    ln1 = axis.plot(time_axis, pitch[0], linewidth=2, label="Pitch", color="green")
+    axis.set_ylim((-1.3, 1.3))
+
+    axis2 = axis.twinx()
+    time_axis = torch.linspace(0, end_time, nfcc.shape[1])
+    ln2 = axis2.plot(time_axis, nfcc[0], linewidth=2, label="NFCC", color="blue", linestyle="--")
+
+    lns = ln1 + ln2
+    labels = [l.get_label() for l in lns]
+    axis.legend(lns, labels, loc=0)
+    plt.show(block=False)
 
-pitch_feature = F.compute_kaldi_pitch(waveform, sample_rate)
-pitch, nfcc = pitch_feature[..., 0], pitch_feature[..., 1]
 
-plot_kaldi_pitch(waveform, sample_rate, pitch, nfcc)
-play_audio(waveform, sample_rate)
+plot_kaldi_pitch(SPEECH_WAVEFORM, SAMPLE_RATE, pitch, nfcc)
diff --git a/beginner_source/audio_io_tutorial.py b/beginner_source/audio_io_tutorial.py
index 9fa23bbad..4917f1b10 100644
--- a/beginner_source/audio_io_tutorial.py
+++ b/beginner_source/audio_io_tutorial.py
@@ -3,13 +3,10 @@
 Audio I/O
 =========
 
-``torchaudio`` integrates ``libsox`` and provides a rich set of audio I/O.
+This tutorial shows how to use TorchAudio's basic I/O API to load audio files
+into PyTorch's Tensor object, and save Tensor objects to audio files.
 """
 
-# When running this tutorial in Google Colab, install the required packages
-# with the following.
-# !pip install torchaudio boto3
-
 import torch
 import torchaudio
 
@@ -17,163 +14,47 @@
 print(torchaudio.__version__)
 
 ######################################################################
-# Preparing data and utility functions (skip this section)
-# --------------------------------------------------------
+# Preparation
+# -----------
 #
-
-#@title Prepare data and utility functions. {display-mode: "form"}
-#@markdown
-#@markdown You do not need to look into this cell.
-#@markdown Just execute once and you are good to go.
-#@markdown
-#@markdown In this tutorial, we will use a speech data from [VOiCES dataset](https://iqtlabs.github.io/voices/), which is licensed under Creative Commos BY 4.0.
-
+# First, we import the modules and download the audio assets we use in this tutorial.
+#
+# .. note::
+#    When running this tutorial in Google Colab, install the required packages
+#    with the following:
+#
+#    .. code::
+#
+#       !pip install boto3
 
 import io
 import os
-import requests
 import tarfile
+import tempfile
 
 import boto3
+import matplotlib.pyplot as plt
+import requests
 from botocore import UNSIGNED
 from botocore.config import Config
-import matplotlib.pyplot as plt
-from IPython.display import Audio, display
+from IPython.display import Audio
+from torchaudio.utils import download_asset
 
+SAMPLE_GSM = download_asset("tutorial-assets/steam-train-whistle-daniel_simon.gsm")
+SAMPLE_WAV = download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav")
+SAMPLE_WAV_8000 = download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042-8000hz.wav")
 
-_SAMPLE_DIR = "_sample_data"
-SAMPLE_WAV_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/steam-train-whistle-daniel_simon.wav"
-SAMPLE_WAV_PATH = os.path.join(_SAMPLE_DIR, "steam.wav")
 
-SAMPLE_MP3_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/steam-train-whistle-daniel_simon.mp3"
-SAMPLE_MP3_PATH = os.path.join(_SAMPLE_DIR, "steam.mp3")
-
-SAMPLE_GSM_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/steam-train-whistle-daniel_simon.gsm"
-SAMPLE_GSM_PATH = os.path.join(_SAMPLE_DIR, "steam.gsm")
-
-SAMPLE_WAV_SPEECH_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
-SAMPLE_WAV_SPEECH_PATH = os.path.join(_SAMPLE_DIR, "speech.wav")
-
-SAMPLE_TAR_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/VOiCES_devkit.tar.gz"
-SAMPLE_TAR_PATH = os.path.join(_SAMPLE_DIR, "sample.tar.gz")
-SAMPLE_TAR_ITEM = "VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
-
-S3_BUCKET = "pytorch-tutorial-assets"
-S3_KEY = "VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
-
-
-def _fetch_data():
-  os.makedirs(_SAMPLE_DIR, exist_ok=True)
-  uri = [
-    (SAMPLE_WAV_URL, SAMPLE_WAV_PATH),
-    (SAMPLE_MP3_URL, SAMPLE_MP3_PATH),
-    (SAMPLE_GSM_URL, SAMPLE_GSM_PATH),
-    (SAMPLE_WAV_SPEECH_URL, SAMPLE_WAV_SPEECH_PATH),
-    (SAMPLE_TAR_URL, SAMPLE_TAR_PATH),
-  ]
-  for url, path in uri:
-    with open(path, 'wb') as file_:
-      file_.write(requests.get(url).content)
-
-_fetch_data()
-
-def print_stats(waveform, sample_rate=None, src=None):
-  if src:
-    print("-" * 10)
-    print("Source:", src)
-    print("-" * 10)
-  if sample_rate:
-    print("Sample Rate:", sample_rate)
-  print("Shape:", tuple(waveform.shape))
-  print("Dtype:", waveform.dtype)
-  print(f" - Max:     {waveform.max().item():6.3f}")
-  print(f" - Min:     {waveform.min().item():6.3f}")
-  print(f" - Mean:    {waveform.mean().item():6.3f}")
-  print(f" - Std Dev: {waveform.std().item():6.3f}")
-  print()
-  print(waveform)
-  print()
-
-def plot_waveform(waveform, sample_rate, title="Waveform", xlim=None, ylim=None):
-  waveform = waveform.numpy()
-
-  num_channels, num_frames = waveform.shape
-  time_axis = torch.arange(0, num_frames) / sample_rate
-
-  figure, axes = plt.subplots(num_channels, 1)
-  if num_channels == 1:
-    axes = [axes]
-  for c in range(num_channels):
-    axes[c].plot(time_axis, waveform[c], linewidth=1)
-    axes[c].grid(True)
-    if num_channels > 1:
-      axes[c].set_ylabel(f'Channel {c+1}')
-    if xlim:
-      axes[c].set_xlim(xlim)
-    if ylim:
-      axes[c].set_ylim(ylim)
-  figure.suptitle(title)
-  plt.show(block=False)
-
-def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
-  waveform = waveform.numpy()
-
-  num_channels, num_frames = waveform.shape
-  time_axis = torch.arange(0, num_frames) / sample_rate
-
-  figure, axes = plt.subplots(num_channels, 1)
-  if num_channels == 1:
-    axes = [axes]
-  for c in range(num_channels):
-    axes[c].specgram(waveform[c], Fs=sample_rate)
-    if num_channels > 1:
-      axes[c].set_ylabel(f'Channel {c+1}')
-    if xlim:
-      axes[c].set_xlim(xlim)
-  figure.suptitle(title)
-  plt.show(block=False)
-
-def play_audio(waveform, sample_rate):
-  waveform = waveform.numpy()
-
-  num_channels, num_frames = waveform.shape
-  if num_channels == 1:
-    display(Audio(waveform[0], rate=sample_rate))
-  elif num_channels == 2:
-    display(Audio((waveform[0], waveform[1]), rate=sample_rate))
-  else:
-    raise ValueError("Waveform with more than 2 channels are not supported.")
-
-def _get_sample(path, resample=None):
-  effects = [
-    ["remix", "1"]
-  ]
-  if resample:
-    effects.extend([
-      ["lowpass", f"{resample // 2}"],
-      ["rate", f'{resample}'],
-    ])
-  return torchaudio.sox_effects.apply_effects_file(path, effects=effects)
-
-def get_sample(*, resample=None):
-  return _get_sample(SAMPLE_WAV_PATH, resample=resample)
-
-def inspect_file(path):
-  print("-" * 10)
-  print("Source:", path)
-  print("-" * 10)
-  print(f" - File size: {os.path.getsize(path)} bytes")
-  print(f" - {torchaudio.info(path)}")
 
 ######################################################################
-# Quering audio metadata
-# ----------------------
+# Querying audio metadata
+# -----------------------
 #
-# Function ``torchaudio.info`` fetches audio metadata. You can provide
-# a path-like object or file-like object.
+# Function :py:func:`torchaudio.info` fetches audio metadata.
+# You can provide a path-like object or file-like object.
 #
 
-metadata = torchaudio.info(SAMPLE_WAV_PATH)
+metadata = torchaudio.info(SAMPLE_WAV)
 print(metadata)
 
 ######################################################################
@@ -205,6 +86,7 @@ def inspect_file(path):
 # -  ``"OPUS"``: Opus [`opus-codec.org <https://opus-codec.org/>`__]
 # -  ``"GSM"``: GSM-FR
 #    [`wikipedia <https://en.wikipedia.org/wiki/Full_Rate>`__]
+# -  ``"HTK"``: Single channel 16-bit PCM
 # -  ``"UNKNOWN"`` None of above
 #
 
@@ -216,49 +98,37 @@ def inspect_file(path):
 # -  ``num_frames`` can be ``0`` for GSM-FR format.
 #
 
-metadata = torchaudio.info(SAMPLE_MP3_PATH)
-print(metadata)
-
-metadata = torchaudio.info(SAMPLE_GSM_PATH)
+metadata = torchaudio.info(SAMPLE_GSM)
 print(metadata)
 
 
 ######################################################################
 # Querying file-like object
-# ~~~~~~~~~~~~~~~~~~~~~~~~~
+# -------------------------
 #
-# ``info`` works on file-like objects.
+# :py:func:`torchaudio.info` works on file-like objects.
 #
 
-print("Source:", SAMPLE_WAV_URL)
-with requests.get(SAMPLE_WAV_URL, stream=True) as response:
-  metadata = torchaudio.info(response.raw)
+url = "https://download.pytorch.org/torchaudio/tutorial-assets/steam-train-whistle-daniel_simon.wav"
+with requests.get(url, stream=True) as response:
+    metadata = torchaudio.info(response.raw)
 print(metadata)
 
 ######################################################################
-# **Note** When passing a file-like object, ``info`` does not read
-# all of the underlying data; rather, it reads only a portion
-# of the data from the beginning.
-# Therefore, for a given audio format, it may not be able to retrieve the
-# correct metadata, including the format itself.
-# The following example illustrates this.
+# .. note::
 #
-# -  Use argument ``format`` to specify the audio format of the input.
-# -  The returned metadata has ``num_frames = 0``
-#
-
-print("Source:", SAMPLE_MP3_URL)
-with requests.get(SAMPLE_MP3_URL, stream=True) as response:
-  metadata = torchaudio.info(response.raw, format="mp3")
-
-  print(f"Fetched {response.raw.tell()} bytes.")
-print(metadata)
+#    When passing a file-like object, ``info`` does not read
+#    all of the underlying data; rather, it reads only a portion
+#    of the data from the beginning.
+#    Therefore, for a given audio format, it may not be able to retrieve the
+#    correct metadata, including the format itself. In such case, you
+#    can pass ``format`` argument to specify the format of the audio.
 
 ######################################################################
-# Loading audio data into Tensor
-# ------------------------------
+# Loading audio data
+# ------------------
 #
-# To load audio data, you can use ``torchaudio.load``.
+# To load audio data, you can use :py:func:`torchaudio.load`.
 #
 # This function accepts a path-like object or file-like object as input.
 #
@@ -266,51 +136,112 @@ def inspect_file(path):
 # (``int``).
 #
 # By default, the resulting tensor object has ``dtype=torch.float32`` and
-# its value range is normalized within ``[-1.0, 1.0]``.
+# its value range is ``[-1.0, 1.0]``.
 #
 # For the list of supported format, please refer to `the torchaudio
 # documentation <https://pytorch.org/audio>`__.
 #
 
-waveform, sample_rate = torchaudio.load(SAMPLE_WAV_SPEECH_PATH)
+waveform, sample_rate = torchaudio.load(SAMPLE_WAV)
+
 
-print_stats(waveform, sample_rate=sample_rate)
+######################################################################
+#
+def plot_waveform(waveform, sample_rate):
+    waveform = waveform.numpy()
+
+    num_channels, num_frames = waveform.shape
+    time_axis = torch.arange(0, num_frames) / sample_rate
+
+    figure, axes = plt.subplots(num_channels, 1)
+    if num_channels == 1:
+        axes = [axes]
+    for c in range(num_channels):
+        axes[c].plot(time_axis, waveform[c], linewidth=1)
+        axes[c].grid(True)
+        if num_channels > 1:
+            axes[c].set_ylabel(f"Channel {c+1}")
+    figure.suptitle("waveform")
+    plt.show(block=False)
+
+
+######################################################################
+#
 plot_waveform(waveform, sample_rate)
+
+
+######################################################################
+#
+def plot_specgram(waveform, sample_rate, title="Spectrogram"):
+    waveform = waveform.numpy()
+
+    num_channels, num_frames = waveform.shape
+
+    figure, axes = plt.subplots(num_channels, 1)
+    if num_channels == 1:
+        axes = [axes]
+    for c in range(num_channels):
+        axes[c].specgram(waveform[c], Fs=sample_rate)
+        if num_channels > 1:
+            axes[c].set_ylabel(f"Channel {c+1}")
+    figure.suptitle(title)
+    plt.show(block=False)
+
+
+######################################################################
+#
 plot_specgram(waveform, sample_rate)
-play_audio(waveform, sample_rate)
 
 
+######################################################################
+#
+Audio(waveform.numpy()[0], rate=sample_rate)
+
 ######################################################################
 # Loading from file-like object
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# -----------------------------
 #
-# ``torchaudio``\ ’s I/O functions now support file-like objects. This
-# allows for fetching and decoding audio data from locations
+# The I/O functions support file-like objects.
+# This allows for fetching and decoding audio data from locations
 # within and beyond the local file system.
 # The following examples illustrate this.
 #
 
+######################################################################
+#
+
 # Load audio data as HTTP request
-with requests.get(SAMPLE_WAV_SPEECH_URL, stream=True) as response:
-  waveform, sample_rate = torchaudio.load(response.raw)
+url = "https://download.pytorch.org/torchaudio/tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
+with requests.get(url, stream=True) as response:
+    waveform, sample_rate = torchaudio.load(response.raw)
 plot_specgram(waveform, sample_rate, title="HTTP datasource")
 
+######################################################################
+#
+
 # Load audio from tar file
-with tarfile.open(SAMPLE_TAR_PATH, mode='r') as tarfile_:
-  fileobj = tarfile_.extractfile(SAMPLE_TAR_ITEM)
-  waveform, sample_rate = torchaudio.load(fileobj)
+tar_path = download_asset("tutorial-assets/VOiCES_devkit.tar.gz")
+tar_item = "VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
+with tarfile.open(tar_path, mode="r") as tarfile_:
+    fileobj = tarfile_.extractfile(tar_item)
+    waveform, sample_rate = torchaudio.load(fileobj)
 plot_specgram(waveform, sample_rate, title="TAR file")
 
+######################################################################
+#
+
 # Load audio from S3
-client = boto3.client('s3', config=Config(signature_version=UNSIGNED))
-response = client.get_object(Bucket=S3_BUCKET, Key=S3_KEY)
-waveform, sample_rate = torchaudio.load(response['Body'])
+bucket = "pytorch-tutorial-assets"
+key = "VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
+client = boto3.client("s3", config=Config(signature_version=UNSIGNED))
+response = client.get_object(Bucket=bucket, Key=key)
+waveform, sample_rate = torchaudio.load(response["Body"])
 plot_specgram(waveform, sample_rate, title="From S3")
 
 
 ######################################################################
 # Tips on slicing
-# ~~~~~~~~~~~~~~~
+# ---------------
 #
 # Providing ``num_frames`` and ``frame_offset`` arguments restricts
 # decoding to the corresponding segment of the input.
@@ -335,29 +266,28 @@ def inspect_file(path):
 
 frame_offset, num_frames = 16000, 16000  # Fetch and decode the 1 - 2 seconds
 
+url = "https://download.pytorch.org/torchaudio/tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
 print("Fetching all the data...")
-with requests.get(SAMPLE_WAV_SPEECH_URL, stream=True) as response:
-  waveform1, sample_rate1 = torchaudio.load(response.raw)
-  waveform1 = waveform1[:, frame_offset:frame_offset+num_frames]
-  print(f" - Fetched {response.raw.tell()} bytes")
+with requests.get(url, stream=True) as response:
+    waveform1, sample_rate1 = torchaudio.load(response.raw)
+    waveform1 = waveform1[:, frame_offset : frame_offset + num_frames]
+    print(f" - Fetched {response.raw.tell()} bytes")
 
 print("Fetching until the requested frames are available...")
-with requests.get(SAMPLE_WAV_SPEECH_URL, stream=True) as response:
-  waveform2, sample_rate2 = torchaudio.load(
-      response.raw, frame_offset=frame_offset, num_frames=num_frames)
-  print(f" - Fetched {response.raw.tell()} bytes")
+with requests.get(url, stream=True) as response:
+    waveform2, sample_rate2 = torchaudio.load(response.raw, frame_offset=frame_offset, num_frames=num_frames)
+    print(f" - Fetched {response.raw.tell()} bytes")
 
 print("Checking the resulting waveform ... ", end="")
 assert (waveform1 == waveform2).all()
 print("matched!")
 
-
 ######################################################################
 # Saving audio to file
 # --------------------
 #
 # To save audio data in formats interpretable by common applications,
-# you can use ``torchaudio.save``.
+# you can use :py:func:`torchaudio.save`.
 #
 # This function accepts a path-like object or file-like object.
 #
@@ -372,55 +302,72 @@ def inspect_file(path):
 # ``bits_per_sample`` to change this behavior. For example, to save data
 # in 16-bit signed integer PCM, you can do the following.
 #
-# **Note** Saving data in encodings with lower bit depth reduces the
+# .. note::
+#
+# Saving data in encodings with a lower bit depth reduces the
 # resulting file size but also precision.
 #
 
+waveform, sample_rate = torchaudio.load(SAMPLE_WAV)
+
+
+######################################################################
+#
 
-waveform, sample_rate = get_sample()
-print_stats(waveform, sample_rate=sample_rate)
+def inspect_file(path):
+    print("-" * 10)
+    print("Source:", path)
+    print("-" * 10)
+    print(f" - File size: {os.path.getsize(path)} bytes")
+    print(f" - {torchaudio.info(path)}")
+    print()
 
+######################################################################
+#
 # Save without any encoding option.
 # The function will pick up the encoding which
 # the provided data fit
-path = "save_example_default.wav"
-torchaudio.save(path, waveform, sample_rate)
-inspect_file(path)
+with tempfile.TemporaryDirectory() as tempdir:
+    path = f"{tempdir}/save_example_default.wav"
+    torchaudio.save(path, waveform, sample_rate)
+    inspect_file(path)
 
+######################################################################
+#
 # Save as 16-bit signed integer Linear PCM
 # The resulting file occupies half the storage but loses precision
-path = "save_example_PCM_S16.wav"
-torchaudio.save(
-    path, waveform, sample_rate,
-    encoding="PCM_S", bits_per_sample=16)
-inspect_file(path)
+with tempfile.TemporaryDirectory() as tempdir:
+    path = f"{tempdir}/save_example_PCM_S16.wav"
+    torchaudio.save(path, waveform, sample_rate, encoding="PCM_S", bits_per_sample=16)
+    inspect_file(path)
 
 
 ######################################################################
-# ``torchaudio.save`` can also handle other formats. To name a few:
+# :py:func:`torchaudio.save` can also handle other formats.
+# To name a few:
 #
 
-waveform, sample_rate = get_sample(resample=8000)
-
 formats = [
-  "mp3",
-  "flac",
-  "vorbis",
-  "sph",
-  "amb",
-  "amr-nb",
-  "gsm",
+    "flac",
+    "vorbis",
+    "sph",
+    "amb",
+    "amr-nb",
+    "gsm",
 ]
 
-for format in formats:
-  path = f"save_example.{format}"
-  torchaudio.save(path, waveform, sample_rate, format=format)
-  inspect_file(path)
-
+######################################################################
+#
+waveform, sample_rate = torchaudio.load(SAMPLE_WAV_8000)
+with tempfile.TemporaryDirectory() as tempdir:
+    for format in formats:
+        path = f"{tempdir}/save_example.{format}"
+        torchaudio.save(path, waveform, sample_rate, format=format)
+        inspect_file(path)
 
 ######################################################################
 # Saving to file-like object
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~
+# --------------------------
 #
 # Similar to the other I/O functions, you can save audio to file-like
 # objects. When saving to a file-like object, argument ``format`` is
@@ -428,7 +375,7 @@ def inspect_file(path):
 #
 
 
-waveform, sample_rate = get_sample()
+waveform, sample_rate = torchaudio.load(SAMPLE_WAV)
 
 # Saving to bytes buffer
 buffer_ = io.BytesIO()
@@ -436,4 +383,3 @@ def inspect_file(path):
 
 buffer_.seek(0)
 print(buffer_.read(16))
-
diff --git a/beginner_source/audio_resampling_tutorial.py b/beginner_source/audio_resampling_tutorial.py
index 99e691cfa..3ffd73998 100644
--- a/beginner_source/audio_resampling_tutorial.py
+++ b/beginner_source/audio_resampling_tutorial.py
@@ -1,16 +1,11 @@
 # -*- coding: utf-8 -*-
 """
 Audio Resampling
-==========
-
-Here, we will walk through resampling audio waveforms using ``torchaudio``.
+================
 
+This tutorial shows how to use torchaudio's resampling API.
 """
 
-# When running this tutorial in Google Colab, install the required packages
-# with the following.
-# !pip install torchaudio librosa
-
 import torch
 import torchaudio
 import torchaudio.functional as F
@@ -20,174 +15,126 @@
 print(torchaudio.__version__)
 
 ######################################################################
-# Preparing data and utility functions (skip this section)
-# --------------------------------------------------------
+# Preparation
+# -----------
 #
-
-#@title Prepare data and utility functions. {display-mode: "form"}
-#@markdown
-#@markdown You do not need to look into this cell.
-#@markdown Just execute once and you are good to go.
-
-#-------------------------------------------------------------------------------
-# Preparation of data and helper functions.
-#-------------------------------------------------------------------------------
+# First, we import the modules and define the helper functions.
+#
+# .. note::
+#    When running this tutorial in Google Colab, install the required packages
+#    with the following.
+#
+#    .. code::
+#
+#       !pip install librosa
 
 import math
 import time
 
 import librosa
 import matplotlib.pyplot as plt
-from IPython.display import Audio, display
 import pandas as pd
+from IPython.display import Audio, display
 
+pd.set_option('display.max_rows', None)
+pd.set_option('display.max_columns', None)
 
 DEFAULT_OFFSET = 201
-SWEEP_MAX_SAMPLE_RATE = 48000
-DEFAULT_LOWPASS_FILTER_WIDTH = 6
-DEFAULT_ROLLOFF = 0.99
-DEFAULT_RESAMPLING_METHOD = 'sinc_interpolation'
 
 
 def _get_log_freq(sample_rate, max_sweep_rate, offset):
-  """Get freqs evenly spaced out in log-scale, between [0, max_sweep_rate // 2]
+    """Get freqs evenly spaced out in log-scale, between [0, max_sweep_rate // 2]
+
+    offset is used to avoid negative infinity `log(offset + x)`.
 
-  offset is used to avoid negative infinity `log(offset + x)`.
+    """
+    start, stop = math.log(offset), math.log(offset + max_sweep_rate // 2)
+    return torch.exp(torch.linspace(start, stop, sample_rate, dtype=torch.double)) - offset
 
-  """
-  half = sample_rate // 2
-  start, stop = math.log(offset), math.log(offset + max_sweep_rate // 2)
-  return torch.exp(torch.linspace(start, stop, sample_rate, dtype=torch.double)) - offset
 
 def _get_inverse_log_freq(freq, sample_rate, offset):
-  """Find the time where the given frequency is given by _get_log_freq"""
-  half = sample_rate // 2
-  return sample_rate * (math.log(1 + freq / offset) / math.log(1 + half / offset))
+    """Find the time where the given frequency is given by _get_log_freq"""
+    half = sample_rate // 2
+    return sample_rate * (math.log(1 + freq / offset) / math.log(1 + half / offset))
+
 
 def _get_freq_ticks(sample_rate, offset, f_max):
-  # Given the original sample rate used for generating the sweep,
-  # find the x-axis value where the log-scale major frequency values fall in
-  time, freq = [], []
-  for exp in range(2, 5):
-    for v in range(1, 10):
-      f = v * 10 ** exp
-      if f < sample_rate // 2:
-        t = _get_inverse_log_freq(f, sample_rate, offset) / sample_rate
-        time.append(t)
-        freq.append(f)
-  t_max = _get_inverse_log_freq(f_max, sample_rate, offset) / sample_rate
-  time.append(t_max)
-  freq.append(f_max)
-  return time, freq
+    # Given the original sample rate used for generating the sweep,
+    # find the x-axis value where the log-scale major frequency values fall in
+    time, freq = [], []
+    for exp in range(2, 5):
+        for v in range(1, 10):
+            f = v * 10**exp
+            if f < sample_rate // 2:
+                t = _get_inverse_log_freq(f, sample_rate, offset) / sample_rate
+                time.append(t)
+                freq.append(f)
+    t_max = _get_inverse_log_freq(f_max, sample_rate, offset) / sample_rate
+    time.append(t_max)
+    freq.append(f_max)
+    return time, freq
+
 
 def get_sine_sweep(sample_rate, offset=DEFAULT_OFFSET):
-  max_sweep_rate = sample_rate
-  freq = _get_log_freq(sample_rate, max_sweep_rate, offset)
-  delta = 2 * math.pi * freq / sample_rate
-  cummulative = torch.cumsum(delta, dim=0)
-  signal = torch.sin(cummulative).unsqueeze(dim=0)
-  return signal
-
-def plot_sweep(waveform, sample_rate, title, max_sweep_rate=SWEEP_MAX_SAMPLE_RATE, offset=DEFAULT_OFFSET):
-  x_ticks = [100, 500, 1000, 5000, 10000, 20000, max_sweep_rate // 2]
-  y_ticks = [1000, 5000, 10000, 20000, sample_rate//2]
-
-  time, freq = _get_freq_ticks(max_sweep_rate, offset, sample_rate // 2)
-  freq_x = [f if f in x_ticks and f <= max_sweep_rate // 2 else None for f in freq]
-  freq_y = [f for f in freq if f >= 1000 and f in y_ticks and f <= sample_rate // 2]
-
-  figure, axis = plt.subplots(1, 1)
-  axis.specgram(waveform[0].numpy(), Fs=sample_rate)
-  plt.xticks(time, freq_x)
-  plt.yticks(freq_y, freq_y)
-  axis.set_xlabel('Original Signal Frequency (Hz, log scale)')
-  axis.set_ylabel('Waveform Frequency (Hz)')
-  axis.xaxis.grid(True, alpha=0.67)
-  axis.yaxis.grid(True, alpha=0.67)
-  figure.suptitle(f'{title} (sample rate: {sample_rate} Hz)')
-  plt.show(block=True)
-
-def play_audio(waveform, sample_rate):
-  waveform = waveform.numpy()
-
-  num_channels, num_frames = waveform.shape
-  if num_channels == 1:
-    display(Audio(waveform[0], rate=sample_rate))
-  elif num_channels == 2:
-    display(Audio((waveform[0], waveform[1]), rate=sample_rate))
-  else:
-    raise ValueError("Waveform with more than 2 channels are not supported.")
-
-def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
-  waveform = waveform.numpy()
-
-  num_channels, num_frames = waveform.shape
-  time_axis = torch.arange(0, num_frames) / sample_rate
-
-  figure, axes = plt.subplots(num_channels, 1)
-  if num_channels == 1:
-    axes = [axes]
-  for c in range(num_channels):
-    axes[c].specgram(waveform[c], Fs=sample_rate)
-    if num_channels > 1:
-      axes[c].set_ylabel(f'Channel {c+1}')
-    if xlim:
-      axes[c].set_xlim(xlim)
-  figure.suptitle(title)
-  plt.show(block=False)
+    max_sweep_rate = sample_rate
+    freq = _get_log_freq(sample_rate, max_sweep_rate, offset)
+    delta = 2 * math.pi * freq / sample_rate
+    cummulative = torch.cumsum(delta, dim=0)
+    signal = torch.sin(cummulative).unsqueeze(dim=0)
+    return signal
 
-def benchmark_resample(
-    method,
+
+def plot_sweep(
     waveform,
     sample_rate,
-    resample_rate,
-    lowpass_filter_width=DEFAULT_LOWPASS_FILTER_WIDTH,
-    rolloff=DEFAULT_ROLLOFF,
-    resampling_method=DEFAULT_RESAMPLING_METHOD,
-    beta=None,
-    librosa_type=None,
-    iters=5
+    title,
+    max_sweep_rate=48000,
+    offset=DEFAULT_OFFSET,
 ):
-  if method == "functional":
-    begin = time.time()
-    for _ in range(iters):
-      F.resample(waveform, sample_rate, resample_rate, lowpass_filter_width=lowpass_filter_width,
-                 rolloff=rolloff, resampling_method=resampling_method)
-    elapsed = time.time() - begin
-    return elapsed / iters
-  elif method == "transforms":
-    resampler = T.Resample(sample_rate, resample_rate, lowpass_filter_width=lowpass_filter_width,
-                           rolloff=rolloff, resampling_method=resampling_method, dtype=waveform.dtype)
-    begin = time.time()
-    for _ in range(iters):
-      resampler(waveform)
-    elapsed = time.time() - begin
-    return elapsed / iters
-  elif method == "librosa":
-    waveform_np = waveform.squeeze().numpy()
-    begin = time.time()
-    for _ in range(iters):
-      librosa.resample(waveform_np, sample_rate, resample_rate, res_type=librosa_type)
-    elapsed = time.time() - begin
-    return elapsed / iters
+    x_ticks = [100, 500, 1000, 5000, 10000, 20000, max_sweep_rate // 2]
+    y_ticks = [1000, 5000, 10000, 20000, sample_rate // 2]
+
+    time, freq = _get_freq_ticks(max_sweep_rate, offset, sample_rate // 2)
+    freq_x = [f if f in x_ticks and f <= max_sweep_rate // 2 else None for f in freq]
+    freq_y = [f for f in freq if f in y_ticks and 1000 <= f <= sample_rate // 2]
+
+    figure, axis = plt.subplots(1, 1)
+    _, _, _, cax = axis.specgram(waveform[0].numpy(), Fs=sample_rate)
+    plt.xticks(time, freq_x)
+    plt.yticks(freq_y, freq_y)
+    axis.set_xlabel("Original Signal Frequency (Hz, log scale)")
+    axis.set_ylabel("Waveform Frequency (Hz)")
+    axis.xaxis.grid(True, alpha=0.67)
+    axis.yaxis.grid(True, alpha=0.67)
+    figure.suptitle(f"{title} (sample rate: {sample_rate} Hz)")
+    plt.colorbar(cax)
+    plt.show(block=True)
+
 
 ######################################################################
+# Resampling Overview
+# -------------------
+#
 # To resample an audio waveform from one freqeuncy to another, you can use
-# ``transforms.Resample`` or ``functional.resample``.
-# ``transforms.Resample`` precomputes and caches the kernel used for
-# resampling, while ``functional.resample`` computes it on the fly, so
-# using ``transforms.Resample`` will result in a speedup when resampling
+# :py:func:`torchaudio.transforms.Resample` or
+# :py:func:`torchaudio.functional.resample`.
+# ``transforms.Resample`` precomputes and caches the kernel used for resampling,
+# while ``functional.resample`` computes it on the fly, so using
+# ``torchaudio.transforms.Resample`` will result in a speedup when resampling
 # multiple waveforms using the same parameters (see Benchmarking section).
 #
 # Both resampling methods use `bandlimited sinc
 # interpolation <https://ccrma.stanford.edu/~jos/resample/>`__ to compute
 # signal values at arbitrary time steps. The implementation involves
 # convolution, so we can take advantage of GPU / multithreading for
-# performance improvements. When using resampling in multiple
-# subprocesses, such as data loading with multiple worker processes, your
-# application might create more threads than your system can handle
-# efficiently. Setting ``torch.set_num_threads(1)`` might help in this
-# case.
+# performance improvements.
+#
+# .. note::
+#
+#    When using resampling in multiple subprocesses, such as data loading
+#    with multiple worker processes, your application might create more
+#    threads than your system can handle efficiently.
+#    Setting ``torch.set_num_threads(1)`` might help in this case.
 #
 # Because a finite number of samples can only represent a finite number of
 # frequencies, resampling does not produce perfect results, and a variety
@@ -203,17 +150,24 @@ def benchmark_resample(
 #
 
 sample_rate = 48000
-resample_rate = 32000
-
 waveform = get_sine_sweep(sample_rate)
+
 plot_sweep(waveform, sample_rate, title="Original Waveform")
-play_audio(waveform, sample_rate)
+Audio(waveform.numpy()[0], rate=sample_rate)
+
+######################################################################
+#
+# Now we resample (downsample) it.
+#
+# We see that in the spectrogram of the resampled waveform, there is an
+# artifact, which was not present in the original waveform.
 
+resample_rate = 32000
 resampler = T.Resample(sample_rate, resample_rate, dtype=waveform.dtype)
 resampled_waveform = resampler(waveform)
-plot_sweep(resampled_waveform, resample_rate, title="Resampled Waveform")
-play_audio(waveform, sample_rate)
 
+plot_sweep(resampled_waveform, resample_rate, title="Resampled Waveform")
+Audio(resampled_waveform.numpy()[0], rate=resample_rate)
 
 ######################################################################
 # Controling resampling quality with parameters
@@ -231,17 +185,18 @@ def benchmark_resample(
 # expensive.
 #
 
-
 sample_rate = 48000
 resample_rate = 32000
 
 resampled_waveform = F.resample(waveform, sample_rate, resample_rate, lowpass_filter_width=6)
 plot_sweep(resampled_waveform, resample_rate, title="lowpass_filter_width=6")
 
+######################################################################
+#
+
 resampled_waveform = F.resample(waveform, sample_rate, resample_rate, lowpass_filter_width=128)
 plot_sweep(resampled_waveform, resample_rate, title="lowpass_filter_width=128")
 
-
 ######################################################################
 # Rolloff
 # ~~~~~~~
@@ -262,6 +217,9 @@ def benchmark_resample(
 resampled_waveform = F.resample(waveform, sample_rate, resample_rate, rolloff=0.99)
 plot_sweep(resampled_waveform, resample_rate, title="rolloff=0.99")
 
+######################################################################
+#
+
 resampled_waveform = F.resample(waveform, sample_rate, resample_rate, rolloff=0.8)
 plot_sweep(resampled_waveform, resample_rate, title="rolloff=0.8")
 
@@ -285,6 +243,9 @@ def benchmark_resample(
 resampled_waveform = F.resample(waveform, sample_rate, resample_rate, resampling_method="sinc_interpolation")
 plot_sweep(resampled_waveform, resample_rate, title="Hann Window Default")
 
+######################################################################
+#
+
 resampled_waveform = F.resample(waveform, sample_rate, resample_rate, resampling_method="kaiser_window")
 plot_sweep(resampled_waveform, resample_rate, title="Kaiser Window Default")
 
@@ -297,11 +258,13 @@ def benchmark_resample(
 # that of librosa (resampy)’s kaiser window resampling, with some noise
 #
 
-
 sample_rate = 48000
 resample_rate = 32000
 
-### kaiser_best
+######################################################################
+# kaiser_best
+# ~~~~~~~~~~~
+#
 resampled_waveform = F.resample(
     waveform,
     sample_rate,
@@ -309,18 +272,28 @@ def benchmark_resample(
     lowpass_filter_width=64,
     rolloff=0.9475937167399596,
     resampling_method="kaiser_window",
-    beta=14.769656459379492
+    beta=14.769656459379492,
 )
 plot_sweep(resampled_waveform, resample_rate, title="Kaiser Window Best (torchaudio)")
 
+######################################################################
+#
+
 librosa_resampled_waveform = torch.from_numpy(
-    librosa.resample(waveform.squeeze().numpy(), sample_rate, resample_rate, res_type='kaiser_best')).unsqueeze(0)
+    librosa.resample(waveform.squeeze().numpy(), orig_sr=sample_rate, target_sr=resample_rate, res_type="kaiser_best")
+).unsqueeze(0)
 plot_sweep(librosa_resampled_waveform, resample_rate, title="Kaiser Window Best (librosa)")
 
+######################################################################
+#
+
 mse = torch.square(resampled_waveform - librosa_resampled_waveform).mean().item()
 print("torchaudio and librosa kaiser best MSE:", mse)
 
-### kaiser_fast
+######################################################################
+# kaiser_fast
+# ~~~~~~~~~~~
+#
 resampled_waveform = F.resample(
     waveform,
     sample_rate,
@@ -328,18 +301,24 @@ def benchmark_resample(
     lowpass_filter_width=16,
     rolloff=0.85,
     resampling_method="kaiser_window",
-    beta=8.555504641634386
+    beta=8.555504641634386,
 )
-plot_specgram(resampled_waveform, resample_rate, title="Kaiser Window Fast (torchaudio)")
+plot_sweep(resampled_waveform, resample_rate, title="Kaiser Window Fast (torchaudio)")
+
+######################################################################
+#
 
 librosa_resampled_waveform = torch.from_numpy(
-    librosa.resample(waveform.squeeze().numpy(), sample_rate, resample_rate, res_type='kaiser_fast')).unsqueeze(0)
+    librosa.resample(waveform.squeeze().numpy(), orig_sr=sample_rate, target_sr=resample_rate, res_type="kaiser_fast")
+).unsqueeze(0)
 plot_sweep(librosa_resampled_waveform, resample_rate, title="Kaiser Window Fast (librosa)")
 
+######################################################################
+#
+
 mse = torch.square(resampled_waveform - librosa_resampled_waveform).mean().item()
 print("torchaudio and librosa kaiser fast MSE:", mse)
 
-
 ######################################################################
 # Performance Benchmarking
 # ------------------------
@@ -363,6 +342,57 @@ def benchmark_resample(
 #
 
 
+def benchmark_resample(
+    method,
+    waveform,
+    sample_rate,
+    resample_rate,
+    lowpass_filter_width=6,
+    rolloff=0.99,
+    resampling_method="sinc_interpolation",
+    beta=None,
+    librosa_type=None,
+    iters=5,
+):
+    if method == "functional":
+        begin = time.monotonic()
+        for _ in range(iters):
+            F.resample(
+                waveform,
+                sample_rate,
+                resample_rate,
+                lowpass_filter_width=lowpass_filter_width,
+                rolloff=rolloff,
+                resampling_method=resampling_method,
+            )
+        elapsed = time.monotonic() - begin
+        return elapsed / iters
+    elif method == "transforms":
+        resampler = T.Resample(
+            sample_rate,
+            resample_rate,
+            lowpass_filter_width=lowpass_filter_width,
+            rolloff=rolloff,
+            resampling_method=resampling_method,
+            dtype=waveform.dtype,
+        )
+        begin = time.monotonic()
+        for _ in range(iters):
+            resampler(waveform)
+        elapsed = time.monotonic() - begin
+        return elapsed / iters
+    elif method == "librosa":
+        waveform_np = waveform.squeeze().numpy()
+        begin = time.monotonic()
+        for _ in range(iters):
+            librosa.resample(waveform_np, orig_sr=sample_rate, target_sr=resample_rate, res_type=librosa_type)
+        elapsed = time.monotonic() - begin
+        return elapsed / iters
+
+
+######################################################################
+#
+
 configs = {
     "downsample (48 -> 44.1 kHz)": [48000, 44100],
     "downsample (16 -> 8 kHz)": [16000, 8000],
@@ -371,71 +401,76 @@ def benchmark_resample(
 }
 
 for label in configs:
-  times, rows = [], []
-  sample_rate = configs[label][0]
-  resample_rate = configs[label][1]
-  waveform = get_sine_sweep(sample_rate)
-
-  # sinc 64 zero-crossings
-  f_time = benchmark_resample("functional", waveform, sample_rate, resample_rate, lowpass_filter_width=64)
-  t_time = benchmark_resample("transforms", waveform, sample_rate, resample_rate, lowpass_filter_width=64)
-  times.append([None, 1000 * f_time, 1000 * t_time])
-  rows.append(f"sinc (width 64)")
-
-  # sinc 6 zero-crossings
-  f_time = benchmark_resample("functional", waveform, sample_rate, resample_rate, lowpass_filter_width=16)
-  t_time = benchmark_resample("transforms", waveform, sample_rate, resample_rate, lowpass_filter_width=16)
-  times.append([None, 1000 * f_time, 1000 * t_time])
-  rows.append(f"sinc (width 16)")
-
-  # kaiser best
-  lib_time = benchmark_resample("librosa", waveform, sample_rate, resample_rate, librosa_type="kaiser_best")
-  f_time = benchmark_resample(
-      "functional",
-      waveform,
-      sample_rate,
-      resample_rate,
-      lowpass_filter_width=64,
-      rolloff=0.9475937167399596,
-      resampling_method="kaiser_window",
-      beta=14.769656459379492)
-  t_time = benchmark_resample(
-      "transforms",
-      waveform,
-      sample_rate,
-      resample_rate,
-      lowpass_filter_width=64,
-      rolloff=0.9475937167399596,
-      resampling_method="kaiser_window",
-      beta=14.769656459379492)
-  times.append([1000 * lib_time, 1000 * f_time, 1000 * t_time])
-  rows.append(f"kaiser_best")
-
-  # kaiser fast
-  lib_time = benchmark_resample("librosa", waveform, sample_rate, resample_rate, librosa_type="kaiser_fast")
-  f_time = benchmark_resample(
-      "functional",
-      waveform,
-      sample_rate,
-      resample_rate,
-      lowpass_filter_width=16,
-      rolloff=0.85,
-      resampling_method="kaiser_window",
-      beta=8.555504641634386)
-  t_time = benchmark_resample(
-      "transforms",
-      waveform,
-      sample_rate,
-      resample_rate,
-      lowpass_filter_width=16,
-      rolloff=0.85,
-      resampling_method="kaiser_window",
-      beta=8.555504641634386)
-  times.append([1000 * lib_time, 1000 * f_time, 1000 * t_time])
-  rows.append(f"kaiser_fast")
-
-  df = pd.DataFrame(times,
-                    columns=["librosa", "functional", "transforms"],
-                    index=rows)
-  df.columns = pd.MultiIndex.from_product([[f"{label} time (ms)"],df.columns])
-  display(df.round(2))
+    times, rows = [], []
+    sample_rate = configs[label][0]
+    resample_rate = configs[label][1]
+    waveform = get_sine_sweep(sample_rate)
+
+    # sinc 64 zero-crossings
+    f_time = benchmark_resample("functional", waveform, sample_rate, resample_rate, lowpass_filter_width=64)
+    t_time = benchmark_resample("transforms", waveform, sample_rate, resample_rate, lowpass_filter_width=64)
+    times.append([None, 1000 * f_time, 1000 * t_time])
+    rows.append("sinc (width 64)")
+
+    # sinc 6 zero-crossings
+    f_time = benchmark_resample("functional", waveform, sample_rate, resample_rate, lowpass_filter_width=16)
+    t_time = benchmark_resample("transforms", waveform, sample_rate, resample_rate, lowpass_filter_width=16)
+    times.append([None, 1000 * f_time, 1000 * t_time])
+    rows.append("sinc (width 16)")
+
+    # kaiser best
+    lib_time = benchmark_resample("librosa", waveform, sample_rate, resample_rate, librosa_type="kaiser_best")
+    f_time = benchmark_resample(
+        "functional",
+        waveform,
+        sample_rate,
+        resample_rate,
+        lowpass_filter_width=64,
+        rolloff=0.9475937167399596,
+        resampling_method="kaiser_window",
+        beta=14.769656459379492,
+    )
+    t_time = benchmark_resample(
+        "transforms",
+        waveform,
+        sample_rate,
+        resample_rate,
+        lowpass_filter_width=64,
+        rolloff=0.9475937167399596,
+        resampling_method="kaiser_window",
+        beta=14.769656459379492,
+    )
+    times.append([1000 * lib_time, 1000 * f_time, 1000 * t_time])
+    rows.append("kaiser_best")
+
+    # kaiser fast
+    lib_time = benchmark_resample("librosa", waveform, sample_rate, resample_rate, librosa_type="kaiser_fast")
+    f_time = benchmark_resample(
+        "functional",
+        waveform,
+        sample_rate,
+        resample_rate,
+        lowpass_filter_width=16,
+        rolloff=0.85,
+        resampling_method="kaiser_window",
+        beta=8.555504641634386,
+    )
+    t_time = benchmark_resample(
+        "transforms",
+        waveform,
+        sample_rate,
+        resample_rate,
+        lowpass_filter_width=16,
+        rolloff=0.85,
+        resampling_method="kaiser_window",
+        beta=8.555504641634386,
+    )
+    times.append([1000 * lib_time, 1000 * f_time, 1000 * t_time])
+    rows.append("kaiser_fast")
+
+    df = pd.DataFrame(times, columns=["librosa", "functional", "transforms"], index=rows)
+    df.columns = pd.MultiIndex.from_product([[f"{label} time (ms)"], df.columns])
+
+    print(f"torchaudio: {torchaudio.__version__}")
+    print(f"librosa: {librosa.__version__}")
+    display(df.round(2))
diff --git a/beginner_source/basics/tensorqs_tutorial.py b/beginner_source/basics/tensorqs_tutorial.py
index 3aec33d06..a810d6fa3 100644
--- a/beginner_source/basics/tensorqs_tutorial.py
+++ b/beginner_source/basics/tensorqs_tutorial.py
@@ -145,7 +145,7 @@
 y1 = tensor @ tensor.T
 y2 = tensor.matmul(tensor.T)
 
-y3 = torch.rand_like(tensor)
+y3 = torch.rand_like(y1)
 torch.matmul(tensor, tensor.T, out=y3)
 
 
diff --git a/beginner_source/dcgan_faces_tutorial.py b/beginner_source/dcgan_faces_tutorial.py
index cc5202a4f..bad0fbe69 100644
--- a/beginner_source/dcgan_faces_tutorial.py
+++ b/beginner_source/dcgan_faces_tutorial.py
@@ -187,11 +187,11 @@
 # *celeba* 이라는 폴더를 새로 만들고, 해당 폴더에 해당 zip 파일을 압축해제 해주시면 됩니다.
 # 압축 해제 후, 위에서 정의한 *dataroot* 변수에 방금 만든 *celeba* 폴더의 경로를 넣어주세요.
 # 위의 작업이 끝나면 *celeba* 폴더의 구조는 다음과 같아야 합니다:
-# 
+#
 # ::
-# 
+#
 #    /path/to/celeba
-#        -> img_align_celeba  
+#        -> img_align_celeba
 #            -> 188242.jpg
 #            -> 173822.jpg
 #            -> 284702.jpg
@@ -319,7 +319,7 @@ def forward(self, input):
 # 좋습니다. 이제 우리는 생성자의 인스턴스를 만들고 ``weights_init``
 # 함수를 적용시킬 수 있습니다. 모델의 인스턴스를 출력해서 생성자가
 # 어떻게 구성되어있는지 확인해봅시다.
-# 
+#
 
 # 생성자를 만듭니다
 netG = Generator(ngpu).to(device)
@@ -389,7 +389,7 @@ def forward(self, input):
 ######################################################################
 # 이제 우리는 생성자에 한 것처럼 구분자의 인스턴스를 만들고,
 # ``weights_init`` 함수를 적용시킨 다음, 모델의 구조를 출력해볼 수 있습니다.
-# 
+#
 
 # 구분자를 만듭니다
 netD = Discriminator(ngpu).to(device)
@@ -397,7 +397,7 @@ def forward(self, input):
 # 필요한 경우 multi-gpu를 설정 해주세요
 if (device.type == 'cuda') and (ngpu > 1):
     netD = nn.DataParallel(netD, list(range(ngpu)))
-    
+
 # 모든 가중치의 평균을 0, 분산을 0.02로 초기화 하기 위해
 # weight_init 함수를 적용시킵니다
 netD.apply(weights_init)
@@ -414,7 +414,7 @@ def forward(self, input):
 # 학습을 구체화시킬 시간입니다. 손실함수로는 Binary Cross Entropy loss
 # (`BCELoss <https://pytorch.org/docs/stable/nn.html#torch.nn.BCELoss>`__)
 # 를 사용할겁니다. 해당함수는 아래의 식으로 파이토치에 구현되어 있습니다:
-# 
+#
 # .. math:: \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad l_n = - \left[ y_n \cdot \log x_n + (1 - y_n) \cdot \log (1 - x_n) \right]
 #
 # 이때, 위의 함수가 로그함수 요소를 정의한 방식을 주의깊게 봐주세요 (예. :math:`log(D(x))` 와
@@ -514,7 +514,7 @@ def forward(self, input):
 for epoch in range(num_epochs):
     # 한 에폭 내에서 배치 반복
     for i, data in enumerate(dataloader, 0):
-        
+
         ############################
         # (1) D 신경망을 업데이트 합니다: log(D(x)) + log(1 - D(G(z)))를 최대화 합니다
         ###########################
@@ -567,23 +567,23 @@ def forward(self, input):
         D_G_z2 = output.mean().item()
         # G를 업데이트 합니다
         optimizerG.step()
-        
+
         # 훈련 상태를 출력합니다
         if i % 50 == 0:
             print('[%d/%d][%d/%d]\tLoss_D: %.4f\tLoss_G: %.4f\tD(x): %.4f\tD(G(z)): %.4f / %.4f'
                   % (epoch, num_epochs, i, len(dataloader),
                      errD.item(), errG.item(), D_x, D_G_z1, D_G_z2))
-        
+
         # 이후 그래프를 그리기 위해 손실값들을 저장해둡니다
         G_losses.append(errG.item())
         D_losses.append(errD.item())
-        
+
         # fixed_noise를 통과시킨 G의 출력값을 저장해둡니다
         if (iters % 500 == 0) or ((epoch == num_epochs-1) and (i == len(dataloader)-1)):
             with torch.no_grad():
                 fake = netG(fixed_noise).detach().cpu()
             img_list.append(vutils.make_grid(fake, padding=2, normalize=True))
-            
+
         iters += 1
 
 
@@ -661,5 +661,5 @@ def forward(self, input):
 # -  결과물이 얼마나 더 좋아지는지 확인해보기 위해서 학습시간을 늘려볼 수 있습니다
 # -  다른 데이터셋을 이용해 훈련시켜보거나, 이미지의 사이즈를 다르게 해보거나, 아키텍쳐의 구성을 바꿔볼 수도 있습니다
 # -  `여기 <https://github.com/nashory/gans-awesome-applications>`__ 에서 더욱 멋진 GAN 프로젝트들을 찾을수도 있죠
-# -  `음악 <https://deepmind.com/blog/wavenet-generative-model-raw-audio/>`__ 을 작곡하는 GAN도 만들 수 있습니다
+# -  `음악 <https://www.deepmind.com/blog/wavenet-a-generative-model-for-raw-audio/>`__ 을 작곡하는 GAN도 만들 수 있습니다
 #
diff --git a/beginner_source/dist_overview.rst b/beginner_source/dist_overview.rst
index 4ff844d54..89d6527c8 100644
--- a/beginner_source/dist_overview.rst
+++ b/beginner_source/dist_overview.rst
@@ -2,6 +2,8 @@ PyTorch Distributed Overview
 ============================
 **Author**: `Shen Li <https://mrshenli.github.io/>`_
 
+.. note::
+   View the source code for this tutorial in `github <https://github.com/pytorch/tutorials/blob/master/beginner_source/dist_overview.rst>`__.
 
 This is the overview page for the ``torch.distributed`` package. The goal of
 this page is to categorize documents into different topics and briefly
diff --git a/beginner_source/examples_autograd/polynomial_custom_function.py b/beginner_source/examples_autograd/polynomial_custom_function.py
index 3fbcb005b..3e75f4f6f 100755
--- a/beginner_source/examples_autograd/polynomial_custom_function.py
+++ b/beginner_source/examples_autograd/polynomial_custom_function.py
@@ -6,7 +6,7 @@
 :math:`y=\sin(x)` 을 예측할 수 있도록, :math:`-\pi` 부터 :math:`\pi` 까지
 유클리드 거리(Euclidean distance)를 최소화하도록 3차 다항식을 학습합니다.
 다항식을 :math:`y=a+bx+cx^2+dx^3` 라고 쓰는 대신 :math:`y=a+b P_3(c+dx)` 로 다항식을 적겠습니다.
-여기서 :math:`P_3(x)=\frac{1}{2}\left(5x^3-3x\right)` 은 3차
+여기서 :math:`P_3(x)=\\frac{1}{2}\\left(5x^3-3x\\right)` 은 3차
 `르장드르 다항식(Legendre polynomial)`_ 입니다.
 
 .. _르장드르 다항식(Legendre polynomial):
@@ -16,7 +16,7 @@
 변화도(gradient)를 계산합니다.
 
 아래 구현에서는 :math:`P_3'(x)` 을 수행하기 위해 사용자 정의 autograd Function를 구현합니다.
-수학적으로는 :math:`P_3'(x)=\frac{3}{2}\left(5x^2-1\right)` 입니다.
+수학적으로는 :math:`P_3'(x)=\\frac{3}{2}\\left(5x^2-1\\right)` 입니다.
 """
 import torch
 import math
diff --git a/beginner_source/introyt/autogradyt_tutorial.py b/beginner_source/introyt/autogradyt_tutorial.py
index e5c47b25f..b3609eed4 100644
--- a/beginner_source/introyt/autogradyt_tutorial.py
+++ b/beginner_source/introyt/autogradyt_tutorial.py
@@ -395,7 +395,7 @@ def forward(self, x):
 
 
 ##########################################################################
-# ``torch.no_grad()`` can also be used as a function or method dectorator:
+# ``torch.no_grad()`` can also be used as a function or method decorator:
 # 
 
 def add_tensors1(x, y):
@@ -649,7 +649,7 @@ def do_some_doubling(x):
 # multiplication as ``vjp()`` with the operands reversed. The ``vhp()``
 # and ``hvp()`` methods do the same for a vector-Hessian product.
 # 
-# For more information, including preformance notes on the `docs for the
+# For more information, including performance notes on the `docs for the
 # functional
 # API <https://pytorch.org/docs/stable/autograd.html#functional-higher-level-api>`__
 # 
diff --git a/beginner_source/introyt/introyt1_tutorial.py b/beginner_source/introyt/introyt1_tutorial.py
index 219186e9a..ca8b64e40 100644
--- a/beginner_source/introyt/introyt1_tutorial.py
+++ b/beginner_source/introyt/introyt1_tutorial.py
@@ -87,7 +87,7 @@
 twos = torch.ones(2, 3) * 2 # every element is multiplied by 2
 print(twos)
 
-threes = ones + twos       # additon allowed because shapes are similar
+threes = ones + twos       # addition allowed because shapes are similar
 print(threes)              # tensors are added element-wise
 print(threes.shape)        # this has the same dimensions as input tensors
 
diff --git a/beginner_source/introyt/modelsyt_tutorial.py b/beginner_source/introyt/modelsyt_tutorial.py
index 58abe51af..884fcbdb1 100644
--- a/beginner_source/introyt/modelsyt_tutorial.py
+++ b/beginner_source/introyt/modelsyt_tutorial.py
@@ -46,15 +46,15 @@ class is a subclass of ``torch.Tensor``, with the special behavior that
 import torch
 
 class TinyModel(torch.nn.Module):
-
+    
     def __init__(self):
         super(TinyModel, self).__init__()
-
+        
         self.linear1 = torch.nn.Linear(100, 200)
         self.activation = torch.nn.ReLU()
         self.linear2 = torch.nn.Linear(200, 10)
         self.softmax = torch.nn.Softmax()
-
+    
     def forward(self, x):
         x = self.linear1(x)
         x = self.activation(x)
@@ -85,19 +85,19 @@ def forward(self, x):
 # model, and a ``forward()`` method where the computation gets done. Note
 # that we can print the model, or any of its submodules, to learn about
 # its structure.
-#
+# 
 # Common Layer Types
 # ------------------
-#
+# 
 # Linear Layers
 # ~~~~~~~~~~~~~
-#
+# 
 # The most basic type of neural network layer is a *linear* or *fully
 # connected* layer. This is a layer where every input influences every
 # output of the layer to a degree specified by the layer’s weights. If a
-# model has *m* inputs and *n* outputs, the weights will be an *m*x*n*
+# model has *m* inputs and *n* outputs, the weights will be an *m* x *n*
 # matrix. For example:
-#
+# 
 
 lin = torch.nn.Linear(3, 2)
 x = torch.rand(1, 3)
@@ -117,32 +117,32 @@ def forward(self, x):
 # If you do the matrix multiplication of ``x`` by the linear layer’s
 # weights, and add the biases, you’ll find that you get the output vector
 # ``y``.
-#
+# 
 # One other important feature to note: When we checked the weights of our
 # layer with ``lin.weight``, it reported itself as a ``Parameter`` (which
 # is a subclass of ``Tensor``), and let us know that it’s tracking
 # gradients with autograd. This is a default behavior for ``Parameter``
 # that differs from ``Tensor``.
-#
+# 
 # Linear layers are used widely in deep learning models. One of the most
 # common places you’ll see them is in classifier models, which will
 # usually have one or more linear layers at the end, where the last layer
 # will have *n* outputs, where *n* is the number of classes the classifier
 # addresses.
-#
+# 
 # Convolutional Layers
 # ~~~~~~~~~~~~~~~~~~~~
-#
+# 
 # *Convolutional* layers are built to handle data with a high degree of
 # spatial correlation. They are very commonly used in computer vision,
 # where they detect close groupings of features which the compose into
 # higher-level features. They pop up in other contexts too - for example,
-# in NLP applications, where the a word’s immediate context (that is, the
+# in NLP applications, where a word’s immediate context (that is, the
 # other words nearby in the sequence) can affect the meaning of a
 # sentence.
-#
+# 
 # We saw convolutional layers in action in LeNet5 in an earlier video:
-#
+# 
 
 import torch.functional as F
 
@@ -182,7 +182,7 @@ def num_flat_features(self, x):
 ##########################################################################
 # Let’s break down what’s happening in the convolutional layers of this
 # model. Starting with ``conv1``:
-#
+# 
 # -  LeNet5 is meant to take in a 1x32x32 black & white image. **The first
 #    argument to a convolutional layer’s constructor is the number of
 #    input channels.** Here, it is 1. If we were building this model to
@@ -198,14 +198,14 @@ def num_flat_features(self, x):
 #    size.** Here, the “5” means we’ve chosen a 5x5 kernel. (If you want a
 #    kernel with height different from width, you can specify a tuple for
 #    this argument - e.g., ``(3, 5)`` to get a 3x5 convolution kernel.)
-#
+# 
 # The output of a convolutional layer is an *activation map* - a spatial
 # representation of the presence of features in the input tensor.
 # ``conv1`` will give us an output tensor of 6x28x28; 6 is the number of
 # features, and 28 is the height and width of our map. (The 28 comes from
 # the fact that when scanning a 5-pixel window over a 32-pixel row, there
 # are only 28 valid positions.)
-#
+# 
 # We then pass the output of the convolution through a ReLU activation
 # function (more on activation functions later), then through a max
 # pooling layer. The max pooling layer takes features near each other in
@@ -214,14 +214,14 @@ def num_flat_features(self, x):
 # cell, and assigning that cell the maximum value of the 4 cells that went
 # into it. This gives us a lower-resolution version of the activation map,
 # with dimensions 6x14x14.
-#
+# 
 # Our next convolutional layer, ``conv2``, expects 6 input channels
 # (corresponding to the 6 features sought by the first layer), has 16
 # output channels, and a 3x3 kernel. It puts out a 16x12x12 activation
 # map, which is again reduced by a max pooling layer to 16x6x6. Prior to
 # passing this output to the linear layers, it is reshaped to a 16 \* 6 \*
 # 6 = 576-element vector for consumption by the next layer.
-#
+# 
 # There are convolutional layers for addressing 1D, 2D, and 3D tensors.
 # There are also many more optional arguments for a conv layer
 # constructor, including stride length(e.g., only scanning every second or
@@ -229,22 +229,22 @@ def num_flat_features(self, x):
 # edges of the input), and more. See the
 # `documentation <https://pytorch.org/docs/stable/nn.html#convolution-layers>`__
 # for more information.
-#
+# 
 # Recurrent Layers
 # ~~~~~~~~~~~~~~~~
-#
+# 
 # *Recurrent neural networks* (or *RNNs)* are used for sequential data -
 # anything from time-series measurements from a scientific instrument to
 # natural language sentences to DNA nucleotides. An RNN does this by
 # maintaining a *hidden state* that acts as a sort of memory for what it
 # has seen in the sequence so far.
-#
+# 
 # The internal structure of an RNN layer - or its variants, the LSTM (long
 # short-term memory) and GRU (gated recurrent unit) - is moderately
 # complex and beyond the scope of this video, but we’ll show you what one
 # looks like in action with an LSTM-based part-of-speech tagger (a type of
 # classifier that tells you if a word is a noun, verb, etc.):
-#
+# 
 
 class LSTMTagger(torch.nn.Module):
 
@@ -271,7 +271,7 @@ def forward(self, sentence):
 
 ########################################################################
 # The constructor has four arguments:
-#
+# 
 # -  ``vocab_size`` is the number of words in the input vocabulary. Each
 #    word is a one-hot vector (or unit vector) in a
 #    ``vocab_size``-dimensional space.
@@ -281,7 +281,7 @@ def forward(self, sentence):
 #    space, where words with similar meanings are close together in the
 #    space.
 # -  ``hidden_dim`` is the size of the LSTM’s memory.
-#
+# 
 # The input will be a sentence with the words represented as indices of
 # one-hot vectors. The embedding layer will then map these down to an
 # ``embedding_dim``-dimensional space. The LSTM takes this sequence of
@@ -290,15 +290,15 @@ def forward(self, sentence):
 # ``log_softmax()`` to the output of the final layer converts the output
 # into a normalized set of estimated probabilities that a given word maps
 # to a given tag.
-#
+# 
 # If you’d like to see this network in action, check out the `Sequence
 # Models and LSTM
-# Networks <https://tutorials.pytorch.kr/beginner/nlp/sequence_models_tutorial.html>`__
+# Networks <https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html>`__
 # tutorial on pytorch.org.
-#
+# 
 # Transformers
 # ~~~~~~~~~~~~
-#
+# 
 # *Transformers* are multi-purpose networks that have taken over the state
 # of the art in NLP with models like BERT. A discussion of transformer
 # architecture is beyond the scope of this video, but PyTorch has a
@@ -312,22 +312,22 @@ def forward(self, sentence):
 # ``TransformerDecoderLayer``). For details, check out the
 # `documentation <https://pytorch.org/docs/stable/nn.html#transformer-layers>`__
 # on transformer classes, and the relevant
-# `tutorial <https://tutorials.pytorch.kr/beginner/transformer_tutorial.html>`__
+# `tutorial <https://pytorch.org/tutorials/beginner/transformer_tutorial.html>`__
 # on pytorch.org.
-#
+# 
 # Other Layers and Functions
 # --------------------------
-#
+# 
 # Data Manipulation Layers
 # ~~~~~~~~~~~~~~~~~~~~~~~~
-#
+# 
 # There are other layer types that perform important functions in models,
 # but don’t participate in the learning process themselves.
-#
+# 
 # **Max pooling** (and its twin, min pooling) reduce a tensor by combining
 # cells, and assigning the maximum value of the input cells to the output
 # cell (we saw this). For example:
-#
+# 
 
 my_tensor = torch.rand(1, 6, 6)
 print(my_tensor)
@@ -340,12 +340,12 @@ def forward(self, sentence):
 # If you look closely at the values above, you’ll see that each of the
 # values in the maxpooled output is the maximum value of each quadrant of
 # the 6x6 input.
-#
+# 
 # **Normalization layers** re-center and normalize the output of one layer
 # before feeding it to another. Centering the and scaling the intermediate
 # tensors has a number of beneficial effects, such as letting you use
 # higher learning rates without exploding/vanishing gradients.
-#
+# 
 
 my_tensor = torch.rand(1, 4, 4) * 20 + 5
 print(my_tensor)
@@ -366,22 +366,22 @@ def forward(self, sentence):
 # in the neighborhood of 15. After running it through the normalization
 # layer, you can see that the values are smaller, and grouped around zero
 # - in fact, the mean should be very small (> 1e-8).
-#
+# 
 # This is beneficial because many activation functions (discussed below)
 # have their strongest gradients near 0, but sometimes suffer from
 # vanishing or exploding gradients for inputs that drive them far away
 # from zero. Keeping the data centered around the area of steepest
 # gradient will tend to mean faster, better learning and higher feasible
 # learning rates.
-#
+# 
 # **Dropout layers** are a tool for encouraging *sparse representations*
 # in your model - that is, pushing it to do inference with less data.
-#
+# 
 # Dropout layers work by randomly setting parts of the input tensor
 # *during training* - dropout layers are always turned off for inference.
 # This forces the model to learn against this masked or reduced dataset.
 # For example:
-#
+# 
 
 my_tensor = torch.rand(1, 4, 4)
 
@@ -394,10 +394,10 @@ def forward(self, sentence):
 # Above, you can see the effect of dropout on a sample tensor. You can use
 # the optional ``p`` argument to set the probability of an individual
 # weight dropping out; if you don’t it defaults to 0.5.
-#
+# 
 # Activation Functions
 # ~~~~~~~~~~~~~~~~~~~~
-#
+# 
 # Activation functions make deep learning possible. A neural network is
 # really a program - with many parameters - that *simulates a mathematical
 # function*. If all we did was multiple tensors by layer weights
@@ -406,17 +406,17 @@ def forward(self, sentence):
 # reduce could be reduced to a single matrix multiplication. Inserting
 # *non-linear* activation functions between layers is what allows a deep
 # learning model to simulate any function, rather than just linear ones.
-#
+# 
 # ``torch.nn.Module`` has objects encapsulating all of the major
 # activation functions including ReLU and its many variants, Tanh,
 # Hardtanh, sigmoid, and more. It also includes other functions, such as
 # Softmax, that are most useful at the output stage of a model.
-#
+# 
 # Loss Functions
 # ~~~~~~~~~~~~~~
-#
+# 
 # Loss functions tell us how far a model’s prediction is from the correct
 # answer. PyTorch contains a variety of loss functions, including common
 # MSE (mean squared error = L2 norm), Cross Entropy Loss and Negative
 # Likelihood Loss (useful for classifiers), and others.
-#
+# 
diff --git a/beginner_source/translation_transformer.py b/beginner_source/translation_transformer.py
index 0e498705c..f290b79fb 100644
--- a/beginner_source/translation_transformer.py
+++ b/beginner_source/translation_transformer.py
@@ -25,10 +25,15 @@
 
 from torchtext.data.utils import get_tokenizer
 from torchtext.vocab import build_vocab_from_iterator
-from torchtext.datasets import Multi30k
+from torchtext.datasets import multi30k, Multi30k
 from typing import Iterable, List
 
 
+# 원본 데이터의 링크가 동작하지 않으므로 데이터셋의 URL을 수정해야 합니다.
+# 더 자세한 내용은 https://github.com/pytorch/text/issues/1756#issuecomment-1163664163 을 참고해주세요.
+multi30k.URL["train"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/training.tar.gz"
+multi30k.URL["valid"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/validation.tar.gz"
+
 SRC_LANGUAGE = 'de'
 TGT_LANGUAGE = 'en'
 
@@ -39,6 +44,7 @@
 
 # 출발어(source)와 목적어(target)의 토크나이저(tokenizer)를 생성합니다.
 # 아래 필요 사항(dependency)을 모두 설치해주세요.
+# pip install -U torchdata
 # pip install -U spacy
 # python -m spacy download en_core_web_sm
 # python -m spacy download de_core_news_sm
diff --git a/conf.py b/conf.py
index 90602a8e6..93cc8a9a2 100644
--- a/conf.py
+++ b/conf.py
@@ -48,7 +48,7 @@
 # -- General configuration ------------------------------------------------
 
 # site base url
-site_url = 'http://tutorials.pytorch.kr/'
+site_url = 'https://tutorials.pytorch.kr/'
 
 # If your documentation needs a minimal Sphinx version, state it here.
 #
@@ -132,7 +132,7 @@
 
 # General information about the project.
 project = 'PyTorch Tutorials'
-copyright = '2021, PyTorch & PyTorch Korea Community'
+copyright = '2022, PyTorch & 파이토치 한국 사용자 모임(PyTorch Korea User Group)'
 author = 'PyTorch contributors'
 
 # The version info for the project you're documenting, acts as replacement for
@@ -154,7 +154,7 @@
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
 # This patterns also effect to html_static_path and html_extra_path
-exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+exclude_patterns = ['docs', '.github', '_build', 'Thumbs.db', '.DS_Store']
 exclude_patterns += sphinx_gallery_conf['examples_dirs']
 exclude_patterns += ['*/index.rst']
 exclude_patterns += ['venv']
@@ -203,7 +203,7 @@
 html_theme_path = [pytorch_sphinx_theme.get_html_theme_path()]
 html_logo = '_static/logos/logo-kr-sm-dark.svg'
 html_favicon = '_static/favicon.ico'    # under html_static_path
-html_title = '파이토치 한국어 튜토리얼(PyTorch tutorials in Korean)'
+html_title = '파이토치 한국어 튜토리얼 (PyTorch tutorials in Korean)'
 html_theme_options = {
     'pytorch_project': 'tutorials',
     'collapse_navigation': False,
diff --git a/index.rst b/index.rst
index 479306b21..519177d4e 100644
--- a/index.rst
+++ b/index.rst
@@ -3,6 +3,15 @@
 파이토치(PyTorch) 한국어 튜토리얼에 오신 것을 환영합니다!
 =============================================================
 
+아래 튜토리얼들이 새로 추가되었습니다.
+
+* `Introduction to TorchRec <https://pytorch.org/tutorials/intermediate/torchrec_tutorial.html?utm_source=whats_new_tutorials&utm_medium=torchrec>`__
+* `Getting Started with Fully Sharded Data Parallel (FSDP) <https://pytorch.org/tutorials/intermediate/FSDP_tutorial.html?utm_source=whats_new_tutorials&utm_medium=FSDP>`__
+* `Grokking PyTorch Intel CPU Performance from First Principles <https://pytorch.org/tutorials/intermediate/torchserve_with_ipex?utm_source=whats_new_tutorials&utm_medium=torchserve_ipex>`__
+* `Customize Process Group Backends Using Cpp Extensions <https://pytorch.org/tutorials/intermediate/process_group_cpp_extension_tutorial.html?utm_source=whats_new_tutorials&utm_medium=cpp_ext>`__
+* `Forward-mode Automatic Differentiation <https://pytorch.org/tutorials/intermediate/forward_ad_usage.html?utm_source=whats_new_tutorials&utm_medium=forward_ad>`__ (added functorch API capabilities)
+* `Real Time Inference on Raspberry Pi 4 (30 fps!) <https://pytorch.org/tutorials/intermediate/realtime_rpi.html?utm_source=whats_new_tutorials&utm_medium=rpi>`__
+
 .. raw:: html
 
     <div class="tutorials-callout-container">
@@ -359,7 +368,7 @@
    :card_description: This tutorial covers how to run quantized and fused models on a Raspberry Pi 4 at 30 fps.
    :image: _static/img/thumbnails/cropped/realtime_rpi.png
    :link: intermediate/realtime_rpi.html
-   :tags: TorchScript,Model Optimization,Image/Video,Quantization
+   :tags: TorchScript,Model-Optimization,Image/Video,Quantization
 
 .. customcarditem::
    :header: Autograd in C++ Frontend
@@ -475,6 +484,13 @@
    :link: advanced/static_quantization_tutorial.html
    :tags: Quantization
 
+.. customcarditem::
+   :header: Grokking PyTorch Intel CPU Performance from First Principles
+   :card_description: A case study on the TorchServe inference framework optimized with Intel® Extension for PyTorch.
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: intermediate/torchserve_with_ipex
+   :tags: Model-Optimization,Production
+
 .. Parallel-and-Distributed-Training
 
 .. customcarditem::
@@ -592,6 +608,14 @@
    :link: intermediate/torchrec_tutorial.html
    :tags: TorchRec,Recommender
 
+.. customcarditem::
+   :header: Exploring TorchRec sharding
+   :card_description: This tutorial covers the sharding schemes of embedding tables by using <code>EmbeddingPlanner</code> and <code>DistributedModelParallel</code> API.
+   :image: _static/img/thumbnails/torchrec.png
+   :link: advanced/sharding.html
+   :tags: TorchRec,Recommender
+
+
 .. End of tutorial card section
 
 .. raw:: html
@@ -831,6 +855,7 @@
    intermediate/dynamic_quantization_bert_tutorial
    intermediate/quantized_transfer_learning_tutorial
    advanced/static_quantization_tutorial
+   intermediate/torchserve_with_ipex
 
 .. toctree::
    :maxdepth: 2
@@ -868,4 +893,5 @@
    :hidden:
    :caption: Recommendation Systems
 
-   intermediate/torchrec_tutorial
\ No newline at end of file
+   intermediate/torchrec_tutorial
+   advanced/sharding
diff --git a/intermediate_source/FSDP_adavnced_tutorial.rst b/intermediate_source/FSDP_adavnced_tutorial.rst
new file mode 100644
index 000000000..1adbf9722
--- /dev/null
+++ b/intermediate_source/FSDP_adavnced_tutorial.rst
@@ -0,0 +1,602 @@
+Advanced Fully Sharded Data Parallel(FSDP) Tutorial
+=====================================================
+
+**Author**: `Hamid Shojanazeri <https://github.com/HamidShojanazeri>`__, `Less Wright <https://github.com/lessw2020>`__, `Rohan Varma <https://github.com/rohan-varma/>`__, `Yanli Zhao <https://github.com/zhaojuanmao>`__
+
+
+This tutorial introduces more advanced features of Fully Sharded Data Parallel (FSDP) as part of the PyTorch 1.12 release. To get familiar with FSDP, please refer to the `FSDP getting started tutorial <https://pytorch.org/tutorials/intermediate/FSDP_tutorial.html>`__.
+
+In this tutorial, we fine-tune a HuggingFace (HF) T5 model with FSDP for text summarization as a working example. 
+
+The example uses Wikihow and for simplicity, we will showcase the training on a single node, P4dn instance with 8 A100 GPUs. We will soon have a blog post on large scale FSDP training on a multi-node cluster, please stay tuned for that on the PyTorch medium channel.
+
+FSDP is a production ready package with focus on ease of use, performance, and long-term support. 
+One of the main benefits of FSDP is reducing the memory footprint on each GPU. This enables training of larger models with lower total memory vs DDP, and leverages the overlap of computation and communication to train models efficiently. 
+This reduced memory pressure can be leveraged to either train larger models or increase batch size, potentially helping overall training throughput. 
+You can read more about PyTorch FSDP `here <https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/>`__.
+
+
+FSDP Features in This Tutorial
+------------------------------
+* Transformer Auto Wrap Policy
+* Mixed Precision
+* Initializing FSDP Model on Device
+* Sharding Strategy
+* Backward Prefetch
+* Model Checkpoint Saving via Streaming to CPU
+
+
+
+Recap on How FSDP Works
+-----------------------
+
+At a high level FDSP works as follow:
+
+*In constructor*
+
+* Shard model parameters and each rank only keeps its own shard
+
+*In forward pass*
+
+* Run `all_gather` to collect all shards from all ranks to recover the full parameter for this FSDP unit
+* Run forward computation
+* Discard non-owned parameter shards it has just collected to free memory
+
+*In backward pass*
+
+* Run `all_gather` to collect all shards from all ranks to recover the full parameter in this FSDP unit
+* Run backward computation
+* Discard non-owned parameters to free memory. 
+* Run reduce_scatter to sync gradients
+
+
+Fine-tuning HF T5
+-----------------
+HF T5 pre-trained models are available in four different sizes, ranging from small with 60 Million parameters to XXL with 11 Billion parameters. In this tutorial, we demonstrate the fine-tuning of a T5 3B with FSDP for text summarization using WikiHow dataset.
+The main focus of this tutorial is to highlight different available features in FSDP that are helpful for training large scale model above 3B parameters. Also, we cover specific features for Transformer based models. The code for this tutorial is available in  `Pytorch Examples <https://github.com/HamidShojanazeri/examples/tree/FSDP_example/FSDP/>`__.
+
+
+*Setup*
+
+1.1 Install PyTorch Nightlies
+
+We will install PyTorch nightlies, as some of the features such as activation checkpointing is available in nightlies and will be added in next PyTorch release after 1.12.
+
+.. code-block:: bash 
+
+    pip3 install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu113/torch_nightly.html
+
+1.2 Dataset Setup
+
+Please create a `data` folder, download the WikiHow dataset from `wikihowAll.csv <https://ucsb.app.box.com/s/ap23l8gafpezf4tq3wapr6u8241zz358>`__  and `wikihowSep.cs <https://ucsb.app.box.com/s/7yq601ijl1lzvlfu4rjdbbxforzd2oag>`__, and place them in the `data` folder. 
+We will use the wikihow dataset from  `summarization_dataset <https://github.com/HamidShojanazeri/examples/blob/FSDP_example/FSDP/summarization_dataset.py>`__.
+
+Next, we add the following code snippets to a Python script “T5_training.py”.  Note - The full source code for this tutorial is available in `PyTorch examples <https://github.com/HamidShojanazeri/examples/tree/FSDP_example/FSDP>`__.
+
+1.3  Import necessary packages:
+
+.. code-block:: python
+
+   import os
+    import argparse
+    import torch
+    import torch.nn as nn
+    import torch.nn.functional as F
+    import torch.optim as optim
+    from transformers import AutoTokenizer, GPT2TokenizerFast
+    from transformers import T5Tokenizer, T5ForConditionalGeneration
+    import functools
+    from torch.optim.lr_scheduler import StepLR
+    import torch.nn.functional as F
+    import torch.distributed as dist
+    import torch.multiprocessing as mp
+    from torch.nn.parallel import DistributedDataParallel as DDP
+    from torch.utils.data.distributed import DistributedSampler
+    from transformers.models.t5.modeling_t5 import T5Block
+
+    from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
+     checkpoint_wrapper,
+     CheckpointImpl,
+     apply_activation_checkpointing_wrapper)
+
+    from torch.distributed.fsdp import (
+        FullyShardedDataParallel as FSDP,
+        MixedPrecision,
+        BackwardPrefetch,
+        ShardingStrategy,
+        FullStateDictConfig,
+        StateDictType,
+    )
+    from torch.distributed.fsdp.wrap import (
+        transformer_auto_wrap_policy,
+        enable_wrap,
+        wrap,
+    )
+    from functools import partial
+    from torch.utils.data import DataLoader
+    from pathlib import Path
+    from summarization_dataset import *
+    from transformers.models.t5.modeling_t5 import T5Block
+    from typing import Type
+    import time
+    import tqdm
+    from datetime import datetime
+
+1.4 Distributed training setup. 
+Here we use two helper functions to initialize the processes for distributed training,  and then to clean up after training completion.
+In this tutorial, we are going to use torch elastic, using `torchrun <https://pytorch.org/docs/stable/elastic/run.html>`__ , which will set the worker `RANK` and `WORLD_SIZE` automatically.
+
+.. code-block:: python
+
+    def setup():
+        # initialize the process group
+        dist.init_process_group("nccl")
+
+    def cleanup():
+        dist.destroy_process_group()
+
+2.1  Set up the HuggingFace T5 model:
+
+.. code-block:: python
+
+    def setup_model(model_name):
+        model = T5ForConditionalGeneration.from_pretrained(model_name)
+        tokenizer =  T5Tokenizer.from_pretrained(model_name)
+        return model, tokenizer
+
+We also, add couple of helper functions here for date and formatting memory metrics.
+
+.. code-block:: python
+
+    def get_date_of_run():
+        """create date and time for file save uniqueness
+        example: 2022-05-07-08:31:12_PM'
+        """
+        date_of_run = datetime.now().strftime("%Y-%m-%d-%I:%M:%S_%p")
+        print(f"--> current date and time of run = {date_of_run}")
+        return date_of_run
+        
+    def format_metrics_to_gb(item):
+        """quick function to format numbers to gigabyte and round to 4 digit precision"""
+        metric_num = item / g_gigabyte
+        metric_num = round(metric_num, ndigits=4)
+        return metric_num
+    
+
+2.2 Define a train function:
+
+.. code-block:: python
+
+    def train(args, model, rank, world_size, train_loader, optimizer, epoch, sampler=None):
+        model.train()
+        local_rank = int(os.environ['LOCAL_RANK'])
+        fsdp_loss = torch.zeros(2).to(local_rank)
+
+        if sampler:
+            sampler.set_epoch(epoch)
+        if rank==0:
+            inner_pbar = tqdm.tqdm(
+                range(len(train_loader)), colour="blue", desc="r0 Training Epoch"
+            )
+        for batch in train_loader:
+            for key in batch.keys():
+                batch[key] = batch[key].to(local_rank)
+            optimizer.zero_grad()
+            output = model(input_ids=batch["source_ids"],attention_mask=batch["source_mask"],labels=batch["target_ids"] )
+            loss = output["loss"]
+            loss.backward()
+            optimizer.step()
+            fsdp_loss[0] += loss.item()
+            fsdp_loss[1] += len(batch)
+            if rank==0:
+                inner_pbar.update(1)
+
+        dist.all_reduce(fsdp_loss, op=dist.ReduceOp.SUM)
+        train_accuracy = fsdp_loss[0] / fsdp_loss[1]
+
+
+        if rank == 0:
+            inner_pbar.close()
+            print(
+                    f"Train Epoch: \t{epoch}, Loss: \t{train_accuracy:.4f}"
+                )
+        return train_accuracy
+
+2.3 Define a validation function:
+
+.. code-block:: python
+
+    def validation(model, rank, world_size, val_loader):
+        model.eval()
+        correct = 0
+        local_rank = int(os.environ['LOCAL_RANK'])
+        fsdp_loss = torch.zeros(3).to(local_rank)
+        if rank == 0:
+            inner_pbar = tqdm.tqdm(
+                range(len(val_loader)), colour="green", desc="Validation Epoch"
+            )
+        with torch.no_grad():
+            for batch in val_loader:
+                for key in batch.keys():
+                    batch[key] = batch[key].to(local_rank)
+                output = model(input_ids=batch["source_ids"],attention_mask=batch["source_mask"],labels=batch["target_ids"])
+                fsdp_loss[0] += output["loss"].item()  # sum up batch loss
+                fsdp_loss[1] += len(batch)
+
+                if rank==0:
+                    inner_pbar.update(1)
+
+        dist.all_reduce(fsdp_loss, op=dist.ReduceOp.SUM)
+        val_loss = fsdp_loss[0] / fsdp_loss[1]
+        if rank == 0:
+            inner_pbar.close()
+            print(f"Validation Loss: {val_loss:.4f}")
+        return val_loss
+
+
+2.4 Define a distributed train function that wraps the model in FSDP:
+
+
+.. code-block:: python
+
+    
+    def fsdp_main(args):
+
+        model, tokenizer = setup_model("t5-base")
+
+        local_rank = int(os.environ['LOCAL_RANK'])
+        rank = int(os.environ['RANK'])
+        world_size = int(os.environ['WORLD_SIZE'])
+
+
+        dataset = load_dataset('wikihow', 'all', data_dir='data/')
+        print(dataset.keys())
+        print("Size of train dataset: ", dataset['train'].shape)
+        print("Size of Validation dataset: ", dataset['validation'].shape)
+
+
+        #wikihow(tokenizer, type_path, num_samples, input_length, output_length, print_text=False)
+        train_dataset = wikihow(tokenizer, 'train', 1500, 512, 150, False) 
+        val_dataset = wikihow(tokenizer, 'validation', 300, 512, 150, False)
+
+        sampler1 = DistributedSampler(train_dataset, rank=rank, num_replicas=world_size, shuffle=True)
+        sampler2 = DistributedSampler(val_dataset, rank=rank, num_replicas=world_size)
+
+        setup()
+
+
+        train_kwargs = {'batch_size': args.batch_size, 'sampler': sampler1}
+        test_kwargs = {'batch_size': args.test_batch_size, 'sampler': sampler2}
+        cuda_kwargs = {'num_workers': 2,
+                        'pin_memory': True,
+                        'shuffle': False}
+        train_kwargs.update(cuda_kwargs)
+        test_kwargs.update(cuda_kwargs)
+
+        train_loader = torch.utils.data.DataLoader(train_dataset,**train_kwargs)
+        val_loader = torch.utils.data.DataLoader(val_dataset, **test_kwargs)
+
+        t5_auto_wrap_policy = functools.partial(
+            transformer_auto_wrap_policy,
+            transformer_layer_cls={
+                T5Block,
+            },
+        )
+        sharding_strategy: ShardingStrategy = ShardingStrategy.SHARD_GRAD_OP #for Zero2 and FULL_SHARD for Zero3
+        torch.cuda.set_device(local_rank)
+
+
+        #init_start_event = torch.cuda.Event(enable_timing=True)
+        #init_end_event = torch.cuda.Event(enable_timing=True)
+
+        #init_start_event.record()
+
+        bf16_ready = (
+        torch.version.cuda
+        and torch.cuda.is_bf16_supported()
+        and LooseVersion(torch.version.cuda) >= "11.0"
+        and dist.is_nccl_available()
+        and nccl.version() >= (2, 10)
+        )
+
+        if bf16_ready:
+            mp_policy = bfSixteen
+        else:
+            mp_policy = None # defaults to fp32
+            
+        # model is on CPU before input to FSDP
+        model = FSDP(model,
+            auto_wrap_policy=t5_auto_wrap_policy,
+            mixed_precision=mp_policy,
+            #sharding_strategy=sharding_strategy,
+            device_id=torch.cuda.current_device())
+
+        optimizer = optim.AdamW(model.parameters(), lr=args.lr)
+
+        scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)
+        best_val_loss = float("inf")
+        curr_val_loss = float("inf")
+        file_save_name = "T5-model-"
+
+        if rank == 0:
+            time_of_run = get_date_of_run()
+            dur = []
+            train_acc_tracking = []
+            val_acc_tracking = []
+            training_start_time = time.time()
+
+        if rank == 0 and args.track_memory:
+            mem_alloc_tracker = []
+            mem_reserved_tracker = []
+
+        for epoch in range(1, args.epochs + 1):
+            t0 = time.time()
+            train_accuracy = train(args, model, rank, world_size, train_loader, optimizer, epoch, sampler=sampler1)
+            if args.run_validation:
+                curr_val_loss = validation(model, rank, world_size, val_loader)
+            scheduler.step()
+
+            if rank == 0:
+
+                print(f"--> epoch {epoch} completed...entering save and stats zone")
+
+                dur.append(time.time() - t0)
+                train_acc_tracking.append(train_accuracy.item())
+
+                if args.run_validation:
+                    val_acc_tracking.append(curr_val_loss.item())
+
+                if args.track_memory:
+                    mem_alloc_tracker.append(
+                        format_metrics_to_gb(torch.cuda.memory_allocated())
+                    )
+                    mem_reserved_tracker.append(
+                        format_metrics_to_gb(torch.cuda.memory_reserved())
+                    )
+                print(f"completed save and stats zone...")
+    
+            if args.save_model and curr_val_loss < best_val_loss:
+
+                # save
+                if rank == 0:
+                    print(f"--> entering save model state")
+
+                save_policy = FullStateDictConfig(offload_to_cpu=True, rank0_only=True)
+                with FSDP.state_dict_type(
+                    model, StateDictType.FULL_STATE_DICT, save_policy
+                ):
+                    cpu_state = model.state_dict()
+                #print(f"saving process: rank {rank}  done w state_dict")
+
+
+                if rank == 0:
+                    print(f"--> saving model ...")
+                    currEpoch = (
+                        "-" + str(epoch) + "-" + str(round(curr_val_loss.item(), 4)) + ".pt"
+                    )
+                    print(f"--> attempting to save model prefix {currEpoch}")
+                    save_name = file_save_name + "-" + time_of_run + "-" + currEpoch
+                    print(f"--> saving as model name {save_name}")
+
+                    torch.save(cpu_state, save_name)
+
+            if curr_val_loss < best_val_loss:
+
+                best_val_loss = curr_val_loss
+                if rank==0:
+                    print(f"-->>>> New Val Loss Record: {best_val_loss}")
+
+        dist.barrier()
+        cleanup()
+
+
+2.5 Parse the arguments and set the main function:
+
+.. code-block:: python
+
+    
+    if __name__ == '__main__':
+        # Training settings
+        parser = argparse.ArgumentParser(description='PyTorch T5 FSDP Example')
+        parser.add_argument('--batch-size', type=int, default=4, metavar='N',
+                            help='input batch size for training (default: 64)')
+        parser.add_argument('--test-batch-size', type=int, default=4, metavar='N',
+                            help='input batch size for testing (default: 1000)')
+        parser.add_argument('--epochs', type=int, default=2, metavar='N',
+                            help='number of epochs to train (default: 3)')
+        parser.add_argument('--lr', type=float, default=.002, metavar='LR',
+                            help='learning rate (default: .002)')
+        parser.add_argument('--gamma', type=float, default=0.7, metavar='M',
+                            help='Learning rate step gamma (default: 0.7)')
+        parser.add_argument('--no-cuda', action='store_true', default=False,
+                            help='disables CUDA training')
+        parser.add_argument('--seed', type=int, default=1, metavar='S',
+                            help='random seed (default: 1)')
+        parser.add_argument('--track_memory', action='store_false', default=True,
+                            help='track the gpu memory')
+        parser.add_argument('--run_validation', action='store_false', default=True,
+                            help='running the validation')
+        parser.add_argument('--save-model', action='store_false', default=True,
+                            help='For Saving the current Model')
+        args = parser.parse_args()
+
+        torch.manual_seed(args.seed)
+
+        fsdp_main(args)
+
+
+To run the the training using torchrun:
+
+.. code-block:: bash 
+
+    torchrun --nnodes 1 --nproc_per_node 4  T5_training.py
+
+.. _transformer_wrapping_policy:
+Transformer Wrapping Policy
+---------------------------
+As discussed in the `previous tutorial <https://pytorch.org/tutorials/intermediate/FSDP_tutorial.html>`__, auto_wrap_policy is one of the FSDP features that make it easy to automatically shard a given model and put the model, optimizer and gradient shards into distinct FSDP units.
+
+For some architectures such as Transformer encoder-decoders, some parts of the model such as embedding table is being shared with both encoder and decoder.
+In this case, we need to place the embedding table in the outer FSDP unit so that it could be accessed from both encoder and decoder.  In addition, by registering the layer class for a transformer, the sharding plan can be made much more communication efficient.  In PyTorch 1.12, FSDP added this support and now we have a wrapping policy for transfomers.
+
+It can be created as follows, where the T5Block represents the T5 transformer layer class (holding MHSA and FFN).  
+
+
+.. code-block:: python
+
+    t5_auto_wrap_policy = functools.partial(
+            transformer_auto_wrap_policy,
+            transformer_layer_cls={
+                T5Block,
+            },
+        )
+    torch.cuda.set_device(local_rank)
+  
+
+    model = FSDP(model,
+        fsdp_auto_wrap_policy=t5_auto_wrap_policy)
+
+To see the wrapped model, you can easily print the model and visually inspect the sharding and FSDP units as well.
+
+
+Mixed Precision
+---------------
+FSDP supports flexible mixed precision training allowing for arbitrary reduced precision types (such as fp16 or bfloat16). Currently BFloat16 is only available on Ampere GPUs, so you need to confirm native support before you use it. On V100s for example, BFloat16 can still be run but due to it running non-natively, it can result in significant slowdowns.
+
+To check if BFloat16 is natively supported, you can use the following :
+
+.. code-block:: python
+    
+    bf16_ready = (
+        torch.version.cuda
+        and torch.cuda.is_bf16_supported() 
+        and LooseVersion(torch.version.cuda) >= "11.0"
+        and dist.is_nccl_available()
+        and nccl.version() >= (2, 10)
+    )
+
+One of the advantages of mixed percision in FSDP is providing granular control over different precision levels for parameters, gradients, and buffers as follows:
+
+.. code-block:: python
+
+    fpSixteen = MixedPrecision(
+        param_dtype=torch.float16,
+        # Gradient communication precision.
+        reduce_dtype=torch.float16,
+        # Buffer precision.
+        buffer_dtype=torch.float16,
+    )
+
+    bfSixteen = MixedPrecision(
+        param_dtype=torch.bfloat16,
+        # Gradient communication precision.
+        reduce_dtype=torch.bfloat16,
+        # Buffer precision.
+        buffer_dtype=torch.bfloat16,
+    )
+
+    fp32_policy = MixedPrecision(
+        param_dtype=torch.float32,
+        # Gradient communication precision.
+        reduce_dtype=torch.float32,
+        # Buffer precision.
+        buffer_dtype=torch.float32,
+    )
+
+Note that if a certain type (parameter, reduce, buffer) is not specified, they will not be casted at all.
+
+This flexibility allows users fine grained control, such as only setting gradient communication to happen in reduced precision, and all parameters / buffer computation to be done in full precision. This is potentially useful in cases where intra-node communication is the main bottleneck and parameters / buffers must be in full precision to avoid accuracy issues. This can be done with the following policy:
+
+.. code-block:: bash
+
+    grad_bf16 = MixedPrecision(reduce_dtype=torch.bfloat16)
+    
+
+In 2.4 we just add the relevant mixed precision policy to the FSDP wrapper:
+
+
+.. code-block:: python
+
+     model = FSDP(model,
+            auto_wrap_policy=t5_auto_wrap_policy,
+            mixed_precision=bfSixteen)
+
+In our experiments, we have observed up to 4x speed up by using BFloat16 for training and memory reduction of approximately 30% in some experiments that can be used for batch size increases.
+
+
+Intializing FSDP Model on Device
+--------------------------------
+In 1.12, FSDP supports a `device_id` argument meant to initialize input CPU module on the device given by `device_id`. This is useful when the entire model does not fit on a single GPU, but fits in a host's CPU memory. When `device_id` is specified, FSDP will move the model to the specified device on a per-FSDP unit basis, avoiding GPU OOM issues while initializing several times faster than CPU-based initialization:
+
+.. code-block:: python
+
+    torch.cuda.set_device(local_rank)
+
+     model = FSDP(model,
+            auto_wrap_policy=t5_auto_wrap_policy,
+            mixed_precision=bfSixteen,
+            device_id=torch.cuda.current_device())
+     
+
+    
+Sharding Strategy
+-----------------
+FSDP sharding strategy by default is set to fully shard the model parameters, gradients and optimizer states get sharded across all ranks. (also termed Zero3 sharding). In case you are interested to have Zero2 sharding strategy, where only optimizer states and gradients are sharded, FSDP support this feature by passing the Sharding strategy by using  "ShardingStrategy.SHARD_GRAD_OP", instead of "ShardingStrategy.FULL_SHARD" to the FSDP initialization  as follows:
+
+.. code-block:: python
+
+    torch.cuda.set_device(local_rank)
+
+     model = FSDP(model,
+            auto_wrap_policy=t5_auto_wrap_policy,
+            mixed_precision=bfSixteen,
+            device_id=torch.cuda.current_device(),
+            sharding_strategy=ShardingStrategy.SHARD_GRAD_OP # FULL_SHARD)
+
+This will reduce the communication overhead in FSDP, in this case, it holds full parameters after forward and through the backwards pass. 
+
+This saves an all_gather during backwards so there is less communication at the cost of a higher memory footprint. Note that full model params are freed at the end of backwards and all_gather will happen on the next forward pass.
+
+Backward Prefetch
+-----------------
+The backward prefetch setting controls the timing of when the next FSDP unit's parameters should be requested.  By setting it to `BACKWARD_PRE`, the next FSDP's unit params can begin to be requested and arrive sooner before the computation of the current unit starts. This overlaps the `all_gather` communication and gradient computation which can increase the training speed in exchange for slightly higher memory consumption. It can be utilized in the FSDP wrapper in 2.4 as follows:
+
+.. code-block:: python
+
+    torch.cuda.set_device(local_rank)
+
+     model = FSDP(model,
+            auto_wrap_policy=t5_auto_wrap_policy,
+            mixed_precision=bfSixteen,
+            device_id=torch.cuda.current_device(),
+            backward_prefetch = BackwardPrefetch.BACKWARD_PRE)
+            
+`backward_prefetch` has two modes, `BACKWARD_PRE` and `BACKWARD_POST`.  `BACKWARD_POST` means that the next FSDP unit's params will not be requested until the current FSDP unit processing is complete, thus minimizing memory overhead.  In some cases, using `BACKWARD_PRE` can increase model training speed up to 2-10%, with even higher speed improvements noted for larger models. 
+
+Model Checkpoint Saving, by streaming to the Rank0 CPU
+------------------------------------------------------
+To save model checkpoints using FULL_STATE_DICT saving which saves model in the same fashion as a local model, PyTorch 1.12 offers a few utilities to support the saving of larger models.
+
+First, a FullStateDictConfig can be specified, allowing the state_dict to be populated on rank 0 only and offloaded to the CPU.
+
+When using this configuration, FSDP will allgather model parameters, offloading them to the CPU one by one, only on rank 0. When the state_dict is finally saved, it will only be populated on rank 0 and contain CPU tensors. This avoids potential OOM for models that are larger than a single GPU memory and allows users to checkpoint models whose size is roughly the available CPU RAM on the user's machine.
+
+This feature can be run as follows:
+
+.. code-block:: python
+
+    save_policy = FullStateDictConfig(offload_to_cpu=True, rank0_only=True)
+    with FSDP.state_dict_type(
+                model, StateDictType.FULL_STATE_DICT, save_policy
+            ):
+                cpu_state = model.state_dict()
+    if rank == 0:
+     save_name = file_save_name + "-" + time_of_run + "-" + currEpoch
+     torch.save(cpu_state, save_name)
+
+Summary
+-------
+In this tutorial, we have introduced many new features for FSDP available in Pytorch 1.12 and used HF T5 as the running example. 
+Using the proper wrapping policy especially for transformer models, along with mixed precision and backward prefetch should speed up your training runs. Also, features such as initializing the model on device, and checkpoint saving via streaming to CPU should help to avoid OOM error in dealing with large models. 
+
+We are actively working to add new features to FSDP for the next release.  If you have feedback, feature requests, questions or are encountering issues using FSDP, please feel free to contact us by opening an issue at  `PyTorch Github repository <https://github.com/pytorch/pytorch>`__.
diff --git a/intermediate_source/FSDP_tutorial.rst b/intermediate_source/FSDP_tutorial.rst
index d51f38800..421e966ee 100644
--- a/intermediate_source/FSDP_tutorial.rst
+++ b/intermediate_source/FSDP_tutorial.rst
@@ -3,6 +3,8 @@ Getting Started with Fully Sharded Data Parallel(FSDP)
 
 **Author**: `Hamid Shojanazeri <https://github.com/HamidShojanazeri>`__, `Yanli Zhao <https://github.com/zhaojuanmao>`__, `Shen Li <https://mrshenli.github.io/>`__
 
+.. note::
+   View the source code for this tutorial in `github <https://github.com/pytorch/tutorials/blob/master/intermediate_source/FSDP_tutorial.rst>`__.
 
 Training AI models at a large scale is a challenging task that requires a lot of compute power and resources. 
 It also comes with considerable engineering complexity to handle the training of these very large models.
@@ -33,13 +35,13 @@ At high level FDSP works as follow:
 
 *In forward path*
 
-* Run allgather to collect all shards from all ranks to recover the full parameter in this FSDP unit
+* Run all_gather to collect all shards from all ranks to recover the full parameter in this FSDP unit
 * Run forward computation
 * Discard parameter shards it has just collected
 
 *In backward path*
 
-* Run allgather to collect all shards from all ranks to recover the full parameter in this FSDP unit
+* Run all_gather to collect all shards from all ranks to recover the full parameter in this FSDP unit
 * Run backward computation
 * Run reduce_scatter to sync gradients
 * Discard parameters. 
@@ -153,7 +155,7 @@ We add the following code snippets to a python script “FSDP_mnist.py”.
             ddp_loss[0] += loss.item()
             ddp_loss[1] += len(data)
 
-        dist.reduce(ddp_loss, 0, op=dist.ReduceOp.SUM)
+        dist.all_reduce(ddp_loss, op=dist.ReduceOp.SUM)
         if rank == 0:
             print('Train Epoch: {} \tLoss: {:.6f}'.format(epoch, ddp_loss[0] / ddp_loss[1]))
 
@@ -174,7 +176,7 @@ We add the following code snippets to a python script “FSDP_mnist.py”.
                 ddp_loss[1] += pred.eq(target.view_as(pred)).sum().item()
                 ddp_loss[2] += len(data)
 
-        dist.reduce(ddp_loss, 0, op=dist.ReduceOp.SUM)
+        dist.all_reduce(ddp_loss, op=dist.ReduceOp.SUM)
 
         if rank == 0:
             test_loss = ddp_loss[0] / ddp_loss[2]
diff --git a/intermediate_source/ddp_tutorial.rst b/intermediate_source/ddp_tutorial.rst
index 7406612da..fcdf92461 100644
--- a/intermediate_source/ddp_tutorial.rst
+++ b/intermediate_source/ddp_tutorial.rst
@@ -6,6 +6,9 @@
 
 **번역**: `조병근 <https://github.com/Jo-byung-geun>`_
 
+.. note::
+   이 튜토리얼의 소스 코드는 `GitHub <https://github.com/pytorch/tutorials/blob/master/intermediate_source/ddp_tutorial.rst>`__ 에서 확인할 수 있습니다.
+
 선수과목(Prerequisites):
 
 -  `PyTorch 분산 처리 개요 <../beginner/dist_overview.html>`__
@@ -56,7 +59,7 @@ checkpointing 모델 및 DDP와 모델 병렬 처리의 결합을 포함한 추
 기본적인 사용법
 ---------------
 
-DDP 모듈을 생성하기 전에 우선 작업 그룹을 올바르게 설정해야 합니다. 자세한 내용은
+DDP 모듈을 생성하기 전에 반드시 우선 작업 그룹을 올바르게 설정해야 합니다. 자세한 내용은
 `PYTORCH로 분산 어플리케이션 개발하기 <https://tutorials.pytorch.kr/intermediate/dist_tuto.html>`__\에서 확인할 수 있습니다.
 
 .. code:: python
@@ -167,7 +170,7 @@ DDP를 사용할 때, 최적의 방법은 모델을 한 작업에만 저장하
 이는 모든 작업이 같은 매개변수로부터 시작되고 변화도는
 역전파 전달로 동기화되므로 옵티마이저(optimizer)는
 매개변수를 동일한 값으로 계속 설정해야 하기 때문에 정확합니다. 이러한 최적화를 사용하는 경우,
-저장이 완료되기 전에 읽어오는 작업을 시작하지 않도록 해야 합니다. 게다가, 모듈을 읽어올 때,
+저장이 완료되기 전에 불러오는 어떠한 작업도 시작하지 않도록 해야 합니다. 더불어, 모듈을 읽어올 때
 작업이 다른 기기에 접근하지 않도록 적절한 ``map_location`` 인자를 제공해야합니다.
 ``map_location``\값이 없을 경우, ``torch.load``\는 먼저 모듈을 CPU에 읽어온 다음 각 매개변수가
 저장된 위치로 복사하여 동일한 장치를 사용하는 동일한 기기에서 모든 작업을 발생시킵니다.
@@ -182,9 +185,6 @@ DDP를 사용할 때, 최적의 방법은 모델을 한 작업에만 저장하
         model = ToyModel().to(rank)
         ddp_model = DDP(model, device_ids=[rank])
 
-        loss_fn = nn.MSELoss()
-        optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
-
         CHECKPOINT_PATH = tempfile.gettempdir() + "/model.checkpoint"
         if rank == 0:
             # 모든 작업은 같은 매개변수로부터 시작된다고 생각해야 합니다.
@@ -199,10 +199,13 @@ DDP를 사용할 때, 최적의 방법은 모델을 한 작업에만 저장하
         ddp_model.load_state_dict(
             torch.load(CHECKPOINT_PATH, map_location=map_location))
 
+        loss_fn = nn.MSELoss()
+        optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
+
         optimizer.zero_grad()
         outputs = ddp_model(torch.randn(20, 10))
         labels = torch.randn(20, 5).to(rank)
-        loss_fn = nn.MSELoss()
+
         loss_fn(outputs, labels).backward()
         optimizer.step()
 
@@ -215,10 +218,10 @@ DDP를 사용할 때, 최적의 방법은 모델을 한 작업에만 저장하
         cleanup()
 
 모델 병렬 처리를 활용한 DDP
----------------------------
+------------------------------
 
-DDP는 다중 – GPU 모델에서도 작동합니다.
-다중 – GPU 모델을 활용한 DDP는 대용량의 데이터를 가진 대용량 모델을 학습시킬 때 특히 유용합니다.
+DDP는 다중 GPU 모델에서도 작동합니다.
+다중 GPU 모델을 활용한 DDP는 대용량의 데이터를 가진 대용량 모델을 학습시킬 때 특히 유용합니다.
 
 .. code:: python
 
@@ -272,3 +275,76 @@ DDP는 다중 – GPU 모델에서도 작동합니다.
         run_demo(demo_basic, world_size)
         run_demo(demo_checkpoint, world_size)
         run_demo(demo_model_parallel, world_size)
+
+Initialize DDP with torch.distributed.run/torchrun
+--------------------------------------------------------------------
+
+We can leverage PyTorch Elastic to simplify the DDP code and initialize the job more easily.
+Let's still use the Toymodel example and create a file named ``elastic_ddp.py``.
+
+.. code:: python
+
+    import torch
+    import torch.distributed as dist
+    import torch.nn as nn
+    import torch.optim as optim
+
+    from torch.nn.parallel import DistributedDataParallel as DDP
+
+    class ToyModel(nn.Module):
+        def __init__(self):
+            super(ToyModel, self).__init__()
+            self.net1 = nn.Linear(10, 10)
+            self.relu = nn.ReLU()
+            self.net2 = nn.Linear(10, 5)
+
+        def forward(self, x):
+            return self.net2(self.relu(self.net1(x)))
+
+    def demo_basic():
+        dist.init_process_group("nccl")
+        rank = dist.get_rank()
+        print(f"Start running basic DDP example on rank {rank}.")
+
+        # create model and move it to GPU with id rank
+        device_id = rank % torch.cuda.device_count()
+        model = ToyModel().to(device_id)
+        ddp_model = DDP(model, device_ids=[device_id])
+
+        loss_fn = nn.MSELoss()
+        optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
+
+        optimizer.zero_grad()
+        outputs = ddp_model(torch.randn(20, 10))
+        labels = torch.randn(20, 5).to(device_id)
+        loss_fn(outputs, labels).backward()
+        optimizer.step()
+
+    if __name__ == "__main__":
+        demo_basic()
+
+One can then run a `torch elastic/torchrun<https://pytorch.org/docs/stable/elastic/quickstart.html>`__ command
+on all nodes to initialize the DDP job created above:
+
+.. code:: bash
+    torchrun --nnodes=2 --nproc_per_node=8 --rdzv_id=100 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR:29400 elastic_ddp.py
+
+We are running the DDP script on two hosts, and each host we run with 8 processes, aka, we
+are running it on 16 GPUs. Note that ``$MASTER_ADDR`` must be the same across all nodes.
+
+Here torchrun will launch 8 process and invoke ``elastic_ddp.py``
+on each process on the node it is launched on, but user also needs to apply cluster
+management tools like slurm to actually run this command on 2 nodes.
+
+For example, on a SLURM enabled cluster, we can write a script to run the command above
+and set ``MASTER_ADDR`` as:
+
+.. code:: bash
+    export MASTER_ADDR=$(scontrol show hostname ${SLURM_NODELIST} | head -n 1)
+
+Then we can just run this script using the SLURM command: ``srun --nodes=2 ./torchrun_script.sh``.
+Of course, this is just an example; you can choose your own cluster scheduling tools
+to initiate the torchrun job.
+
+For more information about Elastic run, one can check this
+`quick start document <https://pytorch.org/docs/stable/elastic/quickstart.html>`__ to learn more.
diff --git a/intermediate_source/dist_pipeline_parallel_tutorial.rst b/intermediate_source/dist_pipeline_parallel_tutorial.rst
index 7bc978898..da57a5c56 100644
--- a/intermediate_source/dist_pipeline_parallel_tutorial.rst
+++ b/intermediate_source/dist_pipeline_parallel_tutorial.rst
@@ -2,11 +2,14 @@ Distributed Pipeline Parallelism Using RPC
 ==========================================
 **Author**: `Shen Li <https://mrshenli.github.io/>`_
 
+.. note::
+   View the source code for this tutorial in `github <https://github.com/pytorch/tutorials/blob/master/intermediate_source/dist_pipeline_parallel_tutorial.rst>`__.
+
 Prerequisites:
 
 -  `PyTorch Distributed Overview <../beginner/dist_overview.html>`__
--  `Single-Machine Model Parallel Best Practices <https://tutorials.pytorch.kr/intermediate/model_parallel_tutorial.html>`__
--  `Getting started with Distributed RPC Framework <https://tutorials.pytorch.kr/intermediate/rpc_tutorial.html>`__
+-  `Single-Machine Model Parallel Best Practices <https://pytorch.org/tutorials/intermediate/model_parallel_tutorial.html>`__
+-  `Getting started with Distributed RPC Framework <https://pytorch.org/tutorials/intermediate/rpc_tutorial.html>`__
 -  RRef helper functions:
    `RRef.rpc_sync() <https://pytorch.org/docs/master/rpc.html#torch.distributed.rpc.RRef.rpc_sync>`__,
    `RRef.rpc_async() <https://pytorch.org/docs/master/rpc.html#torch.distributed.rpc.RRef.rpc_async>`__, and
diff --git a/intermediate_source/dist_tuto.rst b/intermediate_source/dist_tuto.rst
index 292aad7cc..1686bd914 100644
--- a/intermediate_source/dist_tuto.rst
+++ b/intermediate_source/dist_tuto.rst
@@ -3,6 +3,9 @@ PyTorch로 분산 어플리케이션 개발하기
 **Author**: `Séb Arnold <https://seba1511.com>`_
   **번역**: `박정환 <https://github.com/9bow>`_
 
+.. note::
+   이 튜토리얼의 소스 코드는 `GitHub <https://github.com/pytorch/tutorials/blob/master/intermediate_source/dist_tuto.rst>`__ 에서 확인할 수 있습니다.
+
 선수과목(Prerequisites):
 
 -  `PyTorch Distributed Overview <../beginner/dist_overview.html>`__
diff --git a/intermediate_source/memory_format_tutorial.py b/intermediate_source/memory_format_tutorial.py
index a8aa9d015..cd92877b5 100644
--- a/intermediate_source/memory_format_tutorial.py
+++ b/intermediate_source/memory_format_tutorial.py
@@ -28,7 +28,7 @@
 """
 
 ######################################################################
-# Channels last 메모리 형식은 오직 4D NCWH Tensors에서만 실행할 수 있습니다.
+# Channels last 메모리 형식은 오직 4D NCHW Tensors에서만 실행할 수 있습니다.
 #
 
 ######################################################################
@@ -147,9 +147,10 @@
 ######################################################################
 # 성능 향상
 # -------------------------------------------------------------------------------------------
-# 정밀도를 줄인(reduced precision ``torch.float16``) 상태에서 Tensor Cores를 지원하는 Nvidia의 하드웨어에서
-# 가장 의미심장한 성능 향상을 보였습니다. `AMP (Automated Mixed Precision)` 학습 스크립트를 활용하여
-# 연속적인 형식에 비해 Channels last 방식이 22% 이상의 성능 향승을 확인할 수 있었습니다.
+# Channels last 메모리 형식 최적화는 GPU와 CPU에서 모두 사용 가능합니다.
+# GPU에서는 정밀도를 줄인(reduced precision ``torch.float16``) 상태에서 Tensor Cores를 지원하는 Nvidia의
+# 하드웨어에서 가장 의미심장한 성능 향상을 보였습니다. `AMP (Automated Mixed Precision)` 학습 스크립트를
+# 활용하여 연속적인 형식에 비해 Channels last 방식이 22% 이상의 성능 향승을 확인할 수 있었습니다.
 # 이 때, Nvidia가 제공하는 AMP를 사용했습니다. https://github.com/NVIDIA/apex
 #
 # ``python main_amp.py -a resnet50 --b 200 --workers 16 --opt-level O2  ./data``
@@ -232,6 +233,11 @@
 # ``alexnet``, ``mnasnet0_5``, ``mnasnet0_75``, ``mnasnet1_0``, ``mnasnet1_3``, ``mobilenet_v2``, ``resnet101``, ``resnet152``, ``resnet18``, ``resnet34``, ``resnet50``, ``resnext50_32x4d``, ``shufflenet_v2_x0_5``, ``shufflenet_v2_x1_0``, ``shufflenet_v2_x1_5``, ``shufflenet_v2_x2_0``, ``squeezenet1_0``, ``squeezenet1_1``, ``vgg11``, ``vgg11_bn``, ``vgg13``, ``vgg13_bn``, ``vgg16``, ``vgg16_bn``, ``vgg19``, ``vgg19_bn``, ``wide_resnet101_2``, ``wide_resnet50_2``
 #
 
+######################################################################
+# 아래 목록의 모델들은 Channels last 형식을 전적으로 지원하며 Intel(R) Xeon(R) Ice Lake (또는 최신) CPU에서 26%-76% 성능 향상을 보여줍니다:
+# ``alexnet``, ``densenet121``, ``densenet161``, ``densenet169``, ``googlenet``, ``inception_v3``, ``mnasnet0_5``, ``mnasnet1_0``, ``resnet101``, ``resnet152``, ``resnet18``, ``resnet34``, ``resnet50``, ``resnext101_32x8d``, ``resnext50_32x4d``, ``shufflenet_v2_x0_5``, ``shufflenet_v2_x1_0``, ``squeezenet1_0``, ``squeezenet1_1``, ``vgg11``, ``vgg11_bn``, ``vgg13``, ``vgg13_bn``, ``vgg16``, ``vgg16_bn``, ``vgg19``, ``vgg19_bn``, ``wide_resnet101_2``, ``wide_resnet50_2``
+#
+
 ######################################################################
 # 기존 모델들 변환하기
 # --------------------------
diff --git a/intermediate_source/named_tensor_tutorial.py b/intermediate_source/named_tensor_tutorial.py
deleted file mode 100644
index 349416040..000000000
--- a/intermediate_source/named_tensor_tutorial.py
+++ /dev/null
@@ -1,545 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-(prototype) Introduction to Named Tensors in PyTorch
-*******************************************************
-**Author**: `Richard Zou <https://github.com/zou3519>`_
-
-Named Tensors aim to make tensors easier to use by allowing users to associate
-explicit names with tensor dimensions. In most cases, operations that take
-dimension parameters will accept dimension names, avoiding the need to track
-dimensions by position. In addition, named tensors use names to automatically
-check that APIs are being used correctly at runtime, providing extra safety.
-Names can also be used to rearrange dimensions, for example, to support
-"broadcasting by name" rather than "broadcasting by position".
-
-This tutorial is intended as a guide to the functionality that will
-be included with the 1.3 launch. By the end of it, you will be able to:
-
-- Create Tensors with named dimensions, as well as remove or rename those
-  dimensions
-- Understand the basics of how operations propagate dimension names
-- See how naming dimensions enables clearer code in two key areas:
-    - Broadcasting operations
-    - Flattening and unflattening dimensions
-
-Finally, we'll put this into practice by writing a multi-head attention module
-using named tensors.
-
-Named tensors in PyTorch are inspired by and done in collaboration with
-`Sasha Rush <https://tech.cornell.edu/people/alexander-rush/>`_.
-Sasha proposed the original idea and proof of concept in his
-`January 2019 blog post <http://nlp.seas.harvard.edu/NamedTensor>`_.
-
-Basics: named dimensions
-========================
-
-PyTorch now allows Tensors to have named dimensions; factory functions
-take a new `names` argument that associates a name with each dimension.
-This works with most factory functions, such as
-
-- `tensor`
-- `empty`
-- `ones`
-- `zeros`
-- `randn`
-- `rand`
-
-Here we construct a tensor with names:
-"""
-
-import torch
-imgs = torch.randn(1, 2, 2, 3, names=('N', 'C', 'H', 'W'))
-print(imgs.names)
-
-######################################################################
-# Unlike in
-# `the original named tensors blogpost <http://nlp.seas.harvard.edu/NamedTensor>`_,
-# named dimensions are ordered: ``tensor.names[i]`` is the name of the ``i`` th
-# dimension of ``tensor``.
-#
-# There are two ways to rename a ``Tensor``'s dimensions:
-
-# Method #1: set the .names attribute (this changes name in-place)
-imgs.names = ['batch', 'channel', 'width', 'height']
-print(imgs.names)
-
-# Method #2: specify new names (this changes names out-of-place)
-imgs = imgs.rename(channel='C', width='W', height='H')
-print(imgs.names)
-
-######################################################################
-# The preferred way to remove names is to call ``tensor.rename(None)``:
-
-imgs = imgs.rename(None)
-print(imgs.names)
-
-######################################################################
-# Unnamed tensors (tensors with no named dimensions) still work as
-# normal and do not have names in their ``repr``.
-
-unnamed = torch.randn(2, 1, 3)
-print(unnamed)
-print(unnamed.names)
-
-######################################################################
-# Named tensors do not require that all dimensions be named.
-
-imgs = torch.randn(3, 1, 1, 2, names=('N', None, None, None))
-print(imgs.names)
-
-######################################################################
-# Because named tensors can coexist with unnamed tensors, we need a nice way to
-# write named tensor-aware code that works with both named and unnamed tensors.
-# Use ``tensor.refine_names(*names)`` to refine dimensions and lift unnamed
-# dims to named dims. Refining a dimension is defined as a "rename" with the
-# following constraints:
-#
-# - A ``None`` dim can be refined to have any name
-# - A named dim can only be refined to have the same name.
-
-imgs = torch.randn(3, 1, 1, 2)
-named_imgs = imgs.refine_names('N', 'C', 'H', 'W')
-print(named_imgs.names)
-
-# Refine the last two dims to 'H' and 'W'. In Python 2, use the string '...'
-# instead of ...
-named_imgs = imgs.refine_names(..., 'H', 'W')
-print(named_imgs.names)
-
-
-def catch_error(fn):
-    try:
-        fn()
-        assert False
-    except RuntimeError as err:
-        err = str(err)
-        if len(err) > 180:
-            err = err[:180] + "..."
-        print(err)
-
-
-named_imgs = imgs.refine_names('N', 'C', 'H', 'W')
-
-# Tried to refine an existing name to a different name
-catch_error(lambda: named_imgs.refine_names('N', 'C', 'H', 'width'))
-
-######################################################################
-# Most simple operations propagate names. The ultimate goal for named tensors
-# is for all operations to propagate names in a reasonable, intuitive manner.
-# Support for many common operations has been added at the time of the 1.3
-# release; here, for example, is ``.abs()``:
-
-print(named_imgs.abs().names)
-
-######################################################################
-# Accessors and Reduction
-# -----------------------
-#
-# One can use dimension names to refer to dimensions instead of the positional
-# dimension. These operations also propagate names. Indexing (basic and
-# advanced) has not been implemented yet but is on the roadmap. Using the
-# ``named_imgs`` tensor from above, we can do:
-
-output = named_imgs.sum('C')  # Perform a sum over the channel dimension
-print(output.names)
-
-img0 = named_imgs.select('N', 0)  # get one image
-print(img0.names)
-
-######################################################################
-# Name inference
-# --------------
-#
-# Names are propagated on operations in a two step process called
-# **name inference**:
-#
-# 1. **Check names**: an operator may perform automatic checks at runtime that
-#    check that certain dimension names must match.
-# 2. **Propagate names**: name inference propagates output names to output
-#    tensors.
-#
-# Let's go through the very small example of adding 2 one-dim tensors with no
-# broadcasting.
-
-x = torch.randn(3, names=('X',))
-y = torch.randn(3)
-z = torch.randn(3, names=('Z',))
-
-######################################################################
-# **Check names**: first, we will check whether the names of these two tensors
-# *match*. Two names match if and only if they are equal (string equality) or
-# at least one is ``None`` (``None`` is essentially a special wildcard name).
-# The only one of these three that will error, therefore, is ``x + z``:
-
-catch_error(lambda: x + z)
-
-######################################################################
-# **Propagate names**: *unify* the two names by returning the most refined name
-# of the two. With ``x + y``,  ``X`` is more refined than ``None``.
-
-print((x + y).names)
-
-######################################################################
-# Most name inference rules are straightforward but some of them can have
-# unexpected semantics. Let's go through a couple you're likely to encounter:
-# broadcasting and matrix multiply.
-#
-# Broadcasting
-# ^^^^^^^^^^^^
-#
-# Named tensors do not change broadcasting behavior; they still broadcast by
-# position. However, when checking two dimensions for if they can be
-# broadcasted, PyTorch also checks that the names of those dimensions match.
-#
-# This results in named tensors preventing unintended alignment during
-# operations that broadcast. In the below example, we apply a
-# ``per_batch_scale`` to ``imgs``.
-
-imgs = torch.randn(2, 2, 2, 2, names=('N', 'C', 'H', 'W'))
-per_batch_scale = torch.rand(2, names=('N',))
-catch_error(lambda: imgs * per_batch_scale)
-
-######################################################################
-# Without ``names``, the ``per_batch_scale`` tensor is aligned with the last
-# dimension of ``imgs``, which is not what we intended. We really wanted to
-# perform the operation by aligning ``per_batch_scale`` with the batch
-# dimension of ``imgs``.
-# See the new "explicit broadcasting by names" functionality for how to
-# align tensors by name, covered below.
-#
-# Matrix multiply
-# ^^^^^^^^^^^^^^^
-#
-# ``torch.mm(A, B)`` performs a dot product between the second dim of ``A``
-# and the first dim of ``B``, returning a tensor with the first dim of ``A``
-# and the second dim of ``B``. (other matmul functions, such as
-# ``torch.matmul``, ``torch.mv``, and ``torch.dot``, behave similarly).
-
-markov_states = torch.randn(128, 5, names=('batch', 'D'))
-transition_matrix = torch.randn(5, 5, names=('in', 'out'))
-
-# Apply one transition
-new_state = markov_states @ transition_matrix
-print(new_state.names)
-
-######################################################################
-# As you can see, matrix multiply does not check if the contracted dimensions
-# have the same name.
-#
-# Next, we'll cover two new behaviors that named tensors enable: explicit
-# broadcasting by names and flattening and unflattening dimensions by names
-#
-# New behavior: Explicit broadcasting by names
-# --------------------------------------------
-#
-# One of the main complaints about working with multiple dimensions is the need
-# to ``unsqueeze`` "dummy" dimensions so that operations can occur.
-# For example, in our per-batch-scale example before, with unnamed tensors
-# we'd do the following:
-
-imgs = torch.randn(2, 2, 2, 2)  # N, C, H, W
-per_batch_scale = torch.rand(2)  # N
-
-correct_result = imgs * per_batch_scale.view(2, 1, 1, 1)  # N, C, H, W
-incorrect_result = imgs * per_batch_scale.expand_as(imgs)
-assert not torch.allclose(correct_result, incorrect_result)
-
-######################################################################
-# We can make these operations safer (and easily agnostic to the number of
-# dimensions) by using names. We provide a new ``tensor.align_as(other)``
-# operation that permutes the dimensions of tensor to match the order specified
-# in ``other.names``, adding one-sized dimensions where appropriate
-# (``tensor.align_to(*names)`` works as well):
-
-imgs = imgs.refine_names('N', 'C', 'H', 'W')
-per_batch_scale = per_batch_scale.refine_names('N')
-
-named_result = imgs * per_batch_scale.align_as(imgs)
-# note: named tensors do not yet work with allclose
-assert torch.allclose(named_result.rename(None), correct_result)
-
-######################################################################
-# New behavior: Flattening and unflattening dimensions by names
-# -------------------------------------------------------------
-#
-# One common operation is flattening and unflattening dimensions. Right now,
-# users perform this using either ``view``, ``reshape``, or ``flatten``; use
-# cases include flattening batch dimensions to send tensors into operators that
-# must take inputs with a certain number of dimensions (i.e., conv2d takes 4D
-# input).
-#
-# To make these operation more semantically meaningful than view or reshape, we
-# introduce a new ``tensor.unflatten(dim, namedshape)`` method and update
-# ``flatten`` to work with names: ``tensor.flatten(dims, new_dim)``.
-#
-# ``flatten`` can only flatten adjacent dimensions but also works on
-# non-contiguous dims. One must pass into ``unflatten`` a **named shape**,
-# which is a list of ``(dim, size)`` tuples, to specify how to unflatten the
-# dim. It is possible to save the sizes during a ``flatten`` for ``unflatten``
-# but we do not yet do that.
-
-imgs = imgs.flatten(['C', 'H', 'W'], 'features')
-print(imgs.names)
-
-imgs = imgs.unflatten('features', (('C', 2), ('H', 2), ('W', 2)))
-print(imgs.names)
-
-######################################################################
-# Autograd support
-# ----------------
-#
-# Autograd currently ignores names on all tensors and just treats them like
-# regular tensors. Gradient computation is correct but we lose the safety that
-# names give us. It is on the roadmap to introduce handling of names to
-# autograd.
-
-x = torch.randn(3, names=('D',))
-weight = torch.randn(3, names=('D',), requires_grad=True)
-loss = (x - weight).abs()
-grad_loss = torch.randn(3)
-loss.backward(grad_loss)
-
-correct_grad = weight.grad.clone()
-print(correct_grad)  # Unnamed for now. Will be named in the future
-
-weight.grad.zero_()
-grad_loss = grad_loss.refine_names('C')
-loss = (x - weight).abs()
-# Ideally we'd check that the names of loss and grad_loss match, but we don't
-# yet
-loss.backward(grad_loss)
-
-print(weight.grad)  # still unnamed
-assert torch.allclose(weight.grad, correct_grad)
-
-######################################################################
-# Other supported (and unsupported) features
-# ------------------------------------------
-#
-# `See here <https://pytorch.org/docs/stable/named_tensor.html>`_ for a
-# detailed breakdown of what is supported with the 1.3 release.
-#
-# In particular, we want to call out three important features that are not
-# currently supported:
-#
-# - Saving or loading named tensors via ``torch.save`` or ``torch.load``
-# - Multi-processing via ``torch.multiprocessing``
-# - JIT support; for example, the following will error
-
-imgs_named = torch.randn(1, 2, 2, 3, names=('N', 'C', 'H', 'W'))
-
-
-@torch.jit.script
-def fn(x):
-    return x
-
-
-catch_error(lambda: fn(imgs_named))
-
-######################################################################
-# As a workaround, please drop names via ``tensor = tensor.rename(None)``
-# before using anything that does not yet support named tensors.
-#
-# Longer example: Multi-head attention
-# --------------------------------------
-#
-# Now we'll go through a complete example of implementing a common
-# PyTorch ``nn.Module``: multi-head attention. We assume the reader is already
-# familiar with multi-head attention; for a refresher, check out
-# `this explanation <https://nlp.seas.harvard.edu/2018/04/03/attention.html>`_
-# or
-# `this explanation <http://jalammar.github.io/illustrated-transformer/>`_.
-#
-# We adapt the implementation of multi-head attention from
-# `ParlAI <https://github.com/facebookresearch/ParlAI>`_; specifically
-# `here <https://github.com/facebookresearch/ParlAI/blob/f7db35cba3f3faf6097b3e6b208442cd564783d9/parlai/agents/transformer/modules.py#L907>`_.
-# Read through the code at that example; then, compare with the code below,
-# noting that there are four places labeled (I), (II), (III), and (IV), where
-# using named tensors enables more readable code; we will dive into each of
-# these after the code block.
-
-import torch.nn as nn
-import torch.nn.functional as F
-import math
-
-
-class MultiHeadAttention(nn.Module):
-    def __init__(self, n_heads, dim, dropout=0):
-        super(MultiHeadAttention, self).__init__()
-        self.n_heads = n_heads
-        self.dim = dim
-
-        self.attn_dropout = nn.Dropout(p=dropout)
-        self.q_lin = nn.Linear(dim, dim)
-        self.k_lin = nn.Linear(dim, dim)
-        self.v_lin = nn.Linear(dim, dim)
-        nn.init.xavier_normal_(self.q_lin.weight)
-        nn.init.xavier_normal_(self.k_lin.weight)
-        nn.init.xavier_normal_(self.v_lin.weight)
-        self.out_lin = nn.Linear(dim, dim)
-        nn.init.xavier_normal_(self.out_lin.weight)
-
-    def forward(self, query, key=None, value=None, mask=None):
-        # (I)
-        query = query.refine_names(..., 'T', 'D')
-        self_attn = key is None and value is None
-        if self_attn:
-            mask = mask.refine_names(..., 'T')
-        else:
-            mask = mask.refine_names(..., 'T', 'T_key')  # enc attn
-
-        dim = query.size('D')
-        assert dim == self.dim, \
-            f'Dimensions do not match: {dim} query vs {self.dim} configured'
-        assert mask is not None, 'Mask is None, please specify a mask'
-        n_heads = self.n_heads
-        dim_per_head = dim // n_heads
-        scale = math.sqrt(dim_per_head)
-
-        # (II)
-        def prepare_head(tensor):
-            tensor = tensor.refine_names(..., 'T', 'D')
-            return (tensor.unflatten('D', [('H', n_heads), ('D_head', dim_per_head)])
-                          .align_to(..., 'H', 'T', 'D_head'))
-
-        assert value is None
-        if self_attn:
-            key = value = query
-        elif value is None:
-            # key and value are the same, but query differs
-            key = key.refine_names(..., 'T', 'D')
-            value = key
-        dim = key.size('D')
-
-        # Distinguish between query_len (T) and key_len (T_key) dims.
-        k = prepare_head(self.k_lin(key)).rename(T='T_key')
-        v = prepare_head(self.v_lin(value)).rename(T='T_key')
-        q = prepare_head(self.q_lin(query))
-
-        dot_prod = q.div_(scale).matmul(k.align_to(..., 'D_head', 'T_key'))
-        dot_prod.refine_names(..., 'H', 'T', 'T_key')  # just a check
-
-        # (III)
-        attn_mask = (mask == 0).align_as(dot_prod)
-        dot_prod.masked_fill_(attn_mask, -float(1e20))
-
-        attn_weights = self.attn_dropout(F.softmax(dot_prod / scale,
-                                                   dim='T_key'))
-
-        # (IV)
-        attentioned = (
-            attn_weights.matmul(v).refine_names(..., 'H', 'T', 'D_head')
-            .align_to(..., 'T', 'H', 'D_head')
-            .flatten(['H', 'D_head'], 'D')
-        )
-
-        return self.out_lin(attentioned).refine_names(..., 'T', 'D')
-
-######################################################################
-# **(I) Refining the input tensor dims**
-
-def forward(self, query, key=None, value=None, mask=None):
-    # (I)
-    query = query.refine_names(..., 'T', 'D')
-
-######################################################################
-# The ``query = query.refine_names(..., 'T', 'D')`` serves as enforcable documentation
-# and lifts input dimensions to being named. It checks that the last two dimensions
-# can be refined to ``['T', 'D']``, preventing potentially silent or confusing size
-# mismatch errors later down the line.
-#
-# **(II)  Manipulating dimensions in prepare_head**
-
-# (II)
-def prepare_head(tensor):
-    tensor = tensor.refine_names(..., 'T', 'D')
-    return (tensor.unflatten('D', [('H', n_heads), ('D_head', dim_per_head)])
-                  .align_to(..., 'H', 'T', 'D_head'))
-
-######################################################################
-# The first thing to note is how the code clearly states the input and
-# output dimensions: the input tensor must end with the ``T`` and ``D`` dims
-# and the output tensor ends in ``H``, ``T``, and ``D_head`` dims.
-#
-# The second thing to note is how clearly the code describes what is going on.
-# prepare_head takes the key, query, and value and splits the embedding dim into
-# multiple heads, finally rearranging the dim order to be ``[..., 'H', 'T', 'D_head']``.
-# ParlAI implements ``prepare_head`` as the following, using ``view`` and ``transpose``
-# operations:
-
-def prepare_head(tensor):
-    # input is [batch_size, seq_len, n_heads * dim_per_head]
-    # output is [batch_size * n_heads, seq_len, dim_per_head]
-    batch_size, seq_len, _ = tensor.size()
-    tensor = tensor.view(batch_size, tensor.size(1), n_heads, dim_per_head)
-    tensor = (
-        tensor.transpose(1, 2)
-        .contiguous()
-        .view(batch_size * n_heads, seq_len, dim_per_head)
-    )
-    return tensor
-
-######################################################################
-# Our named tensor variant uses ops that, though more verbose, have more
-# semantic meaning than ``view`` and ``transpose`` and includes enforcable
-# documentation in the form of names.
-#
-# **(III) Explicit broadcasting by names**
-
-def ignore():
-    # (III)
-    attn_mask = (mask == 0).align_as(dot_prod)
-    dot_prod.masked_fill_(attn_mask, -float(1e20))
-
-######################################################################
-# ``mask`` usually has dims ``[N, T]`` (in the case of self attention) or
-# ``[N, T, T_key]`` (in the case of encoder attention) while ``dot_prod``
-# has dims ``[N, H, T, T_key]``. To make ``mask`` broadcast correctly with
-# ``dot_prod``, we would usually `unsqueeze` dims ``1`` and ``-1`` in the case
-# of self attention or ``unsqueeze`` dim ``1`` in the case of encoder
-# attention. Using named tensors, we simply align ``attn_mask`` to ``dot_prod``
-# using ``align_as`` and stop worrying about where to ``unsqueeze`` dims.
-#
-# **(IV) More dimension manipulation using align_to and flatten**
-
-def ignore():
-    # (IV)
-    attentioned = (
-        attn_weights.matmul(v).refine_names(..., 'H', 'T', 'D_head')
-        .align_to(..., 'T', 'H', 'D_head')
-        .flatten(['H', 'D_head'], 'D')
-    )
-
-######################################################################
-# Here, as in (II), ``align_to`` and ``flatten`` are more semantically
-# meaningful than ``view`` and ``transpose`` (despite being more verbose).
-#
-# Running the example
-# -------------------
-
-n, t, d, h = 7, 5, 2 * 3, 3
-query = torch.randn(n, t, d, names=('N', 'T', 'D'))
-mask = torch.ones(n, t, names=('N', 'T'))
-attn = MultiHeadAttention(h, d)
-output = attn(query, mask=mask)
-# works as expected!
-print(output.names)
-
-######################################################################
-# The above works as expected. Furthermore, note that in the code we
-# did not mention the name of the batch dimension at all. In fact,
-# our ``MultiHeadAttention`` module is agnostic to the existence of batch
-# dimensions.
-
-query = torch.randn(t, d, names=('T', 'D'))
-mask = torch.ones(t, names=('T',))
-output = attn(query, mask=mask)
-print(output.names)
-
-######################################################################
-# Conclusion
-# ----------
-#
-# Thank you for reading! Named tensors are still very much in development;
-# if you have feedback and/or suggestions for improvement, please let us
-# know by creating `an issue <https://github.com/pytorch/pytorch/issues>`_.
diff --git a/intermediate_source/process_group_cpp_extension_tutorial.rst b/intermediate_source/process_group_cpp_extension_tutorial.rst
index da70fb62b..de029cb8e 100644
--- a/intermediate_source/process_group_cpp_extension_tutorial.rst
+++ b/intermediate_source/process_group_cpp_extension_tutorial.rst
@@ -3,6 +3,8 @@ Customize Process Group Backends Using Cpp Extensions
 
 **Author**: `Feng Tian <https://github.com/ftian1>`__, `Shen Li <https://mrshenli.github.io/>`__
 
+.. note::
+   View the source code for this tutorial in `github <https://github.com/pytorch/tutorials/blob/master/intermediate_source/process_group_cpp_extension_tutorial.rst>`__.
 
 Prerequisites:
 
diff --git a/intermediate_source/reinforcement_q_learning.py b/intermediate_source/reinforcement_q_learning.py
index 4dd9801ec..96e86601c 100644
--- a/intermediate_source/reinforcement_q_learning.py
+++ b/intermediate_source/reinforcement_q_learning.py
@@ -5,7 +5,7 @@
 **Author**: `Adam Paszke <https://github.com/apaszke>`_
   **번역**: `황성수 <https://github.com/adonisues>`_
 
-이 튜토리얼에서는 `OpenAI Gym <https://gym.openai.com/>`__ 의
+이 튜토리얼에서는 `OpenAI Gym <https://www.gymlibrary.ml/>`__ 의
 CartPole-v0 태스크에서 DQN (Deep Q Learning) 에이전트를 학습하는데
 PyTorch를 사용하는 방법을 보여드립니다.
 
@@ -14,7 +14,7 @@
 에이전트는 연결된 막대가 똑바로 서 있도록 카트를 왼쪽이나 오른쪽으로
 움직이는 두 가지 동작 중 하나를 선택해야 합니다.
 다양한 알고리즘과 시각화 기능을 갖춘 공식 순위표를
-`Gym 웹사이트 <https://gym.openai.com/envs/CartPole-v0>`__ 에서 찾을 수 있습니다.
+`Gym 웹사이트 <https://www.gymlibrary.ml/environments/classic_control/cart_pole>`__ 에서 찾을 수 있습니다.
 
 .. figure:: /_static/img/cartpole.gif
    :alt: cartpole
@@ -40,7 +40,7 @@
 **패키지**
 
 먼저 필요한 패키지를 가져옵니다. 첫째, 환경을 위해
-`gym <https://gym.openai.com/docs>`__ 이 필요합니다.
+`gym <https://github.com/openai/gym>`__ 이 필요합니다.
 (`pip install gym` 을 사용하여 설치하십시오).
 또한 PyTorch에서 다음을 사용합니다:
 
diff --git a/intermediate_source/rpc_async_execution.rst b/intermediate_source/rpc_async_execution.rst
index d2b4ff29f..68158e3b5 100644
--- a/intermediate_source/rpc_async_execution.rst
+++ b/intermediate_source/rpc_async_execution.rst
@@ -2,6 +2,8 @@ Implementing Batch RPC Processing Using Asynchronous Executions
 ===============================================================
 **Author**: `Shen Li <https://mrshenli.github.io/>`_
 
+.. note::
+   View the source code for this tutorial in `github <https://github.com/pytorch/tutorials/blob/master/intermediate_source/rpc_async_execution.rst>`__.
 
 Prerequisites:
 
@@ -190,7 +192,7 @@ implement batch RPC applications using the
 `@rpc.functions.async_execution <https://pytorch.org/docs/master/rpc.html#torch.distributed.rpc.functions.async_execution>`__
 decorator. In the next section, we re-implement the reinforcement learning
 example in the previous
-`Getting started with Distributed RPC Framework <https://tutorials.pytorch.kr/intermediate/rpc_tutorial.html>`__
+`Getting started with Distributed RPC Framework <https://pytorch.org/tutorials/intermediate/rpc_tutorial.html>`__
 tutorial using batch processing, and demonstrate its impact on the training
 speed.
 
@@ -264,7 +266,7 @@ which will be presented shortly, and this function will be decorated with
             self.select_action = Agent.select_action_batch if batch else Agent.select_action
 
 Compared to the previous tutorial
-`Getting started with Distributed RPC Framework <https://tutorials.pytorch.kr/intermediate/rpc_tutorial.html>`__,
+`Getting started with Distributed RPC Framework <https://pytorch.org/tutorials/intermediate/rpc_tutorial.html>`__,
 observers behave a little differently. Instead of exiting when the environment
 is stopped, it always runs ``n_steps`` iterations in every episode. When the
 environment returns, the observer simply resets the environment and start over
@@ -520,4 +522,4 @@ Learn More
 -  `Batch-Updating Parameter Server Source Code <https://github.com/pytorch/examples/blob/master/distributed/rpc/batch/parameter_server.py>`__
 -  `Batch-Processing CartPole Solver <https://github.com/pytorch/examples/blob/master/distributed/rpc/batch/reinforce.py>`__
 -  `Distributed Autograd <https://pytorch.org/docs/master/rpc.html#distributed-autograd-framework>`__
--  `Distributed Pipeline Parallelism <dist_pipeline_parallel_tutorial.html>`__
\ No newline at end of file
+-  `Distributed Pipeline Parallelism <dist_pipeline_parallel_tutorial.html>`__
diff --git a/intermediate_source/rpc_param_server_tutorial.rst b/intermediate_source/rpc_param_server_tutorial.rst
index 0d5d57b12..6d74f82a2 100644
--- a/intermediate_source/rpc_param_server_tutorial.rst
+++ b/intermediate_source/rpc_param_server_tutorial.rst
@@ -4,6 +4,9 @@ Implementing a Parameter Server Using Distributed RPC Framework
 
 **Author**\ : `Rohan Varma <https://github.com/rohan-varma>`_
 
+.. note::
+   View the source code for this tutorial in `github <https://github.com/pytorch/tutorials/blob/master/intermediate_source/rpc_param_server_tutorial.rst>`__.
+
 Prerequisites:
 
 -  `PyTorch Distributed Overview <../beginner/dist_overview.html>`__
@@ -13,7 +16,7 @@ This tutorial walks through a simple example of implementing a parameter server
 
 Using the Distributed RPC Framework, we'll build an example where multiple trainers use RPC to communicate with the same parameter server and use `RRef <https://pytorch.org/docs/stable/rpc.html#torch.distributed.rpc.RRef>`_ to access states on the remote parameter server instance. Each trainer will launch its dedicated backward pass in a distributed fashion through stitching of the autograd graph across multiple nodes using distributed autograd.
 
-**Note**\ : This tutorial covers the use of the Distributed RPC Framework, which is useful for splitting a model onto multiple machines, or for implementing a parameter-server training strategy where network trainers fetch parameters hosted on a different machine. If instead you are looking for replicating your model across many GPUs, please see the `Distributed Data Parallel tutorial <https://tutorials.pytorch.kr/intermediate/ddp_tutorial.html>`_. There is also another `RPC tutorial <https://tutorials.pytorch.kr/intermediate/rpc_tutorial.html>`_ that covers reinforcement learning and RNN use cases.
+**Note**\ : This tutorial covers the use of the Distributed RPC Framework, which is useful for splitting a model onto multiple machines, or for implementing a parameter-server training strategy where network trainers fetch parameters hosted on a different machine. If instead you are looking for replicating your model across many GPUs, please see the `Distributed Data Parallel tutorial <https://pytorch.org/tutorials/intermediate/ddp_tutorial.html>`_. There is also another `RPC tutorial <https://pytorch.org/tutorials/intermediate/rpc_tutorial.html>`_ that covers reinforcement learning and RNN use cases.
 
 Let's start with the familiar: importing our required modules and defining a simple ConvNet that will train on the MNIST dataset. The below network is largely adopted from the network defined in the `pytorch/examples repo <https://github.com/pytorch/examples/tree/master/mnist>`_.
 
diff --git a/intermediate_source/rpc_tutorial.rst b/intermediate_source/rpc_tutorial.rst
index 9ab52c718..aaaa6022b 100644
--- a/intermediate_source/rpc_tutorial.rst
+++ b/intermediate_source/rpc_tutorial.rst
@@ -2,6 +2,8 @@ Getting Started with Distributed RPC Framework
 =================================================
 **Author**: `Shen Li <https://mrshenli.github.io/>`_
 
+.. note::
+   View the source code for this tutorial in `github <https://github.com/pytorch/tutorials/blob/master/intermediate_source/rpc_tutorial.rst>`__.
 
 Prerequisites:
 
diff --git a/intermediate_source/torchserve_with_ipex.rst b/intermediate_source/torchserve_with_ipex.rst
new file mode 100644
index 000000000..caef69267
--- /dev/null
+++ b/intermediate_source/torchserve_with_ipex.rst
@@ -0,0 +1,394 @@
+Grokking PyTorch Intel CPU performance from first principles
+============================================================
+
+A case study on the TorchServe inference framework optimized with `Intel® Extension for PyTorch* <https://github.com/intel/intel-extension-for-pytorch>`_.
+
+Authors: Min Jean Cho, Mark Saroufim
+
+Reviewers: Ashok Emani, Jiong Gong 
+
+Getting a strong out-of-box performance for deep learning on CPUs can be tricky but it’s much easier if you’re aware of the main problems that affect performance, how to measure them and how to solve them. 
+
+TL;DR
+
++-----------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------+
+| Problem                           | How to measure it                                                                                                                                                                              | Solution                                                                                        |
++-----------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------+
+| Bottlenecked GEMM execution units | - `Imbalance or Serial Spinning <https://www.intel.com/content/www/us/en/develop/documentation/vtune-help/top/reference/cpu-metrics-reference/spin-time/imbalance-or-serial-spinning-1.html>`_ | Avoid using logical cores by setting thread affinity to physical cores via core pinning         |
+|                                   | - `Front-End Bound <https://www.intel.com/content/www/us/en/develop/documentation/vtune-help/top/reference/cpu-metrics-reference/front-end-bound.html>`_                                       |                                                                                                 |
+|                                   | - `Core Bound <https://www.intel.com/content/www/us/en/develop/documentation/vtune-help/top/reference/cpu-metrics-reference/back-end-bound.html>`_                                             |                                                                                                 |
++-----------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------+
+| Non Uniform Memory Access (NUMA)  | - Local vs. remote memory access                                                                                                                                                               | Avoid cross-socket computation by setting thread affinity to a specific socket via core pinning |
+|                                   | - `UPI Utilization <https://www.intel.com/content/www/us/en/develop/documentation/vtune-help/top/reference/cpu-metrics-reference/memory-bound/dram-bound/upi-utilization-bound.html>`_         |                                                                                                 |
+|                                   | - Latency in memory accesses                                                                                                                                                                   |                                                                                                 |
+|                                   | - Thread migration                                                                                                                                                                             |                                                                                                 |
++-----------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------+
+
+*GEMM (General Matrix Multiply)* run on fused-multiply-add (FMA) or dot-product (DP) execution units which will be bottlenecked and cause delays in thread waiting/*spinning at synchronization* barrier when *hyperthreading* is enabled - because using logical cores causes insufficient concurrency for all working threads as each logical thread *contends for the same core resources*. Instead, if we use 1 thread per physical core, we avoid this contention. So we generally recommend *avoiding logical cores* by setting CPU *thread affinity* to physical cores via *core pinning*.  
+
+Multi-socket systems have *Non-Uniform Memory Access (NUMA)* which is a shared memory architecture that describes the placement of main memory modules with respect to processors. But if a process is not NUMA-aware, slow *remote memory* is frequently accessed when *threads migrate* cross socket via *Intel Ultra Path Interconnect (UPI)* during run time. We address this problem by setting CPU *thread affinity* to a specific socket via *core pinning*.  
+
+Knowing these principles in mind, proper CPU runtime configuration can significantly boost out-of-box performance. 
+
+In this blog, we'll walk you through the important runtime configurations you should be aware of from `CPU Performance Tuning Guide <https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html#cpu-specific-optimizations>`_, explain how they work, how to profile them and how to integrate them within a model serving framework like `TorchServe <https://github.com/pytorch/serve>`_ via an easy to use `launch script <https://github.com/intel/intel-extension-for-pytorch/blob/master/docs/tutorials/performance_tuning/launch_script.md>`_ which we’ve `integrated <https://github.com/pytorch/serve/pull/1354>`_ :superscript:`1` natively.
+
+We’ll explain all of these ideas :strong:`visually` from :strong:`first principles` with lots of :strong:`profiles` and show you how we applied our learnings to make out of the box CPU performance on TorchServe better. 
+
+1. The feature has to be explicitly enabled by setting *cpu_launcher_enable=true* in *config.properties*.
+
+Avoid logical cores for deep learning 
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Avoiding logical cores for deep learning workloads generally improves performance. To understand this, let us take a step back to GEMM. 
+
+:strong:`Optimizing GEMM optimizes deep learning`
+
+The majority of time in deep learning training or inference is spent on millions of repeated operations of GEMM which is at the core of fully connected layers. Fully connected layers have been used for decades since multi-layer perceptrons (MLP) `proved to be a universal approximator of any continuous function <https://en.wikipedia.org/wiki/Universal_approximation_theorem>`_. Any MLP can be entirely represented as GEMM. And even a convolution can be represented as a GEMM by using a `Toepliz matrix <https://en.wikipedia.org/wiki/Toeplitz_matrix>`_. 
+
+Returning to the original topic, most GEMM operators benefit from using non-hyperthreading, because the majority of time in deep learning training or inference is spent on millions of repeated operations of GEMM running on fused-multiply-add (FMA) or dot-product (DP) execution units shared by hyperthreading cores. With hyperthreading enabled, OpenMP threads will contend for the same GEMM execution units.
+
+.. figure:: /_static/img/torchserve-ipex-images/1_.png
+   :width: 70%
+   :align: center
+   
+And if 2 logical threads run GEMM at the same time, they will be sharing the same core resources causing front end bound, such that the overhead from this front end bound is greater than the gain from running both logical threads at the same time. 
+
+Therefore we generally recommend avoiding using logical cores for deep learning workloads to achieve good performance. The launch script by default uses physical cores only; however, users can easily experiment with logical vs. physical cores by simply toggling the ``--use_logical_core`` launch script knob.
+
+:strong:`Exercise`
+
+We'll use the following example of feeding ResNet50 dummy tensor:
+
+.. code:: python
+
+    import torch
+    import torchvision.models as models
+    import time
+ 
+    model = models.resnet50(pretrained=False)
+    model.eval()
+    data = torch.rand(1, 3, 224, 224)
+ 
+    # warm up
+    for _ in range(100):
+        model(data)
+ 
+    start = time.time()
+    for _ in range(100):
+        model(data)
+    end = time.time()
+    print('Inference took {:.2f} ms in average'.format((end-start)/100*1000))
+
+Throughout the blog, we'll use `Intel® VTune™ Profiler <https://www.intel.com/content/www/us/en/developer/tools/oneapi/vtune-profiler.html#gs.v4egjg>`_ to profile and verify optimizations. And we'll run all exercises on a machine with two Intel(R) Xeon(R) Platinum 8180M CPUs. The CPU information is shown in Figure 2.1. 
+
+Environment variable ``OMP_NUM_THREADS`` is used to set the number of threads for parallel region. We'll compare ``OMP_NUM_THREADS=2`` with (1) use of logical cores and (2) use of physical cores only. 
+
+(1) Both OpenMP threads trying to utilize the same GEMM execution units shared by hyperthreading cores (0, 56)
+
+We can visualize this by running ``htop`` command on Linux as shown below.
+
+.. figure:: /_static/img/torchserve-ipex-images/2.png
+   :width: 100%
+   :align: center
+
+
+.. figure:: /_static/img/torchserve-ipex-images/3.png
+   :width: 100%
+   :align: center
+
+We notice that the Spin Time is flagged, and Imbalance or Serial Spinning contributed to the majority of it - 4.980 seconds out of the 8.982 seconds total. The Imbalance or Serial Spinning when using logical cores is due to insufficient concurrency of working threads as each logical thread contends for the same core resources. 
+
+The Top Hotspots section of the execution summary indicates that ``__kmp_fork_barrier`` took 4.589 seconds of CPU time - during 9.33% of the CPU execution time, threads were just spinning at this barrier due to thread synchronization.  
+
+(2) Each OpenMP thread utilizing GEMM execution units in respective physical cores (0,1) 
+
+
+.. figure:: /_static/img/torchserve-ipex-images/4.png
+   :width: 80%
+   :align: center
+ 
+
+.. figure:: /_static/img/torchserve-ipex-images/5.png
+   :width: 80%
+   :align: center
+   
+We first note that the execution time dropped from 32 seconds to 23 seconds by avoiding logical cores. While there's still some non-negligible Imbalance or Serial Spinning, we note relative improvement from 4.980 seconds to 3.887 seconds. 
+
+By not using logical threads (instead, using 1 thread per physical core), we avoid logical threads contending for the same core resources. The Top Hotspots section also indicates relative improvement of ``__kmp_fork_barrier`` time from 4.589 seconds to 3.530 seconds. 
+
+Local memory access is always faster than remote memory access 
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+We generally recommend binding a process to a local socket such that the process does not migrate across sockets. Generally the goal of doing so is to utilize high speed cache on local memory and to avoid remote memory access which can be ~2x slower. 
+
+
+.. figure:: /_static/img/torchserve-ipex-images/6.png
+   :width: 80%
+   :align: center
+Figure 1. Two-socket configuration 
+
+Figure 1. shows a typical two-socket configuration. Notice that each socket has its own local memory. Sockets are connected to each other via Intel Ultra Path Interconnect (UPI) which allows each socket to access the local memory of another socket called remote memory. Local memory access is always faster than remote memory access. 
+
+.. figure:: /_static/img/torchserve-ipex-images/7.png
+   :width: 50%
+   :align: center
+Figure 2.1. CPU information 
+
+Users can get their CPU information by running ``lscpu`` command on their Linux machine. Figure 2.1. shows an example of ``lscpu``  execution on a machine with two Intel(R) Xeon(R) Platinum 8180M CPUs. Notice that there are 28 cores per socket, and 2 threads per core (i.e., hyperthreading is enabled). In other words, there are 28 logical cores in addition to 28 physical cores, giving a total of 56 cores per socket. And there are 2 sockets, giving a total of 112 cores (``Thread(s) per core`` x ``Core(s) per socket`` x ``Socket(s)``). 
+
+.. figure:: /_static/img/torchserve-ipex-images/8.png
+   :width: 100%
+   :align: center
+Figure 2.2. CPU information 
+
+The 2 sockets are mapped to 2 NUMA nodes (NUMA node 0, NUMA node 1) respectively.  Physical cores are indexed prior to logical cores. As shown in Figure 2.2., the first 28 physical cores (0-27) and the first 28 logical cores (56-83) on the first socket are on NUMA node 0. And the second 28 physical cores (28-55) and the second 28 logical cores (84-111) on the second socket are on NUMA node 1. Cores on the same socket share local memory and last level cache (LLC) which is much faster than cross-socket communication via Intel UPI. 
+
+Now that we understand NUMA, cross-socket (UPI) traffic, local vs. remote memory access in multi-processor systems, let's profile and verify our understanding. 
+
+:strong:`Exercise`
+
+We'll reuse the ResNet50 example above. 
+
+As we did not pin threads to processor cores of a specific socket, the operating system periodically schedules threads on processor cores located in different sockets. 
+
+.. figure:: /_static/img/torchserve-ipex-images/9.gif 
+   :width: 100%
+   :align: center
+
+Figure 3. CPU usage of non NUMA-aware application. 1 main worker thread was launched, then it launched a physical core number (56) of threads on all cores, including logical cores. 
+
+(Aside: If the number of threads is not set by `torch.set_num_threads <https://pytorch.org/docs/stable/generated/torch.set_num_threads.html>`_, the default number of threads is the number of physical cores in a hyperthreading enabled system. This can be verified by `torch.get_num_threads <https://pytorch.org/docs/stable/generated/torch.get_num_threads.html>`_. Hence we see above about half of the cores busy running the example script.)
+
+.. figure:: /_static/img/torchserve-ipex-images/10.png
+   :width: 100%
+   :align: center
+Figure 4. Non-Uniform Memory Access Analysis graph 
+
+
+Figure 4. compares local vs. remote memory access over time. We verify usage of remote memory which could result in sub-optimal performance. 
+
+:strong:`Set thread affinity to reduce remote memory access and cross-socket (UPI) traffic`
+
+Pinning threads to cores on the same socket helps maintain locality of memory access. In this example, we'll pin to the physical cores on the first NUMA node (0-27). With the launch script, users can easily experiment with NUMA nodes configuration by simply toggling the ``--node_id`` launch script knob. 
+
+Let's visualize the CPU usage now.
+
+.. figure:: /_static/img/torchserve-ipex-images/11.gif 
+   :width: 100%
+   :align: center
+Figure 5. CPU usage of NUMA-aware application 
+
+1 main worker thread was launched, then it launched threads on all physical cores on the first numa node. 
+
+.. figure:: /_static/img/torchserve-ipex-images/12.png
+   :width: 100%
+   :align: center
+Figure 6. Non-Uniform Memory Access Analysis graph 
+
+As shown in Figure 6., now almost all memory accesses are local accesses. 
+
+Efficient CPU usage with core pinning for multi-worker inference 
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+When running multi-worker inference, cores are overlapped (or shared) between workers causing inefficient CPU usage. To address this problem, the launch script equally divides the number of available cores by the number of workers such that each worker is pinned to assigned cores during runtime. 
+
+:strong:`Exercise with TorchServe`
+
+For this exercise, let's apply the CPU performance tuning principles and recommendations that we have discussed so far to `TorchServe apache-bench benchmarking <https://github.com/pytorch/serve/tree/master/benchmarks#benchmarking-with-apache-bench>`_. 
+
+We'll use ResNet50 with 4 workers, concurrency 100, requests 10,000. All other parameters (e.g., batch_size, input, etc) are the same as the `default parameters <https://github.com/pytorch/serve/blob/master/benchmarks/benchmark-ab.py#L18>`_. 
+
+We'll compare the following three configurations:
+
+(1) default TorchServe setting (no core pinning)
+
+(2) `torch.set_num_threads <https://pytorch.org/docs/stable/generated/torch.set_num_threads.html>`_ = ``number of physical cores / number of workers`` (no core pinning)
+
+(3) core pinning via the launch script 
+
+After this exercise, we'll have verified that we prefer avoiding logical cores and prefer local memory access via core pinning with a real TorchServe use case. 
+
+1. Default TorchServe setting (no core pinning) 
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The `base_handler <https://github.com/pytorch/serve/blob/master/ts/torch_handler/base_handler.py>`_ doesn't explicitly set `torch.set_num_threads <https://pytorch.org/docs/stable/generated/torch.set_num_threads.html>`_. Hence the default number of threads is the number of physical CPU cores as described `here <https://pytorch.org/docs/stable/notes/cpu_threading_torchscript_inference.html#runtime-api>`_. Users can check the number of threads by `torch.get_num_threads <https://pytorch.org/docs/stable/generated/torch.get_num_threads.html>`_ in the base_handler. Each of the 4 main worker threads launches a physical core number (56) of threads, launching a total of 56x4 = 224 threads, which is more than the total number of cores 112.  Therefore cores are guaranteed to be heavily overlapped with high logical core utilization- multiple workers using multiple cores at the same time. Furthermore, because threads are not affinitized to specific CPU cores, the operating system periodically schedules threads to cores located in different sockets. 
+
+1. CPU usage 
+
+.. figure:: /_static/img/torchserve-ipex-images/13.png
+   :width: 100%
+   :align: center
+
+4 main worker threads were launched, then each launched a physical core number (56) of threads on all cores, including logical cores.
+
+2. Core Bound stalls
+
+.. figure:: /_static/img/torchserve-ipex-images/14.png
+   :width: 80%
+   :align: center
+
+We observe a very high Core Bound stall of 88.4%, decreasing pipeline efficiency. Core Bound stalls indicate sub-optimal use of available execution units in the CPU. For example, several GEMM instructions in a row competing for fused-multiply-add (FMA) or dot-product (DP) execution units shared by hyperthreading cores could cause Core Bound stalls. And as described in the previous section, use of logical cores amplifies this problem.
+
+
+.. figure:: /_static/img/torchserve-ipex-images/15.png
+   :width: 40%
+   :align: center
+   
+.. figure:: /_static/img/torchserve-ipex-images/16.png
+   :width: 50%
+   :align: center
+   
+An empty pipeline slot not filled with micro-ops (uOps) is attributed to a stall. For example, without core pinning CPU usage may not effectively be on compute but on other operations like thread scheduling from Linux kernel. We see above that ``__sched_yield`` contributed to the majority of the Spin Time.  
+
+3. Thread Migration
+
+Without core pinning, scheduler may migrate thread executing on a core to a different core. Thread migration can disassociate the thread from data that has already been fetched into the caches resulting in longer data access latencies. This problem is exacerbated in NUMA systems when thread migrates across sockets. Data that has been fetched to high speed cache on local memory now becomes remote memory, which is much slower.  
+
+.. figure:: /_static/img/torchserve-ipex-images/17.png
+   :width: 50%
+   :align: center
+
+Generally the total number of threads should be less than or equal to the total number of threads supported by the core. In the above example, we notice a large number of threads executing on core_51 instead of the expected 2 threads (since hyperthreading is enabled in Intel(R) Xeon(R) Platinum 8180 CPUs) . This indicates thread migration. 
+
+.. figure:: /_static/img/torchserve-ipex-images/18.png
+   :width: 80%
+   :align: center
+
+Additionally, notice that thread (TID:97097) was executing on a large number of CPU cores, indicating CPU migration. For example, this thread was executing on cpu_81, then migrated to cpu_14, then migrated to cpu_5, and so on. Furthermore, note that this thread migrated cross socket back and forth many times, resulting in very inefficient memory access. For example, this thread executed on cpu_70 (NUMA node 0), then migrated to cpu_100 (NUMA node 1), then migrated to cpu_24 (NUMA node 0). 
+
+4. Non Uniform Memory Access Analysis
+
+.. figure:: /_static/img/torchserve-ipex-images/19.png
+   :width: 100%
+   :align: center
+
+Compare local vs. remote memory access over time. We observe that about half, 51.09%, of the memory accesses were remote accesses, indicating sub-optimal NUMA configuration. 
+
+2. torch.set_num_threads = ``number of physical cores / number of workers`` (no core pinning) 
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+For an apple-to-apple comparison with launcher's core pinning, we'll set the number of threads to the number of cores divided by the number of workers (launcher does this internally). Add the following code snippet in the `base_handler <https://github.com/pytorch/serve/blob/master/ts/torch_handler/base_handler.py>`_:
+
+.. code:: python
+
+    torch.set_num_threads(num_physical_cores/num_workers)
+
+As before without core pinning, these threads are not affinitized to specific CPU cores, causing the operating system to periodically schedule threads on cores located in different sockets. 
+
+1. CPU usage
+
+.. figure:: /_static/img/torchserve-ipex-images/20.gif 
+   :width: 100%
+   :align: center
+   
+4 main worker threads were launched, then each launched a ``num_physical_cores/num_workers`` number (14) of threads on all cores, including logical cores.  
+
+2. Core Bound stalls
+
+.. figure:: /_static/img/torchserve-ipex-images/21.png
+   :width: 80%
+   :align: center
+   
+Although the percentage of Core Bound stalls has decreased from 88.4% to 73.5%, the Core Bound is still very high.
+
+.. figure:: /_static/img/torchserve-ipex-images/22.png
+   :width: 40%
+   :align: center
+
+.. figure:: /_static/img/torchserve-ipex-images/23.png
+   :width: 50%
+   :align: center
+
+3. Thread Migration
+
+.. figure:: /_static/img/torchserve-ipex-images/24.png
+   :width: 75%
+   :align: center
+   
+Similar as before, without core pinning thread (TID:94290) was executing on a large number of CPU cores, indicating CPU migration. We notice again cross-socket thread migration, resulting in very inefficient memory access. For example, this thread executed on cpu_78 (NUMA node 0), then migrated to cpu_108 (NUMA node 1). 
+
+4. Non Uniform Memory Access Analysis
+
+.. figure:: /_static/img/torchserve-ipex-images/25.png
+   :width: 100%
+   :align: center
+
+Although an improvement from the original 51.09%, still 40.45% of memory access is remote, indicating sub-optimal NUMA configuration. 
+
+3. launcher core pinning
+~~~~~~~~~~~~~~~~~~~~~~~~
+Launcher will internally equally distribute physical cores to workers, and bind them to each worker. As a reminder, launcher by default uses physical cores only. In this example, launcher will bind worker 0 to cores 0-13 (NUMA node 0), worker 1 to cores 14-27 (NUMA node 0), worker 2 to cores 28-41 (NUMA node 1), and worker 3 to cores 42-55 (NUMA node 1). Doing so ensures that cores are not overlapped among workers and avoids logical core usage. 
+
+1. CPU usage
+
+.. figure:: /_static/img/torchserve-ipex-images/26.gif 
+   :width: 100%
+   :align: center
+   
+4 main worker threads were launched, then each launched a ``num_physical_cores/num_workers`` number (14) of threads affinitized to the assigned physical cores.
+
+2. Core Bound stalls
+
+.. figure:: /_static/img/torchserve-ipex-images/27.png
+   :width: 80%
+   :align: center
+   
+Core Bound stalls has decreased significantly from the original 88.4% to 46.2% - almost a 2x improvement. 
+
+.. figure:: /_static/img/torchserve-ipex-images/28.png
+   :width: 40%
+   :align: center
+   
+.. figure:: /_static/img/torchserve-ipex-images/29.png
+   :width: 50%
+   :align: center
+
+We verify that with core binding, most CPU time is effectively used on compute - Spin Time of 0.256s.  
+
+3. Thread Migration
+
+.. figure:: /_static/img/torchserve-ipex-images/30.png
+   :width: 100%
+   :align: center
+   
+We verify that `OMP Primary Thread #0` was bound to assigned physical cores (42-55), and did not migrate cross-socket. 
+
+4. Non Uniform Memory Access Analysis
+
+.. figure:: /_static/img/torchserve-ipex-images/31.png
+   :width: 100%
+   :align: center
+   
+Now almost all, 89.52%, memory accesses are local accesses. 
+
+Conclusion
+~~~~~~~~~~
+
+In this blog, we've showcased that properly setting your CPU runtime configuration can significantly boost out-of-box CPU performance. 
+
+We have walked through some general CPU performance tuning principles and recommendations:
+
+- In a hyperthreading enabled system, avoid logical cores by setting thread affinity to physical cores only via core pinning.
+- In a multi-socket system with NUMA, avoid cross-socket remote memory access by setting thread affinity to a specific socket via core pinning. 
+
+We have visually explained these ideas from first principles and have verified the performance boost with profiling. And finally, we have applied all of our learnings to TorchServe to boost out-of-box TorchServe CPU performance.  
+
+These principles can be automatically configured via an easy to use launch script which has already been integrated into TorchServe. 
+
+For interested readers, please check out the following documents:
+
+- `CPU specific optimizations <https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html#cpu-specific-optimizations>`_
+- `Maximize Performance of Intel® Software Optimization for PyTorch* on CPU <https://www.intel.com/content/www/us/en/developer/articles/technical/how-to-get-better-performance-on-pytorchcaffe2-with-intel-acceleration.html>`_
+- `Performance Tuning Guide <https://intel.github.io/intel-extension-for-pytorch/tutorials/performance_tuning/tuning_guide.html>`_
+- `Launch Script Usage Guide <https://intel.github.io/intel-extension-for-pytorch/tutorials/performance_tuning/launch_script.html>`_
+- `Top-down Microarchitecture Analysis Method <https://www.intel.com/content/www/us/en/develop/documentation/vtune-cookbook/top/methodologies/top-down-microarchitecture-analysis-method.html>`_
+- `Configuring oneDNN for Benchmarking <https://oneapi-src.github.io/oneDNN/dev_guide_performance_settings.html#benchmarking-settings>`_
+- `Intel® VTune™ Profiler <https://www.intel.com/content/www/us/en/developer/tools/oneapi/vtune-profiler.html#gs.tcbgpa>`_
+- `Intel® VTune™ Profiler User Guide <https://www.intel.com/content/www/us/en/develop/documentation/vtune-help/top.html>`_
+
+And stay tuned for a follow-up posts on optimized kernels on CPU via `Intel® Extension for PyTorch* <https://github.com/intel/intel-extension-for-pytorch>`_ and advanced launcher configurations such as memory allocator.
+
+Acknowledgement 
+~~~~~~~~~~~~~~~
+
+We would like to thank Ashok Emani (Intel) and Jiong Gong (Intel) for their immense guidance and support, and thorough feedback and reviews throughout many steps of this blog. We would also like to thank Hamid Shojanazeri (Meta), Li Ning (AWS) and Jing Xu (Intel) for helpful feedback in code review. And Suraj Subramanian (Meta) and Geeta Chauhan (Meta) for helpful feedback on the blog. 
diff --git a/prototype_source/fx_numeric_suite_tutorial.py b/prototype_source/fx_numeric_suite_tutorial.py
new file mode 100644
index 000000000..ac43ae49e
--- /dev/null
+++ b/prototype_source/fx_numeric_suite_tutorial.py
@@ -0,0 +1,231 @@
+# -*- coding: utf-8 -*-
+"""
+PyTorch FX Numeric Suite Core APIs Tutorial
+===========================================
+
+Introduction
+------------
+
+Quantization is good when it works, but it is difficult to know what is wrong
+when it does not satisfy the accuracy we expect. Debugging the accuracy issue
+of quantization is not easy and time-consuming.
+
+One important step of debugging is to measure the statistics of the float model
+and its corresponding quantized model to know where they differ most.
+We built a suite of numeric tools called PyTorch FX Numeric Suite Core APIs in
+PyTorch quantization to enable the measurement of the statistics between
+quantized module and float module to support quantization debugging efforts.
+Even for the quantized model with good accuracy, PyTorch FX Numeric Suite Core
+APIs can still be used as the profiling tool to better understand the
+quantization error within the model and provide the guidance for further
+optimization.
+
+PyTorch FX Numeric Suite Core APIs currently supports models quantized through
+both static quantization and dynamic quantization with unified APIs.
+
+In this tutorial we will use MobileNetV2 as an example to show how to use
+PyTorch FX Numeric Suite Core APIs to measure the statistics between static
+quantized model and float model.
+
+Setup
+^^^^^
+We’ll start by doing the necessary imports:
+"""
+
+##############################################################################
+
+# Imports and util functions
+
+import copy
+import torch
+import torchvision
+import torch.quantization
+import torch.ao.ns._numeric_suite_fx as ns
+import torch.quantization.quantize_fx as quantize_fx
+
+import matplotlib.pyplot as plt
+from tabulate import tabulate
+
+torch.manual_seed(0)
+plt.style.use('seaborn-whitegrid')
+
+
+# a simple line graph
+def plot(xdata, ydata, xlabel, ylabel, title):
+    _ = plt.figure(figsize=(10, 5), dpi=100)
+    plt.xlabel(xlabel)
+    plt.ylabel(ylabel)
+    plt.title(title)
+    ax = plt.axes()
+    ax.plot(xdata, ydata)
+    plt.show()
+
+##############################################################################
+# Then we load the pretrained float MobileNetV2 model, and quantize it.
+
+
+# create float model
+mobilenetv2_float = torchvision.models.quantization.mobilenet_v2(
+    pretrained=True, quantize=False).eval()
+
+# create quantized model
+qconfig_dict = {
+    '': torch.quantization.get_default_qconfig('fbgemm'),
+    # adjust the qconfig to make the results more interesting to explore
+    'module_name': [
+        # turn off quantization for the first couple of layers
+        ('features.0', None),
+        ('features.1', None),
+        # use MinMaxObserver for `features.17`, this should lead to worse
+        # weight SQNR
+        ('features.17', torch.quantization.default_qconfig),
+    ]
+}
+# Note: quantization APIs are inplace, so we save a copy of the float model for
+# later comparison to the quantized model. This is done throughout the
+# tutorial.
+mobilenetv2_prepared = quantize_fx.prepare_fx(
+    copy.deepcopy(mobilenetv2_float), qconfig_dict)
+datum = torch.randn(1, 3, 224, 224)
+mobilenetv2_prepared(datum)
+# Note: there is a long standing issue that we cannot copy.deepcopy a
+# quantized model. Since quantization APIs are inplace and we need to use
+# different copies of the quantized model throughout this tutorial, we call
+# `convert_fx` on a copy, so we have access to the original `prepared_model`
+# later. This is done throughout the tutorial.
+mobilenetv2_quantized = quantize_fx.convert_fx(
+    copy.deepcopy(mobilenetv2_prepared))
+
+##############################################################################
+# 1. Compare the weights of float and quantized models
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+# The first analysis we can do is comparing the weights of the fp32 model and
+# the int8 model by calculating the SQNR between each pair of weights.
+#
+# The `extract_weights` API can be used to extract weights from linear,
+# convolution and LSTM layers. It works for dynamic quantization as well as
+# PTQ/QAT.
+
+# Note: when comparing weights in models with Conv-BN for PTQ, we need to
+# compare weights after Conv-BN fusion for a proper comparison.  Because of
+# this, we use `prepared_model` instead of `float_model` when comparing
+# weights.
+
+# Extract conv and linear weights from corresponding parts of two models, and
+# save them in `wt_compare_dict`.
+mobilenetv2_wt_compare_dict = ns.extract_weights(
+    'fp32',  # string name for model A
+    mobilenetv2_prepared,  # model A
+    'int8',  # string name for model B
+    mobilenetv2_quantized,  # model B
+)
+
+# calculate SQNR between each pair of weights
+ns.extend_logger_results_with_comparison(
+    mobilenetv2_wt_compare_dict,  # results object to modify inplace
+    'fp32',  # string name of model A (from previous step)
+    'int8',  # string name of model B (from previous step)
+    torch.ao.ns.fx.utils.compute_sqnr,  # tensor comparison function
+    'sqnr',  # the name to use to store the results under
+)
+
+# massage the data into a format easy to graph and print
+mobilenetv2_wt_to_print = []
+for idx, (layer_name, v) in enumerate(mobilenetv2_wt_compare_dict.items()):
+    mobilenetv2_wt_to_print.append([
+        idx,
+        layer_name,
+        v['weight']['int8'][0]['prev_node_target_type'],
+        v['weight']['int8'][0]['values'][0].shape,
+        v['weight']['int8'][0]['sqnr'][0],
+    ])
+
+# plot the SQNR between fp32 and int8 weights for each layer
+plot(
+    [x[0] for x in mobilenetv2_wt_to_print],
+    [x[4] for x in mobilenetv2_wt_to_print],
+    'idx',
+    'sqnr',
+    'weights, idx to sqnr'
+)
+
+##############################################################################
+# Also print out the SQNR, so we can inspect the layer name and type:
+
+print(tabulate(
+    mobilenetv2_wt_to_print,
+    headers=['idx', 'layer_name', 'type', 'shape', 'sqnr']
+))
+
+##############################################################################
+# 2. Compare activations API
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^
+# The second tool allows for comparison of activations between float and
+# quantized models at corresponding locations for the same input.
+#
+# .. figure:: /_static/img/compare_output.png
+#
+# The `add_loggers`/`extract_logger_info` API can be used to to extract
+# activations from any layer with a `torch.Tensor` return type. It works for
+# dynamic quantization as well as PTQ/QAT.
+
+# Compare unshadowed activations
+
+# Create a new copy of the quantized model, because we cannot `copy.deepcopy`
+# a quantized model.
+mobilenetv2_quantized = quantize_fx.convert_fx(
+    copy.deepcopy(mobilenetv2_prepared))
+mobilenetv2_float_ns, mobilenetv2_quantized_ns = ns.add_loggers(
+    'fp32',  # string name for model A
+    copy.deepcopy(mobilenetv2_prepared),  # model A
+    'int8',  # string name for model B
+    mobilenetv2_quantized,  # model B
+    ns.OutputLogger,  # logger class to use
+)
+
+# feed data through network to capture intermediate activations
+mobilenetv2_float_ns(datum)
+mobilenetv2_quantized_ns(datum)
+
+# extract intermediate activations
+mobilenetv2_act_compare_dict = ns.extract_logger_info(
+    mobilenetv2_float_ns,  # model A, with loggers (from previous step)
+    mobilenetv2_quantized_ns,  # model B, with loggers (from previous step)
+    ns.OutputLogger,  # logger class to extract data from
+    'int8',  # string name of model to use for layer names for the output
+)
+
+# add SQNR comparison
+ns.extend_logger_results_with_comparison(
+    mobilenetv2_act_compare_dict,  # results object to modify inplace
+    'fp32',  # string name of model A (from previous step)
+    'int8',  # string name of model B (from previous step)
+    torch.ao.ns.fx.utils.compute_sqnr,  # tensor comparison function
+    'sqnr',  # the name to use to store the results under
+)
+
+# massage the data into a format easy to graph and print
+mobilenet_v2_act_to_print = []
+for idx, (layer_name, v) in enumerate(mobilenetv2_act_compare_dict.items()):
+    mobilenet_v2_act_to_print.append([
+        idx,
+        layer_name,
+        v['node_output']['int8'][0]['prev_node_target_type'],
+        v['node_output']['int8'][0]['values'][0].shape,
+        v['node_output']['int8'][0]['sqnr'][0]])
+
+# plot the SQNR between fp32 and int8 activations for each layer
+plot(
+    [x[0] for x in mobilenet_v2_act_to_print],
+    [x[4] for x in mobilenet_v2_act_to_print],
+    'idx',
+    'sqnr',
+    'unshadowed activations, idx to sqnr',
+)
+
+##############################################################################
+# Also print out the SQNR, so we can inspect the layer name and type:
+print(tabulate(
+    mobilenet_v2_act_to_print,
+    headers=['idx', 'layer_name', 'type', 'shape', 'sqnr']
+))
diff --git a/recipes_source/recipes/loading_data_recipe.py b/recipes_source/recipes/loading_data_recipe.py
index 0442e85f1..f58bbd899 100644
--- a/recipes_source/recipes/loading_data_recipe.py
+++ b/recipes_source/recipes/loading_data_recipe.py
@@ -68,16 +68,14 @@
 #
 # ``torchaudio`` 의 YesNo 데이터셋은 한 사람이 히브리어로 yes 혹은
 # no를 녹음한 오디오 클립 60개로 구성되어 있습니다. 오디오 클립 각각의 길이는 단어 8개입니다.
-# (`더 알아보기 <https://www.openslr.org/1/>`__).
+# ( `더 알아보기 <https://www.openslr.org/1/>`__ ).
 #
 # ``torchaudio.datasets.YESNO`` 클래스를 사용하여 YesNo 데이터셋을 생성합니다.
 torchaudio.datasets.YESNO(
-     root,
+     root='./',
      url='http://www.openslr.org/resources/1/waves_yesno.tar.gz',
      folder_in_archive='waves_yesno',
-     download=False,
-     transform=None,
-     target_transform=None)
+     download=True)
 
 ###########################################################################
 # 각각의 데이터 항목 (item)은 튜플 형태 (waveform: 파형, sample_rate: 샘플 속도, labels: 라벨)를 갖습니다.
@@ -87,9 +85,7 @@
 # 그 외의 매개변수는 선택 사항이며, 위 예시에서 기본값을 확인하실 있습니다. 아래와
 # 같은 매개변수도 사용 가능합니다.
 #
-# * ``download``: 참인 경우, 데이터셋 파일을 인터넷에서 다운받고 root 폴더에 저장합니다. 파일이 이미 존재하면 다시 다운받지 않습니다.
-# * ``transform``: 데이터를 변환하여 학습에 사용할 수 있도록 이어붙이고 비정규화된 형태로 불러오실 수 있습니다. 라이브러리마다 다양한 transformation을 지원하고 있으며, 앞으로도 추가될 예정입니다.
-# * ``target_transform``: 타겟 데이터를 변환하기 위한 함수 혹은 transform입니다.
+# * ``download``: 참(True)인 경우, 데이터셋 파일을 인터넷에서 다운받고 root 폴더에 저장합니다. 파일이 이미 존재하면 다시 다운받지 않습니다.
 #
 # 이제 YesNo 데이터를 확인해봅시다:
 
diff --git a/recipes_source/recipes/tuning_guide.py b/recipes_source/recipes/tuning_guide.py
index 9094da055..86d0d4cf4 100644
--- a/recipes_source/recipes/tuning_guide.py
+++ b/recipes_source/recipes/tuning_guide.py
@@ -137,9 +137,9 @@ def fused_gelu(x):
 # Support for ``channels_last`` is experimental, but it's expected to work for
 # standard computer vision models (e.g. ResNet-50, SSD). To convert models to
 # ``channels_last`` format follow
-# `Channels Last Memory Format Tutorial <https://tutorials.pytorch.kr/intermediate/memory_format_tutorial.html>`_.
+# `Channels Last Memory Format Tutorial <https://pytorch.org/tutorials/intermediate/memory_format_tutorial.html>`_.
 # The tutorial includes a section on
-# `converting existing models <https://tutorials.pytorch.kr/intermediate/memory_format_tutorial.html#converting-existing-models>`_.
+# `converting existing models <https://pytorch.org/tutorials/intermediate/memory_format_tutorial.html#converting-existing-models>`_.
 
 ###############################################################################
 # Checkpoint intermediate buffers
@@ -236,6 +236,43 @@ def fused_gelu(x):
 
 # export LD_PRELOAD=<jemalloc.so/tcmalloc.so>:$LD_PRELOAD
 
+###############################################################################
+# Use oneDNN Graph with TorchScript for inference
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# oneDNN Graph can significantly boost inference performance. It fuses some compute-intensive operations such as convolution, matmul with their neighbor operations.
+# Currently, it's supported as an experimental feature for Float32 data-type.
+# oneDNN Graph receives the model’s graph and identifies candidates for operator-fusion with respect to the shape of the example input.
+# A model should be JIT-traced using an example input.
+# Speed-up would then be observed after a couple of warm-up iterations for inputs with the same shape as the example input.
+# The example code-snippets below are for resnet50, but they can very well be extended to use oneDNN Graph with custom models as well.
+
+# Only this extra line of code is required to use oneDNN Graph
+torch.jit.enable_onednn_fusion(True)
+
+###############################################################################
+# Using the oneDNN Graph API requires just one extra line of code.
+# If you are using oneDNN Graph, please avoid calling ``torch.jit.optimize_for_inference``.
+
+# sample input should be of the same shape as expected inputs
+sample_input = [torch.rand(32, 3, 224, 224)]
+# Using resnet50 from TorchVision in this example for illustrative purposes,
+# but the line below can indeed be modified to use custom models as well.
+model = getattr(torchvision.models, "resnet50")().eval()
+# Tracing the model with example input
+traced_model = torch.jit.trace(model, sample_input)
+# Invoking torch.jit.freeze
+traced_model = torch.jit.freeze(traced_model)
+
+###############################################################################
+# Once a model is JIT-traced with a sample input, it can then be used for inference after a couple of warm-up runs.
+
+with torch.no_grad():
+    # a couple of warmup runs
+    traced_model(*sample_input)
+    traced_model(*sample_input)
+    # speedup would be observed after warmup runs
+    traced_model(*sample_input)
+
 ###############################################################################
 # Train a model on CPU with PyTorch DistributedDataParallel(DDP) functionality
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -326,7 +363,7 @@ def fused_gelu(x):
 #   * native PyTorch AMP is available starting from PyTorch 1.6:
 #     `documentation <https://pytorch.org/docs/stable/amp.html>`_,
 #     `examples <https://pytorch.org/docs/stable/notes/amp_examples.html#amp-examples>`_,
-#     `tutorial <https://tutorials.pytorch.kr/recipes/recipes/amp_recipe.html>`_
+#     `tutorial <https://pytorch.org/tutorials/recipes/recipes/amp_recipe.html>`_
 #
 #
 
diff --git a/requirements-noplot.txt b/requirements-noplot.txt
index 44540fc8f..4b761567c 100644
--- a/requirements-noplot.txt
+++ b/requirements-noplot.txt
@@ -15,7 +15,8 @@ torchvision
 torchtext
 torchaudio
 torchdata
-#functorch
+# Functorch is not needed, as intermediate_source/forward_ad_usage.py is not rendered
+# functorch
 
 # PyTorch Theme
 pytorch-sphinx-theme @ https://github.com/PyTorchKorea/pytorch_sphinx_theme/archive/master.zip
diff --git a/requirements.txt b/requirements.txt
index 8c3e0b482..5a26253ab 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -19,7 +19,8 @@ torchvision
 torchtext
 torchaudio
 torchdata
-functorch
+# Functorch is not needed, as intermediate_source/forward_ad_usage.py is not rendered
+# functorch
 PyHamcrest
 bs4
 awscliv2==2.1.1
@@ -42,6 +43,6 @@ scikit-image
 scipy
 pillow
 wget
-gym
+gym==0.24.0
 gym-super-mario-bros==7.3.0
 timm