Update pretraining slides

angelxuanchang · Feb 26, 2024 · 1f52e54 · 1f52e54
1 parent a7eb251
commit 1f52e54
Show file tree

Hide file tree

Showing 3 changed files with 207 additions and 9 deletions.
diff --git a/_data/schedule.yaml b/_data/schedule.yaml
@@ -136,18 +136,18 @@
   dates:
   - date: "2/26" 
     title: "Pretraining and fine-tuning"
-    lecture: "pretraining"
+    lecture: "peft"
     leclinks:
       - name: "pretraining slides"
-        url: "assets/lecture-slides/L20-pretraining.pdf"
+        url: "assets/lecture-slides/L15-pretraining.pdf"
       - name: "fine-tuning slides"
         url: "assets/slides/peft.pdf"
   - date: "2/28" 
-    title: "Final project tips, model debugging and analysis"
-    lecture: "analysis"
+    lecture: "fewshot"
+    title: "Few-shot and in-context learning"
     leclinks:
       - name: "slides"
-        url: "assets/lecture-slides/L17-project-tips-analysis.pdf"
+        url: "assets/slides/fewshot.pdf"
     hwdue: "3"
     hwout: "4"
   - date: "2/29" 
@@ -177,11 +177,11 @@
       - name: "slides"
         url: "assets/lecture-slides/L18-question-answering.pdf"
   - date: "3/13" 
-    title: "Text Generation"
-    lecture: "textgen"
+    title: "Final project tips, model debugging and analysis"
+    lecture: "analysis"
     leclinks:
       - name: "slides"
-        url: "assets/lecture-slides/L21-text-generation.pdf"
+        url: "assets/lecture-slides/L17-project-tips-analysis.pdf"
     hwdue: "4"
 - week: 
   num: 11
@@ -195,6 +195,9 @@
   - date: "3/20" 
     title: "Scaling laws for LLMs"
     lecture: "scaling-laws"
+    leclinks:
+      - name: "slides"
+        url: "assets/slides/scaling.pdf"
   - date: "3/21" 
     project: "Project milestone"
 - week: 

diff --git a/_data/syllabus.yaml b/_data/syllabus.yaml
@@ -532,6 +532,201 @@
       author: "Rogers, Kovaleva, and Rumshisky"
       url: "https://direct.mit.edu/tacl/article/doi/10.1162/tacl_a_00349/96482/A-Primer-in-BERTology-What-We-Know-About-How-BERT"
       optional: true
+- title: "Tokenization"
+  tag: bpe 
+  include: true
+  current: false
+  notes:
+    - title: "Lecture notes"
+      url: "assets/slides/tokenization.pdf"
+    - title: "Tokenization into Sub-words with Byte-Pair Encoding"
+      url: "http://nbviewer.jupyter.org/github/anoopsarkar/nlp-class/blob/gh-pages/assets/notebooks/bpe.ipynb"
+      download: "http://github.com/anoopsarkar/nlp-class/raw/gh-pages/assets/notebooks/bpe.ipynb"
+  links:
+    - title: "BPE tutorial at Huggingface"
+      url: "https://huggingface.co/learn/nlp-course/chapter6/5?fw=pt"
+      author: "Huggingface"
+      optional: true
+- title: "Pre-training Language Models"
+  tag: "pretraining"
+  include: true
+  current: false
+  notes:
+    - title: Lecture notes
+      url: "assets/slides/pre_training.pdf"
+    - title: "Improving language understanding with unsupervised learning"
+      author: "Alec Radford, Karthik Narasimhan, Tim Salimans, Ilya Sutskever"
+      url: "https://openai.com/research/language-unsupervised"
+  links:
+    - title: "Semi-supervised Sequence Learning"
+      author: "Andrew M. Dai, Quoc V. Le"
+      url: "https://arxiv.org/abs/1511.01432"
+    - title: "Deep contextualized word representations"
+      author: "Matthew E. Peters, Mark Neumann, Mohit Iyyer, Matt Gardner, Christopher Clark, Kenton Lee, Luke Zettlemoyer"
+      url: "https://arxiv.org/abs/1802.05365"
+    - title: "RoBERTa: A Robustly Optimized BERT Pretraining Approach"
+      author: "Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov"
+      url: "https://arxiv.org/abs/1907.11692"
+    - title: "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context"
+      author: "Zihang Dai, Zhilin Yang, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov"
+      url: "https://arxiv.org/abs/1901.02860"
+    - title: "ELECTRA: Pre-training Text Encoders as Discriminators Rather Than Generators"
+      author: "Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning"
+      url: "https://arxiv.org/abs/2003.10555"
+    - title: "Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer"
+      author: "Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu"
+      url: "https://arxiv.org/abs/1910.10683"
+    - title: "ALBERT: A Lite BERT for Self-supervised Learning of Language Representations"
+      author: "Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut"
+      url: "https://arxiv.org/abs/1909.11942"
+- title: "Decoding"
+  tag: "decoding"
+  include: true
+  current: false
+  notes:
+    - title: "Lecture notes"
+      url: "assets/slides/decoding.pdf"
+    - title: "Intro to Generation with LLMs"
+      author: "Huggingface"
+      url: "https://huggingface.co/docs/transformers/llm_tutorial"
+    - title: "Generation Strategies"
+      author: "Huggingface"
+      url: "https://huggingface.co/docs/transformers/generation_strategies"
+    - title: "Contrastive Search"
+      author: "Tian Lan"
+      url: "https://huggingface.co/blog/introducing-csearch#62-example-two---opt"
+    - title: "Categorical Reparameterization with Gumbel-Softmax"
+      author: "Eric Jang, Shixiang Gu, Ben Poole"
+      url: "https://arxiv.org/abs/1611.01144v5"
+  links:
+    - title: "A Contrastive Framework for Neural Text Generation"
+      author: "Yixuan Su, Tian Lan, Yan Wang, Dani Yogatama, Lingpeng Kong, Nigel Collier"
+      url: "https://arxiv.org/abs/2202.06417"
+- title: "Parameter-efficient Fine Tuning"
+  tag: "peft"
+  include: true
+  current: false
+  notes:
+    - title: Lecture notes
+      url: "assets/slides/peft.pdf"
+  readings:
+    - title: "HuggingFace PEFT: Parameter-Efficient Fine-Tuning of Billion-Scale Models on Low-Resource Hardware"
+      author: Sourab Mangrulkar, Sayak Paul"
+      url: "https://huggingface.co/blog/peft"
+    - title: "Prefix-Tuning: Optimizing Continuous Prompts for Generation"
+      author: "Xiang Lisa Li, Percy Liang"
+      url: "https://arxiv.org/abs/2101.00190"
+      optional: true
+    - title: "AdaMix: Mixture-of-Adaptations for Parameter-efficient Model Tuning"
+      author: "Yaqing Wang, Sahaj Agarwal, Subhabrata Mukherjee, Xiaodong Liu, Jing Gao, Ahmed Hassan Awadallah, Jianfeng Gao"
+      url: "https://arxiv.org/abs/2205.12410"
+      optional: true
+    - title: "LoRA: Low-Rank Adaptation of Large Language Models"
+      author: "Edward J. Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, Weizhu Chen"
+      url: "https://arxiv.org/abs/2106.09685"
+      optional: true
+    - title: "Adapter methods"
+      author: "docs.adapterhub.ml"
+      url: "https://docs.adapterhub.ml/overview.html"
+      optional: true
+  links:
+    - title: "AdapterHub: A Framework for Adapting Transformers"
+      author: "Jonas Pfeiffer, Andreas Rücklé, Clifton Poth, Aishwarya Kamath, Ivan Vulić, Sebastian Ruder, Kyunghyun Cho, Iryna Gurevych"
+      url: "https://aclanthology.org/2020.emnlp-demos.7/"
+    - title: "Parameter-Efficient Transfer Learning for NLP"
+      author: "Neil Houlsby, Andrei Giurgiu, Stanislaw Jastrzebski, Bruna Morrone, Quentin de Laroussilhe, Andrea Gesmundo, Mona Attariyan, Sylvain Gelly"
+      url: "https://arxiv.org/abs/1902.00751"
+    - title: "Simple, Scalable Adaptation for Neural Machine Translation"
+      author: "Ankur Bapna, Naveen Arivazhagan, Orhan Firat"
+      url: "https://arxiv.org/abs/1909.08478"
+    - title: "AdapterFusion: Non-Destructive Task Composition for Transfer Learning"
+      author: "Jonas Pfeiffer, Aishwarya Kamath, Andreas Rücklé, Kyunghyun Cho, Iryna Gurevych"
+      url: "https://aclanthology.org/2021.eacl-main.39/"
+    - title: "Parameter-Efficient Tuning with Special Token Adaptation"
+      author: "Xiaocong Yang, James Y. Huang, Wenxuan Zhou, Muhao Chen"
+      url: "https://aclanthology.org/2023.eacl-main.60/"
+- title: "Few-shot and in-context learning"
+  tag: "fewshot"
+  include: true
+  current: false
+  notes:
+    - title: Lecture notes
+      url: "assets/slides/fewshot.pdf"
+  readings:
+    - title: "Language Models are Unsupervised Multitask Learners"
+      author: "Open AI"
+      url: "https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf"
+    - title: "Language Models are Few-Shot Learners"
+      author: "Open AI"
+      url: "https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf"
+    - title: "GPT-4 Technical Report"
+      author: "Open AI"
+      url: "https://arxiv.org/abs/2303.08774"
+    - title: "Rethinking the Role of Demonstrations: What Makes In-Context Learning Work?"
+      author: "Sewon Min, Xinxi Lyu, Ari Holtzman, Mikel Artetxe, Mike Lewis, Hannaneh Hajishirzi, Luke Zettlemoyer"
+      url: "https://arxiv.org/abs/2202.12837"
+    - title: "In-Context Learning Learns Label Relationships but Is Not Conventional Learning"
+      author: "Jannik Kossen, Yarin Gal, Tom Rainforth"
+      url: "https://arxiv.org/abs/2307.12375"
+  links:
+    - title: "In-context Examples Selection for Machine Translation"
+      author: "Sweta Agrawal, Chunting Zhou, Mike Lewis, Luke Zettlemoyer, Marjan Ghazvininejad"
+      url: "https://aclanthology.org/2023.findings-acl.564/"
+    - title: "How Good Are GPT Models at Machine Translation? A Comprehensive Evaluation"
+      author: "Amr Hendy, Mohamed Abdelrehim, Amr Sharaf, Vikas Raunak, Mohamed Gabr, Hitokazu Matsushita, Young Jin Kim, Mohamed Afify, Hany Hassan Awadalla"
+      url: "https://arxiv.org/abs/2302.09210"
+- title: "Instruction tuning and Preference Optimization"
+  tag: "instruct"
+  include: true
+  current: false
+  notes:
+    - title: Lecture notes
+      url: "assets/slides/instruct_tuning.pdf"
+    - title: "Direct Preference Optimization: Your Language Model is Secretly a Reward Model"
+      author: "Rafael Rafailov, Archit Sharma, Eric Mitchell, Stefano Ermon, Christopher D. Manning, Chelsea Finn"
+      url: "https://arxiv.org/abs/2305.18290"
+  links:
+    - title: "Scaling Instruction-Finetuned Language Models"
+      author: "Google"
+      url: "https://arxiv.org/abs/2210.11416"
+    - title: "Proximal Policy Optimization Algorithms"
+      author: "John Schulman, Filip Wolski, Prafulla Dhariwal, Alec Radford, Oleg Klimov"
+      url: "https://arxiv.org/abs/1707.06347"
+    - title: "LIMA: Less Is More for Alignment"
+      author: "Chunting Zhou, Pengfei Liu, Puxin Xu, Srini Iyer, Jiao Sun, Yuning Mao, Xuezhe Ma, Avia Efrat, Ping Yu, Lili Yu, Susan Zhang, Gargi Ghosh, Mike Lewis, Luke Zettlemoyer, Omer Levy"
+      url: "https://arxiv.org/abs/2305.11206"
+- title: "Scaling Laws for LLMs"
+  tag: "scaling"
+  include: true
+  current: false
+  notes:
+    - title: Lecture notes
+      url: "assets/slides/scaling.pdf"
+    - title: "Scaling Laws for Neural Language Models"
+      author: "Open AI"
+      url: "https://arxiv.org/abs/2001.08361"
+    - title: "Training Compute-Optimal Large Language Models"
+      author: "Google Deepmind"
+      url: "https://arxiv.org/abs/2203.15556"
+    - title: "GLaM: Efficient Scaling of Language Models with Mixture-of-Experts"
+      author: "Google"
+      url: "https://arxiv.org/abs/2112.06905"
+    - title: "PaLM: Scaling Language Modeling with Pathways"
+      author: "Google"
+      url: "https://arxiv.org/abs/2204.02311"
+  links:
+    - title: "AI and Compute"
+      author: "OpenAI"
+      url: "https://openai.com/research/ai-and-compute"
+    - title: "Scaling laws notebook"
+      author: "Andrej Karpathy"
+      url: "https://github.com/karpathy/nanoGPT/blob/master/scaling_laws.ipynb"
+    - title: "The FLOPs Calculus of Language Model Training"
+      author: "Dzmitry Bahdanau"
+      url: "https://medium.com/@dzmitrybahdanau/the-flops-calculus-of-language-model-training-3b19c1f025e4"
+    - title: "The AI Brick Wall – A Practical Limit For Scaling Dense Transformer Models, and How GPT 4 Will Break Past It"
+      author: "Dylan Patel"
+      url: "https://www.semianalysis.com/p/the-ai-brick-wall-a-practical-limit"
 - title: Constituency Parsing, Ambiguity and Context-Free Grammars
   tag: cfg
   include: true
@@ -600,7 +795,7 @@
       optional: false
 - title: "Semantic Parsing"
   tag: semparse
-  include: true
+  include: false
   current: false
   notes:
     - title: "Semantic Parsing"

diff --git a/assets/lecture-slides/L15-pretraining.pdf b/assets/lecture-slides/L15-pretraining.pdf