Skip to content

Commit

Permalink
Update pretraining slides
Browse files Browse the repository at this point in the history
  • Loading branch information
angelxuanchang committed Feb 26, 2024
1 parent a7eb251 commit 1f52e54
Show file tree
Hide file tree
Showing 3 changed files with 207 additions and 9 deletions.
19 changes: 11 additions & 8 deletions _data/schedule.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -136,18 +136,18 @@
dates:
- date: "2/26"
title: "Pretraining and fine-tuning"
lecture: "pretraining"
lecture: "peft"
leclinks:
- name: "pretraining slides"
url: "assets/lecture-slides/L20-pretraining.pdf"
url: "assets/lecture-slides/L15-pretraining.pdf"
- name: "fine-tuning slides"
url: "assets/slides/peft.pdf"
- date: "2/28"
title: "Final project tips, model debugging and analysis"
lecture: "analysis"
lecture: "fewshot"
title: "Few-shot and in-context learning"
leclinks:
- name: "slides"
url: "assets/lecture-slides/L17-project-tips-analysis.pdf"
url: "assets/slides/fewshot.pdf"
hwdue: "3"
hwout: "4"
- date: "2/29"
Expand Down Expand Up @@ -177,11 +177,11 @@
- name: "slides"
url: "assets/lecture-slides/L18-question-answering.pdf"
- date: "3/13"
title: "Text Generation"
lecture: "textgen"
title: "Final project tips, model debugging and analysis"
lecture: "analysis"
leclinks:
- name: "slides"
url: "assets/lecture-slides/L21-text-generation.pdf"
url: "assets/lecture-slides/L17-project-tips-analysis.pdf"
hwdue: "4"
- week:
num: 11
Expand All @@ -195,6 +195,9 @@
- date: "3/20"
title: "Scaling laws for LLMs"
lecture: "scaling-laws"
leclinks:
- name: "slides"
url: "assets/slides/scaling.pdf"
- date: "3/21"
project: "Project milestone"
- week:
Expand Down
197 changes: 196 additions & 1 deletion _data/syllabus.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -532,6 +532,201 @@
author: "Rogers, Kovaleva, and Rumshisky"
url: "https://direct.mit.edu/tacl/article/doi/10.1162/tacl_a_00349/96482/A-Primer-in-BERTology-What-We-Know-About-How-BERT"
optional: true
- title: "Tokenization"
tag: bpe
include: true
current: false
notes:
- title: "Lecture notes"
url: "assets/slides/tokenization.pdf"
- title: "Tokenization into Sub-words with Byte-Pair Encoding"
url: "http://nbviewer.jupyter.org/github/anoopsarkar/nlp-class/blob/gh-pages/assets/notebooks/bpe.ipynb"
download: "http://github.com/anoopsarkar/nlp-class/raw/gh-pages/assets/notebooks/bpe.ipynb"
links:
- title: "BPE tutorial at Huggingface"
url: "https://huggingface.co/learn/nlp-course/chapter6/5?fw=pt"
author: "Huggingface"
optional: true
- title: "Pre-training Language Models"
tag: "pretraining"
include: true
current: false
notes:
- title: Lecture notes
url: "assets/slides/pre_training.pdf"
- title: "Improving language understanding with unsupervised learning"
author: "Alec Radford, Karthik Narasimhan, Tim Salimans, Ilya Sutskever"
url: "https://openai.com/research/language-unsupervised"
links:
- title: "Semi-supervised Sequence Learning"
author: "Andrew M. Dai, Quoc V. Le"
url: "https://arxiv.org/abs/1511.01432"
- title: "Deep contextualized word representations"
author: "Matthew E. Peters, Mark Neumann, Mohit Iyyer, Matt Gardner, Christopher Clark, Kenton Lee, Luke Zettlemoyer"
url: "https://arxiv.org/abs/1802.05365"
- title: "RoBERTa: A Robustly Optimized BERT Pretraining Approach"
author: "Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov"
url: "https://arxiv.org/abs/1907.11692"
- title: "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context"
author: "Zihang Dai, Zhilin Yang, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov"
url: "https://arxiv.org/abs/1901.02860"
- title: "ELECTRA: Pre-training Text Encoders as Discriminators Rather Than Generators"
author: "Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning"
url: "https://arxiv.org/abs/2003.10555"
- title: "Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer"
author: "Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu"
url: "https://arxiv.org/abs/1910.10683"
- title: "ALBERT: A Lite BERT for Self-supervised Learning of Language Representations"
author: "Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut"
url: "https://arxiv.org/abs/1909.11942"
- title: "Decoding"
tag: "decoding"
include: true
current: false
notes:
- title: "Lecture notes"
url: "assets/slides/decoding.pdf"
- title: "Intro to Generation with LLMs"
author: "Huggingface"
url: "https://huggingface.co/docs/transformers/llm_tutorial"
- title: "Generation Strategies"
author: "Huggingface"
url: "https://huggingface.co/docs/transformers/generation_strategies"
- title: "Contrastive Search"
author: "Tian Lan"
url: "https://huggingface.co/blog/introducing-csearch#62-example-two---opt"
- title: "Categorical Reparameterization with Gumbel-Softmax"
author: "Eric Jang, Shixiang Gu, Ben Poole"
url: "https://arxiv.org/abs/1611.01144v5"
links:
- title: "A Contrastive Framework for Neural Text Generation"
author: "Yixuan Su, Tian Lan, Yan Wang, Dani Yogatama, Lingpeng Kong, Nigel Collier"
url: "https://arxiv.org/abs/2202.06417"
- title: "Parameter-efficient Fine Tuning"
tag: "peft"
include: true
current: false
notes:
- title: Lecture notes
url: "assets/slides/peft.pdf"
readings:
- title: "HuggingFace PEFT: Parameter-Efficient Fine-Tuning of Billion-Scale Models on Low-Resource Hardware"
author: Sourab Mangrulkar, Sayak Paul"
url: "https://huggingface.co/blog/peft"
- title: "Prefix-Tuning: Optimizing Continuous Prompts for Generation"
author: "Xiang Lisa Li, Percy Liang"
url: "https://arxiv.org/abs/2101.00190"
optional: true
- title: "AdaMix: Mixture-of-Adaptations for Parameter-efficient Model Tuning"
author: "Yaqing Wang, Sahaj Agarwal, Subhabrata Mukherjee, Xiaodong Liu, Jing Gao, Ahmed Hassan Awadallah, Jianfeng Gao"
url: "https://arxiv.org/abs/2205.12410"
optional: true
- title: "LoRA: Low-Rank Adaptation of Large Language Models"
author: "Edward J. Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, Weizhu Chen"
url: "https://arxiv.org/abs/2106.09685"
optional: true
- title: "Adapter methods"
author: "docs.adapterhub.ml"
url: "https://docs.adapterhub.ml/overview.html"
optional: true
links:
- title: "AdapterHub: A Framework for Adapting Transformers"
author: "Jonas Pfeiffer, Andreas Rücklé, Clifton Poth, Aishwarya Kamath, Ivan Vulić, Sebastian Ruder, Kyunghyun Cho, Iryna Gurevych"
url: "https://aclanthology.org/2020.emnlp-demos.7/"
- title: "Parameter-Efficient Transfer Learning for NLP"
author: "Neil Houlsby, Andrei Giurgiu, Stanislaw Jastrzebski, Bruna Morrone, Quentin de Laroussilhe, Andrea Gesmundo, Mona Attariyan, Sylvain Gelly"
url: "https://arxiv.org/abs/1902.00751"
- title: "Simple, Scalable Adaptation for Neural Machine Translation"
author: "Ankur Bapna, Naveen Arivazhagan, Orhan Firat"
url: "https://arxiv.org/abs/1909.08478"
- title: "AdapterFusion: Non-Destructive Task Composition for Transfer Learning"
author: "Jonas Pfeiffer, Aishwarya Kamath, Andreas Rücklé, Kyunghyun Cho, Iryna Gurevych"
url: "https://aclanthology.org/2021.eacl-main.39/"
- title: "Parameter-Efficient Tuning with Special Token Adaptation"
author: "Xiaocong Yang, James Y. Huang, Wenxuan Zhou, Muhao Chen"
url: "https://aclanthology.org/2023.eacl-main.60/"
- title: "Few-shot and in-context learning"
tag: "fewshot"
include: true
current: false
notes:
- title: Lecture notes
url: "assets/slides/fewshot.pdf"
readings:
- title: "Language Models are Unsupervised Multitask Learners"
author: "Open AI"
url: "https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf"
- title: "Language Models are Few-Shot Learners"
author: "Open AI"
url: "https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf"
- title: "GPT-4 Technical Report"
author: "Open AI"
url: "https://arxiv.org/abs/2303.08774"
- title: "Rethinking the Role of Demonstrations: What Makes In-Context Learning Work?"
author: "Sewon Min, Xinxi Lyu, Ari Holtzman, Mikel Artetxe, Mike Lewis, Hannaneh Hajishirzi, Luke Zettlemoyer"
url: "https://arxiv.org/abs/2202.12837"
- title: "In-Context Learning Learns Label Relationships but Is Not Conventional Learning"
author: "Jannik Kossen, Yarin Gal, Tom Rainforth"
url: "https://arxiv.org/abs/2307.12375"
links:
- title: "In-context Examples Selection for Machine Translation"
author: "Sweta Agrawal, Chunting Zhou, Mike Lewis, Luke Zettlemoyer, Marjan Ghazvininejad"
url: "https://aclanthology.org/2023.findings-acl.564/"
- title: "How Good Are GPT Models at Machine Translation? A Comprehensive Evaluation"
author: "Amr Hendy, Mohamed Abdelrehim, Amr Sharaf, Vikas Raunak, Mohamed Gabr, Hitokazu Matsushita, Young Jin Kim, Mohamed Afify, Hany Hassan Awadalla"
url: "https://arxiv.org/abs/2302.09210"
- title: "Instruction tuning and Preference Optimization"
tag: "instruct"
include: true
current: false
notes:
- title: Lecture notes
url: "assets/slides/instruct_tuning.pdf"
- title: "Direct Preference Optimization: Your Language Model is Secretly a Reward Model"
author: "Rafael Rafailov, Archit Sharma, Eric Mitchell, Stefano Ermon, Christopher D. Manning, Chelsea Finn"
url: "https://arxiv.org/abs/2305.18290"
links:
- title: "Scaling Instruction-Finetuned Language Models"
author: "Google"
url: "https://arxiv.org/abs/2210.11416"
- title: "Proximal Policy Optimization Algorithms"
author: "John Schulman, Filip Wolski, Prafulla Dhariwal, Alec Radford, Oleg Klimov"
url: "https://arxiv.org/abs/1707.06347"
- title: "LIMA: Less Is More for Alignment"
author: "Chunting Zhou, Pengfei Liu, Puxin Xu, Srini Iyer, Jiao Sun, Yuning Mao, Xuezhe Ma, Avia Efrat, Ping Yu, Lili Yu, Susan Zhang, Gargi Ghosh, Mike Lewis, Luke Zettlemoyer, Omer Levy"
url: "https://arxiv.org/abs/2305.11206"
- title: "Scaling Laws for LLMs"
tag: "scaling"
include: true
current: false
notes:
- title: Lecture notes
url: "assets/slides/scaling.pdf"
- title: "Scaling Laws for Neural Language Models"
author: "Open AI"
url: "https://arxiv.org/abs/2001.08361"
- title: "Training Compute-Optimal Large Language Models"
author: "Google Deepmind"
url: "https://arxiv.org/abs/2203.15556"
- title: "GLaM: Efficient Scaling of Language Models with Mixture-of-Experts"
author: "Google"
url: "https://arxiv.org/abs/2112.06905"
- title: "PaLM: Scaling Language Modeling with Pathways"
author: "Google"
url: "https://arxiv.org/abs/2204.02311"
links:
- title: "AI and Compute"
author: "OpenAI"
url: "https://openai.com/research/ai-and-compute"
- title: "Scaling laws notebook"
author: "Andrej Karpathy"
url: "https://github.com/karpathy/nanoGPT/blob/master/scaling_laws.ipynb"
- title: "The FLOPs Calculus of Language Model Training"
author: "Dzmitry Bahdanau"
url: "https://medium.com/@dzmitrybahdanau/the-flops-calculus-of-language-model-training-3b19c1f025e4"
- title: "The AI Brick Wall – A Practical Limit For Scaling Dense Transformer Models, and How GPT 4 Will Break Past It"
author: "Dylan Patel"
url: "https://www.semianalysis.com/p/the-ai-brick-wall-a-practical-limit"
- title: Constituency Parsing, Ambiguity and Context-Free Grammars
tag: cfg
include: true
Expand Down Expand Up @@ -600,7 +795,7 @@
optional: false
- title: "Semantic Parsing"
tag: semparse
include: true
include: false
current: false
notes:
- title: "Semantic Parsing"
Expand Down
Binary file added assets/lecture-slides/L15-pretraining.pdf
Binary file not shown.

0 comments on commit 1f52e54

Please sign in to comment.