Skip to content

Commit

Permalink
Merge branch 'Lightning-AI:main' into smollm
Browse files Browse the repository at this point in the history
  • Loading branch information
ysjprojects authored Dec 16, 2024
2 parents 0de9447 + 972dee4 commit 613eb77
Show file tree
Hide file tree
Showing 6 changed files with 85 additions and 2 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ Every model is written from scratch to maximize performance and remove layers of
| Pythia | {14,31,70,160,410}M, {1,1.4,2.8,6.9,12}B | EleutherAI | [Biderman et al. 2023](https://arxiv.org/abs/2304.01373) |
| Qwen2.5 | 0.5B, 1.5B, 3B, 7B, 14B, 32B, 72B | Alibaba Group | [Qwen Team 2024](https://qwenlm.github.io/blog/qwen2.5/) |
| Qwen2.5 Coder | 0.5B, 1.5B, 3B, 7B, 14B, 32B | Alibaba Group | [Hui, Binyuan et al. 2024](https://arxiv.org/abs/2409.12186) |
| Qwen2.5 Math | 1.5B, 7B, 72B | Alibaba Group | [An, Yang et al. 2024](https://arxiv.org/abs/2409.12122) |
| QwQ | 32B | Alibaba Group | [Qwen Team 2024](https://qwenlm.github.io/blog/qwq-32b-preview/) |
| SmolLM2 | 135M, 360M, 1.7B | Hugging Face | [Hugging Face 2024](https://github.com/huggingface/smollm) |
| Salamandra | 2B, 7B | Barcelona Supercomputing Centre | [BSC-LTC 2024](https://github.com/BSC-LTC/salamandra) |
Expand Down
68 changes: 68 additions & 0 deletions litgpt/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -2033,6 +2033,74 @@ def norm_class(self) -> Type:

qwen_2_5.extend(qwen_2_5_coder)

qwen_2_5_math = [
# https://huggingface.co/Qwen/Qwen2.5-Math-1.5B/blob/main/config.json
dict(
name="Qwen2.5-Math-1.5B{}",
hf_config=dict(org="Qwen", name="Qwen2.5-Math-1.5B{}"),
block_size=4096,
vocab_size=151643,
padded_vocab_size=151936,
n_layer=28,
n_head=12,
n_embd=1536,
n_query_groups=2,
rotary_percentage=1.0,
parallel_residual=False,
bias=False,
attn_bias=True,
norm_class_name="RMSNorm",
mlp_class_name="LLaMAMLP",
intermediate_size=8960,
norm_eps=1e-6,
rope_base=10000
),
# https://huggingface.co/Qwen/Qwen2.5-Math-7B/blob/main/config.json
dict(
name="Qwen2.5-Math-7B{}",
hf_config=dict(org="Qwen", name="Qwen2.5-Math-7B{}"),
block_size=4096,
vocab_size=151643,
padded_vocab_size=152064,
n_layer=28,
n_head=28,
n_embd=3584,
n_query_groups=4,
rotary_percentage=1.0,
parallel_residual=False,
bias=False,
attn_bias=True,
norm_class_name="RMSNorm",
mlp_class_name="LLaMAMLP",
intermediate_size=18944,
norm_eps=1e-6,
rope_base=10000
),
# https://huggingface.co/Qwen/Qwen2.5-Math-72B/blob/main/config.json
dict(
name="Qwen2.5-Math-72B{}",
hf_config=dict(org="Qwen", name="Qwen2.5-Math-72B{}"),
block_size=4096,
vocab_size=151643,
padded_vocab_size=152064,
n_layer=80,
n_head=64,
n_embd=8192,
n_query_groups=8,
rotary_percentage=1.0,
parallel_residual=False,
bias=False,
attn_bias=True,
norm_class_name="RMSNorm",
mlp_class_name="LLaMAMLP",
intermediate_size=29568,
norm_eps=1e-5,
rope_base=10000
),
]

qwen_2_5.extend(qwen_2_5_math)

for c in qwen_2_5:
for kind in ("", "-Instruct"):
copy = deepcopy(c)
Expand Down
7 changes: 7 additions & 0 deletions litgpt/prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,10 @@ def apply(self, prompt: str, **kwargs: str) -> str:
system_message = "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."
return f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"

class Qwen2_5_Math(PromptStyle):
def apply(self, prompt: str, **kwargs: str) -> str:
system_message = "Please reason step by step, and put your final answer within \\boxed{}."
return f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"

class QwQ(PromptStyle):
def apply(self, prompt: str, **kwargs: str) -> str:
Expand Down Expand Up @@ -326,6 +330,7 @@ def apply(self, prompt: str, **kwargs: str) -> str:
"llama3": Llama3,
"olmo": OLMo,
"qwen2.5": Qwen2_5,
"qwen2.5-math": Qwen2_5_Math,
"qwq": QwQ,
"smollm2": SmolLM2, # SmolLM uses a different template
"salamandra": Salamandra,
Expand Down Expand Up @@ -367,6 +372,8 @@ def model_name_to_prompt_style(model_name: str) -> PromptStyle:
return Gemma()
if re.search(r"OLMo.*-hf", model_name):
return OLMo()
if re.search(r"Qwen2\.5-Math-.*", model_name):
return Qwen2_5_Math()
if re.search(r"Qwen2\.5-.*", model_name):
return Qwen2_5()
if re.search(r"QwQ-.*", model_name):
Expand Down
2 changes: 1 addition & 1 deletion tests/test_convert_lit_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -524,7 +524,7 @@ def test_check_conversion_supported_lora():
check_conversion_supported(lit_weights=lit_weights)

@torch.inference_mode()
@pytest.mark.parametrize("model_name", ("Qwen2.5-1.5B", "Qwen2.5-Coder-1.5B", "QwQ-32B-Preview"))
@pytest.mark.parametrize("model_name", ("Qwen2.5-1.5B", "Qwen2.5-Coder-1.5B", "Qwen2.5-Math-1.5B", "QwQ-32B-Preview"))
@pytest.mark.parametrize(
("device", "dtype"),
[
Expand Down
2 changes: 1 addition & 1 deletion tests/test_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -792,7 +792,7 @@ def test_against_original_gemma_2(model_name, device, dtype):


@torch.inference_mode()
@pytest.mark.parametrize("model_name", ("Qwen2.5-1.5B", "Qwen2.5-Coder-1.5B", "QwQ-32B-Preview"))
@pytest.mark.parametrize("model_name", ("Qwen2.5-1.5B", "Qwen2.5-Coder-1.5B", "Qwen2.5-Math-1.5B", "QwQ-32B-Preview"))
@pytest.mark.parametrize(
("device", "dtype"),
[
Expand Down
7 changes: 7 additions & 0 deletions tutorials/download_model_weights.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ LitGPT supports a variety of LLM architectures with publicly available weights.
| Pythia | {14,31,70,160,410}M, {1,1.4,2.8,6.9,12}B | EleutherAI | [Biderman et al. 2023](https://arxiv.org/abs/2304.01373) |
| Qwen2.5 | 0.5B, 1.5B, 3B, 7B, 14B, 32B, 72B | Alibaba Group | [Qwen Team 2024](https://qwenlm.github.io/blog/qwen2.5/) |
| Qwen2.5 Coder | 0.5B, 1.5B, 3B, 7B, 14B, 32B | Alibaba Group | [Hui, Binyuan et al. 2024](https://arxiv.org/abs/2409.12186) |
| Qwen2.5 Math | 1.5B, 7B, 72B | Alibaba Group | [An, Yang et al. 2024](https://arxiv.org/abs/2409.12122) |
| QwQ | 32B | Alibaba Group | [Qwen Team 2024](https://qwenlm.github.io/blog/qwq-32b-preview/) |
| RedPajama-INCITE | 3B, 7B | Together | [Together 2023](https://together.ai/blog/redpajama-models-v1) |
| SmolLM2 | 135M, 360M, 1.7B | Hugging Face | [Hugging Face 2024](https://github.com/huggingface/smollm) |
Expand Down Expand Up @@ -207,6 +208,12 @@ Qwen/Qwen2.5-Coder-14B
Qwen/Qwen2.5-Coder-14B-Instruct
Qwen/Qwen2.5-Coder-32B
Qwen/Qwen2.5-Coder-32B-Instruct
Qwen/Qwen2.5-Math-1.5B
Qwen/Qwen2.5-Math-1.5B-Instruct
Qwen/Qwen2.5-Math-7B
Qwen/Qwen2.5-Math-7B-Instruct
Qwen/Qwen2.5-Math-72B
Qwen/Qwen2.5-Math-72B-Instruct
Qwen/QwQ-32B-Preview
stabilityai/FreeWilly2
stabilityai/stable-code-3b
Expand Down

0 comments on commit 613eb77

Please sign in to comment.