模型量化报错 #11

QichangZheng · 2023-09-06T04:36:28Z

from transformers import LlamaTokenizer, LlamaForCausalLM
tokenizer = LlamaTokenizer.from_pretrained("yulan-team/YuLan-Chat-2-13b-fp16")
model = LlamaForCausalLM.from_pretrained("yulan-team/YuLan-Chat-2-13b-fp16", load_in_8bit=True).cuda()
model = model.eval()
input_text = "hello"
prompt = "The following is a conversation between a human and an AI assistant namely YuLan, developed by GSAI, Renmin University of China. The AI assistant gives helpful, detailed, and polite answers to the user's questions.\n[|Human|]:{}\n[|AI|]:".format(input_text)
inputs = tokenizer(prompt, return_tensors='pt', padding="longest", max_length=8192, truncation=True, return_attention_mask=True, add_special_tokens=True)
kwargs = {'temperature': 0.8, 'top_p': 0.95, "top_k": 50, "repetition_penalty": 1.1, "no_repeat_ngram_size": 64, "max_length": 8192, "pad_token_id": tokenizer.bos_token_id, "eos_token_id": tokenizer.eos_token_id}
outputs = model.generate(inputs['input_ids'].to(model.device), attention_mask=inputs['attention_mask'].to(model.device), do_sample=True, **kwargs)

RuntimeError Traceback (most recent call last)
in <cell line: 9>()
7 inputs = tokenizer(prompt, return_tensors='pt', padding="longest", max_length=8192, truncation=True, return_attention_mask=True, add_special_tokens=True)
8 kwargs = {'temperature': 0.8, 'top_p': 0.95, "top_k": 50, "repetition_penalty": 1.1, "no_repeat_ngram_size": 64, "max_length": 8192, "pad_token_id": tokenizer.bos_token_id, "eos_token_id": tokenizer.eos_token_id}
----> 9 outputs = model.generate(inputs['input_ids'].to(model.device), attention_mask=inputs['attention_mask'].to(model.device), do_sample=True, **kwargs)
10 print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[len(prompt):])

2 frames
/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py in decorate_context(*args, **kwargs)
113 def decorate_context(*args, **kwargs):
114 with ctx_factory():
--> 115 return func(*args, **kwargs)
116
117 return decorate_context

/usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py in generate(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, **kwargs)
1586
1587 # 13. run sample
-> 1588 return self.sample(
1589 input_ids,
1590 logits_processor=logits_processor,

/usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py in sample(self, input_ids, logits_processor, stopping_criteria, logits_warper, max_length, pad_token_id, eos_token_id, output_attentions, output_hidden_states, output_scores, return_dict_in_generate, synced_gpus, streamer, **model_kwargs)
2676 # sample
2677 probs = nn.functional.softmax(next_token_scores, dim=-1)
-> 2678 next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
2679
2680 # finished sentences should have their next token be a padding token

RuntimeError: probability tensor contains either inf, nan or element < 0

The text was updated successfully, but these errors were encountered:

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

模型量化报错 #11

模型量化报错 #11

QichangZheng commented Sep 6, 2023

模型量化报错 #11

模型量化报错 #11

Comments

QichangZheng commented Sep 6, 2023