diff --git a/.github/workflows/python-lint.yml b/.github/workflows/python-lint.yml new file mode 100644 index 0000000000000..56d17b66cecf1 --- /dev/null +++ b/.github/workflows/python-lint.yml @@ -0,0 +1,20 @@ +name: flake8 Lint + +on: [push, pull_request] + +jobs: + flake8-lint: + runs-on: ubuntu-latest + name: Lint + steps: + - name: Check out source repository + uses: actions/checkout@v3 + - name: Set up Python environment + uses: actions/setup-python@v4 + with: + python-version: "3.11" + - name: flake8 Lint + uses: py-actions/flake8@v2 + with: + ignore: "E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704" + exclude: "examples/*,examples/*/**,*/**/__init__.py" diff --git a/.gitignore b/.gitignore index 2c86617e7c3e9..c79db24a93dcf 100644 --- a/.gitignore +++ b/.gitignore @@ -53,6 +53,7 @@ models-mnt /speculative /parallel /train-text-from-scratch +/tokenize /vdot /common/build-info.cpp arm_neon.h diff --git a/common/common.cpp b/common/common.cpp index c25bac0171b8b..5ab9f2fbfb33f 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -492,6 +492,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { params.interactive_first = true; } else if (arg == "-ins" || arg == "--instruct") { params.instruct = true; + } else if (arg == "-cml" || arg == "--chatml") { + params.chatml = true; } else if (arg == "--infill") { params.infill = true; } else if (arg == "--multiline-input") { @@ -731,6 +733,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" -i, --interactive run in interactive mode\n"); printf(" --interactive-first run in interactive mode and wait for input right away\n"); printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n"); + printf(" -cml, --chatml run in chatml mode (use with ChatML-compatible models)\n"); printf(" --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n"); printf(" -r PROMPT, --reverse-prompt PROMPT\n"); printf(" halt generation at PROMPT, return control in interactive mode\n"); @@ -932,7 +935,7 @@ void llama_batch_add( const std::vector & seq_ids, bool logits) { batch.token [batch.n_tokens] = id; - batch.pos [batch.n_tokens] = pos, + batch.pos [batch.n_tokens] = pos; batch.n_seq_id[batch.n_tokens] = seq_ids.size(); for (size_t i = 0; i < seq_ids.size(); ++i) { batch.seq_id[batch.n_tokens][i] = seq_ids[i]; diff --git a/common/common.h b/common/common.h index 156d4b4cc63f7..9ed986ca43a00 100644 --- a/common/common.h +++ b/common/common.h @@ -110,6 +110,7 @@ struct gpt_params { bool random_prompt = false; // do not randomize prompt if none provided bool use_color = false; // use color to distinguish generations and inputs bool interactive = false; // interactive mode + bool chatml = false; // chatml mode (used for models trained on chatml syntax) bool prompt_cache_all = false; // save user input and generations to prompt cache bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it diff --git a/convert-baichuan-hf-to-gguf.py b/convert-baichuan-hf-to-gguf.py deleted file mode 100755 index 3785a7d265524..0000000000000 --- a/convert-baichuan-hf-to-gguf.py +++ /dev/null @@ -1,315 +0,0 @@ -#!/usr/bin/env python3 -# HF baichuan --> gguf conversion - -from __future__ import annotations - -import argparse -import json -import os -import sys -from pathlib import Path -from typing import TYPE_CHECKING, Any -import numpy as np -import torch -from sentencepiece import SentencePieceProcessor # type: ignore[import] - -if 'NO_LOCAL_GGUF' not in os.environ: - sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) -import gguf - - -if TYPE_CHECKING: - from typing import TypeAlias - -NDArray: TypeAlias = 'np.ndarray[Any, Any]' - -# reverse HF permute back to original pth layout - - -def reverse_hf_permute(weights: NDArray, n_head: int, n_kv_head: int | None = None) -> NDArray: - if n_kv_head is not None and n_head != n_kv_head: - n_head //= n_kv_head - - return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape)) - -def reverse_hf_permute_part(weights: NDArray, n_part: int, n_head: int, n_head_kv: int| None = None) -> NDArray: - r = weights.shape[0] // 3 - return (reverse_hf_permute(weights[r * n_part : r * n_part + r, ...], n_head, n_head_kv)) - -def reverse_hf_part(weights: NDArray, n_part: int) -> NDArray: - r = weights.shape[0] // 3 - return weights[r * n_part : r * n_part + r, ...] - -def count_model_parts(dir_model: str) -> int: - num_parts = 0 - - for filename in os.listdir(dir_model): - if filename.startswith("pytorch_model-"): - num_parts += 1 - - if num_parts > 0: - print("gguf: found " + str(num_parts) + " model parts") - - return num_parts - - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(description="Convert a HuggingFace LLaMA model to a GGML compatible file") - parser.add_argument( - "--vocab-only", action="store_true", - help="extract only the vocab", - ) - parser.add_argument( - "--outfile", type=Path, - help="path to write to; default: based on input", - ) - parser.add_argument( - "model", type=Path, - help="directory containing model file, or model file itself (*.bin)", - ) - parser.add_argument( - "ftype", type=int, choices=[0, 1], default=1, nargs='?', - help="output format - use 0 for float32, 1 for float16", - ) - parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine") - return parser.parse_args() - -args = parse_args() - -dir_model = args.model -ftype = args.ftype -if not dir_model.is_dir(): - print(f'Error: {args.model} is not a directory', file = sys.stderr) - sys.exit(1) - -endianess = gguf.GGUFEndian.LITTLE -if args.bigendian: - endianess = gguf.GGUFEndian.BIG -endianess_str = "Big Endian" if args.bigendian else "Little Endian" -print(f"gguf: Conversion Endianess {endianess}") -# possible tensor data types -# ftype == 0 -> float32 -# ftype == 1 -> float16 - -# map from ftype to string -ftype_str = ["f32", "f16"] - -if args.outfile is not None: - fname_out = args.outfile -else: - # output in the same directory as the model by default - fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf' - -print("gguf: loading model "+dir_model.name) - -with open(dir_model / "config.json", "r", encoding="utf-8") as f: - hparams = json.load(f) -print("hello print: ",hparams["architectures"][0]) -if hparams["architectures"][0] != "BaichuanForCausalLM" and hparams["architectures"][0] != "BaiChuanForCausalLM": - print("Model architecture not supported: " + hparams["architectures"][0]) - - sys.exit() - -# get number of model parts -num_parts = count_model_parts(dir_model) -print(f"num_parts:{num_parts}\n") -ARCH=gguf.MODEL_ARCH.BAICHUAN -gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess) - -print("gguf: get model metadata") - -block_count = hparams["num_hidden_layers"] -head_count = hparams["num_attention_heads"] - -if "num_key_value_heads" in hparams: - head_count_kv = hparams["num_key_value_heads"] -else: - head_count_kv = head_count - -if "_name_or_path" in hparams: - hf_repo = hparams["_name_or_path"] -else: - hf_repo = "" - -if "max_sequence_length" in hparams: - ctx_length = hparams["max_sequence_length"] -elif "max_position_embeddings" in hparams: - ctx_length = hparams["max_position_embeddings"] -elif "model_max_length" in hparams: - ctx_length = hparams["model_max_length"] -else: - print("gguf: can not find ctx length parameter.") - - sys.exit() - - -gguf_writer.add_name(dir_model.name) -gguf_writer.add_source_hf_repo(hf_repo) -gguf_writer.add_tensor_data_layout("Meta AI original pth") -gguf_writer.add_context_length(ctx_length) -gguf_writer.add_embedding_length(hparams["hidden_size"]) -gguf_writer.add_block_count(block_count) -gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) -gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"]) -gguf_writer.add_head_count(head_count) -gguf_writer.add_head_count_kv(head_count_kv) -gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"]) - -if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]: - if "type" in hparams["rope_scaling"]: - if hparams["rope_scaling"]["type"] == "linear": - gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"]) - - -# TOKENIZATION - -print("gguf: get tokenizer metadata") - -tokens: list[bytes] = [] -scores: list[float] = [] -toktypes: list[int] = [] - -tokenizer_model_file = dir_model / 'tokenizer.model' -if not tokenizer_model_file.is_file(): - print(f'Error: Missing {tokenizer_model_file}', file = sys.stderr) - sys.exit(1) - -# vocab type sentencepiece -print("gguf: get sentencepiece tokenizer vocab, scores and token types") - -tokenizer = SentencePieceProcessor(str(tokenizer_model_file)) -vocab_size = hparams.get('vocab_size') -if vocab_size is None: - vocab_size = tokenizer.vocab_size() - -for i in range(vocab_size): - text: bytes - score: float - - piece = tokenizer.id_to_piece(i) - text = piece.encode("utf-8") - score = tokenizer.get_score(i) - - toktype = 1 # defualt to normal token type - if tokenizer.is_unknown(i): - toktype = 2 - if tokenizer.is_control(i): - toktype = 3 - - # toktype = 4 is user-defined = tokens from added_tokens.json - - if tokenizer.is_unused(i): - toktype = 5 - if tokenizer.is_byte(i): - toktype = 6 - - tokens.append(text) - scores.append(score) - toktypes.append(toktype) - -added_tokens_file = dir_model / 'added_tokens.json' -if added_tokens_file.is_file(): - with open(added_tokens_file, "r", encoding="utf-8") as f: - addtokens_json = json.load(f) - - print("gguf: get added tokens") - - for key in addtokens_json: - tokens.append( key.encode("utf-8") ) - scores.append(-1000.0) - toktypes.append(4) # user-defined token type - - -gguf_writer.add_tokenizer_model("llama") -gguf_writer.add_token_list(tokens) -gguf_writer.add_token_scores(scores) -gguf_writer.add_token_types(toktypes) - -special_vocab = gguf.SpecialVocab(dir_model, n_vocab = len(tokens)) -special_vocab.add_to_gguf(gguf_writer) - -# TENSORS - -tensor_map = gguf.get_tensor_name_map(ARCH,block_count) - -# tensor info -print("gguf: get tensor metadata") - -if num_parts == 0: - part_names = iter(("pytorch_model.bin",)) -else: - part_names = ( - f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1) - ) - - -for part_name in part_names: - if args.vocab_only: - break - print("gguf: loading model part '" + part_name + "'") - model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu") - - tmp=model_part - for i in range(block_count): - if f"model.layers.{i}.self_attn.W_pack.weight" in model_part: - print(f"Unpacking and permuting layer {i}") - tmp[f"model.layers.{i}.self_attn.q_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],0,head_count,head_count) - tmp[f"model.layers.{i}.self_attn.k_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],1,head_count,head_count_kv) - tmp[f"model.layers.{i}.self_attn.v_proj.weight"]=reverse_hf_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],2) - del tmp[f"model.layers.{i}.self_attn.W_pack.weight"] - - for name in model_part.keys(): - data = model_part[name] - # we don't need these - if name.endswith(".rotary_emb.inv_freq"): - continue - - old_dtype = data.dtype - - # convert any unsupported data types to float32 - if data.dtype != torch.float16 and data.dtype != torch.float32: - data = data.to(torch.float32) - - data = data.squeeze().numpy() - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) - if new_name is None: - print("Can not map tensor '" + name + "'") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(name + " -> " + new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) - gguf_writer.add_tensor(new_name, data) - - -print("gguf: write header") -gguf_writer.write_header_to_file() -print("gguf: write metadata") -gguf_writer.write_kv_data_to_file() -if not args.vocab_only: - print("gguf: write tensors") - gguf_writer.write_tensors_to_file() - -gguf_writer.close() - -print(f"gguf: model successfully exported to '{fname_out}'") -print("") diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 3a618fd4dc0f6..1105670c1ba5e 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -827,13 +827,14 @@ def set_gguf_parameters(self): self.gguf_writer.add_embedding_length(hparams["hidden_size"]) self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) - self.gguf_writer.add_rope_dimension_count(int(hparams["rope_pct"]*(hparams["hidden_size"] // hparams["num_attention_heads"]))) + self.gguf_writer.add_rope_dimension_count(int(hparams["rope_pct"] * (hparams["hidden_size"] // hparams["num_attention_heads"]))) self.gguf_writer.add_head_count(hparams["num_attention_heads"]) self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True) self.gguf_writer.add_layer_norm_eps(1e-5) ###### CONVERSION LOGIC ###### + def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description="Convert a huggingface model to a GGML compatible file") parser.add_argument( diff --git a/convert-llama-ggml-to-gguf.py b/convert-llama-ggml-to-gguf.py index 0c12356707341..e359330afc51f 100755 --- a/convert-llama-ggml-to-gguf.py +++ b/convert-llama-ggml-to-gguf.py @@ -14,11 +14,13 @@ sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) import gguf + class GGMLFormat(IntEnum): GGML = 0 GGMF = 1 GGJT = 2 + class GGMLFType(IntEnum): ALL_F32 = 0 MOSTLY_F16 = 1 @@ -38,6 +40,7 @@ class GGMLFType(IntEnum): MOSTLY_Q5_K_M = 17 MOSTLY_Q6_K = 18 + class Hyperparameters: def __init__(self): self.n_vocab = self.n_embd = self.n_mult = self.n_head = 0 @@ -69,6 +72,7 @@ def load(self, data, offset): def __str__(self): return f'' + class Vocab: def __init__(self, load_scores = True): self.items = [] @@ -90,6 +94,7 @@ def load(self, data, offset, n_vocab): self.items.append((item_text, item_score)) return offset - orig_offset + class Tensor: def __init__(self, use_padding = True): self.name = None @@ -123,6 +128,7 @@ def load(self, data, offset): # print(n_dims, name_len, dtype, self.dims, self.name, pad) return offset - orig_offset + class GGMLModel: def __init__(self): self.hyperparameters = None @@ -159,8 +165,8 @@ def validate_conversion(self, ftype): if ftype not in (GGMLFType.ALL_F32, GGMLFType.MOSTLY_F16): err = 'Quantizations changed in GGJTv2. Can only convert unquantized GGML files older than GGJTv2.' elif (self.file_format == GGMLFormat.GGJT and self.format_version == 2): - if ftype in ( GGMLFType.MOSTLY_Q4_0, GGMLFType.MOSTLY_Q4_1, - GGMLFType.MOSTLY_Q4_1_SOME_F16, GGMLFType.MOSTLY_Q8_0): + if ftype in (GGMLFType.MOSTLY_Q4_0, GGMLFType.MOSTLY_Q4_1, + GGMLFType.MOSTLY_Q4_1_SOME_F16, GGMLFType.MOSTLY_Q8_0): err = 'Q4 and Q8 quantizations changed in GGJTv3.' if len(err) > 0: raise ValueError(f'{err} Sorry, your {self.file_format.name}v{self.format_version} file of type {ftype.name} is not eligible for conversion.') @@ -187,6 +193,7 @@ def load(self, data, offset): hp.set_n_ff(self) return offset + class GGMLToGGUF: def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override = None, special_vocab = None): hp = ggml_model.hyperparameters @@ -217,7 +224,7 @@ def save(self): gguf_writer = gguf.GGUFWriter( self.cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], - use_temp_file = False ) + use_temp_file = False) self.add_params(gguf_writer) self.add_vocab(gguf_writer) if self.special_vocab is not None: @@ -341,7 +348,8 @@ def add_tensors(self, gguf_writer): mapped_name, data[tensor.start_offset:tensor.start_offset + tensor.len_bytes], raw_shape = tempdims, - raw_dtype = tensor.dtype ) + raw_dtype = tensor.dtype) + def handle_metadata(cfg, hp): import convert @@ -365,38 +373,40 @@ def handle_metadata(cfg, hp): raise ValueError('Unable to load metadata') vocab = convert.load_vocab( cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir, - cfg.vocabtype ) + cfg.vocabtype) # FIXME: Respect cfg.vocab_dir? svocab = gguf.SpecialVocab(cfg.model_metadata_dir, - load_merges = cfg.vocabtype == 'bpe', - n_vocab = vocab.vocab_size) + load_merges = cfg.vocabtype == 'bpe', + n_vocab = vocab.vocab_size) convert.check_vocab_size(params, vocab) return (params, vocab, svocab) + def handle_args(): parser = argparse.ArgumentParser(description = 'Convert GGML models to GGUF') parser.add_argument('--input', '-i', type = Path, required = True, - help = 'Input GGMLv3 filename') + help = 'Input GGMLv3 filename') parser.add_argument('--output', '-o', type = Path, required = True, - help ='Output GGUF filename') + help ='Output GGUF filename') parser.add_argument('--name', - help = 'Set model name') + help = 'Set model name') parser.add_argument('--desc', - help = 'Set model description') + help = 'Set model description') parser.add_argument('--gqa', type = int, default = 1, - help = 'grouped-query attention factor (use 8 for LLaMA2 70B)') + help = 'grouped-query attention factor (use 8 for LLaMA2 70B)') parser.add_argument('--eps', default = '5.0e-06', - help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2') + help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2') parser.add_argument('--context-length', '-c', type=int, default = 2048, - help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096') + help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096') parser.add_argument('--model-metadata-dir', '-m', type = Path, - help ='Load HuggingFace/.pth vocab and metadata from the specified directory') + help ='Load HuggingFace/.pth vocab and metadata from the specified directory') parser.add_argument("--vocab-dir", type=Path, - help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir") + help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir") parser.add_argument("--vocabtype", choices=["spm", "bpe"], default="spm", - help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)") + help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)") return parser.parse_args() + def main(): cfg = handle_args() print(f'* Using config: {cfg}') @@ -406,7 +416,7 @@ def main(): data = np.memmap(cfg.input, mode = 'r') model = GGMLModel() print('* Scanning GGML input file') - offset = model.load(data, 0) + offset = model.load(data, 0) # noqa print(f'* GGML model hyperparameters: {model.hyperparameters}') vocab_override = None params_override = None @@ -421,12 +431,15 @@ def main(): print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n') if model.file_format == GGMLFormat.GGML: print('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!') - converter = GGMLToGGUF(model, data, cfg, + converter = GGMLToGGUF( + model, data, cfg, params_override = params_override, vocab_override = vocab_override, - special_vocab = special_vocab ) + special_vocab = special_vocab + ) converter.save() print(f'* Successful completion. Output saved to: {cfg.output}') + if __name__ == '__main__': main() diff --git a/convert-persimmon-to-gguf.py b/convert-persimmon-to-gguf.py index 240f87306e578..206b7d5ff9e31 100644 --- a/convert-persimmon-to-gguf.py +++ b/convert-persimmon-to-gguf.py @@ -9,6 +9,7 @@ sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) import gguf + def _flatten_dict(dct, tensors, prefix=None): assert isinstance(dct, dict) for key in dct.keys(): @@ -21,6 +22,7 @@ def _flatten_dict(dct, tensors, prefix=None): raise ValueError(type(dct[key])) return None + def _get_sentencepiece_tokenizer_info(dir_model: Path): tokenizer_path = dir_model / 'adept_vocab.model' print('gguf: getting sentencepiece tokenizer from', tokenizer_path) @@ -54,6 +56,7 @@ def _get_sentencepiece_tokenizer_info(dir_model: Path): pass return tokens, scores, toktypes + def main(): parser = argparse.ArgumentParser(description="Convert a Persimmon model from Adept (e.g. Persimmon 8b chat) to a GGML compatible file") parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") @@ -125,6 +128,5 @@ def main(): print("") - if __name__ == '__main__': main() diff --git a/convert.py b/convert.py old mode 100755 new mode 100644 index 5b6344aa8fc85..3ad836ce0ec1d --- a/convert.py +++ b/convert.py @@ -46,6 +46,7 @@ # data types # + @dataclass(frozen=True) class DataType: name: str @@ -55,15 +56,18 @@ class DataType: def elements_to_bytes(self, n_elements: int) -> int: return n_elements * self.dtype.itemsize + @dataclass(frozen=True) class UnquantizedDataType(DataType): pass + DT_F16 = UnquantizedDataType('F16', dtype = np.dtype(np.float16), valid_conversions = ['F32', 'Q8_0']) DT_F32 = UnquantizedDataType('F32', dtype = np.dtype(np.float32), valid_conversions = ['F16', 'Q8_0']) DT_I32 = UnquantizedDataType('I32', dtype = np.dtype(np.int16), valid_conversions = []) DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16), valid_conversions = ['F32', 'F16', 'Q8_0']) + @dataclass(frozen=True) class QuantizedDataType(DataType): block_size: int @@ -77,6 +81,7 @@ def elements_to_bytes(self, n_elements: int) -> int: assert n_elements % self.block_size == 0, f'Invalid number of elements {n_elements} for {self.name} with block size {self.block_size}' return self.quantized_dtype.itemsize * (n_elements // self.block_size) + @dataclass(frozen=True) class Q8_0QuantizedDataType(QuantizedDataType): # Mini Q8_0 quantization in Python! @@ -86,6 +91,7 @@ def quantize(self, arr: NDArray) -> NDArray: n_blocks = arr.size // self.block_size blocks = arr.reshape((n_blocks, self.block_size)) # Much faster implementation of block quantization contributed by @Cebtenzzre + def quantize_blocks_q8_0(blocks: NDArray) -> Iterable[tuple[Any, Any]]: d = abs(blocks).max(axis = 1) / np.float32(127) with np.errstate(divide = 'ignore'): @@ -94,10 +100,11 @@ def quantize_blocks_q8_0(blocks: NDArray) -> Iterable[tuple[Any, Any]]: yield from zip(d, qs) return np.fromiter(quantize_blocks_q8_0(blocks), count = n_blocks, dtype = self.quantized_dtype) + DT_Q8_0 = Q8_0QuantizedDataType('Q8_0', - dtype = np.dtype(np.float32), valid_conversions = [], - ggml_type = gguf.GGMLQuantizationType.Q8_0, block_size = 32, - quantized_dtype = np.dtype([('d', ' Iterable[tuple[Any, Any]]: # TODO: match this with `llama_ftype` # TODO: rename to LLAMAFileType # TODO: move to `gguf.py` + + class GGMLFileType(enum.IntEnum): AllF32 = 0 MostlyF16 = 1 # except 1d tensors @@ -128,6 +137,7 @@ def type_for_tensor(self, name: str, tensor: LazyTensor) -> DataType: # 1D tensors are always F32. return dt if len(tensor.shape) > 1 else DT_F32 + GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = { GGMLFileType.AllF32 : DT_F32, GGMLFileType.MostlyF16 : DT_F16, @@ -138,6 +148,7 @@ def type_for_tensor(self, name: str, tensor: LazyTensor) -> DataType: # hparams loading # + @dataclass class Params: n_vocab: int @@ -167,11 +178,11 @@ def guessed(model: LazyModel) -> Params: # try transformer naming first if "model.layers.0.self_attn.q_proj.weight" in model: - n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model) + n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model) elif "model.layers.0.self_attn.W_pack.weight" in model: # next: try baichuan naming - n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model) + n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model) else: - n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model) + n_layer = next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model) if n_layer < 1: raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n" @@ -308,7 +319,7 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> No (item['content'], item['id']) for item in tokenizer_json.get('added_tokens', []) # Added tokens here can be duplicates of the main vocabulary. - if item['content'] not in self.bpe_tokenizer ) + if item['content'] not in self.bpe_tokenizer) vocab_size: int = len(self.bpe_tokenizer) expected_ids = list(range(vocab_size, vocab_size + len(added_tokens))) @@ -326,7 +337,6 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> No def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: tokenizer = self.bpe_tokenizer - from transformers.models.gpt2 import tokenization_gpt2 reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.items()} for i, _ in enumerate(tokenizer): @@ -406,6 +416,7 @@ def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: def __repr__(self) -> str: return f"" + Vocab: TypeAlias = 'BpeVocab | SentencePieceVocab' # @@ -413,13 +424,14 @@ def __repr__(self) -> str: # TODO: reuse (probably move to gguf.py?) # + def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray: - #print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) ) + # print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) ) if n_head_kv is not None and n_head != n_head_kv: n_head = n_head_kv return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape)) + .swapaxes(1, 2) + .reshape(weights.shape)) class Tensor(metaclass=ABCMeta): @@ -500,7 +512,7 @@ def load(self) -> Tensor: ret = self._load() # Should be okay if it maps to the same numpy type? assert ret.data_type == self.data_type or (self.data_type.dtype == ret.data_type.dtype), \ - (self.data_type, ret.data_type, self.description) + (self.data_type, ret.data_type, self.description) return ret def astype(self, data_type: DataType) -> LazyTensor: @@ -588,6 +600,7 @@ def load() -> Tensor: return lazy_tensor.load().permute(n_head, n_head_kv) return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description) + def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int, n_head_kv: int) -> LazyTensor: def load() -> Tensor: return lazy_tensor.load().permute_part(n_part, n_head, n_head_kv) @@ -595,6 +608,7 @@ def load() -> Tensor: s[0] = s[0] // 3 return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description) + def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor: def load() -> Tensor: return lazy_tensor.load().part(n_part) @@ -744,6 +758,7 @@ def lazy_load_file(path: Path) -> ModelPlus: In = TypeVar('In') Out = TypeVar('Out') + def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int, max_workers: int | None = None, use_processpool_executor: bool = False) -> Iterable[Out]: '''Parallel map, but with backpressure. If the caller doesn't call `next` fast enough, this will stop calling `func` at some point rather than @@ -778,6 +793,7 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc break yield result + def check_vocab_size(params: Params, vocab: Vocab) -> None: if params.n_vocab != vocab.vocab_size: assert isinstance(vocab, BpeVocab) or isinstance(vocab, SentencePieceVocab) @@ -796,7 +812,7 @@ def check_vocab_size(params: Params, vocab: Vocab) -> None: class OutputFile: - def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None: + def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None: self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess) def add_meta_arch(self, params: Params) -> None: @@ -876,7 +892,7 @@ def close(self) -> None: self.gguf.close() @staticmethod - def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None: + def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None: check_vocab_size(params, vocab) of = OutputFile(fname_out, endianess=endianess) @@ -938,8 +954,9 @@ def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyM of.close() + def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType: - wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0)+".weight"].data_type + wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) +".weight"].data_type if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32): return GGMLFileType.AllF32 @@ -952,10 +969,12 @@ def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileT raise Exception(f"Unexpected combination of types: {name_to_type}") + def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel: return {name: tensor.astype(output_type.type_for_tensor(name, tensor)) for (name, tensor) in model.items()} + def convert_model_names(model: LazyModel, params: Params) -> LazyModel: tmap = gguf.TensorNameMap(ARCH, params.n_layer) should_skip: set[gguf.MODEL_TENSOR] = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, [])) @@ -968,7 +987,7 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel: print(f"Permuting layer {i}") tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head) tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_head_kv) - #tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"] + # tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"] elif f"model.layers.{i}.self_attn.W_pack.weight" in model: print(f"Unpacking and permuting layer {i}") tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head) @@ -993,6 +1012,7 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel: return out + def nth_multifile_path(path: Path, n: int) -> Path | None: '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return the nth path in the model. @@ -1174,8 +1194,8 @@ def main(args_in: list[str] | None = None) -> None: # FIXME: Try to respect vocab_dir somehow? vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype) special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, - load_merges = args.vocabtype == 'bpe', - n_vocab = vocab.vocab_size) + load_merges = args.vocabtype == 'bpe', + n_vocab = vocab.vocab_size) outfile = args.outfile OutputFile.write_vocab_only(outfile, params, vocab, special_vocab) print(f"Wrote {outfile}") @@ -1188,8 +1208,8 @@ def main(args_in: list[str] | None = None) -> None: vocab = load_vocab(vocab_dir, args.vocabtype) # FIXME: Try to respect vocab_dir somehow? special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, - load_merges = args.vocabtype == 'bpe', - n_vocab = vocab.vocab_size) + load_merges = args.vocabtype == 'bpe', + n_vocab = vocab.vocab_size) model = model_plus.model model = convert_model_names(model, params) diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index 093a9356dccef..890b75cdc62bb 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -147,6 +147,13 @@ int main(int argc, char ** argv) { return 0; } + if (params.chatml) { + printf("\n************\n"); + printf("%s: please use the 'main' tool for chatml mode\n", __func__); + printf("************\n\n"); + + return 0; + } if (!params.antiprompt.empty()) { printf("\n************\n"); printf("%s: please use the 'main' tool for antiprompt mode\n", __func__); diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 0ae50635beccc..3ae7d43640b40 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -235,8 +235,11 @@ int main(int argc, char ** argv) { std::vector embd_inp; - if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) { + if (params.interactive_first || params.instruct || params.chatml || !params.prompt.empty() || session_tokens.empty()) { LOG("tokenize the prompt\n"); + if (params.chatml) { + params.prompt = "<|im_start|>system\n" + params.prompt + "<|im_end|>"; + } embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true); } else { LOG("use session tokens\n"); @@ -314,7 +317,7 @@ int main(int argc, char ** argv) { } // number of tokens to keep when resetting context - if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct) { + if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct || params.chatml) { params.n_keep = (int)embd_inp.size(); } @@ -325,11 +328,23 @@ int main(int argc, char ** argv) { LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str()); LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str()); + // chatml prefix & suffix + const auto cml_pfx = ::llama_tokenize(ctx, "\n<|im_start|>user\n", add_bos, true); + const auto cml_sfx = ::llama_tokenize(ctx, "<|im_end|>\n<|im_start|>assistant\n", false, true); + + LOG("cml_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_pfx).c_str()); + LOG("cml_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_sfx).c_str()); + // in instruct mode, we inject a prefix and a suffix to each input by the user if (params.instruct) { params.interactive_first = true; params.antiprompt.push_back("### Instruction:\n\n"); } + // similar for chatml mode + else if (params.chatml) { + params.interactive_first = true; + params.antiprompt.push_back("<|im_start|>user\n"); + } // enable interactive mode if interactive start is specified if (params.interactive_first) { @@ -706,7 +721,7 @@ int main(int argc, char ** argv) { is_interacting = true; printf("\n"); - } else if (params.instruct) { + } else if (params.instruct || params.chatml) { is_interacting = true; } } @@ -714,7 +729,7 @@ int main(int argc, char ** argv) { if (n_past > 0 && is_interacting) { LOG("waiting for user input\n"); - if (params.instruct) { + if (params.instruct || params.chatml) { printf("\n> "); } @@ -761,6 +776,12 @@ int main(int argc, char ** argv) { n_consumed = embd_inp.size(); embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end()); } + // chatml mode: insert user chat prefix + if (params.chatml && !is_antiprompt) { + LOG("inserting chatml prefix\n"); + n_consumed = embd_inp.size(); + embd_inp.insert(embd_inp.end(), cml_pfx.begin(), cml_pfx.end()); + } if (params.escape) { process_escapes(buffer); } @@ -779,6 +800,11 @@ int main(int argc, char ** argv) { LOG("inserting instruction suffix\n"); embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end()); } + // chatml mode: insert assistant chat suffix + if (params.chatml) { + LOG("inserting chatml suffix\n"); + embd_inp.insert(embd_inp.end(), cml_sfx.begin(), cml_sfx.end()); + } for (size_t i = original_size; i < embd_inp.size(); ++i) { const llama_token token = embd_inp[i]; @@ -804,7 +830,7 @@ int main(int argc, char ** argv) { } // end of text token - if (!embd.empty() && embd.back() == llama_token_eos(model) && !(params.instruct || params.interactive)) { + if (!embd.empty() && embd.back() == llama_token_eos(model) && !(params.instruct || params.interactive || params.chatml)) { LOG_TEE(" [end of text]\n"); break; } diff --git a/examples/server/public/completion.js b/examples/server/public/completion.js index 0c9bd5f1021db..b9c442509a2fa 100644 --- a/examples/server/public/completion.js +++ b/examples/server/public/completion.js @@ -94,6 +94,10 @@ export async function* llama(prompt, params = {}, config = {}) { break; } } + if (result.error) { + result.error = JSON.parse(result.error); + console.error(`llama.cpp error: ${result.error.content}`); + } } } } diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 9f8b4cd96d6d0..2de3a5d583792 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2369,6 +2369,17 @@ int main(int argc, char **argv) break; } } else { + const std::string str = + "error: " + + result.result_json.dump(-1, ' ', false, json::error_handler_t::replace) + + "\n\n"; + LOG_VERBOSE("data stream", { + { "to_send", str } + }); + if (!sink.write(str.c_str(), str.size())) + { + return false; + } break; } } diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index b95b846330c8c..fb1eb6b85082d 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -96,9 +96,22 @@ int main(int argc, char ** argv) { } } - // tokenize the prompt + + // Tokenize the prompt + const bool add_bos_tgt = llama_should_add_bos_token(model_tgt); + LOG("add_bos tgt: %d\n", add_bos_tgt); + + const bool add_bos_dft = llama_should_add_bos_token(model_dft); + LOG("add_bos dft: %d\n", add_bos_dft); + + if (add_bos_tgt != add_bos_dft) { + fprintf(stderr, "%s: error: draft model add_bos must match target model to use speculation but ", __func__); + fprintf(stderr, "add_bos_dft = %d while add_bos_tgt = %d\n", add_bos_dft, add_bos_tgt); + return 1; + } + std::vector inp; - inp = ::llama_tokenize(ctx_tgt, params.prompt, true); + inp = ::llama_tokenize(ctx_tgt, params.prompt, add_bos_tgt, true); const int max_context_size = llama_n_ctx(ctx_tgt); const int max_tokens_list_size = max_context_size - 4; diff --git a/examples/tokenize/tokenize.cpp b/examples/tokenize/tokenize.cpp index 17166836ac127..4ff8e3fa72749 100644 --- a/examples/tokenize/tokenize.cpp +++ b/examples/tokenize/tokenize.cpp @@ -26,7 +26,7 @@ int main(int argc, char ** argv) { llama_context_params ctx_params = llama_context_default_params(); llama_context * ctx = llama_new_context_with_model(model, ctx_params); - const bool add_bos = true; + const bool add_bos = llama_should_add_bos_token(model); std::vector tokens; diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 7f93c9145e75b..8a2f60b00d9c3 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -235,7 +235,7 @@ typedef float2 dfloat2; #endif //GGML_CUDA_F16 static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) { - const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment + const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment int x32 = 0; x32 |= x16[0] << 0; @@ -245,7 +245,7 @@ static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const } static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, const int & i32) { - const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment + const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment int x32 = 0; x32 |= x16[0] << 0; @@ -255,11 +255,11 @@ static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, con } static __device__ __forceinline__ int get_int_from_int8_aligned(const int8_t * x8, const int & i32) { - return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment + return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment } static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * x8, const int & i32) { - return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment + return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment } template @@ -469,7 +469,7 @@ static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUA #define MUL_MAT_SRC1_COL_STRIDE 128 #define MAX_STREAMS 8 -static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { nullptr }; +static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { { nullptr } }; struct ggml_tensor_extra_gpu { void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors @@ -2249,6 +2249,7 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1( } template static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { + (void)x_qh; (void)x_sc; __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y]; __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0]; @@ -2260,7 +2261,7 @@ template static __device__ __forceinline__ void allocate_tiles_q4_0( template static __device__ __forceinline__ void load_tiles_q4_0( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { - + (void)x_qh; (void)x_sc; GGML_CUDA_ASSUME(i_offset >= 0); GGML_CUDA_ASSUME(i_offset < nwarps); GGML_CUDA_ASSUME(k >= 0); @@ -2269,7 +2270,7 @@ template static __device__ __forceinlin const int kbx = k / QI4_0; const int kqsx = k % QI4_0; - const block_q4_0 * bx0 = (block_q4_0 *) vx; + const block_q4_0 * bx0 = (const block_q4_0 *) vx; float * x_dmf = (float *) x_dm; @@ -2307,9 +2308,10 @@ template static __device__ __forceinlin static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { + (void)x_qh; (void)x_sc; const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); - const float * x_dmf = (float *) x_dm; + const float * x_dmf = (const float *) x_dm; int u[2*VDR_Q4_0_Q8_1_MMQ]; @@ -2343,6 +2345,7 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1( } template static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { + (void)x_qh; (void)x_sc; __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + + mmq_y]; __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1]; @@ -2354,6 +2357,7 @@ template static __device__ __forceinline__ void allocate_tiles_q4_1( template static __device__ __forceinline__ void load_tiles_q4_1( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { + (void)x_qh; (void)x_sc; GGML_CUDA_ASSUME(i_offset >= 0); GGML_CUDA_ASSUME(i_offset < nwarps); @@ -2363,7 +2367,7 @@ template static __device__ __forceinlin const int kbx = k / QI4_1; const int kqsx = k % QI4_1; - const block_q4_1 * bx0 = (block_q4_1 *) vx; + const block_q4_1 * bx0 = (const block_q4_1 *) vx; #pragma unroll for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { @@ -2398,6 +2402,7 @@ template static __device__ __forceinlin static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { + (void)x_qh; (void)x_sc; const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); @@ -2435,6 +2440,7 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1( } template static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { + (void)x_qh; (void)x_sc; __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y]; __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0]; @@ -2446,6 +2452,7 @@ template static __device__ __forceinline__ void allocate_tiles_q5_0( template static __device__ __forceinline__ void load_tiles_q5_0( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { + (void)x_qh; (void)x_sc; GGML_CUDA_ASSUME(i_offset >= 0); GGML_CUDA_ASSUME(i_offset < nwarps); @@ -2455,7 +2462,7 @@ template static __device__ __forceinlin const int kbx = k / QI5_0; const int kqsx = k % QI5_0; - const block_q5_0 * bx0 = (block_q5_0 *) vx; + const block_q5_0 * bx0 = (const block_q5_0 *) vx; #pragma unroll for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { @@ -2510,6 +2517,7 @@ template static __device__ __forceinlin static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { + (void)x_qh; (void)x_sc; const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0; @@ -2549,6 +2557,7 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1( } template static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { + (void)x_qh; (void)x_sc; __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y]; __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1]; @@ -2560,6 +2569,7 @@ template static __device__ __forceinline__ void allocate_tiles_q5_1( template static __device__ __forceinline__ void load_tiles_q5_1( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { + (void)x_qh; (void)x_sc; GGML_CUDA_ASSUME(i_offset >= 0); GGML_CUDA_ASSUME(i_offset < nwarps); @@ -2569,7 +2579,7 @@ template static __device__ __forceinlin const int kbx = k / QI5_1; const int kqsx = k % QI5_1; - const block_q5_1 * bx0 = (block_q5_1 *) vx; + const block_q5_1 * bx0 = (const block_q5_1 *) vx; #pragma unroll for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { @@ -2621,6 +2631,7 @@ template static __device__ __forceinlin static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { + (void)x_qh; (void)x_sc; const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1; @@ -2655,6 +2666,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1( } template static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { + (void)x_qh; (void)x_sc; __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y]; __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0]; @@ -2666,6 +2678,7 @@ template static __device__ __forceinline__ void allocate_tiles_q8_0( template static __device__ __forceinline__ void load_tiles_q8_0( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { + (void)x_qh; (void)x_sc; GGML_CUDA_ASSUME(i_offset >= 0); GGML_CUDA_ASSUME(i_offset < nwarps); @@ -2676,7 +2689,7 @@ template static __device__ __forceinlin const int kqsx = k % QI8_0; float * x_dmf = (float *) x_dm; - const block_q8_0 * bx0 = (block_q8_0 *) vx; + const block_q8_0 * bx0 = (const block_q8_0 *) vx; #pragma unroll for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { @@ -2711,6 +2724,7 @@ template static __device__ __forceinlin static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { + (void)x_qh; (void)x_sc; const float * x_dmf = (const float *) x_dm; const float * y_df = (const float *) y_ds; @@ -2744,6 +2758,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1( } template static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { + (void)x_qh; __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y]; __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K]; @@ -2757,6 +2772,7 @@ template static __device__ __forceinline__ void allocate_tiles_q2_K( template static __device__ __forceinline__ void load_tiles_q2_K( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { + (void)x_qh; GGML_CUDA_ASSUME(i_offset >= 0); GGML_CUDA_ASSUME(i_offset < nwarps); @@ -2766,7 +2782,7 @@ template static __device__ __forceinlin const int kbx = k / QI2_K; const int kqsx = k % QI2_K; - const block_q2_K * bx0 = (block_q2_K *) vx; + const block_q2_K * bx0 = (const block_q2_K *) vx; #pragma unroll for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { @@ -2814,6 +2830,7 @@ template static __device__ __forceinlin static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { + (void)x_qh; const int kbx = k / QI2_K; const int ky = (k % QI2_K) * QR2_K; @@ -2887,7 +2904,7 @@ template static __device__ __forceinlin const int kbx = k / QI3_K; const int kqsx = k % QI3_K; - const block_q3_K * bx0 = (block_q3_K *) vx; + const block_q3_K * bx0 = (const block_q3_K *) vx; #pragma unroll for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { @@ -2968,7 +2985,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat( const float * x_dmf = (const float *) x_dm; const float * y_df = (const float *) y_ds; - const int8_t * scales = ((int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4; + const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4; int v[QR3_K*VDR_Q3_K_Q8_1_MMQ]; @@ -3083,6 +3100,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1( } template static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { + (void)x_qh; __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y]; __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K]; @@ -3096,6 +3114,7 @@ template static __device__ __forceinline__ void allocate_tiles_q4_K( template static __device__ __forceinline__ void load_tiles_q4_K( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { + (void)x_qh; GGML_CUDA_ASSUME(i_offset >= 0); GGML_CUDA_ASSUME(i_offset < nwarps); @@ -3105,7 +3124,7 @@ template static __device__ __forceinlin const int kbx = k / QI4_K; // == 0 if QK_K == 256 const int kqsx = k % QI4_K; // == k if QK_K == 256 - const block_q4_K * bx0 = (block_q4_K *) vx; + const block_q4_K * bx0 = (const block_q4_K *) vx; #pragma unroll for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { @@ -3150,7 +3169,7 @@ template static __device__ __forceinlin const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8); - const int * scales = (int *) bxi->scales; + const int * scales = (const int *) bxi->scales; const int ksc = k % (WARP_SIZE/8); @@ -3165,6 +3184,7 @@ template static __device__ __forceinlin static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { + (void)x_qh; const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8); @@ -3264,6 +3284,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1( } template static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { + (void)x_qh; __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y]; __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K]; @@ -3277,6 +3298,7 @@ template static __device__ __forceinline__ void allocate_tiles_q5_K( template static __device__ __forceinline__ void load_tiles_q5_K( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { + (void)x_qh; GGML_CUDA_ASSUME(i_offset >= 0); GGML_CUDA_ASSUME(i_offset < nwarps); @@ -3286,7 +3308,7 @@ template static __device__ __forceinlin const int kbx = k / QI5_K; // == 0 if QK_K == 256 const int kqsx = k % QI5_K; // == k if QK_K == 256 - const block_q5_K * bx0 = (block_q5_K *) vx; + const block_q5_K * bx0 = (const block_q5_K *) vx; #pragma unroll for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { @@ -3342,7 +3364,7 @@ template static __device__ __forceinlin const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8); - const int * scales = (int *) bxi->scales; + const int * scales = (const int *) bxi->scales; const int ksc = k % (WARP_SIZE/8); @@ -3357,6 +3379,7 @@ template static __device__ __forceinlin static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { + (void)x_qh; const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8); @@ -3393,6 +3416,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1( } template static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { + (void)x_qh; __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y]; __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K]; @@ -3406,6 +3430,7 @@ template static __device__ __forceinline__ void allocate_tiles_q6_K( template static __device__ __forceinline__ void load_tiles_q6_K( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { + (void)x_qh; GGML_CUDA_ASSUME(i_offset >= 0); GGML_CUDA_ASSUME(i_offset < nwarps); @@ -3415,7 +3440,7 @@ template static __device__ __forceinlin const int kbx = k / QI6_K; // == 0 if QK_K == 256 const int kqsx = k % QI6_K; // == k if QK_K == 256 - const block_q6_K * bx0 = (block_q6_K *) vx; + const block_q6_K * bx0 = (const block_q6_K *) vx; #pragma unroll for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { @@ -3477,6 +3502,7 @@ template static __device__ __forceinlin static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { + (void)x_qh; const float * x_dmf = (const float *) x_dm; const float * y_df = (const float *) y_ds; @@ -3519,7 +3545,7 @@ static __device__ __forceinline__ void mul_mat_q( __shared__ int tile_y_qs[mmq_x * WARP_SIZE]; __shared__ half2 tile_y_ds[mmq_x * WARP_SIZE/QI8_1]; - float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {0.0f}; + float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {{0.0f}}; for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) { @@ -6012,18 +6038,18 @@ static cudaError_t ggml_cuda_cpy_tensor_2d( const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3; if (nb0 == ts && nb1 == ts*ne0/bs) { return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, kind, stream); - } else if (nb0 == ts) { + } + if (nb0 == ts) { return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, kind, stream); - } else { - for (int64_t i1 = 0; i1 < i1_diff; i1++) { - const void * rx = (const void *) ((const char *) x + i1*nb1); - void * rd = (void *) (dst_ptr + i1*ts*ne0/bs); - // pretend the row is a matrix with cols=1 - cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream); - if (r != cudaSuccess) return r; - } - return cudaSuccess; } + for (int64_t i1 = 0; i1 < i1_diff; i1++) { + const void * rx = (const void *) ((const char *) x + i1*nb1); + void * rd = (void *) (dst_ptr + i1*ts*ne0/bs); + // pretend the row is a matrix with cols=1 + cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream); + if (r != cudaSuccess) { return r; } + } + return cudaSuccess; } static void ggml_cuda_op_repeat( @@ -6978,7 +7004,7 @@ static void ggml_cuda_op_mul_mat( const int64_t ne01 = src0->ne[1]; const int64_t ne02 = src0->ne[2]; const int64_t ne03 = src0->ne[3]; - const int64_t nrows0 = ggml_nrows(src0); + // const int64_t nrows0 = ggml_nrows(src0); const int64_t ne10 = src1->ne[0]; const int64_t ne11 = src1->ne[1]; @@ -7079,7 +7105,7 @@ static void ggml_cuda_op_mul_mat( if (src0_on_device && src0_is_contiguous) { src0_dd[id] = (char *) src0_extra->data_device[id]; } else { - const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0); + // const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0); src0_dd[id] = (char *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_as[id]); } @@ -7312,7 +7338,7 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src } bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { - if (!g_cublas_loaded) return false; + if (!g_cublas_loaded) { return false; } const int64_t ne10 = src1->ne[0]; @@ -7390,7 +7416,7 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream); } -__global__ void k_compute_batched_ptrs( +__global__ static void k_compute_batched_ptrs( const half * src0_as_f16, const half * src1_as_f16, half * dst_f16, const void ** ptrs_src, void ** ptrs_dst, int ne12, int ne13, @@ -8013,7 +8039,7 @@ void ggml_cuda_free_scratch() { } bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { - if (!g_cublas_loaded) return false; + if (!g_cublas_loaded) { return false; } ggml_cuda_func_t func; const bool any_on_device = tensor->backend == GGML_BACKEND_GPU @@ -8312,14 +8338,14 @@ static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backen UNUSED(cgraph); } -static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { +[[noreturn]] static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { GGML_ASSERT(!"not implemented"); UNUSED(backend); UNUSED(plan); } -static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { +[[noreturn]] static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { GGML_ASSERT(!"not implemented"); UNUSED(backend); @@ -8335,8 +8361,9 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * node = cgraph->nodes[i]; - if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE) + if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE) { continue; + } assert(node->backend == GGML_BACKEND_GPU); for (int j = 0; j < GGML_MAX_SRC; j++) { if (node->src[j] != nullptr) { diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 7f63361bd32bc..8bd82daca38a7 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -56,20 +56,21 @@ class Rope: SCALING_FINETUNED = "{arch}.rope.scaling.finetuned" class Tokenizer: - MODEL = "tokenizer.ggml.model" - LIST = "tokenizer.ggml.tokens" - TOKEN_TYPE = "tokenizer.ggml.token_type" - SCORES = "tokenizer.ggml.scores" - MERGES = "tokenizer.ggml.merges" - BOS_ID = "tokenizer.ggml.bos_token_id" - EOS_ID = "tokenizer.ggml.eos_token_id" - UNK_ID = "tokenizer.ggml.unknown_token_id" - SEP_ID = "tokenizer.ggml.seperator_token_id" - PAD_ID = "tokenizer.ggml.padding_token_id" - ADD_BOS = "tokenizer.ggml.add_bos_token" - ADD_EOS = "tokenizer.ggml.add_eos_token" - HF_JSON = "tokenizer.huggingface.json" - RWKV = "tokenizer.rwkv.world" + MODEL = "tokenizer.ggml.model" + LIST = "tokenizer.ggml.tokens" + TOKEN_TYPE = "tokenizer.ggml.token_type" + SCORES = "tokenizer.ggml.scores" + MERGES = "tokenizer.ggml.merges" + BOS_ID = "tokenizer.ggml.bos_token_id" + EOS_ID = "tokenizer.ggml.eos_token_id" + UNK_ID = "tokenizer.ggml.unknown_token_id" + SEP_ID = "tokenizer.ggml.seperator_token_id" + PAD_ID = "tokenizer.ggml.padding_token_id" + ADD_BOS = "tokenizer.ggml.add_bos_token" + ADD_EOS = "tokenizer.ggml.add_eos_token" + HF_JSON = "tokenizer.huggingface.json" + RWKV = "tokenizer.rwkv.world" + CHAT_TEMPLATE = "tokenizer.chat_template" # diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index c3b8c588f17cd..b8ec977c8f3fa 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -221,7 +221,7 @@ def add_tensor( if self.endianess == GGUFEndian.BIG: tensor.byteswap(inplace=True) if self.use_temp_file and self.temp_file is None: - fp = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024) + fp = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256 * 1024 * 1024) fp.seek(0) self.temp_file = fp @@ -399,6 +399,9 @@ def add_add_bos_token(self, value: bool) -> None: def add_add_eos_token(self, value: bool) -> None: self.add_bool(Keys.Tokenizer.ADD_EOS, value) + def add_chat_template(self, value: str) -> None: + self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value) + def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes: pack_prefix = '' if not skip_pack_prefix: diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py index b9f50a0afed7a..de3e5edb557d7 100644 --- a/gguf-py/gguf/vocab.py +++ b/gguf-py/gguf/vocab.py @@ -13,6 +13,7 @@ class SpecialVocab: merges: list[str] add_special_token: dict[str, bool] special_token_ids: dict[str, int] + chat_template: str | None def __init__( self, path: str | os.PathLike[str], load_merges: bool = False, @@ -24,6 +25,7 @@ def __init__( self.n_vocab = n_vocab self.load_merges = load_merges self.merges = [] + self.chat_template = None if special_token_types is not None: self.special_token_types = special_token_types else: @@ -67,6 +69,10 @@ def add_to_gguf(self, gw: GGUFWriter, quiet: bool = False) -> None: if not quiet: print(f'gguf: Setting add_{typ}_token to {value}') add_handler(value) + if self.chat_template is not None: + if not quiet: + print(f'gguf: Setting chat_template to {self.chat_template}') + gw.add_chat_template(self.chat_template) def _load(self, path: Path) -> None: self._try_load_from_tokenizer_json(path) @@ -132,6 +138,14 @@ def _try_load_from_tokenizer_json(self, path: Path) -> bool: return True with open(tokenizer_config_file, encoding = 'utf-8') as f: tokenizer_config = json.load(f) + chat_template = tokenizer_config.get('chat_template') + if chat_template is None or isinstance(chat_template, str): + self.chat_template = chat_template + else: + print( + f'gguf: WARNING: Bad type for chat_template field in {tokenizer_config_file!r} - ignoring', + file = sys.stderr + ) for typ in self.special_token_types: add_entry = tokenizer_config.get(f'add_{typ}_token') if isinstance(add_entry, bool): diff --git a/gguf-py/pyproject.toml b/gguf-py/pyproject.toml index 6e3f9e85549d0..e6374bfe898a4 100644 --- a/gguf-py/pyproject.toml +++ b/gguf-py/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "gguf" -version = "0.5.3" +version = "0.6.0" description = "Read and write ML models in GGUF for GGML" authors = ["GGML "] packages = [ diff --git a/llama.cpp b/llama.cpp index 235f33a914030..0310052d9e9ff 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1882,6 +1882,7 @@ struct llama_model_loader { if (value.size() > MAX_VALUE_LEN) { value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str()); } + replace_all(value, "\n", "\\n"); LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str()); }