Skip to content

Commit

Permalink
feat(llama.cpp): support lora with scale and yarn (#1277)
Browse files Browse the repository at this point in the history
* feat(llama.cpp): support lora with scale

Signed-off-by: Ettore Di Giacinto <[email protected]>

* feat(llama.cpp): support yarn

Signed-off-by: Ettore Di Giacinto <[email protected]>

---------

Signed-off-by: Ettore Di Giacinto <[email protected]>
  • Loading branch information
mudler authored Nov 11, 2023
1 parent bde87d0 commit 803a0ac
Show file tree
Hide file tree
Showing 13 changed files with 476 additions and 364 deletions.
1 change: 1 addition & 0 deletions api/backend/image.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negat
PipelineType: c.Diffusers.PipelineType,
CFGScale: c.Diffusers.CFGScale,
LoraAdapter: c.LoraAdapter,
LoraScale: c.LoraScale,
LoraBase: c.LoraBase,
IMG2IMG: c.Diffusers.IMG2IMG,
CLIPModel: c.Diffusers.ClipModel,
Expand Down
53 changes: 29 additions & 24 deletions api/backend/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,30 +38,35 @@ func gRPCModelOpts(c config.Config) *pb.ModelOptions {
}

return &pb.ModelOptions{
ContextSize: int32(c.ContextSize),
Seed: int32(c.Seed),
NBatch: int32(b),
NoMulMatQ: c.NoMulMatQ,
DraftModel: c.DraftModel,
AudioPath: c.VallE.AudioPath,
Quantization: c.Quantization,
MMProj: c.MMProj,
LoraAdapter: c.LoraAdapter,
LoraBase: c.LoraBase,
NGQA: c.NGQA,
RMSNormEps: c.RMSNormEps,
F16Memory: c.F16,
MLock: c.MMlock,
RopeFreqBase: c.RopeFreqBase,
RopeFreqScale: c.RopeFreqScale,
NUMA: c.NUMA,
Embeddings: c.Embeddings,
LowVRAM: c.LowVRAM,
NGPULayers: int32(c.NGPULayers),
MMap: c.MMap,
MainGPU: c.MainGPU,
Threads: int32(c.Threads),
TensorSplit: c.TensorSplit,
ContextSize: int32(c.ContextSize),
Seed: int32(c.Seed),
NBatch: int32(b),
NoMulMatQ: c.NoMulMatQ,
DraftModel: c.DraftModel,
AudioPath: c.VallE.AudioPath,
Quantization: c.Quantization,
MMProj: c.MMProj,
YarnExtFactor: c.YarnExtFactor,
YarnAttnFactor: c.YarnAttnFactor,
YarnBetaFast: c.YarnBetaFast,
YarnBetaSlow: c.YarnBetaSlow,
LoraAdapter: c.LoraAdapter,
LoraBase: c.LoraBase,
LoraScale: c.LoraScale,
NGQA: c.NGQA,
RMSNormEps: c.RMSNormEps,
F16Memory: c.F16,
MLock: c.MMlock,
RopeFreqBase: c.RopeFreqBase,
RopeFreqScale: c.RopeFreqScale,
NUMA: c.NUMA,
Embeddings: c.Embeddings,
LowVRAM: c.LowVRAM,
NGPULayers: int32(c.NGPULayers),
MMap: c.MMap,
MainGPU: c.MainGPU,
Threads: int32(c.Threads),
TensorSplit: c.TensorSplit,
// AutoGPTQ
ModelBaseName: c.AutoGPTQ.ModelBaseName,
Device: c.AutoGPTQ.Device,
Expand Down
7 changes: 7 additions & 0 deletions api/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,11 +100,18 @@ type LLMConfig struct {
NUMA bool `yaml:"numa"`
LoraAdapter string `yaml:"lora_adapter"`
LoraBase string `yaml:"lora_base"`
LoraScale float32 `yaml:"lora_scale"`
NoMulMatQ bool `yaml:"no_mulmatq"`
DraftModel string `yaml:"draft_model"`
NDraft int32 `yaml:"n_draft"`
Quantization string `yaml:"quantization"`
MMProj string `yaml:"mmproj"`

RopeScaling string `yaml:"rope_scaling"`
YarnExtFactor float32 `yaml:"yarn_ext_factor"`
YarnAttnFactor float32 `yaml:"yarn_attn_factor"`
YarnBetaFast float32 `yaml:"yarn_beta_fast"`
YarnBetaSlow float32 `yaml:"yarn_beta_slow"`
}

type AutoGPTQ struct {
Expand Down
35 changes: 32 additions & 3 deletions backend/cpp/llama/grpc-server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2017,12 +2017,41 @@ static void params_parse(const backend::ModelOptions* request,
if (!request->maingpu().empty()) {
params.main_gpu = std::stoi(request->maingpu());
}
// TODO: lora needs also a scale factor
//params.lora_adapter = request->loraadapter();
//params.lora_base = request->lorabase();
if (!request->loraadapter().empty() && !request->lorabase().empty()) {
float scale_factor = 1.0f;
if (request->lorascale() != 0.0f) {
scale_factor = request->lorascale();
}
// get the directory of modelfile
std::string model_dir = params.model.substr(0, params.model.find_last_of("/\\"));
params.lora_adapter.push_back(std::make_tuple(model_dir + "/"+request->loraadapter(), scale_factor));
params.lora_base = model_dir + "/"+request->lorabase();
}
params.use_mlock = request->mlock();
params.use_mmap = request->mmap();
params.embedding = request->embeddings();

if (request->ropescaling() == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE; }
else if (request->ropescaling() == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN; }
else { params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR; }
if ( request->yarnextfactor() != 0.0f ) {
params.yarn_ext_factor = request->yarnextfactor();
}
if ( request->yarnattnfactor() != 0.0f ) {
params.yarn_attn_factor = request->yarnattnfactor();
}
if ( request->yarnbetafast() != 0.0f ) {
params.yarn_beta_fast = request->yarnbetafast();
}
if ( request->yarnbetaslow() != 0.0f ) {
params.yarn_beta_slow = request->yarnbetaslow();
}
if ( request->ropefreqbase() != 0.0f ) {
params.rope_freq_base = request->ropefreqbase();
}
if ( request->ropefreqscale() != 0.0f ) {
params.rope_freq_scale = request->ropefreqscale();
}
}


Expand Down
Loading

0 comments on commit 803a0ac

Please sign in to comment.