feat(llama.cpp): support lora with scale and yarn (#1277)

* feat(llama.cpp): support lora with scale Signed-off-by: Ettore Di Giacinto <[email protected]> * feat(llama.cpp): support yarn Signed-off-by: Ettore Di Giacinto <[email protected]> --------- Signed-off-by: Ettore Di Giacinto <[email protected]>
mudler · Nov 11, 2023 · 803a0ac · 803a0ac
1 parent bde87d0
commit 803a0ac
Show file tree

Hide file tree

Showing 13 changed files with 476 additions and 364 deletions.
diff --git a/api/backend/image.go b/api/backend/image.go
@@ -21,6 +21,7 @@ func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negat
 			PipelineType:  c.Diffusers.PipelineType,
 			CFGScale:      c.Diffusers.CFGScale,
 			LoraAdapter:   c.LoraAdapter,
+			LoraScale:     c.LoraScale,
 			LoraBase:      c.LoraBase,
 			IMG2IMG:       c.Diffusers.IMG2IMG,
 			CLIPModel:     c.Diffusers.ClipModel,

diff --git a/api/backend/options.go b/api/backend/options.go
@@ -38,30 +38,35 @@ func gRPCModelOpts(c config.Config) *pb.ModelOptions {
 	}
 
 	return &pb.ModelOptions{
-		ContextSize:   int32(c.ContextSize),
-		Seed:          int32(c.Seed),
-		NBatch:        int32(b),
-		NoMulMatQ:     c.NoMulMatQ,
-		DraftModel:    c.DraftModel,
-		AudioPath:     c.VallE.AudioPath,
-		Quantization:  c.Quantization,
-		MMProj:        c.MMProj,
-		LoraAdapter:   c.LoraAdapter,
-		LoraBase:      c.LoraBase,
-		NGQA:          c.NGQA,
-		RMSNormEps:    c.RMSNormEps,
-		F16Memory:     c.F16,
-		MLock:         c.MMlock,
-		RopeFreqBase:  c.RopeFreqBase,
-		RopeFreqScale: c.RopeFreqScale,
-		NUMA:          c.NUMA,
-		Embeddings:    c.Embeddings,
-		LowVRAM:       c.LowVRAM,
-		NGPULayers:    int32(c.NGPULayers),
-		MMap:          c.MMap,
-		MainGPU:       c.MainGPU,
-		Threads:       int32(c.Threads),
-		TensorSplit:   c.TensorSplit,
+		ContextSize:    int32(c.ContextSize),
+		Seed:           int32(c.Seed),
+		NBatch:         int32(b),
+		NoMulMatQ:      c.NoMulMatQ,
+		DraftModel:     c.DraftModel,
+		AudioPath:      c.VallE.AudioPath,
+		Quantization:   c.Quantization,
+		MMProj:         c.MMProj,
+		YarnExtFactor:  c.YarnExtFactor,
+		YarnAttnFactor: c.YarnAttnFactor,
+		YarnBetaFast:   c.YarnBetaFast,
+		YarnBetaSlow:   c.YarnBetaSlow,
+		LoraAdapter:    c.LoraAdapter,
+		LoraBase:       c.LoraBase,
+		LoraScale:      c.LoraScale,
+		NGQA:           c.NGQA,
+		RMSNormEps:     c.RMSNormEps,
+		F16Memory:      c.F16,
+		MLock:          c.MMlock,
+		RopeFreqBase:   c.RopeFreqBase,
+		RopeFreqScale:  c.RopeFreqScale,
+		NUMA:           c.NUMA,
+		Embeddings:     c.Embeddings,
+		LowVRAM:        c.LowVRAM,
+		NGPULayers:     int32(c.NGPULayers),
+		MMap:           c.MMap,
+		MainGPU:        c.MainGPU,
+		Threads:        int32(c.Threads),
+		TensorSplit:    c.TensorSplit,
 		// AutoGPTQ
 		ModelBaseName:    c.AutoGPTQ.ModelBaseName,
 		Device:           c.AutoGPTQ.Device,

diff --git a/api/config/config.go b/api/config/config.go
@@ -100,11 +100,18 @@ type LLMConfig struct {
 	NUMA            bool     `yaml:"numa"`
 	LoraAdapter     string   `yaml:"lora_adapter"`
 	LoraBase        string   `yaml:"lora_base"`
+	LoraScale       float32  `yaml:"lora_scale"`
 	NoMulMatQ       bool     `yaml:"no_mulmatq"`
 	DraftModel      string   `yaml:"draft_model"`
 	NDraft          int32    `yaml:"n_draft"`
 	Quantization    string   `yaml:"quantization"`
 	MMProj          string   `yaml:"mmproj"`
+
+	RopeScaling    string  `yaml:"rope_scaling"`
+	YarnExtFactor  float32 `yaml:"yarn_ext_factor"`
+	YarnAttnFactor float32 `yaml:"yarn_attn_factor"`
+	YarnBetaFast   float32 `yaml:"yarn_beta_fast"`
+	YarnBetaSlow   float32 `yaml:"yarn_beta_slow"`
 }
 
 type AutoGPTQ struct {

diff --git a/backend/cpp/llama/grpc-server.cpp b/backend/cpp/llama/grpc-server.cpp
@@ -2017,12 +2017,41 @@ static void params_parse(const backend::ModelOptions* request,
     if (!request->maingpu().empty()) {
         params.main_gpu = std::stoi(request->maingpu());
     }
-    // TODO: lora needs also a scale factor
-    //params.lora_adapter = request->loraadapter();
-    //params.lora_base = request->lorabase();
+    if (!request->loraadapter().empty() && !request->lorabase().empty()) {
+     float scale_factor = 1.0f;
+     if (request->lorascale() != 0.0f) {
+        scale_factor = request->lorascale();
+     }
+     // get the directory of modelfile
+     std::string model_dir = params.model.substr(0, params.model.find_last_of("/\\"));
+     params.lora_adapter.push_back(std::make_tuple(model_dir + "/"+request->loraadapter(), scale_factor));
+     params.lora_base  =  model_dir + "/"+request->lorabase();
+    }
     params.use_mlock = request->mlock();
     params.use_mmap = request->mmap();
     params.embedding = request->embeddings();
+
+    if (request->ropescaling() == "none")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE; }
+    else if (request->ropescaling() == "yarn")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN; }
+    else { params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR; }
+    if ( request->yarnextfactor() != 0.0f ) {
+        params.yarn_ext_factor = request->yarnextfactor();
+    }
+    if ( request->yarnattnfactor() != 0.0f ) {
+        params.yarn_attn_factor = request->yarnattnfactor();
+    }
+    if ( request->yarnbetafast() != 0.0f ) {
+        params.yarn_beta_fast = request->yarnbetafast();
+    }
+    if ( request->yarnbetaslow() != 0.0f ) {
+        params.yarn_beta_slow = request->yarnbetaslow();
+    }
+    if ( request->ropefreqbase() != 0.0f ) {
+        params.rope_freq_base = request->ropefreqbase();
+    }
+    if ( request->ropefreqscale() != 0.0f ) {
+        params.rope_freq_scale = request->ropefreqscale();
+    }
 }