From 5d486b5281d83787b54f1209d929285ac0027d85 Mon Sep 17 00:00:00 2001
From: Yuwen Hu <yuwen.hu@intel.com>
Date: Wed, 27 Nov 2024 18:23:02 +0800
Subject: [PATCH 1/8] Further tuning Qwen2-7B int4 CW accuracy

---
 .../ipex_llm/transformers/npu_models/convert_mp.py | 14 +++++++++-----
 .../transformers/npu_models/mp_models_base.py      |  5 +++--
 .../ipex_llm/transformers/npu_models/qwen2_mp.py   |  7 +++++--
 3 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py b/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py
index 2e98c1eb937..b9236fdee24 100644
--- a/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py
+++ b/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py
@@ -129,10 +129,14 @@ def optimize_llm_pre(model: torch.nn.Module, qtype, mixed_precision,
         if quantization_group_size == 0:
             n_splits_linear = 1
             if qtype == "sym_int8_rtn":
-                # do not split mlp down_proj for Qwen2-7B & sym_int8
+                # do not split mlp down_proj for Qwen2-7B/MiniCPM-V-2_6 & sym_int8
                 n_splits_down_proj = 1
             else:
-                n_splits_down_proj = 2 if model.config.intermediate_size == 18944 else 1
+                if (not mixed_precision) and model.config.intermediate_size == 18944:
+                    # For Qwen2-7B and MiniCPM-V-2_6
+                    n_splits_down_proj = 16
+                else:
+                    n_splits_down_proj = 1
         else:
             invalidInputError(
                 model.config.hidden_size % quantization_group_size == 0 and
@@ -170,10 +174,10 @@ def optimize_llm_pre(model: torch.nn.Module, qtype, mixed_precision,
         # for Qwen2-7B-Insturct and MiniCPM-V 2.6, divide lm_head into 14 parts
         if model.config.hidden_size == 3584 and (model.config.vocab_size == 152064 or
            model.config.vocab_size == 151666) and not cpu_lm_head:
-            # Do not split lm_head and use sym_int8 instead when mixed_precison is True
             if quantization_group_size == 0:
-                # Do not split lm_head and use sym_int8 instead when mixed_precison is True
-                is_split = (not mixed_precision) and qtype == "sym_int4_rtn"
+                # TODO: may further adjust strategy, use sym_int8 for now
+                # is_split = (not mixed_precision) and qtype == "sym_int4_rtn"
+                is_split = False
                 split_num = 14 if is_split else 1
                 new_lm_head = SlicedLMHead(model.lm_head.weight, split_num=split_num,
                                            bias=model.lm_head.bias, use_split=False)
diff --git a/python/llm/src/ipex_llm/transformers/npu_models/mp_models_base.py b/python/llm/src/ipex_llm/transformers/npu_models/mp_models_base.py
index ccf6e242d90..da1213658d1 100644
--- a/python/llm/src/ipex_llm/transformers/npu_models/mp_models_base.py
+++ b/python/llm/src/ipex_llm/transformers/npu_models/mp_models_base.py
@@ -423,7 +423,7 @@ def feed_forward_sanm_decoder(self, x, w_1_bias, norm_weights, norm_bias):
         w_2 = self.linear(w_1_norm, 512, 2048, bias=False, wt_dtype=self.dtype)
         return w_2
 
-    def mlp(self, hidden_states, seq_len=-1, mode="prefill"):
+    def mlp(self, hidden_states, seq_len=-1, mode="prefill", mixed_precision=False):
         mm1 = self.linear(
             hidden_states, self.intermediate_size, self.hidden_size, bias=False,
             wt_dtype=self.dtype, n_splits=self.n_splits_linear,
@@ -438,8 +438,9 @@ def mlp(self, hidden_states, seq_len=-1, mode="prefill"):
         )  # type: ignore[attr-defined]
         mm1 = self.eltwise_mul(self.swish(mm1), mm2)  # type: ignore[attr-defined]
 
+        wt_dtype = torch.int8 if mixed_precision else self.dtype
         hidden_states = self.linear(
-            mm1, self.hidden_size, self.intermediate_size, bias=False, wt_dtype=self.dtype,
+            mm1, self.hidden_size, self.intermediate_size, bias=False, wt_dtype=wt_dtype,
             n_splits=self.n_splits_down_proj,
             scale_factor=(self.group_size == 0),
             is_prefill=(mode == "prefill")
diff --git a/python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py b/python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py
index 015efe10031..92cca4f1d52 100644
--- a/python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py
+++ b/python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py
@@ -97,7 +97,8 @@ def __init__(
         intermediate_size,
         n_splits_linear: int = 1,
         n_splits_down_proj: int = 1,
-        group_size: int = 0
+        group_size: int = 0,
+        mixed_precision: bool = False
     ):
         super().__init__(max_seq_len=max_seq_len,
                          transpose_value=transpose_value,
@@ -117,6 +118,8 @@ def __init__(
         self.rms_norm_eps = rms_norm_eps
         self.transpose_value = transpose_value
         self.num_layers = num_layers
+        self.mixed_precision = mixed_precision
+
 
         cos = self.constant(self.cached_cos)
         self.cos = self.unsqueeze(cos, axis=0)
@@ -279,7 +282,7 @@ def build_decoder(
         hidden_states = self.eltwise_add(residual, attn_output)
         residual = hidden_states
         hidden_states = self.layer_norm(hidden_states, post_attention_layernorm_weight)
-        hidden_states = self.mlp(hidden_states, self.seq_len, self.mode)
+        hidden_states = self.mlp(hidden_states, self.seq_len, self.mode, self.mixed_precision)
         hidden_states = self.eltwise_add(residual, hidden_states)
         hidden_states = self.convert_to_fp16(hidden_states)
 

From 48457b276eb1dd748ce67450b022468f212a0f13 Mon Sep 17 00:00:00 2001
From: Yuwen Hu <yuwen.hu@intel.com>
Date: Wed, 27 Nov 2024 18:45:14 +0800
Subject: [PATCH 2/8] Fix on mixed_precision of Qwen2-7B

---
 .../transformers/npu_models/mp_models_base.py  |  1 +
 .../transformers/npu_models/qwen2_mp.py        | 18 +++++++++++++-----
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/python/llm/src/ipex_llm/transformers/npu_models/mp_models_base.py b/python/llm/src/ipex_llm/transformers/npu_models/mp_models_base.py
index da1213658d1..4a6a12cb961 100644
--- a/python/llm/src/ipex_llm/transformers/npu_models/mp_models_base.py
+++ b/python/llm/src/ipex_llm/transformers/npu_models/mp_models_base.py
@@ -438,6 +438,7 @@ def mlp(self, hidden_states, seq_len=-1, mode="prefill", mixed_precision=False):
         )  # type: ignore[attr-defined]
         mm1 = self.eltwise_mul(self.swish(mm1), mm2)  # type: ignore[attr-defined]
 
+        print(f'mixed_precision: {mixed_precision}')
         wt_dtype = torch.int8 if mixed_precision else self.dtype
         hidden_states = self.linear(
             mm1, self.hidden_size, self.intermediate_size, bias=False, wt_dtype=wt_dtype,
diff --git a/python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py b/python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py
index 92cca4f1d52..bed9805c5cb 100644
--- a/python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py
+++ b/python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py
@@ -98,7 +98,7 @@ def __init__(
         n_splits_linear: int = 1,
         n_splits_down_proj: int = 1,
         group_size: int = 0,
-        mixed_precision: bool = False
+        mixed_precision: bool = False,
     ):
         super().__init__(max_seq_len=max_seq_len,
                          transpose_value=transpose_value,
@@ -314,6 +314,7 @@ def __init__(
         n_splits_linear: int = 1,
         n_splits_down_proj: int = 1,
         group_size: int = 0,
+        mixed_precision: bool = False,
     ):
         super().__init__()
 
@@ -378,7 +379,8 @@ def __init__(
                 dtype=np_dtype,
                 n_splits_linear=n_splits_linear,
                 n_splits_down_proj=n_splits_down_proj,
-                group_size=group_size
+                group_size=group_size,
+                mixed_precision=mixed_precision,
             )
             self.backend_decoders.append(decoder)
 
@@ -464,6 +466,7 @@ def __init__(
         n_splits_linear: int = 1,
         n_splits_down_proj: int = 1,
         group_size: int = 0,
+        mixed_precision: bool = False,
     ):
         super().__init__()
         self.op_parameters = parameters
@@ -494,7 +497,8 @@ def __init__(
             dtype=np_dtype,
             n_splits_linear=n_splits_linear,
             n_splits_down_proj=n_splits_down_proj,
-            group_size=group_size
+            group_size=group_size,
+            mixed_precision=mixed_precision,
         )
         self.layer_norm_0 = layer_norm_0
         self.layer_norm_1 = layer_norm_1
@@ -574,6 +578,7 @@ def run_decode(
     rms_norm_eps = model.config.rms_norm_eps
     intermediate_size = model.config.intermediate_size
     group_size = getattr(model.config, "group_size", 0)
+    mixed_precision = getattr(model.config, "mixed_precision", False)
     layer_weights = []
     input_layer_norm_weights = []
     post_attn_layernorm_weights = []
@@ -633,7 +638,8 @@ def run_decode(
         do_print=False,
         n_splits_linear=n_splits_linear,
         n_splits_down_proj=n_splits_down_proj,
-        group_size=group_size
+        group_size=group_size,
+        mixed_precision=mixed_precision,
     )
 
     dist.barrier()
@@ -805,6 +811,7 @@ def run_prefill(
     rms_norm_eps = model.config.rms_norm_eps
     intermediate_size = model.config.intermediate_size
     group_size = getattr(model.config, "group_size", 0)
+    mixed_precision = getattr(model.config, "mixed_precision", False)
     deocderlayers = []
     layer_weights = []
     input_layer_norm_weights = []
@@ -853,7 +860,8 @@ def run_prefill(
             transpose_value=transpose_value_cache,
             n_splits_linear=n_splits_linear,
             n_splits_down_proj=n_splits_down_proj,
-            group_size=group_size
+            group_size=group_size,
+            mixed_precision=mixed_precision,
         )
 
         layer_weights.extend(weights)

From baa0715cdcd8e9f0cb56be10d3b5f43c565cec64 Mon Sep 17 00:00:00 2001
From: Yuwen Hu <yuwen.hu@intel.com>
Date: Wed, 27 Nov 2024 18:51:31 +0800
Subject: [PATCH 3/8] Small fix

---
 .../llm/src/ipex_llm/transformers/npu_models/mp_models_base.py   | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/llm/src/ipex_llm/transformers/npu_models/mp_models_base.py b/python/llm/src/ipex_llm/transformers/npu_models/mp_models_base.py
index 4a6a12cb961..da1213658d1 100644
--- a/python/llm/src/ipex_llm/transformers/npu_models/mp_models_base.py
+++ b/python/llm/src/ipex_llm/transformers/npu_models/mp_models_base.py
@@ -438,7 +438,6 @@ def mlp(self, hidden_states, seq_len=-1, mode="prefill", mixed_precision=False):
         )  # type: ignore[attr-defined]
         mm1 = self.eltwise_mul(self.swish(mm1), mm2)  # type: ignore[attr-defined]
 
-        print(f'mixed_precision: {mixed_precision}')
         wt_dtype = torch.int8 if mixed_precision else self.dtype
         hidden_states = self.linear(
             mm1, self.hidden_size, self.intermediate_size, bias=False, wt_dtype=wt_dtype,

From a5ea9a1752d0880c467abb0473ae0b08cb921ea0 Mon Sep 17 00:00:00 2001
From: Yuwen Hu <yuwen.hu@intel.com>
Date: Wed, 27 Nov 2024 18:59:36 +0800
Subject: [PATCH 4/8] Style fix

---
 python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py b/python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py
index bed9805c5cb..a77ac7690e0 100644
--- a/python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py
+++ b/python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py
@@ -120,7 +120,6 @@ def __init__(
         self.num_layers = num_layers
         self.mixed_precision = mixed_precision
 
-
         cos = self.constant(self.cached_cos)
         self.cos = self.unsqueeze(cos, axis=0)
 

From 117370d07d6b88febdc12d1d98fcba2a800756c9 Mon Sep 17 00:00:00 2001
From: Yuwen Hu <yuwen.hu@intel.com>
Date: Thu, 28 Nov 2024 11:16:05 +0800
Subject: [PATCH 5/8] Fit with pipeline=True

---
 python/llm/src/ipex_llm/transformers/npu_model.py     |  7 ++++++-
 .../npu_pipeline_model/convert_pipeline.py            | 11 ++++++++---
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/python/llm/src/ipex_llm/transformers/npu_model.py b/python/llm/src/ipex_llm/transformers/npu_model.py
index eb684bce715..6465c01225a 100644
--- a/python/llm/src/ipex_llm/transformers/npu_model.py
+++ b/python/llm/src/ipex_llm/transformers/npu_model.py
@@ -183,6 +183,10 @@ def from_pretrained(cls, *args, **kwargs):
             from intel_npu_acceleration_library.compiler import create_npu_kernels
 
             if optimize_model:
+                #TODO: enable mixed_precision when pipeline=True
+                if pipeline:
+                    mixed_precision = False
+
                 invalidInputError(
                     max_prompt_len < max_context_len,
                     (
@@ -282,7 +286,8 @@ def optimize_npu_model(cls, *args, **kwargs):
                         group_size=quantization_group_size,
                         qtype=qtype,
                         convert_model=convert_model,
-                        save_directory=save_directory)
+                        save_directory=save_directory,
+                        mixed_precision=mixed_precision)
         model.save_low_bit = types.MethodType(save_low_bit, model)
         return model
 
diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py
index 50448bd684b..f1bb597e6ee 100644
--- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py
+++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py
@@ -194,8 +194,9 @@ def convert_llm(model: torch.nn.Module,
                 transpose_value_cache: bool,
                 group_size: int,
                 qtype: str,
-                convert_model: bool=False,
-                save_directory: str=None):
+                convert_model: bool = False,
+                save_directory: str = None,
+                mixed_precision: bool = False):
     # whether to set layernorm weight as const
     layernorm_const = os.environ.get("IPEX_LLM_LAYERNORM_CONST", "1") == "1"
     if group_size == 0:
@@ -204,7 +205,11 @@ def convert_llm(model: torch.nn.Module,
             # do not split mlp down_proj for Qwen2-7B & sym_int8
             n_splits_down_proj = 1
         else:
-            n_splits_down_proj = 2 if model.config.intermediate_size == 18944 else 1
+            if (not mixed_precision) and model.config.intermediate_size == 18944:
+                # For Qwen2-7B
+                n_splits_down_proj = 16
+            else:
+                n_splits_down_proj = 1
     else:
         n_splits_linear = model.config.hidden_size // group_size
         n_splits_down_proj = model.config.intermediate_size // group_size

From efc37564d902d8bfebb9c930c70090b81803c8ab Mon Sep 17 00:00:00 2001
From: Yuwen Hu <yuwen.hu@intel.com>
Date: Thu, 28 Nov 2024 11:18:16 +0800
Subject: [PATCH 6/8] Example fix

---
 .../NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/qwen.py   | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/qwen.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/qwen.py
index d04961ece87..8f39ef2e985 100644
--- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/qwen.py
+++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/qwen.py
@@ -67,7 +67,6 @@
                                                      torch_dtype=torch.float16,
                                                      attn_implementation="eager",
                                                      transpose_value_cache=not args.disable_transpose_value_cache,
-                                                     mixed_precision=True,
                                                      trust_remote_code=True)
     else:
         model = AutoModelForCausalLM.load_low_bit(

From 06abd9e1bbab789c456c93532d229fb2539af39b Mon Sep 17 00:00:00 2001
From: Yuwen Hu <yuwen.hu@intel.com>
Date: Thu, 28 Nov 2024 11:23:42 +0800
Subject: [PATCH 7/8] Example update

---
 .../Multimodal/README.md                           | 14 ++++++++++----
 .../Multimodal/minicpm_v_2_6.py                    |  2 ++
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/README.md b/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/README.md
index 53f47df7946..1a3e277346f 100644
--- a/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/README.md
+++ b/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/README.md
@@ -22,7 +22,7 @@ In the example [generate.py](./generate.py), we show a basic use case for a phi-
 ### 1. Install
 #### 1.1 Installation on Windows
 We suggest using conda to manage environment:
-```bash
+```cmd
 conda create -n llm python=3.10 libuv
 conda activate llm
 
@@ -100,7 +100,7 @@ The examples below show how to run the **_optimized HuggingFace & FunASR model i
 - [Speech_Paraformer-Large](./speech_paraformer-large.py)
 
 ### 4.1 Run MiniCPM-Llama3-V-2_5 & MiniCPM-V-2_6
-```bash
+```cmd
 # to run MiniCPM-Llama3-V-2_5
 python minicpm-llama3-v2.5.py
 
@@ -117,6 +117,12 @@ Arguments info:
 - `--max-prompt-len MAX_PROMPT_LEN`: Defines the maximum number of tokens that the input prompt can contain. It is default to be `512`.
 - `--disable-transpose-value-cache`: Disable the optimization of transposing value cache.
 
+For [MiniCPM-V-2_6](./minicpm_v_2_6.py), you could also try to enable mixed precision optimization when encountering output problems:
+
+```cmd
+python minicpm_v_2_6.py --mixed-precision
+``` 
+
 #### Sample Output
 ##### [openbmb/MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6)
 
@@ -131,7 +137,7 @@ The image features a young child holding and showing off a white teddy bear wear
 ```
 
 ### 4.2 Run Speech_Paraformer-Large
-```bash
+```cmd
 # to run Speech_Paraformer-Large
 python speech_paraformer-large.py
 ```
@@ -154,7 +160,7 @@ rtf_avg: 0.232: 100%|███████████████████
 ```
 
 ### 4.3 Run Bce-Embedding-Base-V1
-```bash
+```cmd
 # to run Bce-Embedding-Base-V1
 python bce-embedding.py
 ```
diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm_v_2_6.py b/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm_v_2_6.py
index 1a524a5b2dc..f25f3409f2b 100644
--- a/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm_v_2_6.py
+++ b/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm_v_2_6.py
@@ -41,6 +41,7 @@
     parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
     parser.add_argument("--intra-pp", type=int, default=None)
     parser.add_argument("--inter-pp", type=int, default=None)
+    parser.add_argument("--mixed-precision", action='store_true')
 
     args = parser.parse_args()
     model_path = args.repo_id_or_model_path
@@ -57,6 +58,7 @@
                                       intra_pp=args.intra_pp,
                                       inter_pp=args.inter_pp,
                                       transpose_value_cache=not args.disable_transpose_value_cache,
+                                      mixed_precision=args.mixed_precision,
                                       )
     tokenizer = AutoTokenizer.from_pretrained(model_path,
                                               trust_remote_code=True)

From b8c60d0692c8fb0a113c7337acdaf861c6fe47ca Mon Sep 17 00:00:00 2001
From: Yuwen Hu <yuwen.hu@intel.com>
Date: Thu, 28 Nov 2024 11:33:44 +0800
Subject: [PATCH 8/8] Style fix

---
 python/llm/src/ipex_llm/transformers/npu_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/llm/src/ipex_llm/transformers/npu_model.py b/python/llm/src/ipex_llm/transformers/npu_model.py
index 6465c01225a..601bf7720f7 100644
--- a/python/llm/src/ipex_llm/transformers/npu_model.py
+++ b/python/llm/src/ipex_llm/transformers/npu_model.py
@@ -183,7 +183,7 @@ def from_pretrained(cls, *args, **kwargs):
             from intel_npu_acceleration_library.compiler import create_npu_kernels
 
             if optimize_model:
-                #TODO: enable mixed_precision when pipeline=True
+                # TODO: enable mixed_precision when pipeline=True
                 if pipeline:
                     mixed_precision = False