intel · wenhuach21 · Dec 12, 2024 · Dec 11, 2024 · Dec 12, 2024 · Dec 12, 2024
diff --git a/README.md b/README.md
@@ -16,8 +16,7 @@ steps,
 which competes impressively against recent methods without introducing any additional inference overhead and keeping low
 tuning cost. The below
 image presents an overview of AutoRound. Check out our paper on [arxiv](https://arxiv.org/pdf/2309.05516) for more
-details and visit [low_bit_open_llm_leaderboard](https://huggingface.co/spaces/Intel/low_bit_open_llm_leaderboard) for
-more accuracy data and recipes across various models.
+details and quantized huggingface space models in [OPEA](https://huggingface.co/OPEA), [Kaitchup](https://huggingface.co/kaitchup) and [fbaldassarri](https://huggingface.co/fbaldassarri).
 
 <div align="center">
 
@@ -414,3 +413,4 @@ If you find AutoRound useful for your research, please cite our paper:
 ```
 
 
+
diff --git a/auto_round/export/export_to_autogptq/export.py b/auto_round/export/export_to_autogptq/export.py
@@ -121,7 +121,7 @@ def save_quantized_as_autogptq(output_dir, inplace=True, backend="auto_gptq:exll
     supported_types = kwargs["supported_types"]
     safe_serialization = True if 'safe_serialization' not in kwargs.keys() else kwargs["safe_serialization"]
     to_quant_block_names = kwargs["to_quant_block_names"]
-    quant_block_list = kwargs.get("quant_block_list", None)
+    quant_block_list = kwargs.get("quant_block_list", get_block_names(model))
     logger.info("Saving quantized model to autogptq format, this may take a while...")
     tokenizer = kwargs.get("tokenizer", None)
     processor = kwargs.get("processor", None)
@@ -131,19 +131,14 @@ def save_quantized_as_autogptq(output_dir, inplace=True, backend="auto_gptq:exll
         processor.save_pretrained(output_dir)
     ##check module quantized in block, this may have bug for mixed precision quantization
     quantization_config = kwargs["serialization_dict"]
-    if bool(quant_block_list):
-        all_blocks = quant_block_list
-        flattened_list = [item for sublist in all_blocks for item in sublist]
-        common_prefix = os.path.commonprefix(flattened_list).rstrip('.')
-        if common_prefix not in BLOCK_PATTERNS:
-            logger.error(f"auto-gptq format may not support loading this quantized model")
-            quantization_config['block_name_to_quantize'] = common_prefix
-    else:
-        all_blocks = get_block_names(model)
-        flattened_list = [item for sublist in all_blocks for item in sublist]
-        common_prefix = os.path.commonprefix(flattened_list).rstrip('.')
-        if common_prefix not in BLOCK_PATTERNS:
-            quantization_config['block_name_to_quantize'] = common_prefix
+    all_blocks = quant_block_list
+    flattened_list = [item for sublist in all_blocks for item in sublist]
+    common_prefix = os.path.commonprefix(flattened_list).rstrip('.')
+    if common_prefix not in BLOCK_PATTERNS:
+        logger.error(f"auto-gptq format may not support loading this quantized model")
+        quantization_config['block_name_to_quantize'] = common_prefix
+        quantization_config.pop("to_quant_block_names", None)
+
 
     all_to_quantized = True
     modules_in_block_to_quantize = []
@@ -222,3 +217,4 @@ def save(model: torch.nn.Module, save_dir: str, max_shard_size: str = "5GB", saf
             json.dump(model.config.quantization_config, f, indent=2)
 
 
+
diff --git a/auto_round/mllm/eval.py b/auto_round/mllm/eval.py
@@ -81,6 +81,7 @@
     "llava_next": dict(cls="LLaVA_Next"),
     "phi3_v": dict(cls="Phi3Vision"),
     "mllama": dict(cls="llama_vision"),
+    "glm-4v-9b": dict(cls="GLM4v"),
 }
 
 
@@ -409,3 +410,4 @@ class CliArgs:
         json.dump(results, open(output_file, 'w'), indent=4, default=_handle_non_serializable)
 
     return results
+
diff --git a/auto_round/quantizer.py b/auto_round/quantizer.py
@@ -94,7 +94,6 @@ def _init_tuning_params_and_quant_func(self):
         self.weight_min = torch.clamp(weight_reshape.min(1)[0], max=0)
         self.weight_max = torch.clamp(weight_reshape.max(1)[0], min=0)
         self._init_params("value", p_dtype, weight_reshape.shape, 0, True)
-
         # Min-max scale initialization
         shape = get_scale_shape(orig_weight, orig_layer.group_size)
         self._init_params("min_scale", p_dtype, shape, 1.0, self.enable_minmax_tuning)
@@ -304,7 +303,6 @@ def forward(self, x):
         bias = self.orig_layer.bias
         if bias is not None and bias.device.type == 'meta':
             bias = self.orig_layer.get_bias().to(self.device)
-
         if self.enable_norm_bias_tuning:
             bias, _, _ = self._qdq_bias(bias, self.bias_v)
 
@@ -520,3 +518,4 @@ def unwrapper_block(block, best_params):
                 best_param = None
             orig_layer = m.unwrapper(best_param)
             set_module(block, n, orig_layer)
+