Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix block_name_to_quantize #382

Merged
merged 4 commits into from
Dec 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,7 @@ steps,
which competes impressively against recent methods without introducing any additional inference overhead and keeping low
tuning cost. The below
image presents an overview of AutoRound. Check out our paper on [arxiv](https://arxiv.org/pdf/2309.05516) for more
details and visit [low_bit_open_llm_leaderboard](https://huggingface.co/spaces/Intel/low_bit_open_llm_leaderboard) for
more accuracy data and recipes across various models.
details and quantized huggingface space models in [OPEA](https://huggingface.co/OPEA), [Kaitchup](https://huggingface.co/kaitchup) and [fbaldassarri](https://huggingface.co/fbaldassarri).

<div align="center">

Expand Down Expand Up @@ -414,3 +413,4 @@ If you find AutoRound useful for your research, please cite our paper:
```



24 changes: 10 additions & 14 deletions auto_round/export/export_to_autogptq/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ def save_quantized_as_autogptq(output_dir, inplace=True, backend="auto_gptq:exll
supported_types = kwargs["supported_types"]
safe_serialization = True if 'safe_serialization' not in kwargs.keys() else kwargs["safe_serialization"]
to_quant_block_names = kwargs["to_quant_block_names"]
quant_block_list = kwargs.get("quant_block_list", None)
quant_block_list = kwargs.get("quant_block_list", get_block_names(model))
logger.info("Saving quantized model to autogptq format, this may take a while...")
tokenizer = kwargs.get("tokenizer", None)
processor = kwargs.get("processor", None)
Expand All @@ -131,19 +131,14 @@ def save_quantized_as_autogptq(output_dir, inplace=True, backend="auto_gptq:exll
processor.save_pretrained(output_dir)
##check module quantized in block, this may have bug for mixed precision quantization
quantization_config = kwargs["serialization_dict"]
if bool(quant_block_list):
all_blocks = quant_block_list
flattened_list = [item for sublist in all_blocks for item in sublist]
common_prefix = os.path.commonprefix(flattened_list).rstrip('.')
if common_prefix not in BLOCK_PATTERNS:
logger.error(f"auto-gptq format may not support loading this quantized model")
quantization_config['block_name_to_quantize'] = common_prefix
else:
all_blocks = get_block_names(model)
flattened_list = [item for sublist in all_blocks for item in sublist]
common_prefix = os.path.commonprefix(flattened_list).rstrip('.')
if common_prefix not in BLOCK_PATTERNS:
quantization_config['block_name_to_quantize'] = common_prefix
all_blocks = quant_block_list
flattened_list = [item for sublist in all_blocks for item in sublist]
common_prefix = os.path.commonprefix(flattened_list).rstrip('.')
WeiweiZhang1 marked this conversation as resolved.
Show resolved Hide resolved
if common_prefix not in BLOCK_PATTERNS:
logger.error(f"auto-gptq format may not support loading this quantized model")
quantization_config['block_name_to_quantize'] = common_prefix
quantization_config.pop("to_quant_block_names", None)


all_to_quantized = True
modules_in_block_to_quantize = []
Expand Down Expand Up @@ -222,3 +217,4 @@ def save(model: torch.nn.Module, save_dir: str, max_shard_size: str = "5GB", saf
json.dump(model.config.quantization_config, f, indent=2)



2 changes: 2 additions & 0 deletions auto_round/mllm/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@
"llava_next": dict(cls="LLaVA_Next"),
"phi3_v": dict(cls="Phi3Vision"),
"mllama": dict(cls="llama_vision"),
"glm-4v-9b": dict(cls="GLM4v"),
}


Expand Down Expand Up @@ -409,3 +410,4 @@ class CliArgs:
json.dump(results, open(output_file, 'w'), indent=4, default=_handle_non_serializable)

return results

3 changes: 1 addition & 2 deletions auto_round/quantizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,6 @@ def _init_tuning_params_and_quant_func(self):
self.weight_min = torch.clamp(weight_reshape.min(1)[0], max=0)
self.weight_max = torch.clamp(weight_reshape.max(1)[0], min=0)
self._init_params("value", p_dtype, weight_reshape.shape, 0, True)

# Min-max scale initialization
shape = get_scale_shape(orig_weight, orig_layer.group_size)
self._init_params("min_scale", p_dtype, shape, 1.0, self.enable_minmax_tuning)
Expand Down Expand Up @@ -304,7 +303,6 @@ def forward(self, x):
bias = self.orig_layer.bias
if bias is not None and bias.device.type == 'meta':
bias = self.orig_layer.get_bias().to(self.device)

if self.enable_norm_bias_tuning:
bias, _, _ = self._qdq_bias(bias, self.bias_v)

Expand Down Expand Up @@ -520,3 +518,4 @@ def unwrapper_block(block, best_params):
best_param = None
orig_layer = m.unwrapper(best_param)
set_module(block, n, orig_layer)

Loading