diff --git a/.github/workflows/build_docker_images.yml b/.github/workflows/build_docker_images.yml index 5b93f80dcd..ce5577309b 100644 --- a/.github/workflows/build_docker_images.yml +++ b/.github/workflows/build_docker_images.yml @@ -10,6 +10,8 @@ concurrency: group: docker-image-builds cancel-in-progress: false +permissions: {} + env: CI_SLACK_CHANNEL: ${{ secrets.CI_DOCKER_CHANNEL }} diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml index 1ff01d1a5e..42e7972bc2 100644 --- a/.github/workflows/build_documentation.yml +++ b/.github/workflows/build_documentation.yml @@ -7,6 +7,8 @@ on: - doc-builder* - v*-release +permissions: {} + jobs: build: uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@main diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml index 35ceab6e60..3fe27e8a04 100644 --- a/.github/workflows/build_pr_documentation.yml +++ b/.github/workflows/build_pr_documentation.yml @@ -7,6 +7,8 @@ concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} cancel-in-progress: true +permissions: {} + jobs: build: uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main diff --git a/.github/workflows/integrations_tests.yml b/.github/workflows/integrations_tests.yml index 38ab96246e..3d61c8d915 100644 --- a/.github/workflows/integrations_tests.yml +++ b/.github/workflows/integrations_tests.yml @@ -7,6 +7,8 @@ on: description: 'Branch to test on' required: true +permissions: {} + jobs: run_transformers_integration_tests: strategy: diff --git a/.github/workflows/nightly-bnb.yml b/.github/workflows/nightly-bnb.yml index 0fba12dfb9..bc68af80c8 100644 --- a/.github/workflows/nightly-bnb.yml +++ b/.github/workflows/nightly-bnb.yml @@ -12,6 +12,7 @@ env: NVIDIA_DISABLE_REQUIRE: "1" SLACK_API_TOKEN: ${{ secrets.SLACK_API_TOKEN }} +permissions: {} jobs: run_all_tests_single_gpu: diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index 7e6635b392..d578900489 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -12,6 +12,7 @@ env: NVIDIA_DISABLE_REQUIRE: "1" SLACK_API_TOKEN: ${{ secrets.SLACK_API_TOKEN }} +permissions: {} jobs: run_all_tests_single_gpu: diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml index fc65794663..054c4b53c4 100644 --- a/.github/workflows/stale.yml +++ b/.github/workflows/stale.yml @@ -4,6 +4,8 @@ on: schedule: - cron: "0 15 * * *" +permissions: {} + jobs: close_stale_issues: name: Close Stale Issues diff --git a/.github/workflows/test-docker-build.yml b/.github/workflows/test-docker-build.yml index 33a177bba2..558c5f74de 100644 --- a/.github/workflows/test-docker-build.yml +++ b/.github/workflows/test-docker-build.yml @@ -5,6 +5,9 @@ on: paths: # Run only when DockerFile files are modified - "docker/*/Dockerfile" + +permissions: {} + jobs: get_changed_files: name: "Build all modified docker images" diff --git a/.github/workflows/tests-main.yml b/.github/workflows/tests-main.yml index 1b06083e73..d614d547b7 100644 --- a/.github/workflows/tests-main.yml +++ b/.github/workflows/tests-main.yml @@ -6,6 +6,8 @@ on: paths-ignore: - 'docs/**' +permissions: {} + jobs: tests: runs-on: ubuntu-latest diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index fac7446184..36e6841f0e 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -12,6 +12,8 @@ on: env: HF_HOME: .cache/huggingface +permissions: {} + jobs: check_code_quality: runs-on: ubuntu-latest diff --git a/.github/workflows/torch_compile_tests.yml b/.github/workflows/torch_compile_tests.yml index f93d3760d6..02243de643 100644 --- a/.github/workflows/torch_compile_tests.yml +++ b/.github/workflows/torch_compile_tests.yml @@ -17,6 +17,8 @@ env: # To be able to run tests on CUDA 12.2 NVIDIA_DISABLE_REQUIRE: "1" +permissions: {} + jobs: run_tests_with_compile: runs-on: diff --git a/.github/workflows/trufflehog.yml b/.github/workflows/trufflehog.yml index 9a613bb5b7..bdcdac7561 100644 --- a/.github/workflows/trufflehog.yml +++ b/.github/workflows/trufflehog.yml @@ -3,6 +3,8 @@ on: name: Secret Leaks +permissions: {} + jobs: trufflehog: runs-on: ubuntu-latest diff --git a/.github/workflows/upload_pr_documentation.yml b/.github/workflows/upload_pr_documentation.yml index 380f67550d..7659af7e5c 100644 --- a/.github/workflows/upload_pr_documentation.yml +++ b/.github/workflows/upload_pr_documentation.yml @@ -6,6 +6,8 @@ on: types: - completed +permissions: {} + jobs: build: uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@main diff --git a/.github/workflows/zizmor.yaml b/.github/workflows/zizmor.yaml index c9ab11998c..502c4f8d62 100644 --- a/.github/workflows/zizmor.yaml +++ b/.github/workflows/zizmor.yaml @@ -3,13 +3,13 @@ name: CI security linting on: push: branches: ["main"] - paths: - - '.github/**' pull_request: branches: ["*"] paths: - '.github/**' +permissions: {} + jobs: zizmor: name: zizmor latest via Cargo diff --git a/.github/zizmor.yml b/.github/zizmor.yml index f83c7e2b12..1746cfe25f 100644 --- a/.github/zizmor.yml +++ b/.github/zizmor.yml @@ -3,3 +3,13 @@ rules: ignore: # this workflow is only triggered after maintainer approval - upload_pr_documentation.yml:3:1 + cache-poisoning: + ignore: + # the docker buildx binary is cached and zizmor warns about a cache poisoning attack. + # OTOH this cache would make us more resilient against an intrusion on docker-buildx' side. + # There is no obvious benefit so we leave it as it is. + - build_docker_images.yml:37:9 + - build_docker_images.yml:70:9 + - build_docker_images.yml:103:9 + - build_docker_images.yml:136:9 + - build_docker_images.yml:169:9 diff --git a/examples/corda_finetuning/README.md b/examples/corda_finetuning/README.md index c248e99ae1..f07672f7a5 100644 --- a/examples/corda_finetuning/README.md +++ b/examples/corda_finetuning/README.md @@ -100,7 +100,12 @@ lora_config = LoraConfig( init_lora_weights="corda", corda_config=corda_config, ) + +# Call `preprocess_corda` first to collect covariance matrix and build SVD result for model +# For more details, please refer to documentation of `preprocess_corda` preprocess_corda(model, lora_config, run_model=run_model) + +# Call `get_peft_model` after preprocessing, or else you'll encounter error peft_model = get_peft_model(model, lora_config) peft_model.print_trainable_parameters() diff --git a/examples/corda_finetuning/preprocess.py b/examples/corda_finetuning/preprocess.py index 01721d296e..15bb18cb6b 100644 --- a/examples/corda_finetuning/preprocess.py +++ b/examples/corda_finetuning/preprocess.py @@ -21,7 +21,7 @@ from tqdm import tqdm from transformers import AutoModelForCausalLM, AutoTokenizer -from peft.mapping import get_peft_model +from peft import get_peft_model from peft.tuners.lora.config import CordaConfig, LoraConfig from peft.tuners.lora.corda import preprocess_corda diff --git a/src/peft/import_utils.py b/src/peft/import_utils.py index 1745f5bdde..7599aed35f 100644 --- a/src/peft/import_utils.py +++ b/src/peft/import_utils.py @@ -13,9 +13,11 @@ # limitations under the License. import importlib import importlib.metadata as importlib_metadata +import platform from functools import lru_cache import packaging.version +import torch @lru_cache @@ -111,3 +113,23 @@ def is_torchao_available(): f"but only versions above {TORCHAO_MINIMUM_VERSION} are supported" ) return True + + +@lru_cache +def is_xpu_available(check_device=False): + """ + Checks if XPU acceleration is available and potentially if a XPU is in the environment + """ + + system = platform.system() + if system == "Darwin": + return False + else: + if check_device: + try: + # Will raise a RuntimeError if no XPU is found + _ = torch.xpu.device_count() + return torch.xpu.is_available() + except RuntimeError: + return False + return hasattr(torch, "xpu") and torch.xpu.is_available() diff --git a/src/peft/peft_model.py b/src/peft/peft_model.py index 62061a84e8..28f715a100 100644 --- a/src/peft/peft_model.py +++ b/src/peft/peft_model.py @@ -3022,3 +3022,19 @@ def check_irrgular(vals: list[bool | Literal["irregular"]]) -> bool | Literal["i devices=devices, ) return adapter_model_status + + +def __getattr__(name): + if name == "PEFT_TYPE_TO_MODEL_MAPPING": + # This is for backwards compatibility: In #2282, PEFT_TYPE_TO_MODEL_MAPPING was removed as it was redundant with + # PEFT_TYPE_TO_TUNER_MAPPING. However, third party code could still use this mapping, e.g.: + # https://github.com/AutoGPTQ/AutoGPTQ/blob/6689349625de973b9ee3016c28c11f32acf7f02c/auto_gptq/utils/peft_utils.py#L8 + # TODO: Remove after 2026-01 + msg = ( + "PEFT_TYPE_TO_MODEL_MAPPING is deprecated, please use `from peft import PEFT_TYPE_TO_TUNER_MAPPING` instead. " + "The deprecated variable will be removed in 2026." + ) + warnings.warn(msg, category=DeprecationWarning) + return PEFT_TYPE_TO_TUNER_MAPPING + + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") diff --git a/src/peft/tuners/lora/config.py b/src/peft/tuners/lora/config.py index df1fc06958..b36de0c43a 100644 --- a/src/peft/tuners/lora/config.py +++ b/src/peft/tuners/lora/config.py @@ -145,6 +145,8 @@ class CordaConfig: use_float16_for_covariance (`bool`): If true, uses float16 for the covariance matrix. This can reduce the memory usage of the covariance matrix by half, but may lead to numerical instability. Defaults to `False`. + prune_temporary_fields (`bool`): + If true, temporary fields generated in CorDA preprocessing will be pruned. Defaults to `True`. """ cache_file: Optional[str] = field( @@ -189,6 +191,9 @@ class CordaConfig: ) }, ) + prune_temporary_fields: bool = field( + default=True, metadata={"help": "If true, temporary fields generated in CorDA preprocessing will be pruned."} + ) @dataclass diff --git a/src/peft/tuners/lora/corda.py b/src/peft/tuners/lora/corda.py index 0d1d70b1a8..8b991d276e 100644 --- a/src/peft/tuners/lora/corda.py +++ b/src/peft/tuners/lora/corda.py @@ -61,6 +61,10 @@ def preprocess_corda( """ Build necessary CorDA fields for a model. + For each `M * N` linear layer, a `M * M` covariance matrix will be built temporarily during the preprocessing + process, consuming roughly another `2 * MODEL_SIZE` memory for typical LLMs if model weight is FP16 and covariance + is FP32. If that's too much, consider specifying `use_float16_for_covariance` in `lora_config.corda_config`. + Args: model (`nn.Module`): Model to preprocess. @@ -68,17 +72,16 @@ def preprocess_corda( Lora configuration of the model. `lora_config.corda_config` should be set. run_model (`Optional[Callable[[], None]]`): Callback to run the model when building covariance. Typically you should run model inference on your sample - dataset in this callback. Experiments have shown 256 samples to be a good default dataset size. `run_model` - can be `None` only if covariance file in `lora_config.corda_config` is already created. + dataset in this callback. Experiments have shown that when token count per sample is 2048, hidden dimension + is 4096, collecting 256 distinct samples is enough. If you collect too few or too repetitive samples, the + covariance matrix may be low-ranked and unstabilize preprocessing. You can estimate sample count as + `HIDDEN_DIM / TOKEN_PER_SAMPLE * 128`. `run_model` can be `None` only if covariance file in + `lora_config.corda_config` is already created. hooked_model (`Optional[nn.Module]`): Model to hook when building covariance. If none, original model will be hooked. This is only useful when you want to hook a different model than the one you are training, typically you should leave this `None`. Upon completion, the following fields are set for each target module: - corda_method (`Literal["ipm", "kpm"]`): - CorDA method to apply. "ipm" for Instruction-Previewed Mode, "kpm" for Knowledge-Preserved Mode. - rank (`int`): - Rank of CorDA to apply. eigens.S_WC (`torch.Tensor`): Singular values of the weight matrix. eigens.U_WC (`torch.Tensor`): @@ -90,13 +93,12 @@ def preprocess_corda( covariance_file = lora_config.corda_config.covariance_file corda_method = lora_config.corda_config.corda_method verbose = lora_config.corda_config.verbose + prune_temporary_fields = lora_config.corda_config.prune_temporary_fields # If cache exists, skip building if cache_file is not None and os.path.exists(cache_file) and os.path.getsize(cache_file) > 0: cache = torch.load(cache_file, map_location=get_model_device(model)) for name, module in target_modules(model, lora_config): - module.corda_method = cache[f"{name}.corda_method"] - module.rank = cache[f"{name}.rank"] module.eigens = CordaEigens( S_WC=cache[f"{name}.eigens.S_WC"], U_WC=cache[f"{name}.eigens.U_WC"], @@ -123,12 +125,22 @@ def preprocess_corda( # Crop CorDA eigens so that there's less to save crop_corda_eigens(model, lora_config) + # Remove redundant fields if exist + if prune_temporary_fields: + for name, module in target_modules(model, lora_config): + if hasattr(module, "sample_count"): + del module.sample_count + if hasattr(module, "covariance_matrix"): + del module.covariance_matrix + if hasattr(module, "corda_method"): + del module.corda_method + if hasattr(module, "rank"): + del module.rank + # Save cache to disk if cache_file is not None: cache: dict[str, Any] = {} for name, module in target_modules(model, lora_config): - cache[f"{name}.corda_method"] = module.corda_method - cache[f"{name}.rank"] = module.rank cache[f"{name}.eigens.S_WC"] = module.eigens.S_WC cache[f"{name}.eigens.U_WC"] = module.eigens.U_WC cache[f"{name}.eigens.V_WC"] = module.eigens.V_WC @@ -174,15 +186,9 @@ def hook(module, input, output): "Invalid value found in covariance. Please file an issue at https://github.com/huggingface/peft/issues." ) - # calculate mean and std - mean = input.mean(0) - std = input.std(0) - # add to module module.sample_count += 1 module.covariance_matrix += covariance - module.mean += mean - module.std += std # free memory del covariance, input @@ -191,8 +197,6 @@ def hook(module, input, output): for name, module in target_modules(hooked_model, config): module.sample_count = 0 module.covariance_matrix = 0 - module.mean = 0 - module.std = 0 handles.append(module.register_forward_hook(hook)) run_model() @@ -213,14 +217,10 @@ def hook(module, input, output): if name in targets: targets[name].sample_count = module.sample_count targets[name].covariance_matrix = module.covariance_matrix - targets[name].mean = module.mean - targets[name].std = module.std # Divide by sample count for name, module in target_modules(model, config): module.covariance_matrix /= module.sample_count - module.mean /= module.sample_count - module.std /= module.sample_count # Save covariance to disk if covariance_file is not None: diff --git a/src/peft/tuners/lora/layer.py b/src/peft/tuners/lora/layer.py index 20bef8ed10..557fcfd188 100644 --- a/src/peft/tuners/lora/layer.py +++ b/src/peft/tuners/lora/layer.py @@ -349,6 +349,9 @@ def corda_init(self, adapter_name, init_lora_weights): weight = weight.to(dtype) self.get_base_layer().weight.data = weight + # Remove redundant fields + del linear.eigens + def loftq_init(self, adapter_name): from peft.utils.loftq_utils import loftq_init @@ -1402,6 +1405,33 @@ def batch_first(self) -> bool: def head_dim(self) -> int: return self.get_base_layer().head_dim + @property + def in_proj_weight(self) -> nn.Parameter: + return self.get_base_layer().in_proj_weight + + @property + def in_proj_bias(self) -> nn.Parameter: + return self.get_base_layer().in_proj_bias + + @property + def out_proj(self) -> nn.Module: + return self.get_base_layer().out_proj.get_base_layer() + + @property + def bias_k(self) -> Optional[nn.Parameter]: + return self.get_base_layer().bias_k + + @property + def bias_v(self) -> Optional[nn.Parameter]: + return self.get_base_layer().bias_v + + def merge_masks(self, *args, **kwargs) -> tuple[Optional[torch.Tensor], Optional[int]]: + return self.get_base_layer().merge_masks(*args, **kwargs) + + @property + def add_zero_attn(self) -> bool: + return self.get_base_layer().add_zero_attn + def update_layer(self, *args, **kwargs) -> None: super().update_layer(*args, **kwargs) # Note: LoRA is applied to both in_proj and out_proj. There is currently no way to only specify one of them. diff --git a/src/peft/tuners/lora/model.py b/src/peft/tuners/lora/model.py index 32631647b2..2967b8da9c 100644 --- a/src/peft/tuners/lora/model.py +++ b/src/peft/tuners/lora/model.py @@ -444,6 +444,18 @@ def _enable_peft_forward_hooks(self, *args, **kwargs): if unexpected_adapters: raise ValueError(f"Trying to infer with non-existing adapter(s): {', '.join(sorted(unexpected_adapters))}") + # deal with beam search + num_beams = kwargs.get("num_beams", None) + uses_beam_search = isinstance(num_beams, int) and (num_beams > 1) + original_adapter_names = adapter_names[:] + if uses_beam_search: + if not isinstance(adapter_names, (list, tuple)): + raise TypeError(f"Got adapter names of type {type(adapter_names)}, expected a list of str.") + # When there is beam search, the inputs are repeated n times, thus we repeat each adapter name n times and + # then flatten the nested list. For encoder-decoder models, this extended list should not be applied to the + # encoder part. Further below, the original argument is thus restored for the encoder. + adapter_names = sum(([n] * kwargs["num_beams"] for n in adapter_names), []) + hook_handles = [] for module in self.modules(): if isinstance(module, LoraLayer) or isinstance(module, ModulesToSaveWrapper): @@ -451,6 +463,17 @@ def _enable_peft_forward_hooks(self, *args, **kwargs): handle = module.register_forward_pre_hook(pre_forward, with_kwargs=True) hook_handles.append(handle) + if uses_beam_search and hasattr(self.model, "get_encoder"): + # For encoder-decoder models, even when applying beam search, the encoder part of the model should not use + # the extended adapter_names. This is because the encoder still uses the original, non-extended samples. + for module in self.model.get_encoder().modules(): + if isinstance(module, LoraLayer) or isinstance(module, ModulesToSaveWrapper): + # Add another hook to overwrite the kwargs with the original adapter names -- this is easier than + # trying to exclude the encoder. + pre_forward = partial(_adapter_names_pre_forward_hook, adapter_names=original_adapter_names) + handle = module.register_forward_pre_hook(pre_forward, with_kwargs=True) + hook_handles.append(handle) + yield for handle in hook_handles: diff --git a/src/peft/utils/save_and_load.py b/src/peft/utils/save_and_load.py index 5337afeb9f..e42e4a5e8c 100644 --- a/src/peft/utils/save_and_load.py +++ b/src/peft/utils/save_and_load.py @@ -335,10 +335,11 @@ def set_peft_model_state_dict( state_dict = {} if getattr(model, "modules_to_save", None) is not None: for key, value in peft_model_state_dict.items(): - if any(module_name in key for module_name in model.modules_to_save): - for module_name in model.modules_to_save: - if module_name in key: - key = key.replace(module_name, f"{module_name}.modules_to_save.{adapter_name}") + if any(f".{module_name}." in key for module_name in model.modules_to_save): + # sort to make order deterministic, but should not affect overall logic + for module_name in sorted(model.modules_to_save): + if f".{module_name}." in key: + key = key.replace(f".{module_name}.", f".{module_name}.modules_to_save.{adapter_name}.") break state_dict[key] = value else: diff --git a/tests/test_common_gpu.py b/tests/test_common_gpu.py index 09b8b4e901..a92e2c8171 100644 --- a/tests/test_common_gpu.py +++ b/tests/test_common_gpu.py @@ -53,11 +53,12 @@ get_peft_model, prepare_model_for_kbit_training, ) -from peft.import_utils import is_bnb_4bit_available, is_bnb_available +from peft.import_utils import is_bnb_4bit_available, is_bnb_available, is_xpu_available from peft.tuners.lora.config import LoraRuntimeConfig from peft.utils import infer_device from .testing_utils import ( + device_count, require_bitsandbytes, require_multi_accelerator, require_non_cpu, @@ -99,6 +100,8 @@ def tearDown(self): gc.collect() if torch.cuda.is_available(): torch.cuda.empty_cache() + elif is_xpu_available(): + torch.xpu.empty_cache() gc.collect() @require_bitsandbytes @@ -563,7 +566,7 @@ def test_ia3_bnb_4bit_quantization(self): assert isinstance(whisper_4bit.base_model.model.model.decoder.layers[0].self_attn.v_proj, IA3Linear4bit) @pytest.mark.multi_gpu_tests - @require_torch_multi_gpu + @require_multi_accelerator def test_lora_causal_lm_multi_gpu_inference(self): r""" Test if LORA can be used for inference on multiple GPUs. @@ -580,7 +583,7 @@ def test_lora_causal_lm_multi_gpu_inference(self): model = AutoModelForCausalLM.from_pretrained(self.causal_lm_model_id, device_map="balanced") tokenizer = AutoTokenizer.from_pretrained(self.seq2seq_model_id) - assert set(model.hf_device_map.values()) == set(range(torch.cuda.device_count())) + assert set(model.hf_device_map.values()) == set(range(device_count)) model = get_peft_model(model, lora_config) assert isinstance(model, PeftModel) @@ -607,7 +610,7 @@ def test_lora_seq2seq_lm_multi_gpu_inference(self): ) tokenizer = AutoTokenizer.from_pretrained(self.seq2seq_model_id) - assert set(model.hf_device_map.values()) == set(range(torch.cuda.device_count())) + assert set(model.hf_device_map.values()) == set(range(device_count)) model = get_peft_model(model, lora_config) assert isinstance(model, PeftModel) @@ -706,7 +709,7 @@ def test_print_4bit_expected(self): assert trainable_params == EXPECTED_TRAINABLE_PARAMS assert all_params == EXPECTED_ALL_PARAMS - @require_torch_gpu + @require_non_cpu @pytest.mark.single_gpu_tests @require_bitsandbytes def test_modules_to_save_grad(self): @@ -742,7 +745,7 @@ def test_modules_to_save_grad(self): assert original_module.weight.grad is None assert modules_to_save.weight.grad is not None - @require_torch_gpu + @require_non_cpu @pytest.mark.single_gpu_tests @require_bitsandbytes def test_8bit_merge_lora(self): @@ -1408,7 +1411,7 @@ def test_apply_GS_hra_inference(self): assert not torch.allclose(logits_hra, logits_hra_GS) - @require_torch_gpu + @require_non_cpu @pytest.mark.single_gpu_tests def test_apply_GS_hra_conv2d_inference(self): # check for different result with and without apply_GS @@ -1434,7 +1437,7 @@ def test_apply_GS_hra_conv2d_inference(self): assert not torch.allclose(logits_hra, logits_hra_GS) - @require_torch_gpu + @require_non_cpu @pytest.mark.single_gpu_tests def test_r_odd_hra_inference(self): # check that an untrained HRA adapter can't be initialized as an identity tranformation @@ -1456,9 +1459,13 @@ def test_r_odd_hra_inference(self): assert not torch.allclose(logits, logits_hra) -@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires a CUDA GPU") +@pytest.mark.skipif( + not (torch.cuda.is_available() or is_xpu_available()), reason="test requires a hardware accelerator" +) @pytest.mark.single_gpu_tests class TestSameAdapterDifferentDevices: + device = infer_device() + # 1639 # The original issue comes down to the following problem: If the user has a base layer on CUDA, moves the adapter to # CPU, then adds another adapter (which will automatically be moved to CUDA), then the first adapter will also be @@ -1495,29 +1502,29 @@ def __init__(self): def test_lora_one_target_add_new_adapter_does_not_change_device(self, mlp): config = LoraConfig(target_modules=["lin0"]) model = get_peft_model(mlp, config) - model = model.cuda() + model = model.to(self.device) model.lin0.lora_A.cpu() model.lin0.lora_B.cpu() # check that the adapter is indeed on CPU and the base model on GPU assert model.lin0.lora_A.default.weight.device.type == "cpu" assert model.lin0.lora_B.default.weight.device.type == "cpu" - assert model.lin0.base_layer.weight.device.type == "cuda" + assert model.lin0.base_layer.weight.device.type == self.device model.add_adapter("other", config) # check that after adding a new adapter, the old adapter is still on CPU assert model.lin0.lora_A.default.weight.device.type == "cpu" assert model.lin0.lora_B.default.weight.device.type == "cpu" # the rest should be on GPU - assert model.lin0.base_layer.weight.device.type == "cuda" - assert model.lin0.lora_A.other.weight.device.type == "cuda" - assert model.lin0.lora_B.other.weight.device.type == "cuda" + assert model.lin0.base_layer.weight.device.type == self.device + assert model.lin0.lora_A.other.weight.device.type == self.device + assert model.lin0.lora_B.other.weight.device.type == self.device def test_lora_multiple_targets_add_new_adapater_does_not_change_device(self, mlp): # same as the previous test, but targeting multiple layers config = LoraConfig(target_modules=["lin0", "lin1"]) model = get_peft_model(mlp, config) - model = model.cuda() + model = model.to(self.device) # move lin1 to CPU but leave lin0 on GPU model.lin1.lora_A.cpu() model.lin1.lora_B.cpu() @@ -1525,74 +1532,74 @@ def test_lora_multiple_targets_add_new_adapater_does_not_change_device(self, mlp # check that the adapter is indeed on CPU and the base model on GPU assert model.lin1.lora_A.default.weight.device.type == "cpu" assert model.lin1.lora_B.default.weight.device.type == "cpu" - assert model.lin1.base_layer.weight.device.type == "cuda" - assert model.lin0.lora_A.default.weight.device.type == "cuda" - assert model.lin0.lora_B.default.weight.device.type == "cuda" - assert model.lin0.base_layer.weight.device.type == "cuda" + assert model.lin1.base_layer.weight.device.type == self.device + assert model.lin0.lora_A.default.weight.device.type == self.device + assert model.lin0.lora_B.default.weight.device.type == self.device + assert model.lin0.base_layer.weight.device.type == self.device model.add_adapter("other", config) # check that after adding a new adapter, the old adapter is still on CPU assert model.lin1.lora_A.default.weight.device.type == "cpu" assert model.lin1.lora_B.default.weight.device.type == "cpu" - assert model.lin1.base_layer.weight.device.type == "cuda" + assert model.lin1.base_layer.weight.device.type == self.device # the rest should be on GPU - assert model.lin0.lora_A.default.weight.device.type == "cuda" - assert model.lin0.lora_B.default.weight.device.type == "cuda" - assert model.lin0.base_layer.weight.device.type == "cuda" - assert model.lin0.lora_A.other.weight.device.type == "cuda" - assert model.lin0.lora_B.other.weight.device.type == "cuda" - assert model.lin1.lora_A.other.weight.device.type == "cuda" - assert model.lin1.lora_B.other.weight.device.type == "cuda" + assert model.lin0.lora_A.default.weight.device.type == self.device + assert model.lin0.lora_B.default.weight.device.type == self.device + assert model.lin0.base_layer.weight.device.type == self.device + assert model.lin0.lora_A.other.weight.device.type == self.device + assert model.lin0.lora_B.other.weight.device.type == self.device + assert model.lin1.lora_A.other.weight.device.type == self.device + assert model.lin1.lora_B.other.weight.device.type == self.device def test_lora_embedding_target_add_new_adapter_does_not_change_device(self, emb_conv1d): # same as first test, but targeting the embedding layer config = LoraConfig(target_modules=["emb"]) model = get_peft_model(emb_conv1d, config) - model = model.cuda() + model = model.to(self.device) model.emb.lora_embedding_A.cpu() model.emb.lora_embedding_B.cpu() # check that the adapter is indeed on CPU and the base model on GPU assert model.emb.lora_embedding_A.default.device.type == "cpu" assert model.emb.lora_embedding_B.default.device.type == "cpu" - assert model.emb.weight.device.type == "cuda" + assert model.emb.weight.device.type == self.device model.add_adapter("other", config) # check that after adding a new adapter, the old adapter is still on CPU assert model.emb.lora_embedding_A.default.device.type == "cpu" assert model.emb.lora_embedding_B.default.device.type == "cpu" # the rest should be on GPU - assert model.emb.weight.device.type == "cuda" - assert model.emb.lora_embedding_A.other.device.type == "cuda" - assert model.emb.lora_embedding_B.other.device.type == "cuda" + assert model.emb.weight.device.type == self.device + assert model.emb.lora_embedding_A.other.device.type == self.device + assert model.emb.lora_embedding_B.other.device.type == self.device def test_lora_conv1d_target_add_new_adapter_does_not_change_device(self, emb_conv1d): # same as first test, but targeting the Conv1D layer config = LoraConfig(target_modules=["conv1d"]) model = get_peft_model(emb_conv1d, config) - model = model.cuda() + model = model.to(self.device) model.conv1d.lora_A.cpu() model.conv1d.lora_B.cpu() # check that the adapter is indeed on CPU and the base model on GPU assert model.conv1d.lora_A.default.weight.device.type == "cpu" assert model.conv1d.lora_B.default.weight.device.type == "cpu" - assert model.conv1d.weight.device.type == "cuda" + assert model.conv1d.weight.device.type == self.device model.add_adapter("other", config) # check that after adding a new adapter, the old adapter is still on CPU assert model.conv1d.lora_A.default.weight.device.type == "cpu" assert model.conv1d.lora_B.default.weight.device.type == "cpu" # the rest should be on GPU - assert model.conv1d.weight.device.type == "cuda" - assert model.conv1d.lora_A.other.weight.device.type == "cuda" - assert model.conv1d.lora_B.other.weight.device.type == "cuda" + assert model.conv1d.weight.device.type == self.device + assert model.conv1d.lora_A.other.weight.device.type == self.device + assert model.conv1d.lora_B.other.weight.device.type == self.device def test_lora_dora_add_new_adapter_does_not_change_device(self, mlp): # same as first test, but also using DoRA config = LoraConfig(target_modules=["lin0"], use_dora=True) model = get_peft_model(mlp, config) - model = model.cuda() + model = model.to(self.device) model.lin0.lora_A.cpu() model.lin0.lora_B.cpu() model.lin0.lora_magnitude_vector.cpu() @@ -1601,7 +1608,7 @@ def test_lora_dora_add_new_adapter_does_not_change_device(self, mlp): assert model.lin0.lora_A.default.weight.device.type == "cpu" assert model.lin0.lora_B.default.weight.device.type == "cpu" assert model.lin0.lora_magnitude_vector.default.weight.device.type == "cpu" - assert model.lin0.base_layer.weight.device.type == "cuda" + assert model.lin0.base_layer.weight.device.type == self.device model.add_adapter("other", config) # check that after adding a new adapter, the old adapter is still on CPU @@ -1609,182 +1616,182 @@ def test_lora_dora_add_new_adapter_does_not_change_device(self, mlp): assert model.lin0.lora_B.default.weight.device.type == "cpu" assert model.lin0.lora_magnitude_vector.default.weight.device.type == "cpu" # the rest should be on GPU - assert model.lin0.base_layer.weight.device.type == "cuda" - assert model.lin0.lora_A.other.weight.device.type == "cuda" - assert model.lin0.lora_B.other.weight.device.type == "cuda" - assert model.lin0.lora_magnitude_vector.other.weight.device.type == "cuda" + assert model.lin0.base_layer.weight.device.type == self.device + assert model.lin0.lora_A.other.weight.device.type == self.device + assert model.lin0.lora_B.other.weight.device.type == self.device + assert model.lin0.lora_magnitude_vector.other.weight.device.type == self.device def test_adalora_add_new_adapter_does_not_change_device(self, mlp): # same as first test, but using AdaLORA # AdaLora does not like multiple trainable adapters, hence inference_mode=True config = AdaLoraConfig(target_modules=["lin0"], inference_mode=True) model = get_peft_model(mlp, config) - model = model.cuda() + model = model.to(self.device) model.lin0.lora_A.cpu() model.lin0.lora_E.cpu() # check that the adapter is indeed on CPU and the base model on GPU assert model.lin0.lora_A.default.device.type == "cpu" assert model.lin0.lora_E.default.device.type == "cpu" - assert model.lin0.base_layer.weight.device.type == "cuda" + assert model.lin0.base_layer.weight.device.type == self.device model.add_adapter("other", config) # check that after adding a new adapter, the old adapter is still on CPU assert model.lin0.lora_A.default.device.type == "cpu" assert model.lin0.lora_E.default.device.type == "cpu" # the rest should be on GPU - assert model.lin0.base_layer.weight.device.type == "cuda" - assert model.lin0.lora_A.other.device.type == "cuda" - assert model.lin0.lora_E.other.device.type == "cuda" + assert model.lin0.base_layer.weight.device.type == self.device + assert model.lin0.lora_A.other.device.type == self.device + assert model.lin0.lora_E.other.device.type == self.device def test_boft_add_new_adapter_does_not_change_device(self, mlp): # same as first test, but using BoFT config = BOFTConfig(target_modules=["lin0"]) model = get_peft_model(mlp, config) - model = model.cuda() + model = model.to(self.device) model.lin0.boft_R.cpu() model.lin0.boft_s.cpu() # check that the adapter is indeed on CPU and the base model on GPU assert model.lin0.boft_R.default.device.type == "cpu" assert model.lin0.boft_s.default.device.type == "cpu" - assert model.lin0.base_layer.weight.device.type == "cuda" + assert model.lin0.base_layer.weight.device.type == self.device model.add_adapter("other", config) # check that after adding a new adapter, the old adapter is still on CPU assert model.lin0.boft_R.default.device.type == "cpu" assert model.lin0.boft_s.default.device.type == "cpu" # the rest should be on GPU - assert model.lin0.base_layer.weight.device.type == "cuda" - assert model.lin0.boft_R.other.device.type == "cuda" - assert model.lin0.boft_s.other.device.type == "cuda" + assert model.lin0.base_layer.weight.device.type == self.device + assert model.lin0.boft_R.other.device.type == self.device + assert model.lin0.boft_s.other.device.type == self.device def test_ia3_add_new_adapter_does_not_change_device(self, mlp): # same as first test, but using IA3 config = IA3Config(target_modules=["lin0"], feedforward_modules=["lin0"]) model = get_peft_model(mlp, config) - model = model.cuda() + model = model.to(self.device) model.lin0.ia3_l.cpu() # check that the adapter is indeed on CPU and the base model on GPU assert model.lin0.ia3_l.default.device.type == "cpu" - assert model.lin0.base_layer.weight.device.type == "cuda" + assert model.lin0.base_layer.weight.device.type == self.device model.add_adapter("other", config) # check that after adding a new adapter, the old adapter is still on CPU assert model.lin0.ia3_l.default.device.type == "cpu" # the rest should be on GPU - assert model.lin0.base_layer.weight.device.type == "cuda" - assert model.lin0.ia3_l.other.device.type == "cuda" + assert model.lin0.base_layer.weight.device.type == self.device + assert model.lin0.ia3_l.other.device.type == self.device @pytest.mark.xfail(reason="LN Tuning handling of multiple adapters may not be correct", strict=True) def test_ln_tuning_add_new_adapter_does_not_change_device(self, mlp): # same as first test, but using LN tuning config = LNTuningConfig(target_modules=["lin0"]) model = get_peft_model(mlp, config) - model = model.cuda() + model = model.to(self.device) model.lin0.ln_tuning_layers.cpu() # check that the adapter is indeed on CPU and the base model on GPU assert model.lin0.ln_tuning_layers.default.weight.device.type == "cpu" - assert model.lin0.base_layer.weight.device.type == "cuda" + assert model.lin0.base_layer.weight.device.type == self.device model.add_adapter("other", config) # check that after adding a new adapter, the old adapter is still on CPU assert model.lin0.ln_tuning_layers.default.weight.device.type == "cpu" # the rest should be on GPU - assert model.lin0.base_layer.weight.device.type == "cuda" - assert model.lin0.ln_tuning_layers.other.weight.device.type == "cuda" + assert model.lin0.base_layer.weight.device.type == self.device + assert model.lin0.ln_tuning_layers.other.weight.device.type == self.device def test_loha_add_new_adapter_does_not_change_device(self, mlp): # same as first test, but using LoHa config = LoHaConfig(target_modules=["lin0"]) model = get_peft_model(mlp, config) - model = model.cuda() + model = model.to(self.device) model.lin0.hada_w1_a.cpu() model.lin0.hada_w2_b.cpu() # check that the adapter is indeed on CPU and the base model on GPU assert model.lin0.hada_w1_a.default.device.type == "cpu" assert model.lin0.hada_w2_b.default.device.type == "cpu" - assert model.lin0.base_layer.weight.device.type == "cuda" + assert model.lin0.base_layer.weight.device.type == self.device model.add_adapter("other", config) # check that after adding a new adapter, the old adapter is still on CPU assert model.lin0.hada_w1_a.default.device.type == "cpu" assert model.lin0.hada_w2_b.default.device.type == "cpu" # the rest should be on GPU - assert model.lin0.base_layer.weight.device.type == "cuda" - assert model.lin0.hada_w1_a.other.device.type == "cuda" - assert model.lin0.hada_w2_b.other.device.type == "cuda" + assert model.lin0.base_layer.weight.device.type == self.device + assert model.lin0.hada_w1_a.other.device.type == self.device + assert model.lin0.hada_w2_b.other.device.type == self.device def test_lokr_add_new_adapter_does_not_change_device(self, mlp): # same as first test, but using LoKr config = LoKrConfig(target_modules=["lin0"]) model = get_peft_model(mlp, config) - model = model.cuda() + model = model.to(self.device) model.lin0.lokr_w1.cpu() model.lin0.lokr_w2.cpu() # check that the adapter is indeed on CPU and the base model on GPU assert model.lin0.lokr_w1.default.device.type == "cpu" assert model.lin0.lokr_w2.default.device.type == "cpu" - assert model.lin0.base_layer.weight.device.type == "cuda" + assert model.lin0.base_layer.weight.device.type == self.device model.add_adapter("other", config) # check that after adding a new adapter, the old adapter is still on CPU assert model.lin0.lokr_w1.default.device.type == "cpu" assert model.lin0.lokr_w2.default.device.type == "cpu" # the rest should be on GPU - assert model.lin0.base_layer.weight.device.type == "cuda" - assert model.lin0.lokr_w1.other.device.type == "cuda" - assert model.lin0.lokr_w2.other.device.type == "cuda" + assert model.lin0.base_layer.weight.device.type == self.device + assert model.lin0.lokr_w1.other.device.type == self.device + assert model.lin0.lokr_w2.other.device.type == self.device def test_oft_add_new_adapter_does_not_change_device(self, mlp): # same as first test, but using OFT config = OFTConfig(target_modules=["lin0"]) model = get_peft_model(mlp, config) - model = model.cuda() + model = model.to(self.device) model.lin0.oft_r.cpu() # check that the adapter is indeed on CPU and the base model on GPU assert model.lin0.oft_r.default.device.type == "cpu" - assert model.lin0.base_layer.weight.device.type == "cuda" + assert model.lin0.base_layer.weight.device.type == self.device model.add_adapter("other", config) # check that after adding a new adapter, the old adapter is still on CPU assert model.lin0.oft_r.default.device.type == "cpu" # the rest should be on GPU - assert model.lin0.base_layer.weight.device.type == "cuda" - assert model.lin0.oft_r.other.device.type == "cuda" + assert model.lin0.base_layer.weight.device.type == self.device + assert model.lin0.oft_r.other.device.type == self.device def test_vera_add_new_adapter_does_not_change_device(self, mlp): # same as first test, but using VERA config = VeraConfig(target_modules=["lin0"]) model = get_peft_model(mlp, config) - model = model.cuda() + model = model.to(self.device) model.lin0.vera_A.cpu() model.lin0.vera_lambda_d.cpu() # check that the adapter is indeed on CPU and the base model on GPU assert model.lin0.vera_A.default.device.type == "cpu" assert model.lin0.vera_lambda_d.default.device.type == "cpu" - assert model.lin0.base_layer.weight.device.type == "cuda" + assert model.lin0.base_layer.weight.device.type == self.device model.add_adapter("other", config) # check that after adding a new adapter, the old adapter is still on CPU assert model.lin0.vera_A.default.device.type == "cpu" assert model.lin0.vera_lambda_d.default.device.type == "cpu" # the rest should be on GPU - assert model.lin0.base_layer.weight.device.type == "cuda" - assert model.lin0.vera_A.other.device.type == "cuda" - assert model.lin0.vera_lambda_d.other.device.type == "cuda" + assert model.lin0.base_layer.weight.device.type == self.device + assert model.lin0.vera_A.other.device.type == self.device + assert model.lin0.vera_lambda_d.other.device.type == self.device def test_vblora_add_new_adapter_does_not_change_device(self, mlp): # same as first test, but using VBLoRA config = VBLoRAConfig(target_modules=["lin0"], vector_length=2) model = get_peft_model(mlp, config) - model = model.cuda() + model = model.to(self.device) model.lin0.vblora_logits_A.cpu() model.lin0.vblora_logits_B.cpu() model.lin0.vblora_vector_bank.cpu() @@ -1793,7 +1800,7 @@ def test_vblora_add_new_adapter_does_not_change_device(self, mlp): assert model.lin0.vblora_logits_A.default.device.type == "cpu" assert model.lin0.vblora_logits_B.default.device.type == "cpu" assert model.lin0.vblora_vector_bank.default.device.type == "cpu" - assert model.lin0.base_layer.weight.device.type == "cuda" + assert model.lin0.base_layer.weight.device.type == self.device model.add_adapter("other", config) # check that after adding a new adapter, the old adapter is still on CPU @@ -1801,25 +1808,25 @@ def test_vblora_add_new_adapter_does_not_change_device(self, mlp): assert model.lin0.vblora_logits_B.default.device.type == "cpu" assert model.lin0.vblora_vector_bank.default.device.type == "cpu" # the rest should be on GPU - assert model.lin0.base_layer.weight.device.type == "cuda" - assert model.lin0.vblora_logits_A.other.device.type == "cuda" - assert model.lin0.vblora_logits_B.other.device.type == "cuda" - assert model.lin0.vblora_vector_bank.other.device.type == "cuda" + assert model.lin0.base_layer.weight.device.type == self.device + assert model.lin0.vblora_logits_A.other.device.type == self.device + assert model.lin0.vblora_logits_B.other.device.type == self.device + assert model.lin0.vblora_vector_bank.other.device.type == self.device def test_hra_add_new_adapter_does_not_change_device(self, mlp): # same as first test, but using HRA config = HRAConfig(target_modules=["lin0"]) model = get_peft_model(mlp, config) - model = model.cuda() + model = model.to(self.device) model.lin0.hra_u.cpu() # check that the adapter is indeed on CPU and the base model on GPU assert model.lin0.hra_u.default.device.type == "cpu" - assert model.lin0.base_layer.weight.device.type == "cuda" + assert model.lin0.base_layer.weight.device.type == self.device model.add_adapter("other", config) # check that after adding a new adapter, the old adapter is still on CPU assert model.lin0.hra_u.default.device.type == "cpu" # the rest should be on GPU - assert model.lin0.base_layer.weight.device.type == "cuda" - assert model.lin0.hra_u.other.device.type == "cuda" + assert model.lin0.base_layer.weight.device.type == self.device + assert model.lin0.hra_u.other.device.type == self.device diff --git a/tests/test_decoder_models.py b/tests/test_decoder_models.py index a1aca79b12..78b4947db6 100644 --- a/tests/test_decoder_models.py +++ b/tests/test_decoder_models.py @@ -303,6 +303,18 @@ def test_merge_layers_nan(self, test_name, model_id, config_cls, config_kwargs): def test_mixed_adapter_batches(self, test_name, model_id, config_cls, config_kwargs): self._test_mixed_adapter_batches(model_id, config_cls, config_kwargs) + @parameterized.expand( + PeftTestConfigManager.get_grid_parameters( + { + "model_ids": PEFT_DECODER_MODELS_TO_TEST, + "lora_kwargs": {"init_lora_weights": [False]}, + "task_type": "CAUSAL_LM", + }, + ) + ) + def test_generate_with_mixed_adapter_batches(self, test_name, model_id, config_cls, config_kwargs): + self._test_generate_with_mixed_adapter_batches_and_beam_search(model_id, config_cls, config_kwargs) + @parameterized.expand( PeftTestConfigManager.get_grid_parameters(FULL_GRID, filter_params_func=skip_oft_or_hra_and_gpt2) ) diff --git a/tests/test_encoder_decoder_models.py b/tests/test_encoder_decoder_models.py index e22f010089..8f8eb9c0dd 100644 --- a/tests/test_encoder_decoder_models.py +++ b/tests/test_encoder_decoder_models.py @@ -118,6 +118,18 @@ def test_merge_layers(self, test_name, model_id, config_cls, config_kwargs): def test_mixed_adapter_batches(self, test_name, model_id, config_cls, config_kwargs): self._test_mixed_adapter_batches(model_id, config_cls, config_kwargs) + @parameterized.expand( + PeftTestConfigManager.get_grid_parameters( + { + "model_ids": PEFT_ENCODER_DECODER_MODELS_TO_TEST, + "lora_kwargs": {"init_lora_weights": [False]}, + "task_type": "SEQ_2_SEQ_LM", + }, + ) + ) + def test_generate_with_mixed_adapter_batches(self, test_name, model_id, config_cls, config_kwargs): + self._test_generate_with_mixed_adapter_batches_and_beam_search(model_id, config_cls, config_kwargs) + # skip non lora models - generate does not work for prefix tuning, prompt tuning @parameterized.expand(PeftTestConfigManager.get_grid_parameters(FULL_GRID)) def test_generate(self, test_name, model_id, config_cls, config_kwargs): diff --git a/tests/test_gpu_examples.py b/tests/test_gpu_examples.py index 1d33631a05..3b0cde1444 100644 --- a/tests/test_gpu_examples.py +++ b/tests/test_gpu_examples.py @@ -70,6 +70,7 @@ replace_lora_weights_loftq, set_peft_model_state_dict, ) +from peft.import_utils import is_xpu_available from peft.tuners import boft from peft.utils import SAFETENSORS_WEIGHTS_NAME, infer_device from peft.utils.loftq_utils import NFQuantizer @@ -82,6 +83,7 @@ require_bitsandbytes, require_eetq, require_hqq, + require_multi_accelerator, require_non_cpu, require_non_xpu, require_optimum, @@ -2073,7 +2075,10 @@ def test_olora_with_quantized_model(self, bits): assert torch.isfinite(logits).all() -@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires a GPU") +@pytest.mark.skipif( + not (torch.cuda.is_available() or is_xpu_available()), reason="test requires a hardware accelerator" +) +@require_bitsandbytes class TestLoftQ: r""" Tests for LoftQ to ensure that it reduces the quantization error compared to normal LoRA quantization. @@ -2083,19 +2088,18 @@ class TestLoftQ: # quantization without LoftQ. Thus 1.03 means that the error should be decreased by 3% at least. This is a very # conservative value to prevent flakiness, in practice most gains are > 1.5 error_factor = 1.03 + device = infer_device() def get_input(self, model_id, device): tokenizer = AutoTokenizer.from_pretrained(model_id) inputs = tokenizer("All I want is", padding=True, return_tensors="pt") - if device == "cuda": - inputs = inputs.to("cuda") + inputs = inputs.to(self.device) return inputs def get_base_model(self, model_id, device, **kwargs): cls = AutoModelForSeq2SeqLM if "t5" in str(model_id) else AutoModelForCausalLM model = cls.from_pretrained(model_id, **kwargs).eval() - if device == "cuda": - model = model.to("cuda") + model = model.to(self.device) return model def get_logits(self, model, inputs): @@ -3808,28 +3812,30 @@ class TestBOFT: Test that we can correctly use half-precision models with BOFT. """ - @require_torch_gpu + device = infer_device() + + @require_non_cpu @pytest.mark.single_gpu_tests def test_boft_half_linear(self): # Check that we can use BoFT with model loaded in half precision - layer = torch.nn.Linear(160, 160).cuda() + layer = torch.nn.Linear(160, 160).to(self.device) layer = boft.layer.Linear(layer, "layer", boft_n_butterfly_factor=2).to(dtype=torch.bfloat16) - x = torch.randn(160, 160, device="cuda", dtype=torch.bfloat16) + x = torch.randn(160, 160, device=self.device, dtype=torch.bfloat16) layer(x) # does not raise - @require_torch_gpu + @require_non_cpu @pytest.mark.single_gpu_tests def test_boft_half_conv(self): - conv = torch.nn.Conv2d(1, 1, 4).cuda() + conv = torch.nn.Conv2d(1, 1, 4).to(self.device) conv = boft.layer.Conv2d(conv, "conv", boft_n_butterfly_factor=2).to(dtype=torch.bfloat16) - x = torch.randn(1, 160, 160, device="cuda", dtype=torch.bfloat16) + x = torch.randn(1, 160, 160, device=self.device, dtype=torch.bfloat16) conv(x) # does not raise -@require_torch_gpu class TestPTuningReproducibility: device = infer_device() + @require_non_cpu def test_p_tuning_exactly_reproducible_after_loading(self, tmp_path): # See: https://github.com/huggingface/peft/issues/2043#issuecomment-2321522577 # Ensure that after loading a p-tuning checkpoint, results are exactly reproducible (before the patch, they were @@ -3865,7 +3871,6 @@ def test_p_tuning_exactly_reproducible_after_loading(self, tmp_path): torch.testing.assert_close(gen_loaded, gen_peft) -@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires a GPU") @pytest.mark.single_gpu_tests class TestLowCpuMemUsageDifferentDevices: """Test for the low CPU memory usage option for loading PEFT models. @@ -3878,7 +3883,8 @@ class TestLowCpuMemUsageDifferentDevices: model_id = "hf-internal-testing/tiny-random-OPTForCausalLM" device = infer_device() - @pytest.mark.parametrize("device_model, device_sd", [("cpu", "cuda"), ("cuda", "cpu")]) + @require_non_cpu + @pytest.mark.parametrize("device_model, device_sd", [("cpu", infer_device()), (infer_device(), "cpu")]) def test_low_cpu_mem_usage_model_model_on_gpu_state_dict_on_cpu_works(self, device_model, device_sd): # specifically test diverging devices for the model and state_dict inputs = {"input_ids": torch.randint(0, 100, (1, 10)), "attention_mask": torch.ones(1, 10)} @@ -3914,6 +3920,7 @@ def test_low_cpu_mem_usage_model_model_on_gpu_state_dict_on_cpu_works(self, devi assert torch.allclose(logits_low_cpu_mem, logits_not_low_cpu_mem) assert {p.device.type for p in model.parameters()} == {device_model} + @require_bitsandbytes @pytest.mark.parametrize("quantization_method", ["bnb-4bit", "bnb-8bit"]) def test_low_cpu_mem_usage_with_quantization(self, quantization_method): # Ensure that low_cpu_mem_usage works with quantization @@ -3953,7 +3960,7 @@ class TestEvaInitializationGPU: MAX_LENGTH = 256 LORA_DIM = 8 LORA_ALPHA = 1 - DEVICE = "cuda" + DEVICE = infer_device() @pytest.fixture def tokenizer(self): @@ -4021,7 +4028,8 @@ def is_bnb_model(self, model): def collate_fn(examples): return {k: torch.stack([v[k] for v in examples], dim=0) for k in examples[0].keys()} - @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires a GPU") + @require_non_cpu + @require_bitsandbytes @pytest.mark.single_gpu_tests @pytest.mark.parametrize("model_fixture", ["model", "model_bnb"], indirect=True) def test_eva_initialization_consistency(self, model_fixture, dataset, peft_config): @@ -4059,14 +4067,16 @@ def test_eva_initialization_consistency(self, model_fixture, dataset, peft_confi ) -@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires a GPU") @pytest.mark.multi_gpu_tests class TestPrefixTuning: + device = infer_device() + + @require_multi_accelerator def test_prefix_tuning_multiple_devices_decoder_model(self): # See issue 2134 model_id = "hf-internal-testing/tiny-random-MistralForCausalLM" tokenizer = AutoTokenizer.from_pretrained(model_id, padding="left") - inputs = tokenizer(["A list of colors: red, blue"], return_tensors="pt").to("cuda") + inputs = tokenizer(["A list of colors: red, blue"], return_tensors="pt").to(self.device) device_map = { "model.embed_tokens": 0, @@ -4086,11 +4096,12 @@ def test_prefix_tuning_multiple_devices_decoder_model(self): model = get_peft_model(model, peft_config) model.generate(**inputs) # does not raise + @require_multi_accelerator def test_prefix_tuning_multiple_devices_encoder_decoder_model(self): # See issue 2134 model_id = "hf-internal-testing/tiny-random-T5Model" tokenizer = AutoTokenizer.from_pretrained(model_id, padding="left") - inputs = tokenizer(["A list of colors: red, blue"], return_tensors="pt").to("cuda") + inputs = tokenizer(["A list of colors: red, blue"], return_tensors="pt").to(self.device) device_map = { "shared": 0, "encoder.embed_tokens": 0, @@ -4120,7 +4131,9 @@ def test_prefix_tuning_multiple_devices_encoder_decoder_model(self): model.generate(**inputs) # does not raise -@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires a GPU") +@pytest.mark.skipif( + not (torch.cuda.is_available() or is_xpu_available()), reason="test requires a hardware accelerator" +) @pytest.mark.single_gpu_tests class TestHotSwapping: def test_hotswapping_compiled_model_does_not_trigger_recompilation(self): diff --git a/tests/test_initialization.py b/tests/test_initialization.py index 103fa6696d..510f12892e 100644 --- a/tests/test_initialization.py +++ b/tests/test_initialization.py @@ -1145,6 +1145,7 @@ def test_mha_with_dora_raises(self, mha_cls): get_peft_model(model, config) def test_mha_exposes_attributes(self, mha_cls): + # MHA requires a bunch of attributes to be exposed, try to check them exhaustively here model = mha_cls() embed_dim = model.mha.embed_dim kdim = model.mha.kdim @@ -1154,6 +1155,12 @@ def test_mha_exposes_attributes(self, mha_cls): dropout = model.mha.dropout batch_first = model.mha.batch_first head_dim = model.mha.head_dim + in_proj_weight = model.mha.in_proj_weight + in_proj_bias = model.mha.in_proj_bias + out_proj = model.mha.out_proj + bias_k = model.mha.bias_k + bias_v = model.mha.bias_v + add_zero_attn = model.mha.add_zero_attn config = LoraConfig(target_modules=["mha"]) peft_model = get_peft_model(model, config) @@ -1165,6 +1172,39 @@ def test_mha_exposes_attributes(self, mha_cls): assert peft_model.base_model.mha.dropout == dropout assert peft_model.base_model.mha.batch_first == batch_first assert peft_model.base_model.mha.head_dim == head_dim + if in_proj_weight is not None: + assert torch.allclose(peft_model.base_model.mha.in_proj_weight, in_proj_weight) + else: + assert peft_model.base_model.mha.in_proj_weight is None + if in_proj_bias is not None: + assert torch.allclose(peft_model.base_model.mha.in_proj_bias, in_proj_bias) + else: + assert peft_model.base_model.mha.in_proj_bias is None + assert peft_model.base_model.mha.out_proj is out_proj + if bias_k is not None: + assert torch.allclose(peft_model.base_model.mha.bias_k, bias_k) + else: + assert peft_model.base_model.mha.bias_k is None + if bias_v is not None: + assert torch.allclose(peft_model.base_model.mha.bias_v, bias_v) + else: + assert peft_model.base_model.mha.bias_v is None + assert peft_model.base_model.mha.add_zero_attn == add_zero_attn + + def test_mha_merge_masks_method(self, mha_cls): + # MHA requires a merge_masks method to be exposed, check that it works + model = mha_cls() + config = LoraConfig(target_modules=["mha"]) + peft_model = get_peft_model(model, config) + + attn_mask = torch.randint(0, 2, (10, 10)) + key_padding_mask = torch.randint(0, 2, (10, 10)) + query = torch.rand(10, 10, 10) + merged_mask0, mask_type0 = model.mha.merge_masks(attn_mask, key_padding_mask, query) + merged_mask1, mask_type1 = peft_model.base_model.mha.merge_masks(attn_mask, key_padding_mask, query) + + assert torch.allclose(merged_mask0, merged_mask1) + assert mask_type0 == mask_type1 def test_lora_with_bias_extra_params(self): # lora with lora_bias=True @@ -1916,6 +1956,38 @@ def data(self): torch.manual_seed(233) return torch.rand(1000, 1000).to(self.torch_device) + @pytest.mark.parametrize("corda_method", ("ipm", "kpm")) + def test_lora_corda_no_redundant_fields(self, data, corda_method): + original_model = self.get_model() + model = deepcopy(original_model) + + corda_config = CordaConfig( + corda_method=corda_method, + ) + config = LoraConfig( + init_lora_weights="corda", + target_modules=["linear"], + corda_config=corda_config, + ) + preprocess_corda( + model, + config, + run_model=lambda: model(data), + hooked_model=model, + ) + peft_model = get_peft_model(model, config) + + # check if the redundant fields are removed + assert not hasattr(peft_model.base_model.linear, "sample_count") + assert not hasattr(peft_model.base_model.linear, "covariance_matrix") + assert not hasattr(peft_model.base_model.linear, "corda_method") + assert not hasattr(peft_model.base_model.linear, "rank") + assert not hasattr(peft_model.base_model.linear, "eigens") + + # legacy debug fields + assert not hasattr(peft_model.base_model.linear, "mean") + assert not hasattr(peft_model.base_model.linear, "std") + @pytest.mark.parametrize("corda_method", ("ipm", "kpm")) def test_lora_corda_sample_count(self, data, corda_method): original_model = self.get_model() @@ -1923,6 +1995,7 @@ def test_lora_corda_sample_count(self, data, corda_method): corda_config = CordaConfig( corda_method=corda_method, + prune_temporary_fields=False, ) config = LoraConfig( init_lora_weights="corda", @@ -1960,6 +2033,7 @@ def hook(*args): corda_config = CordaConfig( corda_method=corda_method, + prune_temporary_fields=False, ) config = LoraConfig( init_lora_weights="corda", @@ -2961,3 +3035,25 @@ def test_hotswap_extra_key_raises(self, tmp_path): msg = f"Hot swapping the adapter did not succeed. Unexpected keys: {new_key}" with pytest.raises(RuntimeError, match=msg): hotswap_adapter(model, tmp_path / "adapter1", adapter_name="default") + + +def test_import_peft_type_to_model_mapping_deprecation_warning(recwarn): + # This is for backwards compatibility: In #2282, PEFT_TYPE_TO_MODEL_MAPPING was removed as it was redundant with + # PEFT_TYPE_TO_TUNER_MAPPING. However, third party code could still use this mapping, e.g.: + # https://github.com/AutoGPTQ/AutoGPTQ/blob/6689349625de973b9ee3016c28c11f32acf7f02c/auto_gptq/utils/peft_utils.py#L8 + # TODO: Remove after 2026-01 + + # first check that there is no warning under normal circumstances + from peft.peft_model import PeftModel # noqa + + expected = ( + "PEFT_TYPE_TO_MODEL_MAPPING is deprecated, please use `from peft import PEFT_TYPE_TO_TUNER_MAPPING` instead" + ) + warnings = (w.message.args[0] for w in recwarn.list) + assert not any(w.startswith(expected) for w in warnings) + + from peft.peft_model import PEFT_TYPE_TO_MODEL_MAPPING # noqa + + # check that there is a warning with this message after importing the variable + warnings = (w.message.args[0] for w in recwarn.list) + assert any(w.startswith(expected) for w in warnings) diff --git a/tests/test_other.py b/tests/test_other.py index 75d8a7565c..7ee521f1c3 100644 --- a/tests/test_other.py +++ b/tests/test_other.py @@ -12,12 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. + import pytest import torch from torch import nn from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification -from peft import LoraConfig, get_peft_model +from peft import LoraConfig, PeftModel, get_peft_model from peft.utils.other import ModulesToSaveWrapper @@ -199,3 +200,100 @@ def test_transient_attribute_access_non_existing_adapter(self, mlp): model.base_model.model.lin1._active_adapter = "does-not-exist" with pytest.raises(AttributeError, match="has no attribute 'weight'"): model.lin1.weight + + +class TestModulesToSaveNameSubstringBug: + """Test a bug that could occur with multiple modules to save where one adapter's name is a substring of another + adapter's name. + + This bug was the result of an error in the logic of modifying the state_dict for modules_to_save in + set_peft_model_state_dict. The error in the logic was that it was checked if an entry from modules_to_save (a set + of strings) is a substring of a key of the state_dict. If it was, a new name was assigned to that key in the + state_dict, which would allow to load the weight later. + + The issue that stems from the substring check occurs if there are multiple modules_to_save, and one of them has a + name that is a substring of another. So e.g. if one is named "classifier" and the other is named "classifier2", + there could be a false match. + + + This bug was reported in #2289. + + """ + + def get_model(self): + class MyModule(nn.Module): + def __init__(self): + super().__init__() + self.lin = nn.Linear(5, 4) + # important: "classifier" is a substring of "classifier2", "classifier3", "classifier4" + self.classifier = nn.Linear(4, 2) + self.classifier2 = nn.Linear(4, 2) + self.classifier3 = nn.Linear(4, 2) + self.classifier4 = nn.Linear(4, 2) + + def forward(self, x): + x = self.lin(x) + return self.classifier(x) + self.classifier2(x) + self.classifier3(x) + self.classifier4(x) + + torch.manual_seed(0) + return MyModule() + + @pytest.fixture + def path_merged_and_unmerged(self, tmp_path): + # Create 2 checkpoints: + # 1. merged: the model after calling merge_and_unload + # 2. unmerged: the PEFT model saved without calling merge_and_unload + path = tmp_path / "model.pt" + + lora_config = LoraConfig( + target_modules=["lin"], + # important: "classifier" is a substring of "classifier2", "classifier3", "classifier4" + modules_to_save=["classifier", "classifier2", "classifier3", "classifier4"], + ) + model = get_peft_model(self.get_model(), lora_config) + # mock training + for _ in range(5): + optimizer = torch.optim.SGD(model.parameters(), lr=0.01) + output = model(torch.randn(10, 5)) + loss = output.sum() + loss.backward() + optimizer.step() + + # save the peft model without merging + path_unmerged = tmp_path / "unmerged" + model.save_pretrained(path_unmerged) + + # merge the model and save state_dict + path_merged = tmp_path / "merged" + merged = model.merge_and_unload() + state_dict = merged.state_dict() + torch.save(state_dict, path_merged) + + return path_merged, path_unmerged + + def test_load_merged_and_unmerged_same_weights(self, path_merged_and_unmerged): + # Note that this test is quasi flaky, it has a 1 in 4 chance of passing even without the bugfix. It passes when + # "classifier" happens to be the last element of the set model.modules_to_save. The order of the set is random. + # It is not possible just run this test multiple times to minimize the probability of this happening, because + # within the same process, the hash order is consistent. With the bug fix, this doesn't matter, as the test will + # always pass, but if there is a regression, there is a 1 in 4 chance of not catching it. Since the CI runs many + # tests, it is overall very unlikely that none will catch it though. If you see this test failing in CI, thus be + # aware that some of the passing tests may just pass owing to randomness. + path_merged, path_unmerged = path_merged_and_unmerged + + # load the merged model directly + state_dict = torch.load(path_merged, weights_only=True) + model = self.get_model() + model.load_state_dict(state_dict) + sd_merged = model.state_dict() + del model + + # load the unmerged model and merge it + unmerged = PeftModel.from_pretrained(self.get_model(), path_unmerged) + sd_unmerged = unmerged.merge_and_unload().state_dict() + + assert sd_merged.keys() == sd_unmerged.keys() + for key in sd_merged.keys(): + param_merged = sd_merged[key] + param_unmerged = sd_unmerged[key] + assert torch.allclose(param_merged, param_unmerged) diff --git a/tests/testing_common.py b/tests/testing_common.py index fec265812b..a553b24747 100644 --- a/tests/testing_common.py +++ b/tests/testing_common.py @@ -941,6 +941,87 @@ def _test_mixed_adapter_batches(self, model_id, config_cls, config_kwargs): assert torch.allclose(logits_adapter0[1::3], logits_mixed[1::3], atol=atol, rtol=rtol) assert torch.allclose(logits_adapter1[2::3], logits_mixed[2::3], atol=atol, rtol=rtol) + def _test_generate_with_mixed_adapter_batches_and_beam_search(self, model_id, config_cls, config_kwargs): + # Test generating with beam search and with mixing different adapters in a single batch by passing the + # adapter_names argument. See #2283. + if config_cls not in (LoraConfig,): + return pytest.skip(f"Mixed adapter batches not supported for {config_cls}") + + config = config_cls( + base_model_name_or_path=model_id, + **config_kwargs, + ) + + torch.manual_seed(0) + model = self.transformers_class.from_pretrained(model_id) + model = get_peft_model(model, config, adapter_name="adapter0").eval() + model.add_adapter("adapter1", config) + + # In contrast to forward, for generate, it can sometimes happen that we get the same results as the base model + # even with LoRA applied because the impact of LoRA is not big enough. Therefore, use this "trick" to make LoRA + # stronger. + for name, param in model.named_parameters(): + if model.base_model.prefix in name: + param.data.mul_(10.0) + + model = model.to(self.torch_device).eval() + + dummy_input = self.prepare_inputs_for_testing() + # ensure that we have at least 3 samples for this test + dummy_input = {k: torch.cat([v for _ in range(3)]) for k, v in dummy_input.items()} + + gen_kwargs = {**dummy_input, "max_length": 20, "num_beams": 10, "early_stopping": True} + with torch.inference_mode(): + with model.disable_adapter(): + gen_base = model.generate(**gen_kwargs) + + model.set_adapter("adapter0") + with torch.inference_mode(): + gen_adapter0 = model.generate(**gen_kwargs) + + model.set_adapter("adapter1") + with torch.inference_mode(): + gen_adapter1 = model.generate(**gen_kwargs) + + def remove_padding(seq, pad_value): + lst = list(seq) + while lst and (lst[-1] == pad_value): + lst.pop() + return lst + + def gens_are_same(gen0, gen1): + # Special function to compare generations. We cannot use torch.allclose it will raise an error when sequence + # lengths differ. Morevoer, we need to remove the padding from the sequences. This is because, even though + # normally identical sequences should have the same length, when we do mixed adapter batches, each sample + # will be padded to the longest sequence in that mixed batch, which can be different from the longest + # sequence without mixed adapter batches. + pad_value = model.config.eos_token_id + for sample0, sample1 in zip(gen0, gen1): + sample0 = remove_padding(sample0, pad_value) + sample1 = remove_padding(sample1, pad_value) + if (len(sample0) != len(sample1)) or (sample0 != sample1): + # at least one sample differs, the generations are not identical + return False + return True + + # sanity check that there are enough outputs and that they are different + assert len(gen_base) == len(gen_adapter0) == len(gen_adapter1) + assert len(gen_adapter1) >= 3 + assert not gens_are_same(gen_base, gen_adapter0) + assert not gens_are_same(gen_base, gen_adapter1) + assert not gens_are_same(gen_adapter0, gen_adapter1) + + # alternate between base model, adapter0, and adapter1 + adapters = ["__base__", "adapter0", "adapter1"] + gen_kwargs["adapter_names"] = [adapters[i % 3] for i in (range(len(dummy_input["input_ids"])))] + + with torch.inference_mode(): + gen_mixed = model.generate(**gen_kwargs) + + assert gens_are_same(gen_base[::3], gen_mixed[::3]) + assert gens_are_same(gen_adapter0[1::3], gen_mixed[1::3]) + assert gens_are_same(gen_adapter1[2::3], gen_mixed[2::3]) + def _test_generate(self, model_id, config_cls, config_kwargs): model = self.transformers_class.from_pretrained(model_id) config = config_cls(