diff --git a/.github/workflows/test-coverage-coveralls.yml b/.github/workflows/test-coverage-coveralls.yml
index 76cda636b..5de79dbc7 100644
--- a/.github/workflows/test-coverage-coveralls.yml
+++ b/.github/workflows/test-coverage-coveralls.yml
@@ -46,14 +46,32 @@ jobs:
       - name: install coverage.py and coverralls
         run: |
           pip install coverage coveralls
+      - name: install pytest-xdist and pytest-cov
+        run: |
+          pip install pytest-xdist pytest-cov
+      - name: download datasets
+        env:
+          FAST_TEST: "True"
+          USE_GPU: "False"
+          PYTHONPATH: ${{ github.workspace }}
+        run: |
+          pytest ./tests/benchmarks/scenarios/deprecated/test_high_level_generators.py::HighLevelGeneratorTests::test_filelist_benchmark ./tests/benchmarks/scenarios/deprecated/test_nc_mt_scenario.py::MultiTaskTests::test_mt_single_dataset
       - name: python unit test
         id: unittest
         env:
           FAST_TEST: "True"
           USE_GPU: "False"
+          PYTHONPATH: ${{ github.workspace }}
         run: |
           mkdir coverage &&
-          coverage run -m unittest discover tests &&
+          pytest --cov=. -n 4 tests &&
+          mv .coverage coverage_unit_tests &&
+          coverage run examples/eval_plugin.py &&
+          mv .coverage coverage_eval_plugin &&
+          RUN_COVERAGE=True bash ./tests/checkpointing/test_checkpointing.sh &&
+          coverage run tests/run_dist_tests.py &&
+          mv .coverage coverage_dist_tests &&
+          coverage combine coverage_* &&
           coverage lcov -o coverage/lcov.info --omit '**/config-3.py,**/config.py'
       - name: Upload coverage data to coveralls.io
         uses: coverallsapp/github-action@master
diff --git a/.github/workflows/unit-test.yml b/.github/workflows/unit-test.yml
index 769968d09..f831a3f18 100644
--- a/.github/workflows/unit-test.yml
+++ b/.github/workflows/unit-test.yml
@@ -51,16 +51,15 @@ jobs:
         env:
           FAST_TEST: "True"
           USE_GPU: "False"
+          PYTHONPATH: ${{ github.workspace }}
         run: |
           python -m unittest discover tests &&
           echo "Checking that optional dependencies are not needed" &&
           pip uninstall -y higher ctrl-benchmark torchaudio gym pycocotools lvis &&
-          PYTHONPATH=. python examples/eval_plugin.py &&
+          python examples/eval_plugin.py &&
           echo "Running checkpointing tests..." &&
           bash ./tests/checkpointing/test_checkpointing.sh &&
           echo "Running distributed training tests..." &&
-          cd tests &&
-          PYTHONPATH=.. python run_dist_tests.py &&
-          cd .. &&
+          python tests/run_dist_tests.py &&
           echo "While running unit tests, the following datasets were downloaded:" &&
           ls ~/.avalanche/data
diff --git a/avalanche/_annotations.py b/avalanche/_annotations.py
index 7a6b12df6..1820ae811 100644
--- a/avalanche/_annotations.py
+++ b/avalanche/_annotations.py
@@ -4,6 +4,7 @@
 
 only for internal use in the library.
 """
+
 import inspect
 import warnings
 import functools
diff --git a/avalanche/benchmarks/scenarios/deprecated/classification_scenario.py b/avalanche/benchmarks/scenarios/deprecated/classification_scenario.py
index ecdabacaa..b195a27f6 100644
--- a/avalanche/benchmarks/scenarios/deprecated/classification_scenario.py
+++ b/avalanche/benchmarks/scenarios/deprecated/classification_scenario.py
@@ -257,12 +257,10 @@ def __len__(self) -> int:
         return len(self._benchmark.streams[self._stream])
 
     @overload
-    def __getitem__(self, exp_id: int) -> Optional[Set[int]]:
-        ...
+    def __getitem__(self, exp_id: int) -> Optional[Set[int]]: ...
 
     @overload
-    def __getitem__(self, exp_id: slice) -> Tuple[Optional[Set[int]], ...]:
-        ...
+    def __getitem__(self, exp_id: slice) -> Tuple[Optional[Set[int]], ...]: ...
 
     def __getitem__(self, exp_id: Union[int, slice]) -> LazyClassesInExpsRet:
         indexing_collate = _LazyClassesInClassificationExps._slice_collate
diff --git a/avalanche/benchmarks/scenarios/deprecated/dataset_scenario.py b/avalanche/benchmarks/scenarios/deprecated/dataset_scenario.py
index 5547a23b5..5af73eb9b 100644
--- a/avalanche/benchmarks/scenarios/deprecated/dataset_scenario.py
+++ b/avalanche/benchmarks/scenarios/deprecated/dataset_scenario.py
@@ -184,17 +184,17 @@ def __init__(
             invoking the super constructor) to specialize the experience class.
         """
 
-        self.experience_factory: Callable[
-            [TCLStream, int], TDatasetExperience
-        ] = experience_factory
+        self.experience_factory: Callable[[TCLStream, int], TDatasetExperience] = (
+            experience_factory
+        )
 
-        self.stream_factory: Callable[
-            [str, TDatasetScenario], TCLStream
-        ] = stream_factory
+        self.stream_factory: Callable[[str, TDatasetScenario], TCLStream] = (
+            stream_factory
+        )
 
-        self.stream_definitions: Dict[
-            str, StreamDef[TCLDataset]
-        ] = DatasetScenario._check_stream_definitions(stream_definitions)
+        self.stream_definitions: Dict[str, StreamDef[TCLDataset]] = (
+            DatasetScenario._check_stream_definitions(stream_definitions)
+        )
         """
         A structure containing the definition of the streams.
         """
diff --git a/avalanche/benchmarks/scenarios/deprecated/generic_benchmark_creation.py b/avalanche/benchmarks/scenarios/deprecated/generic_benchmark_creation.py
index 9bf1e9b6e..351d7b6f3 100644
--- a/avalanche/benchmarks/scenarios/deprecated/generic_benchmark_creation.py
+++ b/avalanche/benchmarks/scenarios/deprecated/generic_benchmark_creation.py
@@ -143,9 +143,9 @@ def create_multi_dataset_generic_benchmark(
                 "complete_test_set_only is True"
             )
 
-    stream_definitions: Dict[
-        str, Tuple[Iterable[TaskAwareClassificationDataset]]
-    ] = dict()
+    stream_definitions: Dict[str, Tuple[Iterable[TaskAwareClassificationDataset]]] = (
+        dict()
+    )
 
     for stream_name, dataset_list in input_streams.items():
         initial_transform_group = "train"
diff --git a/avalanche/benchmarks/scenarios/deprecated/lazy_dataset_sequence.py b/avalanche/benchmarks/scenarios/deprecated/lazy_dataset_sequence.py
index 1d20fb774..82251d0da 100644
--- a/avalanche/benchmarks/scenarios/deprecated/lazy_dataset_sequence.py
+++ b/avalanche/benchmarks/scenarios/deprecated/lazy_dataset_sequence.py
@@ -99,9 +99,9 @@ def __init__(
         now, including the ones of dropped experiences.
         """
 
-        self.task_labels_field_sequence: Dict[
-            int, Optional[Sequence[int]]
-        ] = defaultdict(lambda: None)
+        self.task_labels_field_sequence: Dict[int, Optional[Sequence[int]]] = (
+            defaultdict(lambda: None)
+        )
         """
         A dictionary mapping each experience to its `targets_task_labels` field.
 
@@ -118,12 +118,10 @@ def __len__(self) -> int:
         return self._stream_length
 
     @overload
-    def __getitem__(self, exp_idx: int) -> TCLDataset:
-        ...
+    def __getitem__(self, exp_idx: int) -> TCLDataset: ...
 
     @overload
-    def __getitem__(self, exp_idx: slice) -> Sequence[TCLDataset]:
-        ...
+    def __getitem__(self, exp_idx: slice) -> Sequence[TCLDataset]: ...
 
     def __getitem__(
         self, exp_idx: Union[int, slice]
@@ -135,9 +133,9 @@ def __getitem__(
         :return: The dataset associated to the experience.
         """
         # A lot of unuseful lines needed for MyPy -_-
-        indexing_collate: Callable[
-            [Iterable[TCLDataset]], Sequence[TCLDataset]
-        ] = lambda x: list(x)
+        indexing_collate: Callable[[Iterable[TCLDataset]], Sequence[TCLDataset]] = (
+            lambda x: list(x)
+        )
         result = manage_advanced_indexing(
             exp_idx,
             self._get_experience_and_load_if_needed,
diff --git a/avalanche/benchmarks/scenarios/deprecated/new_classes/nc_scenario.py b/avalanche/benchmarks/scenarios/deprecated/new_classes/nc_scenario.py
index 8c400ef9d..a5509b18d 100644
--- a/avalanche/benchmarks/scenarios/deprecated/new_classes/nc_scenario.py
+++ b/avalanche/benchmarks/scenarios/deprecated/new_classes/nc_scenario.py
@@ -34,7 +34,6 @@ class NCScenario(
         "NCStream", "NCExperience", TaskAwareSupervisedClassificationDataset
     ]
 ):
-
     """
     This class defines a "New Classes" scenario. Once created, an instance
     of this class can be iterated in order to obtain the experience sequence
@@ -328,9 +327,9 @@ class "34" will be mapped to "1", class "11" to "2" and so on.
                 # used, the user may have defined an amount of classes less than
                 # the overall amount of classes in the dataset.
                 if class_id in self.classes_order_original_ids:
-                    self.class_mapping[
-                        class_id
-                    ] = self.classes_order_original_ids.index(class_id)
+                    self.class_mapping[class_id] = (
+                        self.classes_order_original_ids.index(class_id)
+                    )
         elif self.class_ids_from_zero_in_each_exp:
             # Method 2: remap class IDs so that they appear in range [0, N] in
             # each experience
diff --git a/avalanche/benchmarks/scenarios/detection_scenario.py b/avalanche/benchmarks/scenarios/detection_scenario.py
index 8b16be027..111c5ac11 100644
--- a/avalanche/benchmarks/scenarios/detection_scenario.py
+++ b/avalanche/benchmarks/scenarios/detection_scenario.py
@@ -254,12 +254,10 @@ def __len__(self):
         return len(self._benchmark.streams[self._stream])
 
     @overload
-    def __getitem__(self, exp_id: int) -> Optional[Set[int]]:
-        ...
+    def __getitem__(self, exp_id: int) -> Optional[Set[int]]: ...
 
     @overload
-    def __getitem__(self, exp_id: slice) -> Tuple[Optional[Set[int]], ...]:
-        ...
+    def __getitem__(self, exp_id: slice) -> Tuple[Optional[Set[int]], ...]: ...
 
     def __getitem__(self, exp_id: Union[int, slice]) -> LazyClassesInExpsRet:
         indexing_collate = _LazyClassesInDetectionExps._slice_collate
diff --git a/avalanche/benchmarks/scenarios/generic_scenario.py b/avalanche/benchmarks/scenarios/generic_scenario.py
index d7b5ba09c..34da0d249 100644
--- a/avalanche/benchmarks/scenarios/generic_scenario.py
+++ b/avalanche/benchmarks/scenarios/generic_scenario.py
@@ -427,12 +427,10 @@ def __iter__(self) -> Iterator[TCLExperience]:
             yield exp
 
     @overload
-    def __getitem__(self, item: int) -> TCLExperience:
-        ...
+    def __getitem__(self, item: int) -> TCLExperience: ...
 
     @overload
-    def __getitem__(self: TSequenceCLStream, item: slice) -> TSequenceCLStream:
-        ...
+    def __getitem__(self: TSequenceCLStream, item: slice) -> TSequenceCLStream: ...
 
     @final
     def __getitem__(
diff --git a/avalanche/benchmarks/utils/classification_dataset.py b/avalanche/benchmarks/utils/classification_dataset.py
index acb4f880b..ad6db47f6 100644
--- a/avalanche/benchmarks/utils/classification_dataset.py
+++ b/avalanche/benchmarks/utils/classification_dataset.py
@@ -175,8 +175,7 @@ def _make_taskaware_classification_dataset(
     task_labels: Optional[Union[int, Sequence[int]]] = None,
     targets: Optional[Sequence[TTargetType]] = None,
     collate_fn: Optional[Callable[[List], Any]] = None
-) -> TaskAwareSupervisedClassificationDataset:
-    ...
+) -> TaskAwareSupervisedClassificationDataset: ...
 
 
 @overload
@@ -190,8 +189,7 @@ def _make_taskaware_classification_dataset(
     task_labels: Union[int, Sequence[int]],
     targets: Sequence[TTargetType],
     collate_fn: Optional[Callable[[List], Any]] = None
-) -> TaskAwareSupervisedClassificationDataset:
-    ...
+) -> TaskAwareSupervisedClassificationDataset: ...
 
 
 @overload
@@ -205,8 +203,7 @@ def _make_taskaware_classification_dataset(
     task_labels: Optional[Union[int, Sequence[int]]] = None,
     targets: Optional[Sequence[TTargetType]] = None,
     collate_fn: Optional[Callable[[List], Any]] = None
-) -> TaskAwareClassificationDataset:
-    ...
+) -> TaskAwareClassificationDataset: ...
 
 
 def _make_taskaware_classification_dataset(
@@ -386,8 +383,7 @@ def _taskaware_classification_subset(
     task_labels: Optional[Union[int, Sequence[int]]] = None,
     targets: Optional[Sequence[TTargetType]] = None,
     collate_fn: Optional[Callable[[List], Any]] = None
-) -> TaskAwareSupervisedClassificationDataset:
-    ...
+) -> TaskAwareSupervisedClassificationDataset: ...
 
 
 @overload
@@ -403,8 +399,7 @@ def _taskaware_classification_subset(
     task_labels: Union[int, Sequence[int]],
     targets: Sequence[TTargetType],
     collate_fn: Optional[Callable[[List], Any]] = None
-) -> TaskAwareSupervisedClassificationDataset:
-    ...
+) -> TaskAwareSupervisedClassificationDataset: ...
 
 
 @overload
@@ -420,8 +415,7 @@ def _taskaware_classification_subset(
     task_labels: Optional[Union[int, Sequence[int]]] = None,
     targets: Optional[Sequence[TTargetType]] = None,
     collate_fn: Optional[Callable[[List], Any]] = None
-) -> TaskAwareClassificationDataset:
-    ...
+) -> TaskAwareClassificationDataset: ...
 
 
 def _taskaware_classification_subset(
@@ -619,8 +613,7 @@ def _make_taskaware_tensor_classification_dataset(
     task_labels: Union[int, Sequence[int]],
     targets: Union[Sequence[TTargetType], int],
     collate_fn: Optional[Callable[[List], Any]] = None
-) -> TaskAwareSupervisedClassificationDataset:
-    ...
+) -> TaskAwareSupervisedClassificationDataset: ...
 
 
 @overload
@@ -633,8 +626,9 @@ def _make_taskaware_tensor_classification_dataset(
     task_labels: Optional[Union[int, Sequence[int]]] = None,
     targets: Optional[Union[Sequence[TTargetType], int]] = None,
     collate_fn: Optional[Callable[[List], Any]] = None
-) -> Union[TaskAwareClassificationDataset, TaskAwareSupervisedClassificationDataset]:
-    ...
+) -> Union[
+    TaskAwareClassificationDataset, TaskAwareSupervisedClassificationDataset
+]: ...
 
 
 def _make_taskaware_tensor_classification_dataset(
@@ -759,8 +753,7 @@ def _concat_taskaware_classification_datasets(
         Union[Sequence[TTargetType], Sequence[Sequence[TTargetType]]]
     ] = None,
     collate_fn: Optional[Callable[[List], Any]] = None
-) -> TaskAwareSupervisedClassificationDataset:
-    ...
+) -> TaskAwareSupervisedClassificationDataset: ...
 
 
 @overload
@@ -774,8 +767,7 @@ def _concat_taskaware_classification_datasets(
     task_labels: Union[int, Sequence[int], Sequence[Sequence[int]]],
     targets: Union[Sequence[TTargetType], Sequence[Sequence[TTargetType]]],
     collate_fn: Optional[Callable[[List], Any]] = None
-) -> TaskAwareSupervisedClassificationDataset:
-    ...
+) -> TaskAwareSupervisedClassificationDataset: ...
 
 
 @overload
@@ -791,8 +783,7 @@ def _concat_taskaware_classification_datasets(
         Union[Sequence[TTargetType], Sequence[Sequence[TTargetType]]]
     ] = None,
     collate_fn: Optional[Callable[[List], Any]] = None
-) -> TaskAwareClassificationDataset:
-    ...
+) -> TaskAwareClassificationDataset: ...
 
 
 def _concat_taskaware_classification_datasets(
diff --git a/avalanche/benchmarks/utils/data.py b/avalanche/benchmarks/utils/data.py
index 76908345e..9985e19fa 100644
--- a/avalanche/benchmarks/utils/data.py
+++ b/avalanche/benchmarks/utils/data.py
@@ -344,12 +344,10 @@ def __eq__(self, other: object):
         )
 
     @overload
-    def __getitem__(self, exp_id: int) -> T_co:
-        ...
+    def __getitem__(self, exp_id: int) -> T_co: ...
 
     @overload
-    def __getitem__(self: TAvalancheDataset, exp_id: slice) -> TAvalancheDataset:
-        ...
+    def __getitem__(self: TAvalancheDataset, exp_id: slice) -> TAvalancheDataset: ...
 
     def __getitem__(
         self: TAvalancheDataset, idx: Union[int, slice]
diff --git a/avalanche/benchmarks/utils/data_attribute.py b/avalanche/benchmarks/utils/data_attribute.py
index 3cb88e58c..d3ee4b0d9 100644
--- a/avalanche/benchmarks/utils/data_attribute.py
+++ b/avalanche/benchmarks/utils/data_attribute.py
@@ -62,12 +62,10 @@ def __iter__(self):
             yield self[i]
 
     @overload
-    def __getitem__(self, item: int) -> T_co:
-        ...
+    def __getitem__(self, item: int) -> T_co: ...
 
     @overload
-    def __getitem__(self, item: slice) -> Sequence[T_co]:
-        ...
+    def __getitem__(self, item: slice) -> Sequence[T_co]: ...
 
     def __getitem__(self, item: Union[int, slice]) -> Union[T_co, Sequence[T_co]]:
         return self.data[item]
diff --git a/avalanche/benchmarks/utils/dataset_definitions.py b/avalanche/benchmarks/utils/dataset_definitions.py
index ca8de5b54..de9fa05da 100644
--- a/avalanche/benchmarks/utils/dataset_definitions.py
+++ b/avalanche/benchmarks/utils/dataset_definitions.py
@@ -38,11 +38,9 @@ class IDataset(Protocol[T_co]):
     Note: no __add__ method is defined.
     """
 
-    def __getitem__(self, index: int) -> T_co:
-        ...
+    def __getitem__(self, index: int) -> T_co: ...
 
-    def __len__(self) -> int:
-        ...
+    def __len__(self) -> int: ...
 
 
 class IDatasetWithTargets(IDataset[T_co], Protocol[T_co, TTargetType_co]):
diff --git a/avalanche/benchmarks/utils/dataset_utils.py b/avalanche/benchmarks/utils/dataset_utils.py
index 717d67e75..867ba9cc6 100644
--- a/avalanche/benchmarks/utils/dataset_utils.py
+++ b/avalanche/benchmarks/utils/dataset_utils.py
@@ -64,12 +64,10 @@ def __iter__(self) -> Iterator[TData]:
             yield el
 
     @overload
-    def __getitem__(self, item: int) -> TData:
-        ...
+    def __getitem__(self, item: int) -> TData: ...
 
     @overload
-    def __getitem__(self: TSliceSequence, item: slice) -> TSliceSequence:
-        ...
+    def __getitem__(self: TSliceSequence, item: slice) -> TSliceSequence: ...
 
     @final
     def __getitem__(
diff --git a/avalanche/benchmarks/utils/detection_dataset.py b/avalanche/benchmarks/utils/detection_dataset.py
index 7f3b3b632..6c5efb43f 100644
--- a/avalanche/benchmarks/utils/detection_dataset.py
+++ b/avalanche/benchmarks/utils/detection_dataset.py
@@ -151,8 +151,7 @@ def make_detection_dataset(
     task_labels: Optional[Union[int, Sequence[int]]] = None,
     targets: Optional[Sequence[TTargetType]] = None,
     collate_fn: Optional[Callable[[List], Any]] = None
-) -> SupervisedDetectionDataset:
-    ...
+) -> SupervisedDetectionDataset: ...
 
 
 @overload
@@ -166,8 +165,7 @@ def make_detection_dataset(
     task_labels: Union[int, Sequence[int]],
     targets: Sequence[TTargetType],
     collate_fn: Optional[Callable[[List], Any]] = None
-) -> SupervisedDetectionDataset:
-    ...
+) -> SupervisedDetectionDataset: ...
 
 
 @overload
@@ -181,8 +179,7 @@ def make_detection_dataset(
     task_labels: Optional[Union[int, Sequence[int]]] = None,
     targets: Optional[Sequence[TTargetType]] = None,
     collate_fn: Optional[Callable[[List], Any]] = None
-) -> DetectionDataset:
-    ...
+) -> DetectionDataset: ...
 
 
 def make_detection_dataset(
@@ -373,8 +370,7 @@ def detection_subset(
     task_labels: Optional[Union[int, Sequence[int]]] = None,
     targets: Optional[Sequence[TTargetType]] = None,
     collate_fn: Optional[Callable[[List], Any]] = None
-) -> SupervisedDetectionDataset:
-    ...
+) -> SupervisedDetectionDataset: ...
 
 
 @overload
@@ -390,8 +386,7 @@ def detection_subset(
     task_labels: Union[int, Sequence[int]],
     targets: Sequence[TTargetType],
     collate_fn: Optional[Callable[[List], Any]] = None
-) -> SupervisedDetectionDataset:
-    ...
+) -> SupervisedDetectionDataset: ...
 
 
 @overload
@@ -407,8 +402,7 @@ def detection_subset(
     task_labels: Optional[Union[int, Sequence[int]]] = None,
     targets: Optional[Sequence[TTargetType]] = None,
     collate_fn: Optional[Callable[[List], Any]] = None
-) -> DetectionDataset:
-    ...
+) -> DetectionDataset: ...
 
 
 def detection_subset(
@@ -595,8 +589,7 @@ def concat_detection_datasets(
         Union[Sequence[TTargetType], Sequence[Sequence[TTargetType]]]
     ] = None,
     collate_fn: Optional[Callable[[List], Any]] = None
-) -> SupervisedDetectionDataset:
-    ...
+) -> SupervisedDetectionDataset: ...
 
 
 @overload
@@ -610,8 +603,7 @@ def concat_detection_datasets(
     task_labels: Union[int, Sequence[int], Sequence[Sequence[int]]],
     targets: Union[Sequence[TTargetType], Sequence[Sequence[TTargetType]]],
     collate_fn: Optional[Callable[[List], Any]] = None
-) -> SupervisedDetectionDataset:
-    ...
+) -> SupervisedDetectionDataset: ...
 
 
 @overload
@@ -627,8 +619,7 @@ def concat_detection_datasets(
         Union[Sequence[TTargetType], Sequence[Sequence[TTargetType]]]
     ] = None,
     collate_fn: Optional[Callable[[List], Any]] = None
-) -> DetectionDataset:
-    ...
+) -> DetectionDataset: ...
 
 
 def concat_detection_datasets(
diff --git a/avalanche/benchmarks/utils/ffcv_support/ffcv_epoch_iterator.py b/avalanche/benchmarks/utils/ffcv_support/ffcv_epoch_iterator.py
index 7c36b633f..44fbfcedc 100644
--- a/avalanche/benchmarks/utils/ffcv_support/ffcv_epoch_iterator.py
+++ b/avalanche/benchmarks/utils/ffcv_support/ffcv_epoch_iterator.py
@@ -1,6 +1,7 @@
 """
 Custom version of the FFCV epoch iterator.
 """
+
 from threading import Thread, Event, Lock
 from queue import Queue
 from typing import List, Sequence, TYPE_CHECKING
diff --git a/avalanche/benchmarks/utils/ffcv_support/ffcv_loader.py b/avalanche/benchmarks/utils/ffcv_support/ffcv_loader.py
index 8f2ae9f10..1deacbb97 100644
--- a/avalanche/benchmarks/utils/ffcv_support/ffcv_loader.py
+++ b/avalanche/benchmarks/utils/ffcv_support/ffcv_loader.py
@@ -1,6 +1,7 @@
 """
 Custom version of the FFCV loader that accepts a batch sampler.
 """
+
 from typing import Any, Callable, List, Mapping, Optional, Sequence, Type, Union
 import warnings
 
diff --git a/avalanche/benchmarks/utils/ffcv_support/ffcv_transform_utils.py b/avalanche/benchmarks/utils/ffcv_support/ffcv_transform_utils.py
index f72940b88..4642ffb0f 100644
--- a/avalanche/benchmarks/utils/ffcv_support/ffcv_transform_utils.py
+++ b/avalanche/benchmarks/utils/ffcv_support/ffcv_transform_utils.py
@@ -12,8 +12,8 @@
     Tuple,
     Type,
     Union,
+    Literal,
 )
-from typing_extensions import Literal
 import warnings
 import numpy as np
 
diff --git a/avalanche/benchmarks/utils/flat_data.py b/avalanche/benchmarks/utils/flat_data.py
index d9fd73ee0..02d3681cd 100644
--- a/avalanche/benchmarks/utils/flat_data.py
+++ b/avalanche/benchmarks/utils/flat_data.py
@@ -66,9 +66,9 @@ def __init__(
             else:
                 new_lists.append(ll)
 
-        self._lists: Optional[
-            List[Sequence[int]]
-        ] = new_lists  # freed after eagerification
+        self._lists: Optional[List[Sequence[int]]] = (
+            new_lists  # freed after eagerification
+        )
         self._offset: int = int(offset)
         self._eager_list: Optional[np.ndarray] = None
         """This is the list where we save indices
@@ -408,12 +408,10 @@ def _get_idx(self, idx) -> Tuple[int, int]:
         return dataset_idx, int(idx)
 
     @overload
-    def __getitem__(self, item: int) -> T_co:
-        ...
+    def __getitem__(self, item: int) -> T_co: ...
 
     @overload
-    def __getitem__(self: TFlatData, item: slice) -> TFlatData:
-        ...
+    def __getitem__(self: TFlatData, item: slice) -> TFlatData: ...
 
     def __getitem__(self: TFlatData, item: Union[int, slice]) -> Union[T_co, TFlatData]:
         if isinstance(item, (int, np.integer)):
@@ -470,12 +468,10 @@ def __len__(self):
         return self._size
 
     @overload
-    def __getitem__(self, index: int) -> DataT:
-        ...
+    def __getitem__(self, index: int) -> DataT: ...
 
     @overload
-    def __getitem__(self, index: slice) -> "ConstantSequence[DataT]":
-        ...
+    def __getitem__(self, index: slice) -> "ConstantSequence[DataT]": ...
 
     def __getitem__(
         self, index: Union[int, slice]
diff --git a/avalanche/evaluation/metric_definitions.py b/avalanche/evaluation/metric_definitions.py
index 1934d5392..c91a7122c 100644
--- a/avalanche/evaluation/metric_definitions.py
+++ b/avalanche/evaluation/metric_definitions.py
@@ -215,8 +215,7 @@ def __init__(
         ] = "experience",
         emit_at: Literal["iteration", "epoch", "experience", "stream"] = "experience",
         mode: Literal["train"] = "train",
-    ):
-        ...
+    ): ...
 
     @overload
     def __init__(
@@ -225,8 +224,7 @@ def __init__(
         reset_at: Literal["iteration", "experience", "stream", "never"] = "experience",
         emit_at: Literal["iteration", "experience", "stream"] = "experience",
         mode: Literal["eval"] = "eval",
-    ):
-        ...
+    ): ...
 
     def __init__(
         self, metric: TMetric, reset_at="experience", emit_at="experience", mode="eval"
diff --git a/avalanche/evaluation/metrics/forgetting_bwt.py b/avalanche/evaluation/metrics/forgetting_bwt.py
index 9e5f79094..29d9ffd28 100644
--- a/avalanche/evaluation/metrics/forgetting_bwt.py
+++ b/avalanche/evaluation/metrics/forgetting_bwt.py
@@ -526,18 +526,15 @@ def forgetting_metrics(*, experience=False, stream=False) -> List[PluginMetric]:
 
 
 @overload
-def forgetting_to_bwt(f: float) -> float:
-    ...
+def forgetting_to_bwt(f: float) -> float: ...
 
 
 @overload
-def forgetting_to_bwt(f: Dict[int, float]) -> Dict[int, float]:
-    ...
+def forgetting_to_bwt(f: Dict[int, float]) -> Dict[int, float]: ...
 
 
 @overload
-def forgetting_to_bwt(f: None) -> None:
-    ...
+def forgetting_to_bwt(f: None) -> None: ...
 
 
 def forgetting_to_bwt(f: Optional[Union[float, Dict[int, float]]]):
diff --git a/avalanche/evaluation/metrics/labels_repartition.py b/avalanche/evaluation/metrics/labels_repartition.py
index 0d55ec5f3..7a5304696 100644
--- a/avalanche/evaluation/metrics/labels_repartition.py
+++ b/avalanche/evaluation/metrics/labels_repartition.py
@@ -85,8 +85,7 @@ def __init__(
         ] = default_history_repartition_image_creator,
         mode: Literal["train"] = "train",
         emit_reset_at: Literal["stream", "experience", "epoch"] = "epoch",
-    ):
-        ...
+    ): ...
 
     @overload
     def __init__(
@@ -97,8 +96,7 @@ def __init__(
         ] = default_history_repartition_image_creator,
         mode: Literal["eval"] = "eval",
         emit_reset_at: Literal["stream", "experience"],
-    ):
-        ...
+    ): ...
 
     def __init__(
         self,
@@ -171,12 +169,14 @@ def _package_result(self, strategy: "SupervisedTemplate") -> "MetricResult":
                 f"/{self._mode}_phase"
                 f"/{stream_type(strategy.experience)}_stream"
                 f"/Task_{task:03}",
-                value=AlternativeValues(
-                    self.image_creator(label2counts, self.steps),
-                    label2counts,
-                )
-                if self.image_creator is not None
-                else label2counts,
+                value=(
+                    AlternativeValues(
+                        self.image_creator(label2counts, self.steps),
+                        label2counts,
+                    )
+                    if self.image_creator is not None
+                    else label2counts
+                ),
                 x_plot=strategy.clock.train_iterations,
             )
             for task, label2counts in self.task2label2counts.items()
diff --git a/avalanche/evaluation/metrics/mean_scores.py b/avalanche/evaluation/metrics/mean_scores.py
index a7e69276e..b0f84170f 100644
--- a/avalanche/evaluation/metrics/mean_scores.py
+++ b/avalanche/evaluation/metrics/mean_scores.py
@@ -9,6 +9,7 @@
 It selects the scores of the true class and then average them for past and new
 classes.
 """
+
 from abc import ABC, abstractmethod
 from collections import defaultdict
 from typing import Callable, Dict, Set, TYPE_CHECKING, List, Optional, TypeVar, Literal
diff --git a/avalanche/logging/text_logging.py b/avalanche/logging/text_logging.py
index 7222410bc..17451e344 100644
--- a/avalanche/logging/text_logging.py
+++ b/avalanche/logging/text_logging.py
@@ -253,10 +253,10 @@ def _fobj_serialize(file_object) -> Optional[str]:
             out_file_path = TextLogger._file_get_real_path(file_object)
             stream_name = TextLogger._file_get_stream(file_object)
 
-        if out_file_path is not None:
-            return "path:" + str(out_file_path)
-        elif stream_name is not None:
+        if stream_name is not None:
             return "stream:" + stream_name
+        elif out_file_path is not None:
+            return "path:" + str(out_file_path)
         else:
             return None
 
@@ -285,8 +285,15 @@ def _file_get_real_path(file_object) -> Optional[str]:
                 # Manage files created by tempfile
                 file_object = file_object.file
             fobject_path = file_object.name
+
             if fobject_path in ["<stdout>", "<stderr>"]:
+                # Standard output / error
                 return None
+
+            if isinstance(fobject_path, int):
+                # File descriptor
+                return None
+
             return fobject_path
         except AttributeError:
             return None
diff --git a/avalanche/models/slim_resnet18.py b/avalanche/models/slim_resnet18.py
index f5cd0fed6..a27135d44 100644
--- a/avalanche/models/slim_resnet18.py
+++ b/avalanche/models/slim_resnet18.py
@@ -1,4 +1,5 @@
 """This is the slimmed ResNet as used by Lopez et al. in the GEM paper."""
+
 import torch.nn as nn
 from torch.nn.functional import relu, avg_pool2d
 from avalanche.models import MultiHeadClassifier, MultiTaskModule, DynamicModule
diff --git a/avalanche/models/timm_vit.py b/avalanche/models/timm_vit.py
index c2e0bf45a..21dcc677a 100644
--- a/avalanche/models/timm_vit.py
+++ b/avalanche/models/timm_vit.py
@@ -23,6 +23,7 @@
 # -- Jaeho Lee, dlwogh9344@khu.ac.kr
 # ------------------------------------------
 """
+
 import math
 import re
 
@@ -442,9 +443,11 @@ def checkpoint_filter_fn(state_dict, model, adapt_layer_scale=False):
             v = resize_pos_embed(
                 v,
                 model.pos_embed,
-                0
-                if getattr(model, "no_embed_class")
-                else getattr(model, "num_prefix_tokens", 1),
+                (
+                    0
+                    if getattr(model, "no_embed_class")
+                    else getattr(model, "num_prefix_tokens", 1)
+                ),
                 model.patch_embed.grid_size,
             )
         elif adapt_layer_scale and "gamma_" in k:
diff --git a/avalanche/training/__init__.py b/avalanche/training/__init__.py
index 0a995200a..7780e066f 100644
--- a/avalanche/training/__init__.py
+++ b/avalanche/training/__init__.py
@@ -5,6 +5,7 @@ class (:py:class:`BaseTemplate`) and implementations of the most common
 :py:mod:`training.supervised` or as plugins (:py:mod:`training.plugins`) that
 can be easily combined with your own strategy.
 """
+
 from .regularization import *
 from .supervised import *
 from .storage_policy import *
diff --git a/avalanche/training/plugins/agem.py b/avalanche/training/plugins/agem.py
index db1cfd7a5..9cdb219d9 100644
--- a/avalanche/training/plugins/agem.py
+++ b/avalanche/training/plugins/agem.py
@@ -61,9 +61,11 @@ def before_training_iteration(self, strategy, **kwargs):
             loss.backward()
             # gradient can be None for some head on multi-headed models
             reference_gradients_list = [
-                p.grad.view(-1)
-                if p.grad is not None
-                else torch.zeros(p.numel(), device=strategy.device)
+                (
+                    p.grad.view(-1)
+                    if p.grad is not None
+                    else torch.zeros(p.numel(), device=strategy.device)
+                )
                 for n, p in strategy.model.named_parameters()
             ]
             self.reference_gradients = torch.cat(reference_gradients_list)
@@ -76,9 +78,11 @@ def after_backward(self, strategy, **kwargs):
         """
         if len(self.buffers) > 0:
             current_gradients_list = [
-                p.grad.view(-1)
-                if p.grad is not None
-                else torch.zeros(p.numel(), device=strategy.device)
+                (
+                    p.grad.view(-1)
+                    if p.grad is not None
+                    else torch.zeros(p.numel(), device=strategy.device)
+                )
                 for n, p in strategy.model.named_parameters()
             ]
             current_gradients = torch.cat(current_gradients_list)
diff --git a/avalanche/training/plugins/bic.py b/avalanche/training/plugins/bic.py
index 61535fd37..caef605b9 100644
--- a/avalanche/training/plugins/bic.py
+++ b/avalanche/training/plugins/bic.py
@@ -454,9 +454,9 @@ def _classes_groups(self, strategy: SupervisedTemplate):
         # - "current" classes: seen in current_experience
 
         # "initial" classes
-        initial_classes: Set[
-            int
-        ] = set()  # pre_initial_cl in the original implementation
+        initial_classes: Set[int] = (
+            set()
+        )  # pre_initial_cl in the original implementation
         previous_classes: Set[int] = set()  # pre_new_cl in the original implementation
         current_classes: Set[int] = set()  # new_cl in the original implementation
         # Note: pre_initial_cl + pre_new_cl is "initial_cl" in the original implementation
diff --git a/avalanche/training/plugins/gem.py b/avalanche/training/plugins/gem.py
index f0b01128f..26eccdad6 100644
--- a/avalanche/training/plugins/gem.py
+++ b/avalanche/training/plugins/gem.py
@@ -59,9 +59,11 @@ def before_training_iteration(self, strategy, **kwargs):
                 G.append(
                     torch.cat(
                         [
-                            p.grad.flatten()
-                            if p.grad is not None
-                            else torch.zeros(p.numel(), device=strategy.device)
+                            (
+                                p.grad.flatten()
+                                if p.grad is not None
+                                else torch.zeros(p.numel(), device=strategy.device)
+                            )
                             for p in strategy.model.parameters()
                         ],
                         dim=0,
@@ -79,9 +81,11 @@ def after_backward(self, strategy, **kwargs):
         if strategy.clock.train_exp_counter > 0:
             g = torch.cat(
                 [
-                    p.grad.flatten()
-                    if p.grad is not None
-                    else torch.zeros(p.numel(), device=strategy.device)
+                    (
+                        p.grad.flatten()
+                        if p.grad is not None
+                        else torch.zeros(p.numel(), device=strategy.device)
+                    )
                     for p in strategy.model.parameters()
                 ],
                 dim=0,
diff --git a/avalanche/training/regularization.py b/avalanche/training/regularization.py
index 2c15d3392..277dc1ee3 100644
--- a/avalanche/training/regularization.py
+++ b/avalanche/training/regularization.py
@@ -1,4 +1,5 @@
 """Regularization methods."""
+
 import copy
 from collections import defaultdict
 from typing import List
diff --git a/avalanche/training/supervised/__init__.py b/avalanche/training/supervised/__init__.py
index a956677b3..b35bdbf78 100644
--- a/avalanche/training/supervised/__init__.py
+++ b/avalanche/training/supervised/__init__.py
@@ -2,6 +2,7 @@
 
 This module contains
 """
+
 from .joint_training import *
 from .ar1 import AR1
 from .cumulative import Cumulative
diff --git a/avalanche/training/supervised/ar1.py b/avalanche/training/supervised/ar1.py
index 3a2f52075..41040b9b0 100644
--- a/avalanche/training/supervised/ar1.py
+++ b/avalanche/training/supervised/ar1.py
@@ -26,6 +26,7 @@
     LayerAndParameter,
 )
 from avalanche.training.plugins.evaluation import default_evaluator
+from avalanche.training.templates.strategy_mixin_protocol import CriterionType
 
 
 class AR1(SupervisedTemplate):
@@ -42,7 +43,8 @@ class AR1(SupervisedTemplate):
 
     def __init__(
         self,
-        criterion=None,
+        *,
+        criterion: CriterionType = CrossEntropyLoss(),
         lr: float = 0.001,
         inc_lr: float = 5e-5,
         momentum=0.9,
@@ -65,6 +67,7 @@ def __init__(
             EvaluationPlugin, Callable[[], EvaluationPlugin]
         ] = default_evaluator,
         eval_every=-1,
+        **kwargs
     ):
         """
         Creates an instance of the AR1 strategy.
@@ -146,9 +149,6 @@ def __init__(
 
         optimizer = SGD(model.parameters(), lr=lr, momentum=momentum, weight_decay=l2)
 
-        if criterion is None:
-            criterion = CrossEntropyLoss()
-
         self.ewc_lambda = ewc_lambda
         self.freeze_below_layer = freeze_below_layer
         self.rm_sz = rm_sz
@@ -166,9 +166,9 @@ def __init__(
         self.replay_mb_size = 0
 
         super().__init__(
-            model,
-            optimizer,
-            criterion,
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
             train_mb_size=train_mb_size,
             train_epochs=train_epochs,
             eval_mb_size=eval_mb_size,
@@ -176,6 +176,7 @@ def __init__(
             plugins=plugins,
             evaluator=evaluator,
             eval_every=eval_every,
+            **kwargs
         )
 
     def _before_training_exp(self, **kwargs):
diff --git a/avalanche/training/supervised/cumulative.py b/avalanche/training/supervised/cumulative.py
index 88d1cc626..0e073456c 100644
--- a/avalanche/training/supervised/cumulative.py
+++ b/avalanche/training/supervised/cumulative.py
@@ -3,13 +3,12 @@
 
 from torch.nn import Module
 from torch.optim import Optimizer
-from torch.utils.data import ConcatDataset
 
-from avalanche.benchmarks.utils import _concat_taskaware_classification_datasets
 from avalanche.benchmarks.utils.utils import concat_datasets
 from avalanche.training.plugins.evaluation import default_evaluator
 from avalanche.training.plugins import SupervisedPlugin, EvaluationPlugin
 from avalanche.training.templates import SupervisedTemplate
+from avalanche.training.templates.strategy_mixin_protocol import CriterionType
 
 
 class Cumulative(SupervisedTemplate):
@@ -21,9 +20,10 @@ class Cumulative(SupervisedTemplate):
 
     def __init__(
         self,
+        *,
         model: Module,
         optimizer: Optimizer,
-        criterion,
+        criterion: CriterionType,
         train_mb_size: int = 1,
         train_epochs: int = 1,
         eval_mb_size: Optional[int] = None,
@@ -33,6 +33,7 @@ def __init__(
             EvaluationPlugin, Callable[[], EvaluationPlugin]
         ] = default_evaluator,
         eval_every=-1,
+        **kwargs
     ):
         """Init.
 
@@ -54,9 +55,9 @@ def __init__(
         """
 
         super().__init__(
-            model,
-            optimizer,
-            criterion,
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
             train_mb_size=train_mb_size,
             train_epochs=train_epochs,
             eval_mb_size=eval_mb_size,
@@ -64,6 +65,7 @@ def __init__(
             plugins=plugins,
             evaluator=evaluator,
             eval_every=eval_every,
+            **kwargs
         )
 
         self.dataset = None  # cumulative dataset
@@ -79,3 +81,6 @@ def train_dataset_adaptation(self, **kwargs):
         else:
             self.dataset = concat_datasets([self.dataset, exp.dataset])
         self.adapted_dataset = self.dataset
+
+
+__all__ = ["Cumulative"]
diff --git a/avalanche/training/supervised/deep_slda.py b/avalanche/training/supervised/deep_slda.py
index 83cfec37c..c234c44c9 100644
--- a/avalanche/training/supervised/deep_slda.py
+++ b/avalanche/training/supervised/deep_slda.py
@@ -3,6 +3,7 @@
 
 import os
 import torch
+from torch.nn import Module
 
 from avalanche.training.plugins import SupervisedPlugin
 from avalanche.training.templates import SupervisedTemplate
@@ -12,6 +13,7 @@
 )
 from avalanche.models.dynamic_modules import MultiTaskModule
 from avalanche.models import FeatureExtractorBackbone
+from avalanche.training.templates.strategy_mixin_protocol import CriterionType
 
 
 class StreamingLDA(SupervisedTemplate):
@@ -28,11 +30,12 @@ class StreamingLDA(SupervisedTemplate):
 
     def __init__(
         self,
-        slda_model,
-        criterion,
-        input_size,
-        num_classes,
-        output_layer_name=None,
+        *,
+        slda_model: Module,
+        criterion: CriterionType,
+        input_size: int,
+        num_classes: int,
+        output_layer_name: Optional[str] = None,
         shrinkage_param=1e-4,
         streaming_update_sigma=True,
         train_epochs: int = 1,
@@ -44,10 +47,11 @@ def __init__(
             EvaluationPlugin, Callable[[], EvaluationPlugin]
         ] = default_evaluator,
         eval_every=-1,
+        **kwargs,
     ):
         """Init function for the SLDA model.
 
-        :param slda_model: a PyTorch model
+        :param model: a PyTorch model
         :param criterion: loss function
         :param output_layer_name: if not None, wrap model to retrieve
             only the `output_layer_name` output. If None, the strategy
@@ -78,16 +82,17 @@ def __init__(
             ).eval()
 
         super(StreamingLDA, self).__init__(
-            slda_model,
-            None,  # type: ignore
-            criterion,
-            train_mb_size,
-            train_epochs,
-            eval_mb_size,
+            model=slda_model,
+            optimizer=None,  # type: ignore
+            criterion=criterion,
+            train_mb_size=train_mb_size,
+            train_epochs=train_epochs,
+            eval_mb_size=eval_mb_size,
             device=device,
             plugins=plugins,
             evaluator=evaluator,
             eval_every=eval_every,
+            **kwargs,
         )
 
         # SLDA parameters
diff --git a/avalanche/training/supervised/der.py b/avalanche/training/supervised/der.py
index 0dcc151a4..d9263f2a2 100644
--- a/avalanche/training/supervised/der.py
+++ b/avalanche/training/supervised/der.py
@@ -19,6 +19,7 @@
 from avalanche.benchmarks.utils.data import AvalancheDataset
 from avalanche.benchmarks.utils.data_attribute import TensorDataAttribute
 from avalanche.benchmarks.utils.flat_data import FlatData
+from avalanche.training.templates.strategy_mixin_protocol import CriterionType
 from avalanche.training.utils import cycle
 from avalanche.core import SupervisedPlugin
 from avalanche.training.plugins.evaluation import (
@@ -156,9 +157,10 @@ class DER(SupervisedTemplate):
 
     def __init__(
         self,
+        *,
         model: Module,
         optimizer: Optimizer,
-        criterion=CrossEntropyLoss(),
+        criterion: CriterionType = CrossEntropyLoss(),
         mem_size: int = 200,
         batch_size_mem: Optional[int] = None,
         alpha: float = 0.1,
@@ -173,6 +175,7 @@ def __init__(
         ] = default_evaluator,
         eval_every=-1,
         peval_mode="epoch",
+        **kwargs
     ):
         """
         :param model: PyTorch model.
@@ -219,6 +222,7 @@ def __init__(
             evaluator=evaluator,
             eval_every=eval_every,
             peval_mode=peval_mode,
+            **kwargs
         )
         if batch_size_mem is None:
             self.batch_size_mem = train_mb_size
@@ -328,3 +332,6 @@ def training_epoch(self, **kwargs):
             self._after_update(**kwargs)
 
             self._after_training_iteration(**kwargs)
+
+
+__all__ = ["DER"]
diff --git a/avalanche/training/supervised/er_ace.py b/avalanche/training/supervised/er_ace.py
index 25b90d370..01146c432 100644
--- a/avalanche/training/supervised/er_ace.py
+++ b/avalanche/training/supervised/er_ace.py
@@ -13,6 +13,7 @@
 )
 from avalanche.training.storage_policy import ClassBalancedBuffer
 from avalanche.training.templates import SupervisedTemplate
+from avalanche.training.templates.strategy_mixin_protocol import CriterionType
 from avalanche.training.utils import cycle
 
 
@@ -32,9 +33,10 @@ class ER_ACE(SupervisedTemplate):
 
     def __init__(
         self,
+        *,
         model: Module,
         optimizer: Optimizer,
-        criterion=CrossEntropyLoss(),
+        criterion: CriterionType = CrossEntropyLoss(),
         mem_size: int = 200,
         batch_size_mem: int = 10,
         train_mb_size: int = 1,
@@ -47,6 +49,7 @@ def __init__(
         ] = default_evaluator,
         eval_every=-1,
         peval_mode="epoch",
+        **kwargs
     ):
         """
         :param model: PyTorch model.
@@ -71,17 +74,18 @@ def __init__(
             `eval_every` epochs or iterations (Default='epoch').
         """
         super().__init__(
-            model,
-            optimizer,
-            criterion,
-            train_mb_size,
-            train_epochs,
-            eval_mb_size,
-            device,
-            plugins,
-            evaluator,
-            eval_every,
-            peval_mode,
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
+            train_mb_size=train_mb_size,
+            train_epochs=train_epochs,
+            eval_mb_size=eval_mb_size,
+            device=device,
+            plugins=plugins,
+            evaluator=evaluator,
+            eval_every=eval_every,
+            peval_mode=peval_mode,
+            **kwargs
         )
         self.mem_size = mem_size
         self.batch_size_mem = batch_size_mem
@@ -170,3 +174,8 @@ def _train_cleanup(self):
         super()._train_cleanup()
         # reset the value to avoid serialization failures
         self.replay_loader = None
+
+
+__all__ = [
+    "ER_ACE",
+]
diff --git a/avalanche/training/supervised/er_aml.py b/avalanche/training/supervised/er_aml.py
index 0b89aefd4..9d44f1f1f 100644
--- a/avalanche/training/supervised/er_aml.py
+++ b/avalanche/training/supervised/er_aml.py
@@ -13,6 +13,7 @@
 )
 from avalanche.training.storage_policy import ClassBalancedBuffer
 from avalanche.training.templates import SupervisedTemplate
+from avalanche.training.templates.strategy_mixin_protocol import CriterionType
 from avalanche.training.utils import cycle
 
 
@@ -27,10 +28,11 @@ class ER_AML(SupervisedTemplate):
 
     def __init__(
         self,
+        *,
         model: Module,
         feature_extractor: Module,
         optimizer: Optimizer,
-        criterion=CrossEntropyLoss(),
+        criterion: CriterionType = CrossEntropyLoss(),
         temp: float = 0.1,
         base_temp: float = 0.07,
         same_task_neg: bool = True,
@@ -46,6 +48,7 @@ def __init__(
         ] = default_evaluator,
         eval_every=-1,
         peval_mode="epoch",
+        **kwargs
     ):
         """
         :param model: PyTorch model.
@@ -74,17 +77,18 @@ def __init__(
             `eval_every` epochs or iterations (Default='epoch').
         """
         super().__init__(
-            model,
-            optimizer,
-            criterion,
-            train_mb_size,
-            train_epochs,
-            eval_mb_size,
-            device,
-            plugins,
-            evaluator,
-            eval_every,
-            peval_mode,
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
+            train_mb_size=train_mb_size,
+            train_epochs=train_epochs,
+            eval_mb_size=eval_mb_size,
+            device=device,
+            plugins=plugins,
+            evaluator=evaluator,
+            eval_every=eval_every,
+            peval_mode=peval_mode,
+            **kwargs
         )
         self.mem_size = mem_size
         self.batch_size_mem = batch_size_mem
@@ -198,3 +202,8 @@ def _train_cleanup(self):
         super()._train_cleanup()
         # reset the value to avoid serialization failures
         self.replay_loader = None
+
+
+__all__ = [
+    "ER_AML",
+]
diff --git a/avalanche/training/supervised/expert_gate.py b/avalanche/training/supervised/expert_gate.py
index 223e488e9..abb249216 100644
--- a/avalanche/training/supervised/expert_gate.py
+++ b/avalanche/training/supervised/expert_gate.py
@@ -12,19 +12,17 @@
 from collections import OrderedDict
 import warnings
 
-import torch
 from torch.nn import Module, CrossEntropyLoss
-from torch.optim import Optimizer, SGD, Adam
-from torch.nn.functional import log_softmax
+from torch.optim import Optimizer, SGD
 
 from typing import Optional, List
 
 from avalanche.models.expert_gate import ExpertAutoencoder, ExpertModel, ExpertGate
-from avalanche.models.dynamic_optimizers import reset_optimizer
 from avalanche.training.supervised import AETraining
 from avalanche.training.templates import SupervisedTemplate
 from avalanche.training.plugins import SupervisedPlugin, EvaluationPlugin, LwFPlugin
 from avalanche.training.plugins.evaluation import default_evaluator
+from avalanche.training.templates.strategy_mixin_protocol import CriterionType
 
 
 class ExpertGateStrategy(SupervisedTemplate):
@@ -44,9 +42,10 @@ class ExpertGateStrategy(SupervisedTemplate):
 
     def __init__(
         self,
+        *,
         model: Module,
         optimizer: Optimizer,
-        criterion=CrossEntropyLoss(),
+        criterion: CriterionType = CrossEntropyLoss(),
         train_mb_size: int = 1,
         train_epochs: int = 1,
         eval_mb_size: Optional[int] = None,
@@ -88,10 +87,6 @@ def __init__(
         expert during the forward method
             :class:`~avalanche.training.BaseTemplate` constructor arguments.
         """
-        # Check that the model has the correct architecture.
-        assert isinstance(
-            model, ExpertGate
-        ), "ExpertGateStrategy requires an ExpertGate model."
 
         expertgate = _ExpertGatePlugin()
 
@@ -105,7 +100,6 @@ def __init__(
         self.ae_lr = ae_lr
         self.ae_latent_dim = ae_latent_dim
         self.rel_thresh = rel_thresh
-        model.temp = temp
 
         warnings.warn(
             "This strategy is currently in the alpha stage and we are still "
@@ -128,6 +122,12 @@ def __init__(
             **base_kwargs
         )
 
+        # Check that the model has the correct architecture.
+        assert isinstance(
+            self.model, ExpertGate
+        ), "ExpertGateStrategy requires an ExpertGate model."
+        self.model.temp = temp
+
 
 class _ExpertGatePlugin(SupervisedPlugin):
     """The ExpertGate algorithm is a dynamic architecture algorithm.
@@ -239,9 +239,9 @@ def _select_expert(self, strategy: "SupervisedTemplate", task_label):
 
             # Iterate through all autoencoders to get error values
             for autoencoder_id in strategy.model.autoencoder_dict:
-                error_dict[
-                    str(autoencoder_id)
-                ] = self._get_average_reconstruction_error(strategy, autoencoder_id)
+                error_dict[str(autoencoder_id)] = (
+                    self._get_average_reconstruction_error(strategy, autoencoder_id)
+                )
 
             # Send error dictionary to get most relevant autoencoder
             relatedness_dict = self._task_relatedness(strategy, error_dict, task_label)
@@ -367,3 +367,6 @@ def _train_autoencoder(self, strategy: "SupervisedTemplate", autoencoder):
         # Train with autoencoder strategy
         ae_strategy.train(strategy.experience)
         print("FINISHED TRAINING NEW AUTOENCODER\n")
+
+
+__all__ = ["ExpertGateStrategy"]
diff --git a/avalanche/training/supervised/feature_replay.py b/avalanche/training/supervised/feature_replay.py
index a170b3340..e723b1515 100644
--- a/avalanche/training/supervised/feature_replay.py
+++ b/avalanche/training/supervised/feature_replay.py
@@ -11,6 +11,7 @@
 from avalanche.training.plugins.evaluation import EvaluationPlugin, default_evaluator
 from avalanche.training.storage_policy import ClassBalancedBuffer
 from avalanche.training.templates import SupervisedTemplate
+from avalanche.training.templates.strategy_mixin_protocol import CriterionType
 from avalanche.training.utils import cycle
 from avalanche.training.losses import MaskedCrossEntropy
 
@@ -46,9 +47,10 @@ class FeatureReplay(SupervisedTemplate):
 
     def __init__(
         self,
+        *,
         model: nn.Module,
         optimizer: Optimizer,
-        criterion=MaskedCrossEntropy(),
+        criterion: CriterionType = MaskedCrossEntropy(),
         last_layer_name: str = "classifier",
         mem_size: int = 200,
         batch_size_mem: int = 10,
@@ -62,19 +64,21 @@ def __init__(
         ] = default_evaluator,
         eval_every=-1,
         peval_mode="epoch",
+        **kwargs
     ):
         super().__init__(
-            model,
-            optimizer,
-            criterion,
-            train_mb_size,
-            train_epochs,
-            eval_mb_size,
-            device,
-            plugins,
-            evaluator,
-            eval_every,
-            peval_mode,
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
+            train_mb_size=train_mb_size,
+            train_epochs=train_epochs,
+            eval_mb_size=eval_mb_size,
+            device=device,
+            plugins=plugins,
+            evaluator=evaluator,
+            eval_every=eval_every,
+            peval_mode=peval_mode,
+            **kwargs
         )
         self.mem_size = mem_size
         self.batch_size_mem = batch_size_mem
diff --git a/avalanche/training/supervised/icarl.py b/avalanche/training/supervised/icarl.py
index 9e2ca25d1..fa292ba73 100644
--- a/avalanche/training/supervised/icarl.py
+++ b/avalanche/training/supervised/icarl.py
@@ -28,12 +28,13 @@ class ICaRL(SupervisedTemplate):
 
     def __init__(
         self,
+        *,
         feature_extractor: Module,
         classifier: Module,
         optimizer: Optimizer,
-        memory_size,
+        memory_size: int,
         buffer_transform,
-        fixed_memory,
+        fixed_memory: bool,
         train_mb_size: int = 1,
         train_epochs: int = 1,
         eval_mb_size: Optional[int] = None,
@@ -78,7 +79,11 @@ def __init__(
         )
 
         criterion = ICaRLLossPlugin()  # iCaRL requires this specific loss (#966)
-        icarl = _ICaRLPlugin(memory_size, buffer_transform, fixed_memory)
+        icarl = _ICaRLPlugin(
+            memory_size,
+            buffer_transform,
+            fixed_memory,
+        )
 
         if plugins is None:
             plugins = [icarl]
@@ -89,8 +94,8 @@ def __init__(
             plugins += [criterion]
 
         super().__init__(
-            model,
-            optimizer,
+            model=model,
+            optimizer=optimizer,
             criterion=criterion,
             train_mb_size=train_mb_size,
             train_epochs=train_epochs,
diff --git a/avalanche/training/supervised/joint_training.py b/avalanche/training/supervised/joint_training.py
index 2bb151d80..08294cfc0 100644
--- a/avalanche/training/supervised/joint_training.py
+++ b/avalanche/training/supervised/joint_training.py
@@ -28,6 +28,7 @@
     _experiences_parameter_as_iterable,
     _group_experiences_by_stream,
 )
+from avalanche.training.templates.strategy_mixin_protocol import CriterionType
 
 
 class AlreadyTrainedError(Exception):
@@ -57,9 +58,10 @@ class JointTraining(SupervisedTemplate[TDatasetExperience, TMBInput, TMBOutput])
 
     def __init__(
         self,
+        *,
         model: Module,
         optimizer: Optimizer,
-        criterion,
+        criterion: CriterionType,
         train_mb_size: int = 1,
         train_epochs: int = 1,
         eval_mb_size: int = 1,
@@ -69,6 +71,7 @@ def __init__(
             EvaluationPlugin, Callable[[], EvaluationPlugin]
         ] = default_evaluator,
         eval_every=-1,
+        **kwargs
     ):
         """Init.
 
@@ -98,6 +101,7 @@ def __init__(
             plugins=plugins,
             evaluator=evaluator,
             eval_every=eval_every,
+            **kwargs
         )
         # JointTraining can be trained only once.
         self._is_fitted = False
@@ -135,9 +139,9 @@ def train(
             )
 
         # Normalize training and eval data.
-        experiences_list: Iterable[
-            TDatasetExperience
-        ] = _experiences_parameter_as_iterable(experiences)
+        experiences_list: Iterable[TDatasetExperience] = (
+            _experiences_parameter_as_iterable(experiences)
+        )
 
         if eval_streams is None:
             eval_streams = [experiences_list]
diff --git a/avalanche/training/supervised/l2p.py b/avalanche/training/supervised/l2p.py
index 715498c20..2d519991a 100644
--- a/avalanche/training/supervised/l2p.py
+++ b/avalanche/training/supervised/l2p.py
@@ -31,6 +31,7 @@ class LearningToPrompt(SupervisedTemplate):
 
     def __init__(
         self,
+        *,
         model_name: str,
         criterion: nn.Module = nn.CrossEntropyLoss(),
         train_mb_size: int = 1,
@@ -130,17 +131,18 @@ def __init__(
         )
 
         super().__init__(
-            model,
-            optimizer,
-            criterion,
-            train_mb_size,
-            train_epochs,
-            eval_mb_size,
-            device,
-            plugins,
-            evaluator,
-            eval_every,
-            peval_mode,
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
+            train_mb_size=train_mb_size,
+            train_epochs=train_epochs,
+            eval_mb_size=eval_mb_size,
+            device=device,
+            plugins=plugins,
+            evaluator=evaluator,
+            eval_every=eval_every,
+            peval_mode=peval_mode,
+            **kwargs,
         )
 
         self._criterion = criterion
diff --git a/avalanche/training/supervised/lamaml.py b/avalanche/training/supervised/lamaml.py
index 81a9866cd..d880df347 100644
--- a/avalanche/training/supervised/lamaml.py
+++ b/avalanche/training/supervised/lamaml.py
@@ -8,6 +8,8 @@
 from torch.optim import Optimizer
 import math
 
+from avalanche.training.templates.strategy_mixin_protocol import CriterionType
+
 try:
     import higher
 except ImportError:
@@ -27,9 +29,10 @@
 class LaMAML(SupervisedMetaLearningTemplate):
     def __init__(
         self,
+        *,
         model: Module,
         optimizer: Optimizer,
-        criterion=CrossEntropyLoss(),
+        criterion: CriterionType = CrossEntropyLoss(),
         n_inner_updates: int = 5,
         second_order: bool = True,
         grad_clip_norm: float = 1.0,
@@ -183,9 +186,11 @@ def inner_update_step(self, fast_model, x, y, t):
 
         # Clip grad norms
         grads = [
-            torch.clamp(g, min=-self.grad_clip_norm, max=self.grad_clip_norm)
-            if g is not None
-            else g
+            (
+                torch.clamp(g, min=-self.grad_clip_norm, max=self.grad_clip_norm)
+                if g is not None
+                else g
+            )
             for g in grads
         ]
 
diff --git a/avalanche/training/supervised/lamaml_v2.py b/avalanche/training/supervised/lamaml_v2.py
index 9301a14c5..36905cb80 100644
--- a/avalanche/training/supervised/lamaml_v2.py
+++ b/avalanche/training/supervised/lamaml_v2.py
@@ -3,6 +3,8 @@
 import warnings
 import torch
 
+from avalanche.training.templates.strategy_mixin_protocol import CriterionType
+
 if parse(torch.__version__) < parse("2.0.0"):
     warnings.warn(f"LaMAML requires torch >= 2.0.0.")
 
@@ -24,9 +26,10 @@
 class LaMAML(SupervisedMetaLearningTemplate):
     def __init__(
         self,
+        *,
         model: Module,
         optimizer: Optimizer,
-        criterion=CrossEntropyLoss(),
+        criterion: CriterionType = CrossEntropyLoss(),
         n_inner_updates: int = 5,
         second_order: bool = True,
         grad_clip_norm: float = 1.0,
@@ -185,9 +188,11 @@ def inner_update_step(self, fast_params, x, y, t):
 
         # Clip grad norms
         grads = [
-            torch.clamp(g, min=-self.grad_clip_norm, max=self.grad_clip_norm)
-            if g is not None
-            else g
+            (
+                torch.clamp(g, min=-self.grad_clip_norm, max=self.grad_clip_norm)
+                if g is not None
+                else g
+            )
             for g in grads
         ]
 
diff --git a/avalanche/training/supervised/mer.py b/avalanche/training/supervised/mer.py
index b1a100cdf..d4d709d17 100644
--- a/avalanche/training/supervised/mer.py
+++ b/avalanche/training/supervised/mer.py
@@ -11,6 +11,7 @@
 from avalanche.training.plugins.evaluation import default_evaluator
 from avalanche.training.storage_policy import ReservoirSamplingBuffer
 from avalanche.training.templates import SupervisedMetaLearningTemplate
+from avalanche.training.templates.strategy_mixin_protocol import CriterionType
 
 
 class MERBuffer:
@@ -51,9 +52,10 @@ def get_batch(self, x, y, t):
 class MER(SupervisedMetaLearningTemplate):
     def __init__(
         self,
+        *,
         model: Module,
         optimizer: Optimizer,
-        criterion=CrossEntropyLoss(),
+        criterion: CriterionType = CrossEntropyLoss(),
         mem_size=200,
         batch_size_mem=10,
         n_inner_steps=5,
@@ -69,6 +71,7 @@ def __init__(
         ] = default_evaluator,
         eval_every=-1,
         peval_mode="epoch",
+        **kwargs
     ):
         """Implementation of Look-ahead MAML (LaMAML) algorithm in Avalanche
             using Higher library for applying fast updates.
@@ -85,17 +88,18 @@ def __init__(
 
         """
         super().__init__(
-            model,
-            optimizer,
-            criterion,
-            train_mb_size,
-            train_epochs,
-            eval_mb_size,
-            device,
-            plugins,
-            evaluator,
-            eval_every,
-            peval_mode,
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
+            train_mb_size=train_mb_size,
+            train_epochs=train_epochs,
+            eval_mb_size=eval_mb_size,
+            device=device,
+            plugins=plugins,
+            evaluator=evaluator,
+            eval_every=eval_every,
+            peval_mode=peval_mode,
+            **kwargs
         )
 
         self.buffer = MERBuffer(
diff --git a/avalanche/training/supervised/strategy_wrappers.py b/avalanche/training/supervised/strategy_wrappers.py
index 30a4484a0..541d500b2 100644
--- a/avalanche/training/supervised/strategy_wrappers.py
+++ b/avalanche/training/supervised/strategy_wrappers.py
@@ -45,10 +45,11 @@
 )
 from avalanche.training.templates.base import BaseTemplate
 from avalanche.training.templates import SupervisedTemplate
-from avalanche.evaluation.metrics import accuracy_metrics, loss_metrics
+from avalanche.evaluation.metrics import loss_metrics
 from avalanche.models.generator import MlpVAE, VAE_loss
 from avalanche.models.expert_gate import AE_loss
 from avalanche.logging import InteractiveLogger
+from avalanche.training.templates.strategy_mixin_protocol import CriterionType
 
 
 class Naive(SupervisedTemplate):
@@ -65,9 +66,10 @@ class Naive(SupervisedTemplate):
 
     def __init__(
         self,
+        *,
         model: Module,
         optimizer: Optimizer,
-        criterion=CrossEntropyLoss(),
+        criterion: CriterionType = CrossEntropyLoss(),
         train_mb_size: int = 1,
         train_epochs: int = 1,
         eval_mb_size: Optional[int] = None,
@@ -100,10 +102,11 @@ def __init__(
         :param base_kwargs: any additional
             :class:`~avalanche.training.BaseTemplate` constructor arguments.
         """
+
         super().__init__(
-            model,
-            optimizer,
-            criterion,
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
             train_mb_size=train_mb_size,
             train_epochs=train_epochs,
             eval_mb_size=eval_mb_size,
@@ -123,6 +126,7 @@ class PNNStrategy(SupervisedTemplate):
 
     def __init__(
         self,
+        *,
         model: Module,
         optimizer: Optimizer,
         criterion=CrossEntropyLoss(),
@@ -157,8 +161,7 @@ def __init__(
         :param base_kwargs: any additional
             :class:`~avalanche.training.BaseTemplate` constructor arguments.
         """
-        # Check that the model has the correct architecture.
-        assert isinstance(model, PNN), "PNNStrategy requires a PNN model."
+
         super().__init__(
             model=model,
             optimizer=optimizer,
@@ -173,6 +176,9 @@ def __init__(
             **base_kwargs
         )
 
+        # Check that the model has the correct architecture.
+        assert isinstance(self.model, PNN), "PNNStrategy requires a PNN model."
+
 
 class PackNet(SupervisedTemplate):
     """Task-incremental fixed-network parameter isolation with PackNet.
@@ -195,6 +201,7 @@ class PackNet(SupervisedTemplate):
 
     def __init__(
         self,
+        *,
         model: Union[PackNetModule, PackNetModel],
         optimizer: Optimizer,
         post_prune_epochs: int,
@@ -240,13 +247,10 @@ def __init__(
             :class:`~avalanche.training.BaseTemplate` constructor arguments.
         """
 
-        if not isinstance(model, PackNetModule):
-            raise ValueError("PackNet requires a model that implements PackNetModule.")
-
         super().__init__(
-            model,
-            optimizer,
-            criterion,
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
             train_mb_size=train_mb_size,
             train_epochs=train_epochs,
             eval_mb_size=eval_mb_size,
@@ -257,15 +261,19 @@ def __init__(
             **base_kwargs
         )
 
+        if not isinstance(self.model, PackNetModule):
+            raise ValueError("PackNet requires a model that implements PackNetModule.")
+
 
 class CWRStar(SupervisedTemplate):
     """CWR* Strategy."""
 
     def __init__(
         self,
+        *,
         model: Module,
         optimizer: Optimizer,
-        criterion,
+        criterion: CriterionType,
         cwr_layer_name: str,
         train_mb_size: int = 1,
         train_epochs: int = 1,
@@ -297,18 +305,23 @@ def __init__(
             only at the end of the learning experience. Values >0 mean that
             `eval` is called every `eval_every` epochs and at the end of the
             learning experience.
-        :param \*\*base_kwargs: any additional
+        :param **base_kwargs: any additional
             :class:`~avalanche.training.BaseTemplate` constructor arguments.
         """
-        cwsp = CWRStarPlugin(model, cwr_layer_name, freeze_remaining_model=True)
+
+        cwsp = CWRStarPlugin(
+            model,
+            cwr_layer_name,
+            freeze_remaining_model=True,
+        )
         if plugins is None:
             plugins = [cwsp]
         else:
             plugins.append(cwsp)
         super().__init__(
-            model,
-            optimizer,
-            criterion,
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
             train_mb_size=train_mb_size,
             train_epochs=train_epochs,
             eval_mb_size=eval_mb_size,
@@ -329,9 +342,10 @@ class Replay(SupervisedTemplate):
 
     def __init__(
         self,
+        *,
         model: Module,
         optimizer: Optimizer,
-        criterion,
+        criterion: CriterionType,
         mem_size: int = 200,
         train_mb_size: int = 1,
         train_epochs: int = 1,
@@ -362,7 +376,7 @@ def __init__(
             only at the end of the learning experience. Values >0 mean that
             `eval` is called every `eval_every` epochs and at the end of the
             learning experience.
-        :param \*\*base_kwargs: any additional
+        :param **base_kwargs: any additional
             :class:`~avalanche.training.BaseTemplate` constructor arguments.
         """
 
@@ -372,9 +386,9 @@ def __init__(
         else:
             plugins.append(rp)
         super().__init__(
-            model,
-            optimizer,
-            criterion,
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
             train_mb_size=train_mb_size,
             train_epochs=train_epochs,
             eval_mb_size=eval_mb_size,
@@ -407,9 +421,10 @@ class GenerativeReplay(SupervisedTemplate):
 
     def __init__(
         self,
+        *,
         model: Module,
         optimizer: Optimizer,
-        criterion=CrossEntropyLoss(),
+        criterion: CriterionType = CrossEntropyLoss(),
         train_mb_size: int = 1,
         train_epochs: int = 1,
         eval_mb_size: Optional[int] = None,
@@ -445,7 +460,7 @@ def __init__(
             learning experience.
         :param generator_strategy: A trainable strategy with a generative model,
             which employs GenerativeReplayPlugin. Defaults to None.
-        :param \*\*base_kwargs: any additional
+        :param **base_kwargs: any additional
             :class:`~avalanche.training.BaseTemplate` constructor arguments.
         """
 
@@ -503,9 +518,9 @@ def __init__(
             plugins.append(rp)
 
         super().__init__(
-            model,
-            optimizer,
-            criterion,
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
             train_mb_size=train_mb_size,
             train_epochs=train_epochs,
             eval_mb_size=eval_mb_size,
@@ -536,6 +551,7 @@ class AETraining(SupervisedTemplate):
 
     def __init__(
         self,
+        *,
         model: Module,
         optimizer: Optimizer,
         device,
@@ -566,14 +582,14 @@ def __init__(
             only at the end of the learning experience. Values >0 mean that
             `eval` is called every `eval_every` epochs and at the end of the
             learning experience.
-        :param \*\*base_kwargs: any additional
+        :param **base_kwargs: any additional
             :class:`~avalanche.training.BaseTemplate` constructor arguments.
         """
 
         super().__init__(
-            model,
-            optimizer,
-            criterion,
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
             train_mb_size=train_mb_size,
             train_epochs=train_epochs,
             eval_mb_size=eval_mb_size,
@@ -608,6 +624,7 @@ class VAETraining(SupervisedTemplate):
 
     def __init__(
         self,
+        *,
         model: Module,
         optimizer: Optimizer,
         criterion=VAE_loss,
@@ -640,14 +657,14 @@ def __init__(
             only at the end of the learning experience. Values >0 mean that
             `eval` is called every `eval_every` epochs and at the end of the
             learning experience.
-        :param \*\*base_kwargs: any additional
+        :param **base_kwargs: any additional
             :class:`~avalanche.training.BaseTemplate` constructor arguments.
         """
 
         super().__init__(
-            model,
-            optimizer,
-            criterion,
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
             train_mb_size=train_mb_size,
             train_epochs=train_epochs,
             eval_mb_size=eval_mb_size,
@@ -673,9 +690,10 @@ class GSS_greedy(SupervisedTemplate):
 
     def __init__(
         self,
+        *,
         model: Module,
         optimizer: Optimizer,
-        criterion,
+        criterion: CriterionType,
         mem_size: int = 200,
         mem_strength=1,
         input_size=[],
@@ -709,7 +727,7 @@ def __init__(
             only at the end of the learning experience. Values >0 mean that
             `eval` is called every `eval_every` epochs and at the end of the
             learning experience.
-        :param \*\*base_kwargs: any additional
+        :param **base_kwargs: any additional
             :class:`~avalanche.training.BaseTemplate` constructor arguments.
         """
         rp = GSS_greedyPlugin(
@@ -720,9 +738,9 @@ def __init__(
         else:
             plugins.append(rp)
         super().__init__(
-            model,
-            optimizer,
-            criterion,
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
             train_mb_size=train_mb_size,
             train_epochs=train_epochs,
             eval_mb_size=eval_mb_size,
@@ -743,9 +761,10 @@ class GDumb(SupervisedTemplate):
 
     def __init__(
         self,
+        *,
         model: Module,
         optimizer: Optimizer,
-        criterion,
+        criterion: CriterionType,
         mem_size: int = 200,
         train_mb_size: int = 1,
         train_epochs: int = 1,
@@ -787,9 +806,9 @@ def __init__(
             plugins.append(gdumb)
 
         super().__init__(
-            model,
-            optimizer,
-            criterion,
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
             train_mb_size=train_mb_size,
             train_epochs=train_epochs,
             eval_mb_size=eval_mb_size,
@@ -809,9 +828,10 @@ class LwF(SupervisedTemplate):
 
     def __init__(
         self,
+        *,
         model: Module,
         optimizer: Optimizer,
-        criterion,
+        criterion: CriterionType,
         alpha: Union[float, Sequence[float]],
         temperature: float,
         train_mb_size: int = 1,
@@ -856,9 +876,9 @@ def __init__(
             plugins.append(lwf)
 
         super().__init__(
-            model,
-            optimizer,
-            criterion,
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
             train_mb_size=train_mb_size,
             train_epochs=train_epochs,
             eval_mb_size=eval_mb_size,
@@ -879,9 +899,10 @@ class AGEM(SupervisedTemplate):
 
     def __init__(
         self,
+        *,
         model: Module,
         optimizer: Optimizer,
-        criterion,
+        criterion: CriterionType,
         patterns_per_exp: int,
         sample_size: int = 64,
         train_mb_size: int = 1,
@@ -926,9 +947,9 @@ def __init__(
             plugins.append(agem)
 
         super().__init__(
-            model,
-            optimizer,
-            criterion,
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
             train_mb_size=train_mb_size,
             train_epochs=train_epochs,
             eval_mb_size=eval_mb_size,
@@ -949,9 +970,10 @@ class GEM(SupervisedTemplate):
 
     def __init__(
         self,
+        *,
         model: Module,
         optimizer: Optimizer,
-        criterion,
+        criterion: CriterionType,
         patterns_per_exp: int,
         memory_strength: float = 0.5,
         train_mb_size: int = 1,
@@ -996,9 +1018,9 @@ def __init__(
             plugins.append(gem)
 
         super().__init__(
-            model,
-            optimizer,
-            criterion,
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
             train_mb_size=train_mb_size,
             train_epochs=train_epochs,
             eval_mb_size=eval_mb_size,
@@ -1019,9 +1041,10 @@ class EWC(SupervisedTemplate):
 
     def __init__(
         self,
+        *,
         model: Module,
         optimizer: Optimizer,
-        criterion,
+        criterion: CriterionType,
         ewc_lambda: float,
         mode: str = "separate",
         decay_factor: Optional[float] = None,
@@ -1077,9 +1100,9 @@ def __init__(
             plugins.append(ewc)
 
         super().__init__(
-            model,
-            optimizer,
-            criterion,
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
             train_mb_size=train_mb_size,
             train_epochs=train_epochs,
             eval_mb_size=eval_mb_size,
@@ -1109,9 +1132,10 @@ class SynapticIntelligence(SupervisedTemplate):
 
     def __init__(
         self,
+        *,
         model: Module,
         optimizer: Optimizer,
-        criterion,
+        criterion: CriterionType,
         si_lambda: Union[float, Sequence[float]],
         eps: float = 0.0000001,
         train_mb_size: int = 1,
@@ -1161,13 +1185,13 @@ def __init__(
         # entire implementation of the strategy!
         plugins.append(SynapticIntelligencePlugin(si_lambda=si_lambda, eps=eps))
 
-        super(SynapticIntelligence, self).__init__(
-            model,
-            optimizer,
-            criterion,
-            train_mb_size,
-            train_epochs,
-            eval_mb_size,
+        super().__init__(
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
+            train_mb_size=train_mb_size,
+            train_epochs=train_epochs,
+            eval_mb_size=eval_mb_size,
             device=device,
             plugins=plugins,
             evaluator=evaluator,
@@ -1185,9 +1209,10 @@ class CoPE(SupervisedTemplate):
 
     def __init__(
         self,
+        *,
         model: Module,
         optimizer: Optimizer,
-        criterion,
+        criterion: CriterionType,
         mem_size: int = 200,
         n_classes: int = 10,
         p_size: int = 100,
@@ -1240,9 +1265,9 @@ def __init__(
         else:
             plugins.append(copep)
         super().__init__(
-            model,
-            optimizer,
-            criterion,
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
             train_mb_size=train_mb_size,
             train_epochs=train_epochs,
             eval_mb_size=eval_mb_size,
@@ -1264,9 +1289,10 @@ class LFL(SupervisedTemplate):
 
     def __init__(
         self,
+        *,
         model: Module,
         optimizer: Optimizer,
-        criterion,
+        criterion: CriterionType,
         lambda_e: Union[float, Sequence[float]],
         train_mb_size: int = 1,
         train_epochs: int = 1,
@@ -1308,9 +1334,9 @@ def __init__(
             plugins.append(lfl)
 
         super().__init__(
-            model,
-            optimizer,
-            criterion,
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
             train_mb_size=train_mb_size,
             train_epochs=train_epochs,
             eval_mb_size=eval_mb_size,
@@ -1331,9 +1357,10 @@ class MAS(SupervisedTemplate):
 
     def __init__(
         self,
+        *,
         model: Module,
         optimizer: Optimizer,
-        criterion,
+        criterion: CriterionType,
         lambda_reg: float = 1.0,
         alpha: float = 0.5,
         verbose: bool = False,
@@ -1385,9 +1412,9 @@ def __init__(
             plugins.append(mas)
 
         super().__init__(
-            model,
-            optimizer,
-            criterion,
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
             train_mb_size=train_mb_size,
             train_epochs=train_epochs,
             eval_mb_size=eval_mb_size,
@@ -1408,9 +1435,10 @@ class BiC(SupervisedTemplate):
 
     def __init__(
         self,
+        *,
         model: Module,
         optimizer: Optimizer,
-        criterion,
+        criterion: CriterionType,
         mem_size: int = 200,
         val_percentage: float = 0.1,
         T: int = 2,
@@ -1456,7 +1484,7 @@ def __init__(
             only at the end of the learning experience. Values >0 mean that
             `eval` is called every `eval_every` epochs and at the end of the
             learning experience.
-        :param \*\*base_kwargs: any additional
+        :param **base_kwargs: any additional
             :class:`~avalanche.training.BaseTemplate` constructor arguments.
         """
 
@@ -1477,9 +1505,9 @@ def __init__(
             plugins.append(bic)
 
         super().__init__(
-            model,
-            optimizer,
-            criterion,
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
             train_mb_size=train_mb_size,
             train_epochs=train_epochs,
             eval_mb_size=eval_mb_size,
@@ -1498,9 +1526,10 @@ class MIR(SupervisedTemplate):
 
     def __init__(
         self,
+        *,
         model: Module,
         optimizer: Optimizer,
-        criterion,
+        criterion: CriterionType,
         mem_size: int,
         subsample: int,
         batch_size_mem: int = 1,
@@ -1542,7 +1571,9 @@ def __init__(
 
         # Instantiate plugin
         mir = MIRPlugin(
-            mem_size=mem_size, subsample=subsample, batch_size_mem=batch_size_mem
+            mem_size=mem_size,
+            subsample=subsample,
+            batch_size_mem=batch_size_mem,
         )
 
         # Add plugin to the strategy
@@ -1552,9 +1583,9 @@ def __init__(
             plugins.append(mir)
 
         super().__init__(
-            model,
-            optimizer,
-            criterion,
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
             train_mb_size=train_mb_size,
             train_epochs=train_epochs,
             eval_mb_size=eval_mb_size,
@@ -1577,9 +1608,10 @@ class FromScratchTraining(SupervisedTemplate):
 
     def __init__(
         self,
+        *,
         model: Module,
         optimizer: Optimizer,
-        criterion,
+        criterion: CriterionType,
         reset_optimizer: bool = True,
         train_mb_size: int = 1,
         train_epochs: int = 1,
@@ -1611,7 +1643,7 @@ def __init__(
             only at the end of the learning experience. Values >0 mean that
             `eval` is called every `eval_every` epochs and at the end of the
             learning experience.
-        :param \*\*base_kwargs: any additional
+        :param **base_kwargs: any additional
             :class:`~avalanche.training.BaseTemplate` constructor arguments.
         """
 
@@ -1621,9 +1653,9 @@ def __init__(
         else:
             plugins.append(fstp)
         super().__init__(
-            model,
-            optimizer,
-            criterion,
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
             train_mb_size=train_mb_size,
             train_epochs=train_epochs,
             eval_mb_size=eval_mb_size,
diff --git a/avalanche/training/supervised/strategy_wrappers_online.py b/avalanche/training/supervised/strategy_wrappers_online.py
index 640f5abb4..ffe6f1575 100644
--- a/avalanche/training/supervised/strategy_wrappers_online.py
+++ b/avalanche/training/supervised/strategy_wrappers_online.py
@@ -21,6 +21,7 @@
     SupervisedTemplate,
 )
 from avalanche._annotations import deprecated
+from avalanche.training.templates.strategy_mixin_protocol import CriterionType
 
 
 @deprecated(
@@ -43,9 +44,10 @@ class OnlineNaive(SupervisedTemplate):
 
     def __init__(
         self,
+        *,
         model: Module,
         optimizer: Optimizer,
-        criterion=CrossEntropyLoss(),
+        criterion: CriterionType = CrossEntropyLoss(),
         train_passes: int = 1,
         train_mb_size: int = 1,
         eval_mb_size: Optional[int] = None,
diff --git a/avalanche/training/supervised/supervised_contrastive_replay.py b/avalanche/training/supervised/supervised_contrastive_replay.py
index 6f32cb33b..8eba2c6bc 100644
--- a/avalanche/training/supervised/supervised_contrastive_replay.py
+++ b/avalanche/training/supervised/supervised_contrastive_replay.py
@@ -48,6 +48,7 @@ def __init__(
         evaluator=default_evaluator,
         eval_every=-1,
         peval_mode="epoch",
+        **kwargs
     ):
         """
         :param model: an Avalanche model like the avalanche.models.SCRModel,
@@ -111,17 +112,17 @@ def __init__(
         else:
             raise ValueError("`plugins` parameter needs to be a list.")
         super().__init__(
-            model,
-            optimizer,
-            SCRLoss(temperature=self.temperature),
-            train_mb_size,
-            train_epochs,
-            eval_mb_size,
-            device,
-            plugins,
-            evaluator,
-            eval_every,
-            peval_mode,
+            model=model,
+            optimizer=optimizer,
+            criterion=SCRLoss(temperature=self.temperature),
+            train_mb_size=train_mb_size,
+            train_epochs=train_epochs,
+            eval_mb_size=eval_mb_size,
+            device=device,
+            plugins=plugins,
+            evaluator=evaluator,
+            eval_every=eval_every,
+            peval_mode=peval_mode,
         )
 
     def criterion(self):
@@ -189,3 +190,6 @@ def compute_class_means(self):
             class_means[label] /= class_means[label].norm()
 
         self.model.eval_classifier.update_class_means_dict(class_means)
+
+
+__all__ = ["SCR"]
diff --git a/avalanche/training/templates/__init__.py b/avalanche/training/templates/__init__.py
index 1d3308113..84a482291 100644
--- a/avalanche/training/templates/__init__.py
+++ b/avalanche/training/templates/__init__.py
@@ -9,6 +9,7 @@
 
 Templates are the backbone that supports the plugin systems.
 """
+
 from .base import BaseTemplate
 from .base_sgd import BaseSGDTemplate
 from .common_templates import (
diff --git a/avalanche/training/templates/base.py b/avalanche/training/templates/base.py
index 7cd34c6a6..2ec228291 100644
--- a/avalanche/training/templates/base.py
+++ b/avalanche/training/templates/base.py
@@ -1,7 +1,18 @@
 import sys
 import warnings
 from collections import defaultdict
-from typing import Iterable, Sequence, Optional, TypeVar, Union, List
+from typing import (
+    Any,
+    Dict,
+    Iterable,
+    OrderedDict,
+    Sequence,
+    Optional,
+    Type,
+    TypeVar,
+    Union,
+    List,
+)
 
 import torch
 from torch.nn import Module
@@ -11,7 +22,8 @@
 from avalanche.distributed.distributed_helper import DistributedHelper
 from avalanche.training.templates.strategy_mixin_protocol import BaseStrategyProtocol
 from avalanche.training.utils import trigger_plugins
-
+import functools
+import inspect
 
 TExperienceType = TypeVar("TExperienceType", bound=CLExperience)
 TPluginType = TypeVar("TPluginType", bound=BasePlugin, contravariant=True)
@@ -34,17 +46,24 @@ class BaseTemplate(BaseStrategyProtocol[TExperienceType]):
 
     """
 
+    def __init_subclass__(cls, **kwargs):
+        # This is needed to manage the transition to keyword-only arguments.
+        cls.__init__ = _support_legacy_strategy_positional_args(cls)
+        super().__init_subclass__(**kwargs)
+
     # we need this only for type checking
     PLUGIN_CLASS = BasePlugin
 
     def __init__(
         self,
+        *,
         model: Module,
         device: Union[str, torch.device] = "cpu",
         plugins: Optional[Sequence[BasePlugin]] = None,
+        **kwargs,
     ):
-        super().__init__()
         """Init."""
+        super().__init__(model=model, device=device, plugins=plugins, **kwargs)
 
         self.model: Module = model
         """ PyTorch model. """
@@ -128,9 +147,9 @@ def train(
         self.model.to(self.device)
 
         # Normalize training and eval data.
-        experiences_list: Iterable[
-            TExperienceType
-        ] = _experiences_parameter_as_iterable(experiences)
+        experiences_list: Iterable[TExperienceType] = (
+            _experiences_parameter_as_iterable(experiences)
+        )
 
         if eval_streams is None:
             eval_streams = [experiences_list]
@@ -182,9 +201,9 @@ def eval(
         self.is_training = False
         self.model.eval()
 
-        experiences_list: Iterable[
-            TExperienceType
-        ] = _experiences_parameter_as_iterable(experiences)
+        experiences_list: Iterable[TExperienceType] = (
+            _experiences_parameter_as_iterable(experiences)
+        )
         self.current_eval_stream = experiences_list
 
         self._before_eval(**kwargs)
@@ -350,4 +369,246 @@ def _experiences_parameter_as_iterable(
         return [experiences]
 
 
+class PositionalArgumentsDeprecatedWarning(UserWarning):
+    pass
+
+
+def _warn_init_has_positional_args(init_method, class_name):
+    init_args = inspect.signature(init_method).parameters
+    positional_args = [
+        k
+        for k, v in init_args.items()
+        if v.kind
+        in {
+            inspect.Parameter.POSITIONAL_ONLY,
+            inspect.Parameter.POSITIONAL_OR_KEYWORD,
+            inspect.Parameter.VAR_POSITIONAL,
+        }
+    ]
+    if len(positional_args) > 1:  # self is always present
+        warnings.warn(
+            f"Avalanche is transitioning to strategy constructors that accept named (keyword) arguments only. "
+            f"This is done to ensure that there is no confusion regarding the meaning of each argument (strategies can have many arguments). "
+            f"Your strategy {class_name}.__init__ method still has some positional-only or "
+            f"positional-or-keyword arguments. Consider removing them. Offending arguments: {positional_args}. "
+            f"This can be achieved by adding a * in the argument list of your __init__ method just after 'self'. "
+            f"More info: https://peps.python.org/pep-3102/#specification"
+        )
+
+
+def _merge_legacy_positional_arguments(
+    init_method,
+    class_name: str,
+    args: Sequence[Any],
+    kwargs: Dict[str, Any],
+    allow_pos_args=True,
+):
+    """
+    Manage the legacy positional constructor parameters.
+
+    Used to warn the user when passing positional parameters to strategy constructors
+    (which is deprecated).
+
+    To allow for a smooth transition, we allow the user to pass positional
+    arguments to the constructor. However, we warn the user that
+    this soft transition mechanism will be removed in the future.
+    """
+
+    if len(args) == 1:
+        # No positional argument has been passed (good!)
+        return args, kwargs
+    elif len(args) == 0:
+        # This should never happen and will fail later.
+        # assert len(args) == 0, "At least the 'self' argument should be passed"
+        return args, kwargs
+
+    all_init_args = dict(inspect.signature(init_method).parameters)
+
+    # Remove 'self' from the list of arguments
+    all_init_args.pop("self")
+
+    # Divide parameters in groups
+    pos_only_args = [
+        (k, v)
+        for k, v in all_init_args.items()
+        if v.kind == inspect.Parameter.POSITIONAL_ONLY
+    ]
+    pos_or_keyword = [
+        (k, v)
+        for k, v in all_init_args.items()
+        if v.kind == inspect.Parameter.POSITIONAL_OR_KEYWORD
+    ]
+    var_pos = [
+        (k, v)
+        for k, v in all_init_args.items()
+        if v.kind == inspect.Parameter.VAR_POSITIONAL
+    ]
+    keyword_only_args = [
+        (k, v)
+        for k, v in all_init_args.items()
+        if v.kind == inspect.Parameter.KEYWORD_ONLY
+    ]
+    var_keyword_args = [
+        (k, v)
+        for k, v in all_init_args.items()
+        if v.kind == inspect.Parameter.VAR_KEYWORD
+    ]
+
+    error_str = (
+        f"Avalanche is transitioning to strategy constructors that accept named (keyword) arguments only. "
+        f"This is done to ensure that there is no confusion regarding the meaning of each argument (strategies can have many arguments). "
+        f"Your are passing {len(args) - 1} positional arguments to the {class_name}.__init__ method. "
+        f"Consider passing them as names arguments."
+    )
+
+    if allow_pos_args:
+        error_str += (
+            " The ability to pass positional arguments will be removed in the future."
+        )
+        warnings.warn(error_str, category=PositionalArgumentsDeprecatedWarning)
+    else:
+        raise PositionalArgumentsDeprecatedWarning(error_str)
+
+    args_to_manage = list(args)
+    kwargs_to_manage = dict(kwargs)
+
+    result_args = [args_to_manage.pop(0)]  # Add self
+    result_kwargs = OrderedDict()
+
+    unset_arguments = set(all_init_args.keys())
+
+    for argument_name, arg_def in pos_only_args:
+        if len(args_to_manage) > 0:
+            result_args.append(args_to_manage.pop(0))
+        elif arg_def.default is inspect.Parameter.empty:
+            raise ValueError(
+                f"Positional-only argument {argument_name} is not set (and no default set)."
+            )
+        unset_arguments.remove(argument_name)
+
+    for argument_name, arg_def in pos_or_keyword:
+        if len(args_to_manage) > 0:
+            result_args.append(args_to_manage.pop(0))
+        elif argument_name in kwargs_to_manage:
+            result_kwargs[argument_name] = kwargs_to_manage.pop(argument_name)
+        elif arg_def.default is inspect.Parameter.empty:
+            raise ValueError(
+                f"Parameter {argument_name} is not set (and no default provided)."
+            )
+
+        if argument_name not in unset_arguments:
+            # This is the same error and message raised by Python when passing
+            # multiple values for an argument.
+            raise TypeError(f"Got multiple values for argument '{argument_name}'")
+        unset_arguments.remove(argument_name)
+
+    if len(var_pos) > 0:
+        # assert len(var_pos) == 1, "Only one var-positional argument is supported"
+        argument_name = var_pos[0][0]
+        if len(args_to_manage) > 0:
+            result_args.extend(args_to_manage)
+            args_to_manage = list()
+
+        if argument_name not in unset_arguments:
+            # This is the same error and message raised by Python when passing
+            # multiple values for an argument.
+            raise TypeError(f"Got multiple values for argument '{argument_name}'")
+
+        unset_arguments.remove(argument_name)
+
+    for argument_name, arg_def in keyword_only_args:
+        if len(args_to_manage) > 0 and argument_name in kwargs_to_manage:
+            raise TypeError(
+                f"Got multiple values for argument '{argument_name}' (passed as both positional and named parameter)"
+            )
+
+        if len(args_to_manage) > 0:
+            # This is where the soft transition mechanism is implemented.
+            # The legacy positional arguments are transformed to keyword arguments.
+            result_kwargs[argument_name] = args_to_manage.pop(0)
+        elif argument_name in kwargs_to_manage:
+            result_kwargs[argument_name] = kwargs_to_manage.pop(argument_name)
+        elif arg_def.default is inspect.Parameter.empty:
+            raise ValueError(
+                f"Keyword-only parameter {argument_name} is not set (and no default set)."
+            )
+
+        if argument_name not in unset_arguments:
+            # This is the same error and message raised by Python when passing
+            # multiple values for an argument.
+            raise TypeError(f"Got multiple values for argument '{argument_name}'")
+
+        unset_arguments.remove(argument_name)
+
+    if len(var_keyword_args) > 0:
+        # assert len(var_keyword_args) == 1, "Only one var-keyword argument is supported"
+        argument_name = var_keyword_args[0][0]
+        result_kwargs.update(kwargs_to_manage)
+        kwargs_to_manage = dict()
+
+        if argument_name not in unset_arguments:
+            # This is the same error and message raised by Python when passing
+            # multiple values for an argument.
+            raise TypeError(f"Got multiple values for argument '{argument_name}'")
+        unset_arguments.remove(argument_name)
+
+    assert len(unset_arguments) == 0
+
+    return result_args, result_kwargs
+
+
+def _check_mispelled_kwargs(cls: Type, kwargs: Dict[str, Any]):
+    # First: we gather all parameter names of inits of all the classes in the mro
+    all_init_args = set()
+    for c in cls.mro():
+        # We then consider only positional_or_keyword and keyword_only arguments
+        # Also, it does not make sense to include self
+        all_init_args.update(
+            k
+            for k, v in inspect.signature(c.__init__).parameters.items()
+            if v.kind
+            in {
+                inspect.Parameter.POSITIONAL_OR_KEYWORD,
+                inspect.Parameter.KEYWORD_ONLY,
+            }
+            and k != "self"
+        )
+
+    passed_parameters = set(kwargs.keys())
+    passed_parameters.discard(
+        "self"
+    )  # self should not be in kwargs, but it's better to be safe
+
+    # Then we check if there are any mispelled/unexpected arguments
+    unexpected_args = list(passed_parameters - all_init_args)
+
+    if len(unexpected_args) == 1:
+        raise TypeError(
+            f"{cls.__name__}.__init__ got an unexpected keyword argument: {unexpected_args[0]}. "
+            "This parameter is not accepted by the strategy class or any of its super classes. "
+            "Please check if you have mispelled the parameter name."
+        )
+    elif len(unexpected_args) > 1:
+        raise TypeError(
+            f"{cls.__name__}.__init__ got unexpected keyword arguments: {unexpected_args}. "
+            "Those parameters are not accepted by the strategy class or any of its super classes. "
+            "Please check if you have mispelled any parameter name."
+        )
+
+
+def _support_legacy_strategy_positional_args(cls):
+    init_method, cls_name = cls.__init__, cls.__name__
+
+    @functools.wraps(init_method)
+    def wrap_init(*args, **kwargs):
+        _warn_init_has_positional_args(init_method, cls_name)
+        args, kwargs = _merge_legacy_positional_arguments(
+            init_method, cls_name, args, kwargs, allow_pos_args=True
+        )
+        _check_mispelled_kwargs(cls, kwargs)
+        return init_method(*args, **kwargs)
+
+    return wrap_init
+
+
 __all__ = ["BaseTemplate"]
diff --git a/avalanche/training/templates/base_sgd.py b/avalanche/training/templates/base_sgd.py
index 4bcaec94c..e54f402a6 100644
--- a/avalanche/training/templates/base_sgd.py
+++ b/avalanche/training/templates/base_sgd.py
@@ -1,4 +1,6 @@
-from typing import Any, Callable, Iterable, Sequence, Optional, TypeVar, Union
+import sys
+from typing import Any, Callable, Generic, Iterable, Sequence, Optional, TypeVar, Union
+from typing_extensions import TypeAlias
 from packaging.version import parse
 
 import torch
@@ -7,7 +9,7 @@
 from torch import Tensor
 
 from avalanche.benchmarks import CLExperience, CLStream
-from avalanche.benchmarks.scenarios.deprecated import DatasetExperience
+from avalanche.benchmarks import DatasetExperience
 from avalanche.benchmarks.utils.data import AvalancheDataset
 from avalanche.core import BasePlugin, BaseSGDPlugin
 from avalanche.training.plugins import EvaluationPlugin
@@ -20,7 +22,10 @@
     collate_from_data_or_kwargs,
 )
 
-from avalanche.training.templates.strategy_mixin_protocol import SGDStrategyProtocol
+from avalanche.training.templates.strategy_mixin_protocol import (
+    CriterionType,
+    SGDStrategyProtocol,
+)
 from avalanche.training.utils import trigger_plugins
 
 
@@ -30,6 +35,7 @@
 
 
 class BaseSGDTemplate(
+    Generic[TDatasetExperience, TMBInput, TMBOutput],
     SGDStrategyProtocol[TDatasetExperience, TMBInput, TMBOutput],
     BaseTemplate[TDatasetExperience],
 ):
@@ -53,9 +59,10 @@ class BaseSGDTemplate(
 
     def __init__(
         self,
+        *,
         model: Module,
         optimizer: Optimizer,
-        criterion=CrossEntropyLoss(),
+        criterion: CriterionType = CrossEntropyLoss(),
         train_mb_size: int = 1,
         train_epochs: int = 1,
         eval_mb_size: Optional[int] = 1,
@@ -66,6 +73,7 @@ def __init__(
         ] = default_evaluator,
         eval_every=-1,
         peval_mode="epoch",
+        **kwargs
     ):
         """Init.
 
@@ -87,8 +95,39 @@ def __init__(
             `eval_every` epochs or iterations (Default='epoch').
         """
 
-        super().__init__()  # type: ignore
-        BaseTemplate.__init__(self=self, model=model, device=device, plugins=plugins)
+        # Call super with all args
+        if sys.version_info >= (3, 11):
+            super().__init__(
+                model=model,
+                optimizer=optimizer,
+                criterion=criterion,
+                train_mb_size=train_mb_size,
+                train_epochs=train_epochs,
+                eval_mb_size=eval_mb_size,
+                device=device,
+                plugins=plugins,
+                evaluator=evaluator,
+                eval_every=eval_every,
+                peval_mode=peval_mode,
+                **kwargs
+            )
+        else:
+            super().__init__()  # type: ignore
+            BaseTemplate.__init__(
+                self,
+                model=model,
+                optimizer=optimizer,
+                criterion=criterion,
+                train_mb_size=train_mb_size,
+                train_epochs=train_epochs,
+                eval_mb_size=eval_mb_size,
+                device=device,
+                plugins=plugins,
+                evaluator=evaluator,
+                eval_every=eval_every,
+                peval_mode=peval_mode,
+                **kwargs
+            )
 
         self.optimizer: Optimizer = optimizer
         """ PyTorch optimizer. """
@@ -617,3 +656,10 @@ def after_training_exp(self, strategy, **kwargs):
     #     if self.peval_mode == "experience":
     #         self._maybe_peval(strategy, strategy.clock.train_exp_counter,
     #                           **kwargs)
+
+
+__all__ = [
+    "CriterionType",
+    "BaseSGDTemplate",
+    "PeriodicEval",
+]
diff --git a/avalanche/training/templates/common_templates.py b/avalanche/training/templates/common_templates.py
index 61ba13fac..8405c5a6f 100644
--- a/avalanche/training/templates/common_templates.py
+++ b/avalanche/training/templates/common_templates.py
@@ -1,17 +1,21 @@
+import sys
 from typing import Callable, Sequence, Optional, TypeVar, Union
 import torch
 
 from torch.nn import Module, CrossEntropyLoss
 from torch.optim import Optimizer
-from ...benchmarks.scenarios.deprecated import DatasetExperience
+from avalanche.benchmarks import DatasetExperience
 
-from avalanche.core import BasePlugin, BaseSGDPlugin, SupervisedPlugin
+from avalanche.core import BasePlugin, SupervisedPlugin
 from avalanche.training.plugins.evaluation import (
     EvaluationPlugin,
     default_evaluator,
 )
 from avalanche.training.templates.strategy_mixin_protocol import (
+    CriterionType,
     SupervisedStrategyProtocol,
+    TMBOutput,
+    TMBInput,
 )
 
 from .observation_type import *
@@ -21,18 +25,15 @@
 
 
 TDatasetExperience = TypeVar("TDatasetExperience", bound=DatasetExperience)
-TMBInput = TypeVar("TMBInput")
-TMBOutput = TypeVar("TMBOutput")
 
 
 class SupervisedTemplate(
-    BatchObservation,
-    SupervisedProblem,
-    SGDUpdate,
+    BatchObservation[TDatasetExperience, TMBInput, TMBInput],
+    SupervisedProblem[TDatasetExperience, TMBInput, TMBInput],
+    SGDUpdate[TDatasetExperience, TMBInput, TMBOutput],
     SupervisedStrategyProtocol[TDatasetExperience, TMBInput, TMBOutput],
     BaseSGDTemplate[TDatasetExperience, TMBInput, TMBOutput],
 ):
-
     """Base class for continual learning strategies.
 
     SupervisedTemplate is the super class of all supervised task-based
@@ -79,11 +80,13 @@ class SupervisedTemplate(
 
     PLUGIN_CLASS = SupervisedPlugin
 
+    # TODO: remove default values of model and optimizer when legacy positional arguments are definitively removed
     def __init__(
         self,
+        *,
         model: Module,
         optimizer: Optimizer,
-        criterion=CrossEntropyLoss(),
+        criterion: CriterionType = CrossEntropyLoss(),
         train_mb_size: int = 1,
         train_epochs: int = 1,
         eval_mb_size: Optional[int] = 1,
@@ -94,6 +97,7 @@ def __init__(
         ] = default_evaluator,
         eval_every=-1,
         peval_mode="epoch",
+        **kwargs,
     ):
         """Init.
 
@@ -119,40 +123,29 @@ def __init__(
             periodic evaluation during training should execute every
             `eval_every` epochs or iterations (Default='epoch').
         """
-        super().__init__()  # type: ignore
-        BaseSGDTemplate.__init__(
-            self=self,
-            model=model,
-            optimizer=optimizer,
-            criterion=criterion,
-            train_mb_size=train_mb_size,
-            train_epochs=train_epochs,
-            eval_mb_size=eval_mb_size,
-            device=device,
-            plugins=plugins,
-            evaluator=evaluator,
-            eval_every=eval_every,
-            peval_mode=peval_mode,
-        )
-        ###################################################################
-        # State variables. These are updated during the train/eval loops. #
-        ###################################################################
-
-        # self.adapted_dataset = None
-        # """ Data used to train. It may be modified by plugins. Plugins can
-        # append data to it (e.g. for replay).
-        #
-        # .. note::
-        #
-        #    This dataset may contain samples from different experiences. If you
-        #    want the original data for the current experience
-        #    use :attr:`.BaseTemplate.experience`.
+        kwargs["model"] = model
+        kwargs["optimizer"] = optimizer
+        kwargs["criterion"] = criterion
+        kwargs["train_mb_size"] = train_mb_size
+        kwargs["train_epochs"] = train_epochs
+        kwargs["eval_mb_size"] = eval_mb_size
+        kwargs["device"] = device
+        kwargs["plugins"] = plugins
+        kwargs["evaluator"] = evaluator
+        kwargs["eval_every"] = eval_every
+        kwargs["peval_mode"] = peval_mode
+
+        if sys.version_info >= (3, 11):
+            super().__init__(**kwargs)
+        else:
+            super().__init__()
+            BaseSGDTemplate.__init__(self=self, **kwargs)
 
 
 class SupervisedMetaLearningTemplate(
-    BatchObservation,
-    SupervisedProblem,
-    MetaUpdate,
+    BatchObservation[TDatasetExperience, TMBInput, TMBInput],
+    SupervisedProblem[TDatasetExperience, TMBInput, TMBInput],
+    MetaUpdate[TDatasetExperience, TMBInput, TMBInput],
     BaseSGDTemplate[TDatasetExperience, TMBInput, TMBOutput],
 ):
     """Base class for continual learning strategies.
@@ -201,11 +194,13 @@ class SupervisedMetaLearningTemplate(
 
     PLUGIN_CLASS = SupervisedPlugin
 
+    # TODO: remove default values of model and optimizer when legacy positional arguments are definitively removed
     def __init__(
         self,
+        *,
         model: Module,
         optimizer: Optimizer,
-        criterion=CrossEntropyLoss(),
+        criterion: CriterionType = CrossEntropyLoss(),
         train_mb_size: int = 1,
         train_epochs: int = 1,
         eval_mb_size: Optional[int] = 1,
@@ -216,6 +211,7 @@ def __init__(
         ] = default_evaluator,
         eval_every=-1,
         peval_mode="epoch",
+        **kwargs,
     ):
         """Init.
 
@@ -241,34 +237,23 @@ def __init__(
             periodic evaluation during training should execute every
             `eval_every` epochs or iterations (Default='epoch').
         """
-        super().__init__()  # type: ignore
-        BaseSGDTemplate.__init__(
-            self=self,
-            model=model,
-            optimizer=optimizer,
-            criterion=criterion,
-            train_mb_size=train_mb_size,
-            train_epochs=train_epochs,
-            eval_mb_size=eval_mb_size,
-            device=device,
-            plugins=plugins,
-            evaluator=evaluator,
-            eval_every=eval_every,
-            peval_mode=peval_mode,
-        )
-        ###################################################################
-        # State variables. These are updated during the train/eval loops. #
-        ###################################################################
-
-        # self.adapted_dataset = None
-        # """ Data used to train. It may be modified by plugins. Plugins can
-        # append data to it (e.g. for replay).
-        #
-        # .. note::
-        #
-        #    This dataset may contain samples from different experiences. If you
-        #    want the original data for the current experience
-        #    use :attr:`.BaseTemplate.experience`.
+        kwargs["model"] = model
+        kwargs["optimizer"] = optimizer
+        kwargs["criterion"] = criterion
+        kwargs["train_mb_size"] = train_mb_size
+        kwargs["train_epochs"] = train_epochs
+        kwargs["eval_mb_size"] = eval_mb_size
+        kwargs["device"] = device
+        kwargs["plugins"] = plugins
+        kwargs["evaluator"] = evaluator
+        kwargs["eval_every"] = eval_every
+        kwargs["peval_mode"] = peval_mode
+
+        if sys.version_info >= (3, 11):
+            super().__init__(**kwargs)
+        else:
+            super().__init__()
+            BaseSGDTemplate.__init__(self=self, **kwargs)
 
 
 __all__ = [
diff --git a/avalanche/training/templates/observation_type/__init__.py b/avalanche/training/templates/observation_type/__init__.py
index f98135349..9f0cf3c00 100644
--- a/avalanche/training/templates/observation_type/__init__.py
+++ b/avalanche/training/templates/observation_type/__init__.py
@@ -2,4 +2,5 @@
    batch(multiple epochs) vs. online(one epoch)
 
 """
+
 from .batch_observation import BatchObservation
diff --git a/avalanche/training/templates/observation_type/batch_observation.py b/avalanche/training/templates/observation_type/batch_observation.py
index e71bcf61e..374d71c45 100644
--- a/avalanche/training/templates/observation_type/batch_observation.py
+++ b/avalanche/training/templates/observation_type/batch_observation.py
@@ -1,19 +1,18 @@
-import warnings
-from collections import defaultdict
-from typing import List, TypeVar
-
-from torch import Tensor
-
 from avalanche.benchmarks import OnlineCLExperience
 from avalanche.models.utils import avalanche_model_adaptation
-from avalanche.training.templates.strategy_mixin_protocol import SGDStrategyProtocol
+from avalanche.training.templates.strategy_mixin_protocol import (
+    SGDStrategyProtocol,
+    TSGDExperienceType,
+    TMBInput,
+    TMBOutput,
+)
 from avalanche.models.dynamic_optimizers import reset_optimizer, update_optimizer
 from avalanche.training.utils import _at_task_boundary
 
 
-class BatchObservation(SGDStrategyProtocol):
-    def __init__(self):
-        super().__init__()
+class BatchObservation(SGDStrategyProtocol[TSGDExperienceType, TMBInput, TMBOutput]):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
         self.optimized_param_id = None
 
     def model_adaptation(self, model=None):
diff --git a/avalanche/training/templates/problem_type/__init__.py b/avalanche/training/templates/problem_type/__init__.py
index 0932beb4c..81cad2a6d 100644
--- a/avalanche/training/templates/problem_type/__init__.py
+++ b/avalanche/training/templates/problem_type/__init__.py
@@ -2,4 +2,5 @@
    how inputs should be mapped to outputs.
 
 """
+
 from .supervised_problem import SupervisedProblem
diff --git a/avalanche/training/templates/problem_type/supervised_problem.py b/avalanche/training/templates/problem_type/supervised_problem.py
index b2a264b63..1de5639ab 100644
--- a/avalanche/training/templates/problem_type/supervised_problem.py
+++ b/avalanche/training/templates/problem_type/supervised_problem.py
@@ -1,13 +1,21 @@
 from avalanche.models import avalanche_forward
 from avalanche.training.templates.strategy_mixin_protocol import (
     SupervisedStrategyProtocol,
+    TSGDExperienceType,
+    TMBInput,
+    TMBOutput,
 )
 
 
 # Types are perfectly ok for MyPy
 # Also confirmed here: https://stackoverflow.com/a/70907644
 # PyLance just does not understand it
-class SupervisedProblem(SupervisedStrategyProtocol):
+class SupervisedProblem(
+    SupervisedStrategyProtocol[TSGDExperienceType, TMBInput, TMBOutput]
+):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
     @property
     def mb_x(self):
         """Current mini-batch input."""
diff --git a/avalanche/training/templates/strategy_mixin_protocol.py b/avalanche/training/templates/strategy_mixin_protocol.py
index 217e9a1f2..4596c9d39 100644
--- a/avalanche/training/templates/strategy_mixin_protocol.py
+++ b/avalanche/training/templates/strategy_mixin_protocol.py
@@ -1,4 +1,5 @@
-from typing import Iterable, List, Optional, TypeVar, Protocol
+from typing import Generic, Iterable, List, Optional, TypeVar, Protocol, Callable, Union
+from typing_extensions import TypeAlias
 
 from torch import Tensor
 import torch
@@ -14,11 +15,13 @@
 
 TExperienceType = TypeVar("TExperienceType", bound=CLExperience)
 TSGDExperienceType = TypeVar("TSGDExperienceType", bound=DatasetExperience)
-TMBinput = TypeVar("TMBinput")
-TMBoutput = TypeVar("TMBoutput")
+TMBInput = TypeVar("TMBInput")
+TMBOutput = TypeVar("TMBOutput")
 
+CriterionType: TypeAlias = Union[Module, Callable[[Tensor, Tensor], Tensor]]
 
-class BaseStrategyProtocol(Protocol[TExperienceType]):
+
+class BaseStrategyProtocol(Generic[TExperienceType], Protocol[TExperienceType]):
     model: Module
 
     device: torch.device
@@ -33,18 +36,19 @@ class BaseStrategyProtocol(Protocol[TExperienceType]):
 
 
 class SGDStrategyProtocol(
+    Generic[TSGDExperienceType, TMBInput, TMBOutput],
     BaseStrategyProtocol[TSGDExperienceType],
-    Protocol[TSGDExperienceType, TMBinput, TMBoutput],
+    Protocol[TSGDExperienceType, TMBInput, TMBOutput],
 ):
     """
     A protocol for strategies to be used for typing mixin classes.
     """
 
-    mbatch: Optional[TMBinput]
+    mbatch: Optional[TMBInput]
 
-    mb_output: Optional[TMBoutput]
+    mb_output: Optional[TMBOutput]
 
-    dataloader: Iterable[TMBinput]
+    dataloader: Iterable[TMBInput]
 
     _stop_training: bool
 
@@ -52,59 +56,44 @@ class SGDStrategyProtocol(
 
     loss: Tensor
 
-    _criterion: Module
+    _criterion: CriterionType
 
-    def forward(self) -> TMBoutput:
-        ...
+    def forward(self) -> TMBOutput: ...
 
-    def criterion(self) -> Tensor:
-        ...
+    def criterion(self) -> Tensor: ...
 
-    def backward(self) -> None:
-        ...
+    def backward(self) -> None: ...
 
-    def _make_empty_loss(self) -> Tensor:
-        ...
+    def _make_empty_loss(self) -> Tensor: ...
 
-    def make_optimizer(self, **kwargs):
-        ...
+    def make_optimizer(self, **kwargs): ...
 
-    def optimizer_step(self) -> None:
-        ...
+    def optimizer_step(self) -> None: ...
 
-    def model_adaptation(self, model: Optional[Module] = None) -> Module:
-        ...
+    def model_adaptation(self, model: Optional[Module] = None) -> Module: ...
 
-    def _unpack_minibatch(self):
-        ...
+    def _unpack_minibatch(self): ...
 
-    def _before_training_iteration(self, **kwargs):
-        ...
+    def _before_training_iteration(self, **kwargs): ...
 
-    def _before_forward(self, **kwargs):
-        ...
+    def _before_forward(self, **kwargs): ...
 
-    def _after_forward(self, **kwargs):
-        ...
+    def _after_forward(self, **kwargs): ...
 
-    def _before_backward(self, **kwargs):
-        ...
+    def _before_backward(self, **kwargs): ...
 
-    def _after_backward(self, **kwargs):
-        ...
+    def _after_backward(self, **kwargs): ...
 
-    def _before_update(self, **kwargs):
-        ...
+    def _before_update(self, **kwargs): ...
 
-    def _after_update(self, **kwargs):
-        ...
+    def _after_update(self, **kwargs): ...
 
-    def _after_training_iteration(self, **kwargs):
-        ...
+    def _after_training_iteration(self, **kwargs): ...
 
 
 class SupervisedStrategyProtocol(
-    SGDStrategyProtocol[TSGDExperienceType, TMBinput, TMBoutput], Protocol
+    SGDStrategyProtocol[TSGDExperienceType, TMBInput, TMBOutput],
+    Protocol[TSGDExperienceType, TMBInput, TMBOutput],
 ):
     mb_x: Tensor
 
@@ -114,25 +103,20 @@ class SupervisedStrategyProtocol(
 
 
 class MetaLearningStrategyProtocol(
-    SGDStrategyProtocol[TSGDExperienceType, TMBinput, TMBoutput], Protocol
+    SGDStrategyProtocol[TSGDExperienceType, TMBInput, TMBOutput],
+    Protocol[TSGDExperienceType, TMBInput, TMBOutput],
 ):
-    def _before_inner_updates(self, **kwargs):
-        ...
+    def _before_inner_updates(self, **kwargs): ...
 
-    def _inner_updates(self, **kwargs):
-        ...
+    def _inner_updates(self, **kwargs): ...
 
-    def _after_inner_updates(self, **kwargs):
-        ...
+    def _after_inner_updates(self, **kwargs): ...
 
-    def _before_outer_update(self, **kwargs):
-        ...
+    def _before_outer_update(self, **kwargs): ...
 
-    def _outer_update(self, **kwargs):
-        ...
+    def _outer_update(self, **kwargs): ...
 
-    def _after_outer_update(self, **kwargs):
-        ...
+    def _after_outer_update(self, **kwargs): ...
 
 
 __all__ = [
diff --git a/avalanche/training/templates/update_type/meta_update.py b/avalanche/training/templates/update_type/meta_update.py
index 79e8a5016..b74c0066e 100644
--- a/avalanche/training/templates/update_type/meta_update.py
+++ b/avalanche/training/templates/update_type/meta_update.py
@@ -1,10 +1,16 @@
 from avalanche.training.templates.strategy_mixin_protocol import (
     MetaLearningStrategyProtocol,
+    TSGDExperienceType,
+    TMBInput,
+    TMBOutput,
 )
 from avalanche.training.utils import trigger_plugins
 
 
-class MetaUpdate(MetaLearningStrategyProtocol):
+class MetaUpdate(MetaLearningStrategyProtocol[TSGDExperienceType, TMBInput, TMBOutput]):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
     def training_epoch(self, **kwargs):
         """Training epoch.
 
diff --git a/avalanche/training/templates/update_type/sgd_update.py b/avalanche/training/templates/update_type/sgd_update.py
index 8014dc409..934c9cfe6 100644
--- a/avalanche/training/templates/update_type/sgd_update.py
+++ b/avalanche/training/templates/update_type/sgd_update.py
@@ -1,7 +1,15 @@
-from avalanche.training.templates.strategy_mixin_protocol import SGDStrategyProtocol
+from avalanche.training.templates.strategy_mixin_protocol import (
+    SGDStrategyProtocol,
+    TSGDExperienceType,
+    TMBInput,
+    TMBOutput,
+)
 
 
-class SGDUpdate(SGDStrategyProtocol):
+class SGDUpdate(SGDStrategyProtocol[TSGDExperienceType, TMBInput, TMBOutput]):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
     def training_epoch(self, **kwargs):
         """Training epoch.
 
diff --git a/examples/detection.py b/examples/detection.py
index 3e7f7d94e..ec2e92119 100644
--- a/examples/detection.py
+++ b/examples/detection.py
@@ -160,15 +160,11 @@ def obtain_base_model(segmentation: bool):
         pretrain_argument["pretrained"] = True
     else:
         if segmentation:
-            pretrain_argument[
-                "weights"
-            ] = (
+            pretrain_argument["weights"] = (
                 torchvision.models.detection.mask_rcnn.MaskRCNN_ResNet50_FPN_Weights.DEFAULT
             )
         else:
-            pretrain_argument[
-                "weights"
-            ] = (
+            pretrain_argument["weights"] = (
                 torchvision.models.detection.faster_rcnn.FasterRCNN_ResNet50_FPN_Weights.DEFAULT
             )
 
diff --git a/examples/detection_lvis.py b/examples/detection_lvis.py
index ed86e5b2c..330b7a9ec 100644
--- a/examples/detection_lvis.py
+++ b/examples/detection_lvis.py
@@ -142,15 +142,11 @@ def obtain_base_model(segmentation: bool):
         pretrain_argument["pretrained"] = True
     else:
         if segmentation:
-            pretrain_argument[
-                "weights"
-            ] = (
+            pretrain_argument["weights"] = (
                 torchvision.models.detection.mask_rcnn.MaskRCNN_ResNet50_FPN_Weights.DEFAULT
             )
         else:
-            pretrain_argument[
-                "weights"
-            ] = (
+            pretrain_argument["weights"] = (
                 torchvision.models.detection.faster_rcnn.FasterRCNN_ResNet50_FPN_Weights.DEFAULT
             )
 
diff --git a/examples/hf_datawrapper.py b/examples/hf_datawrapper.py
index 892b3af9c..9b38d6690 100644
--- a/examples/hf_datawrapper.py
+++ b/examples/hf_datawrapper.py
@@ -7,6 +7,7 @@
 You can install them by running:
 pip install datasets transformers
 """
+
 import datasets as ds
 import numpy as np
 import torch
diff --git a/examples/nlp_qa.py b/examples/nlp_qa.py
index 7a6192d43..d6714de12 100644
--- a/examples/nlp_qa.py
+++ b/examples/nlp_qa.py
@@ -2,6 +2,7 @@
 Simple example that show how to use Avalanche for Question Answering on 
 Squad by using T5
 """
+
 from avalanche.benchmarks.utils import DataAttribute, ConstantSequence
 from avalanche.training.plugins import ReplayPlugin
 from transformers import DataCollatorForSeq2Seq
@@ -162,7 +163,7 @@ def preprocess_function(
     benchmark = CLScenario(
         [
             CLStream("train", train_exps),
-            CLStream("valid", val_exp)
+            CLStream("valid", val_exp),
             # add more stream here (validation, test, out-of-domain, ...)
         ]
     )
diff --git a/examples/tvdetection/train.py b/examples/tvdetection/train.py
index e73205295..0fa536674 100644
--- a/examples/tvdetection/train.py
+++ b/examples/tvdetection/train.py
@@ -17,6 +17,7 @@
 Because the number of images is smaller in the person keypoint subset of COCO,
 the number of epochs should be adapted so that we have the same number of iterations.
 """
+
 import datetime
 import os
 import time
diff --git a/profiling/data_merging.py b/profiling/data_merging.py
index c0574f05f..ae230aa7c 100644
--- a/profiling/data_merging.py
+++ b/profiling/data_merging.py
@@ -2,6 +2,7 @@
     Profiling script to measure the performance of the merging process
     of AvalancheDatasets
 """
+
 from unittest.mock import Mock
 
 import time
diff --git a/profiling/serialization.py b/profiling/serialization.py
index 92ea7d873..a404d6d47 100644
--- a/profiling/serialization.py
+++ b/profiling/serialization.py
@@ -4,6 +4,7 @@
 SAVING TIME:  0.019003629684448242
 LOADING TIME:  0.016000032424926758
 """
+
 import os
 import time
 from torch.optim import SGD
diff --git a/pyproject.toml b/pyproject.toml
index 192d35128..1f0d593dc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -59,4 +59,7 @@ ignore_errors = true
 
 [[tool.mypy.overrides]]
 module = "profiling.*"
-ignore_errors = true
\ No newline at end of file
+ignore_errors = true
+
+[tool.pytest.ini_options]
+addopts="-n 4"
\ No newline at end of file
diff --git a/tests/benchmarks/utils/test_dataloaders.py b/tests/benchmarks/utils/test_dataloaders.py
index 31a1c47cd..8a6892828 100644
--- a/tests/benchmarks/utils/test_dataloaders.py
+++ b/tests/benchmarks/utils/test_dataloaders.py
@@ -157,9 +157,9 @@ def test_dataloader_with_multiple_workers(self):
 
         # Continual learning strategy
         cl_strategy = Naive(
-            model,
-            optimizer,
-            criterion,
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
             train_mb_size=32,
             train_epochs=1,
             eval_mb_size=32,
diff --git a/tests/checkpointing/test_checkpointing.sh b/tests/checkpointing/test_checkpointing.sh
index eb447897d..a68b9c1b0 100755
--- a/tests/checkpointing/test_checkpointing.sh
+++ b/tests/checkpointing/test_checkpointing.sh
@@ -2,14 +2,19 @@
 # Script used to automatically test various combinations of plugins when used with
 # the checkpointing functionality.
 set -euo pipefail
-cd tests/checkpointing
 rm -rf logs
 rm -rf checkpoints
 rm -rf metrics_no_checkpoint
 rm -rf metrics_checkpoint
+rm -rf checkpoint.pkl
+rm -rf .coverage
+rm -rf .coverage_no_checkpoint
+rm -rf .coverage_checkpoint1
+rm -rf .coverage_checkpoint2
+rm -rf coverage_checkpointing_*
 
 export PYTHONUNBUFFERED=1
-export PYTHONPATH=../..
+export PYTHONPATH=.
 export CUDA_VISIBLE_DEVICES=0
 export CUBLAS_WORKSPACE_CONFIG=:4096:8
 
@@ -30,6 +35,7 @@ function str_bool {
 
 RUN_FAST_TESTS=$(str_bool "${FAST_TEST:-False}")
 RUN_GPU_TESTS=$(str_bool "${USE_GPU:-False}")
+RUN_COVERAGE=$(str_bool "${RUN_COVERAGE:-False}")
 
 GPU_PARAM="--cuda -1"
 
@@ -38,26 +44,66 @@ then
   GPU_PARAM="--cuda 0"
 fi
 
+BASE_RUNNER=(
+  "python"
+  "-u"
+)
+
+if [ "$RUN_COVERAGE" = "true" ]
+then
+  echo "Running with coverage..."
+  BASE_RUNNER=(
+    "coverage"
+    "run"
+  )
+fi
+
+BASE_RUNNER=(
+  "${BASE_RUNNER[@]}"
+  "tests/checkpointing/task_incremental_with_checkpointing.py"
+  $GPU_PARAM
+  "--benchmark"
+  $BENCHMARK
+)
+
 run_and_check() {
   # Without checkpoints
-  python -u task_incremental_with_checkpointing.py $GPU_PARAM --checkpoint_at -1 \
-    --plugins "$@" --benchmark $BENCHMARK --log_metrics_to './metrics_no_checkpoint'
+  "${BASE_RUNNER[@]}" --checkpoint_at -1 \
+    --plugins "$@" --log_metrics_to './metrics_no_checkpoint'
   rm -r ./checkpoint.pkl
+  if [ "$RUN_COVERAGE" = "true" ]
+  then
+    mv .coverage .coverage_no_checkpoint
+  fi
 
   # Stop after experience 1
-  python -u task_incremental_with_checkpointing.py $GPU_PARAM --checkpoint_at 1 \
-    --plugins "$@" --benchmark $BENCHMARK --log_metrics_to './metrics_checkpoint'
+  "${BASE_RUNNER[@]}" --checkpoint_at 1 \
+    --plugins "$@" --log_metrics_to './metrics_checkpoint'
+  if [ "$RUN_COVERAGE" = "true" ]
+  then
+    mv .coverage .coverage_checkpoint1
+  fi
   echo "Running from checkpoint 1..."
-  python -u task_incremental_with_checkpointing.py $GPU_PARAM --checkpoint_at 1  \
-    --plugins "$@" --benchmark $BENCHMARK --log_metrics_to './metrics_checkpoint'
+  "${BASE_RUNNER[@]}" --checkpoint_at 1  \
+    --plugins "$@" --log_metrics_to './metrics_checkpoint'
+  if [ "$RUN_COVERAGE" = "true" ]
+  then
+    mv .coverage .coverage_checkpoint2
+    coverage combine .coverage_no_checkpoint .coverage_checkpoint1 .coverage_checkpoint2
+    mv .coverage "coverage_checkpointing_$@"
+  fi
   rm -r ./checkpoint.pkl
 
-  python -u check_metrics_aligned.py \
+  python -u tests/checkpointing/check_metrics_aligned.py \
     "./metrics_no_checkpoint" "./metrics_checkpoint"
 
   rm -r metrics_no_checkpoint
   rm -r metrics_checkpoint
   rm -r logs
+
+  rm -rf .coverage_no_checkpoint
+  rm -rf .coverage_checkpoint1
+  rm -rf .coverage_checkpoint2
 }
 
 run_and_check "replay"
diff --git a/tests/models/test_models.py b/tests/models/test_models.py
index 7fdf67996..1dd49cc02 100644
--- a/tests/models/test_models.py
+++ b/tests/models/test_models.py
@@ -157,9 +157,9 @@ def test_optimizers(self):
             model, "SGDmom", "Adam", "SGD", "AdamW"
         ):
             strategy = Naive(
-                model,
-                optimizer,
-                criterion,
+                model=model,
+                optimizer=optimizer,
+                criterion=criterion,
                 train_mb_size=64,
                 device=self.device,
                 eval_mb_size=50,
@@ -172,9 +172,9 @@ def test_checkpointing(self):
         model, criterion, benchmark = self.init_scenario(multi_task=True)
         optimizer = SGD(model.parameters(), lr=0.1, momentum=0.9)
         strategy = Naive(
-            model,
-            optimizer,
-            criterion,
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
             train_mb_size=64,
             device=self.device,
             eval_mb_size=50,
@@ -190,9 +190,9 @@ def test_checkpointing(self):
         model, criterion, benchmark = self.init_scenario(multi_task=True)
         optimizer = SGD(model.parameters(), lr=0.1, momentum=0.9)
         strategy = Naive(
-            model,
-            optimizer,
-            criterion,
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
             train_mb_size=64,
             device=self.device,
             eval_mb_size=50,
@@ -226,9 +226,9 @@ def test_mh_classifier(self):
         model, criterion, benchmark = self.init_scenario(multi_task=True)
         optimizer = SGD(model.parameters(), lr=0.1, momentum=0.9)
         strategy = Naive(
-            model,
-            optimizer,
-            criterion,
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
             train_mb_size=64,
             device=self.device,
             eval_mb_size=50,
@@ -272,9 +272,9 @@ def test_incremental_classifier(self):
         benchmark = self.benchmark
 
         strategy = Naive(
-            model,
-            optimizer,
-            criterion,
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
             train_mb_size=100,
             train_epochs=1,
             eval_mb_size=100,
@@ -334,9 +334,9 @@ def test_incremental_classifier_weight_update(self):
         benchmark = self.benchmark
 
         strategy = Naive(
-            model,
-            optimizer,
-            criterion,
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
             train_mb_size=100,
             train_epochs=1,
             eval_mb_size=100,
@@ -378,9 +378,9 @@ def test_multihead_head_creation(self):
         benchmark = get_fast_benchmark(use_task_labels=True, shuffle=False)
 
         strategy = Naive(
-            model,
-            optimizer,
-            criterion,
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
             train_mb_size=100,
             train_epochs=1,
             eval_mb_size=100,
@@ -432,9 +432,9 @@ def test_multihead_head_selection(self):
         benchmark = get_fast_benchmark(use_task_labels=True, shuffle=False)
 
         strategy = Naive(
-            model,
-            optimizer,
-            criterion,
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
             train_mb_size=100,
             train_epochs=1,
             eval_mb_size=100,
diff --git a/tests/models/test_packnet.py b/tests/models/test_packnet.py
index 1b0caff40..c80aeb402 100644
--- a/tests/models/test_packnet.py
+++ b/tests/models/test_packnet.py
@@ -1,6 +1,9 @@
 import unittest
 from avalanche.models.packnet import PackNetModel, packnet_simple_mlp
 from avalanche.training.supervised.strategy_wrappers import PackNet
+from avalanche.training.templates.base import (
+    PositionalArgumentsDeprecatedWarning,
+)
 from torch.optim import SGD, Adam
 import torch
 import os
@@ -26,16 +29,17 @@ def _test_PackNetPlugin(self, expectations, optimizer_constructor, lr):
         )
         model = construct_model()
         optimizer = optimizer_constructor(model.parameters(), lr=lr)
-        strategy = PackNet(
-            model,
-            prune_proportion=0.5,
-            post_prune_epochs=1,
-            optimizer=optimizer,
-            train_epochs=2,
-            train_mb_size=10,
-            eval_mb_size=10,
-            device="cuda" if use_gpu else "cpu",
-        )
+        with self.assertWarns(PositionalArgumentsDeprecatedWarning):
+            strategy = PackNet(
+                model,
+                prune_proportion=0.5,
+                post_prune_epochs=1,
+                optimizer=optimizer,
+                train_epochs=2,
+                train_mb_size=10,
+                eval_mb_size=10,
+                device="cuda" if use_gpu else "cpu",
+            )
         x_test = torch.rand(10, 6)
         t_test = torch.ones(10, dtype=torch.long)
         task_ouputs = []
diff --git a/tests/run_dist_tests.py b/tests/run_dist_tests.py
index fa8d9f94d..232e98645 100644
--- a/tests/run_dist_tests.py
+++ b/tests/run_dist_tests.py
@@ -1,4 +1,5 @@
 import os
+from pathlib import Path
 import signal
 import sys
 import unittest
@@ -34,6 +35,9 @@ def get_distributed_test_cases(suite: Union[TestCase, TestSuite]) -> Set[str]:
 @click.command()
 @click.argument("test_cases", nargs=-1)
 def run_distributed_suites(test_cases):
+    if Path.cwd().name != "tests":
+        os.chdir(Path.cwd() / "tests")
+
     cases_names = get_distributed_test_cases(
         unittest.defaultTestLoader.discover(".")
     )  # Don't change the path!
diff --git a/tests/test_loggers.py b/tests/test_loggers.py
index 36f3403e3..4f50f67ef 100644
--- a/tests/test_loggers.py
+++ b/tests/test_loggers.py
@@ -3,6 +3,7 @@
 Right now they don't do much but they at least check that the loggers run
 without errors.
 """
+
 import unittest
 
 from torch.optim import SGD
diff --git a/tests/training/test_online_strategies.py b/tests/training/test_online_strategies.py
index 520a837d2..7d95b7856 100644
--- a/tests/training/test_online_strategies.py
+++ b/tests/training/test_online_strategies.py
@@ -9,6 +9,9 @@
 from avalanche.models import SimpleMLP
 from avalanche.benchmarks.scenarios.online import OnlineCLScenario
 from avalanche.training import OnlineNaive
+from avalanche.training.templates.base import (
+    PositionalArgumentsDeprecatedWarning,
+)
 from tests.unit_tests_utils import get_fast_benchmark
 from avalanche.training.plugins.evaluation import default_evaluator
 
@@ -46,24 +49,25 @@ def test_naive(self):
 
         # With task boundaries
         model, optimizer, criterion, _ = self.init_sit()
-        strategy = OnlineNaive(
-            model,
-            optimizer,
-            criterion,
-            train_mb_size=1,
-            device=self.device,
-            eval_mb_size=50,
-            evaluator=default_evaluator,
-        )
+        with self.assertWarns(PositionalArgumentsDeprecatedWarning):
+            strategy = OnlineNaive(
+                model,
+                optimizer,
+                criterion=criterion,
+                train_mb_size=1,
+                device=self.device,
+                eval_mb_size=50,
+                evaluator=default_evaluator,
+            )
         ocl_benchmark = OnlineCLScenario(benchmark_streams, access_task_boundaries=True)
         self.run_strategy_boundaries(ocl_benchmark, strategy)
 
         # Without task boundaries
         model, optimizer, criterion, my_nc_benchmark = self.init_sit()
         strategy = OnlineNaive(
-            model,
-            optimizer,
-            criterion,
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
             train_mb_size=1,
             device=self.device,
             eval_mb_size=50,
diff --git a/tests/training/test_plugins.py b/tests/training/test_plugins.py
index 04e591f99..6c223c73e 100644
--- a/tests/training/test_plugins.py
+++ b/tests/training/test_plugins.py
@@ -118,9 +118,9 @@ def test_callback_reachability(self):
 
         plug = MockPlugin()
         strategy = Naive(
-            model,
-            optimizer,
-            criterion,
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
             train_mb_size=100,
             train_epochs=1,
             eval_mb_size=100,
diff --git a/tests/training/test_strategies.py b/tests/training/test_strategies.py
index 61c3592ff..5265d5c31 100644
--- a/tests/training/test_strategies.py
+++ b/tests/training/test_strategies.py
@@ -60,6 +60,9 @@
 from avalanche.training.supervised.strategy_wrappers import PNNStrategy
 from avalanche.training.templates import SupervisedTemplate
 from avalanche.training.templates.base import _group_experiences_by_stream
+from avalanche.training.templates.base import (
+    PositionalArgumentsDeprecatedWarning,
+)
 from avalanche.training.utils import get_last_fc_layer
 from tests.training.test_strategy_utils import run_strategy
 from tests.unit_tests_utils import get_fast_benchmark, get_device
@@ -110,9 +113,9 @@ def test_periodic_eval(self):
         # for each eval loop.
         acc = StreamAccuracy()
         strategy = Naive(
-            model,
-            optimizer,
-            criterion,
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
             train_epochs=2,
             eval_every=-1,
             evaluator=EvaluationPlugin(acc),
@@ -127,9 +130,9 @@ def test_periodic_eval(self):
         acc = StreamAccuracy()
         evalp = EvaluationPlugin(acc)
         strategy = Naive(
-            model,
-            optimizer,
-            criterion,
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
             train_epochs=2,
             eval_every=0,
             evaluator=evalp,
@@ -144,9 +147,9 @@ def test_periodic_eval(self):
         ###################
         acc = StreamAccuracy()
         strategy = Naive(
-            model,
-            optimizer,
-            criterion,
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
             train_epochs=2,
             eval_every=1,
             evaluator=EvaluationPlugin(acc),
@@ -160,9 +163,9 @@ def test_periodic_eval(self):
         ###################
         acc = StreamAccuracy()
         strategy = Naive(
-            model,
-            optimizer,
-            criterion,
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
             train_epochs=2,
             eval_every=100,
             evaluator=EvaluationPlugin(acc),
@@ -185,9 +188,9 @@ def test_plugins_compatibility_checks(self):
         )
 
         strategy = Naive(
-            model,
-            optimizer,
-            criterion,
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
             train_epochs=2,
             eval_every=-1,
             evaluator=evalp,
@@ -226,9 +229,9 @@ def after_training_iteration(
         optimizer = SGD(model.parameters(), lr=1)
 
         strategy = Cumulative(
-            model,
-            optimizer,
-            criterion,
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
             train_mb_size=1,
             device=get_device(),
             eval_mb_size=512,
@@ -271,23 +274,37 @@ def init_scenario(self, multi_task=False):
     def test_naive(self):
         # SIT scenario
         model, optimizer, criterion, benchmark = self.init_scenario(multi_task=False)
-        strategy = Naive(
-            model,
-            optimizer,
-            criterion,
-            train_mb_size=64,
-            device=self.device,
-            eval_mb_size=50,
-            train_epochs=2,
-        )
+
+        with self.assertRaises(TypeError):
+            strategy = Naive(
+                model,
+                optimizer,
+                criterion,
+                train_mb_size=64,
+                device=self.device,
+                eval_mb_size=50,
+                trai_epochs=2,
+            )
+
+        with self.assertWarns(PositionalArgumentsDeprecatedWarning):
+            strategy = Naive(
+                model,
+                optimizer,
+                criterion,
+                train_mb_size=64,
+                device=self.device,
+                eval_mb_size=50,
+                train_epochs=2,
+            )
+
         run_strategy(benchmark, strategy)
 
         # MT scenario
         model, optimizer, criterion, benchmark = self.init_scenario(multi_task=True)
         strategy = Naive(
-            model,
-            optimizer,
-            criterion,
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
             train_mb_size=64,
             device=self.device,
             eval_mb_size=50,
@@ -313,25 +330,28 @@ def after_train_dataset_adaptation(
 
         # SIT scenario
         model, optimizer, criterion, benchmark = self.init_scenario(multi_task=False)
-        strategy = JointTraining(
-            model,
-            optimizer,
-            criterion,
-            train_mb_size=64,
-            device=self.device,
-            eval_mb_size=50,
-            train_epochs=2,
-            plugins=[JointSTestPlugin(benchmark)],
-        )
+        with self.assertWarns(PositionalArgumentsDeprecatedWarning):
+            strategy = JointTraining(
+                model,
+                optimizer,
+                criterion,
+                train_mb_size=64,
+                device=self.device,
+                eval_mb_size=50,
+                train_epochs=2,
+                plugins=[JointSTestPlugin(benchmark)],
+            )
+
         strategy.evaluator.loggers = [TextLogger(sys.stdout)]
         strategy.train(benchmark.train_stream)
 
         # MT scenario
         model, optimizer, criterion, benchmark = self.init_scenario(multi_task=True)
+
         strategy = JointTraining(
-            model,
-            optimizer,
-            criterion,
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
             train_mb_size=64,
             device=self.device,
             eval_mb_size=50,
@@ -351,14 +371,17 @@ def test_cwrstar(self):
         # SIT scenario
         model, optimizer, criterion, benchmark = self.init_scenario(multi_task=False)
         last_fc_name, _ = get_last_fc_layer(model)
-        strategy = CWRStar(
-            model,
-            optimizer,
-            criterion,
-            last_fc_name,
-            train_mb_size=64,
-            device=self.device,
-        )
+
+        with self.assertWarns(PositionalArgumentsDeprecatedWarning):
+            strategy = CWRStar(
+                model,
+                optimizer,
+                criterion=criterion,
+                cwr_layer_name=last_fc_name,
+                train_mb_size=64,
+                device=self.device,
+            )
+
         run_strategy(benchmark, strategy)
 
         dict_past_j = {}
@@ -378,10 +401,10 @@ def test_cwrstar(self):
         # MT scenario
         model, optimizer, criterion, benchmark = self.init_scenario(multi_task=True)
         strategy = CWRStar(
-            model,
-            optimizer,
-            criterion,
-            last_fc_name,
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
+            cwr_layer_name=last_fc_name,
             train_mb_size=64,
             device=self.device,
         )
@@ -401,25 +424,27 @@ def test_cwrstar(self):
     def test_replay(self):
         # SIT scenario
         model, optimizer, criterion, benchmark = self.init_scenario(multi_task=False)
-        strategy = Replay(
-            model,
-            optimizer,
-            criterion,
-            mem_size=10,
-            train_mb_size=64,
-            device=self.device,
-            eval_mb_size=50,
-            train_epochs=2,
-        )
+        with self.assertWarns(PositionalArgumentsDeprecatedWarning):
+            strategy = Replay(
+                model,
+                optimizer,
+                criterion=criterion,
+                mem_size=10,
+                train_mb_size=64,
+                device=self.device,
+                eval_mb_size=50,
+                train_epochs=2,
+            )
+
         run_strategy(benchmark, strategy)
 
         # MT scenario
         model, optimizer, criterion, benchmark = self.init_scenario(multi_task=True)
 
         strategy = Replay(
-            model,
-            optimizer,
-            criterion,
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
             mem_size=10,
             train_mb_size=64,
             device=self.device,
@@ -432,25 +457,27 @@ def test_replay(self):
     def test_mer(self):
         # SIT scenario
         model, optimizer, criterion, benchmark = self.init_scenario(multi_task=False)
-        strategy = MER(
-            model,
-            optimizer,
-            criterion,
-            mem_size=10,
-            train_mb_size=64,
-            device=self.device,
-            eval_mb_size=50,
-            train_epochs=2,
-        )
+        with self.assertWarns(PositionalArgumentsDeprecatedWarning):
+            strategy = MER(
+                model,
+                optimizer,
+                criterion=criterion,
+                mem_size=10,
+                train_mb_size=64,
+                device=self.device,
+                eval_mb_size=50,
+                train_epochs=2,
+            )
+
         run_strategy(benchmark, strategy)
 
         # MT scenario
         model, optimizer, criterion, benchmark = self.init_scenario(multi_task=True)
 
         strategy = MER(
-            model,
-            optimizer,
-            criterion,
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
             mem_size=10,
             train_mb_size=64,
             device=self.device,
@@ -463,24 +490,26 @@ def test_mer(self):
     def test_gdumb(self):
         # SIT scenario
         model, optimizer, criterion, benchmark = self.init_scenario(multi_task=False)
-        strategy = GDumb(
-            model,
-            optimizer,
-            criterion,
-            mem_size=200,
-            train_mb_size=64,
-            device=self.device,
-            eval_mb_size=50,
-            train_epochs=2,
-        )
+        with self.assertWarns(PositionalArgumentsDeprecatedWarning):
+            strategy = GDumb(
+                model,
+                optimizer,
+                criterion,
+                mem_size=200,
+                train_mb_size=64,
+                device=self.device,
+                eval_mb_size=50,
+                train_epochs=2,
+            )
+
         run_strategy(benchmark, strategy)
 
         # MT scenario
         model, optimizer, criterion, benchmark = self.init_scenario(multi_task=True)
         strategy = GDumb(
-            model,
-            optimizer,
-            criterion,
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
             mem_size=200,
             train_mb_size=64,
             device=self.device,
@@ -492,23 +521,24 @@ def test_gdumb(self):
     def test_cumulative(self):
         # SIT scenario
         model, optimizer, criterion, benchmark = self.init_scenario(multi_task=False)
-        strategy = Cumulative(
-            model,
-            optimizer,
-            criterion,
-            train_mb_size=64,
-            device=self.device,
-            eval_mb_size=50,
-            train_epochs=2,
-        )
+        with self.assertWarns(PositionalArgumentsDeprecatedWarning):
+            strategy = Cumulative(
+                model,
+                optimizer,
+                criterion,
+                train_mb_size=64,
+                device=self.device,
+                eval_mb_size=50,
+                train_epochs=2,
+            )
         run_strategy(benchmark, strategy)
 
         # MT scenario
         model, optimizer, criterion, benchmark = self.init_scenario(multi_task=True)
         strategy = Cumulative(
-            model,
-            optimizer,
-            criterion,
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
             train_mb_size=64,
             device=self.device,
             eval_mb_size=50,
@@ -518,17 +548,19 @@ def test_cumulative(self):
 
     def test_slda(self):
         model, _, criterion, benchmark = self.init_scenario(multi_task=False)
-        strategy = StreamingLDA(
-            model,
-            criterion,
-            input_size=10,
-            output_layer_name="features",
-            num_classes=10,
-            eval_mb_size=7,
-            train_epochs=1,
-            device=self.device,
-            train_mb_size=7,
-        )
+        with self.assertWarns(PositionalArgumentsDeprecatedWarning):
+            strategy = StreamingLDA(
+                model,
+                criterion,
+                10,
+                10,
+                output_layer_name="features",
+                eval_mb_size=7,
+                train_epochs=1,
+                device=self.device,
+                train_mb_size=7,
+            )
+
         run_strategy(benchmark, strategy)
 
     def test_warning_slda_lwf(self):
@@ -546,25 +578,26 @@ def test_warning_slda_lwf(self):
     def test_lwf(self):
         # SIT scenario
         model, optimizer, criterion, benchmark = self.init_scenario(multi_task=False)
-        strategy = LwF(
-            model,
-            optimizer,
-            criterion,
-            alpha=[0, 1 / 2, 2 * (2 / 3), 3 * (3 / 4), 4 * (4 / 5)],
-            temperature=2,
-            device=self.device,
-            train_mb_size=10,
-            eval_mb_size=50,
-            train_epochs=2,
-        )
+        with self.assertWarns(PositionalArgumentsDeprecatedWarning):
+            strategy = LwF(
+                model,
+                optimizer,
+                criterion,
+                alpha=[0, 1 / 2, 2 * (2 / 3), 3 * (3 / 4), 4 * (4 / 5)],
+                temperature=2,
+                device=self.device,
+                train_mb_size=10,
+                eval_mb_size=50,
+                train_epochs=2,
+            )
         run_strategy(benchmark, strategy)
 
         # MT scenario
         model, optimizer, criterion, benchmark = self.init_scenario(multi_task=True)
         strategy = LwF(
-            model,
-            optimizer,
-            criterion,
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
             alpha=[0, 1 / 2, 2 * (2 / 3), 3 * (3 / 4), 4 * (4 / 5)],
             temperature=2,
             device=self.device,
@@ -577,24 +610,25 @@ def test_lwf(self):
     def test_agem(self):
         # SIT scenario
         model, optimizer, criterion, benchmark = self.init_scenario(multi_task=False)
-        strategy = AGEM(
-            model,
-            optimizer,
-            criterion,
-            patterns_per_exp=25,
-            sample_size=25,
-            train_mb_size=10,
-            eval_mb_size=50,
-            train_epochs=2,
-        )
+        with self.assertWarns(PositionalArgumentsDeprecatedWarning):
+            strategy = AGEM(
+                model,
+                optimizer,
+                criterion,
+                patterns_per_exp=25,
+                sample_size=25,
+                train_mb_size=10,
+                eval_mb_size=50,
+                train_epochs=2,
+            )
         run_strategy(benchmark, strategy)
 
         # MT scenario
         model, optimizer, criterion, benchmark = self.init_scenario(multi_task=True)
         strategy = AGEM(
-            model,
-            optimizer,
-            criterion,
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
             patterns_per_exp=25,
             sample_size=25,
             train_mb_size=10,
@@ -606,24 +640,25 @@ def test_agem(self):
     def test_gem(self):
         # SIT scenario
         model, optimizer, criterion, benchmark = self.init_scenario(multi_task=False)
-        strategy = GEM(
-            model,
-            optimizer,
-            criterion,
-            patterns_per_exp=256,
-            train_mb_size=10,
-            eval_mb_size=50,
-            train_epochs=2,
-        )
+        with self.assertWarns(PositionalArgumentsDeprecatedWarning):
+            strategy = GEM(
+                model,
+                optimizer,
+                criterion,
+                patterns_per_exp=256,
+                train_mb_size=10,
+                eval_mb_size=50,
+                train_epochs=2,
+            )
 
         run_strategy(benchmark, strategy)
 
         # MT scenario
         model, optimizer, criterion, benchmark = self.init_scenario(multi_task=True)
         strategy = GEM(
-            model,
-            optimizer,
-            criterion,
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
             patterns_per_exp=256,
             train_mb_size=10,
             eval_mb_size=50,
@@ -635,25 +670,26 @@ def test_gem(self):
     def test_ewc(self):
         # SIT scenario
         model, optimizer, criterion, benchmark = self.init_scenario(multi_task=False)
-        strategy = EWC(
-            model,
-            optimizer,
-            criterion,
-            ewc_lambda=0.4,
-            mode="separate",
-            train_mb_size=10,
-            eval_mb_size=50,
-            train_epochs=2,
-        )
+        with self.assertWarns(PositionalArgumentsDeprecatedWarning):
+            strategy = EWC(
+                model,
+                optimizer,
+                criterion,
+                ewc_lambda=0.4,
+                mode="separate",
+                train_mb_size=10,
+                eval_mb_size=50,
+                train_epochs=2,
+            )
 
         run_strategy(benchmark, strategy)
 
         # MT scenario
         model, optimizer, criterion, benchmark = self.init_scenario(multi_task=True)
         strategy = EWC(
-            model,
-            optimizer,
-            criterion,
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
             ewc_lambda=0.4,
             mode="separate",
             train_mb_size=10,
@@ -665,25 +701,26 @@ def test_ewc(self):
     def test_ewc_online(self):
         # SIT scenario
         model, optimizer, criterion, benchmark = self.init_scenario(multi_task=False)
-        strategy = EWC(
-            model,
-            optimizer,
-            criterion,
-            ewc_lambda=0.4,
-            mode="online",
-            decay_factor=0.1,
-            train_mb_size=10,
-            eval_mb_size=50,
-            train_epochs=2,
-        )
+        with self.assertWarns(PositionalArgumentsDeprecatedWarning):
+            strategy = EWC(
+                model,
+                optimizer,
+                criterion,
+                ewc_lambda=0.4,
+                mode="online",
+                decay_factor=0.1,
+                train_mb_size=10,
+                eval_mb_size=50,
+                train_epochs=2,
+            )
         run_strategy(benchmark, strategy)
 
         # # MT scenario
         model, optimizer, criterion, benchmark = self.init_scenario(multi_task=True)
         strategy = EWC(
-            model,
-            optimizer,
-            criterion,
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
             ewc_lambda=0.4,
             mode="online",
             decay_factor=0.1,
@@ -696,29 +733,30 @@ def test_ewc_online(self):
     def test_rwalk(self):
         # SIT scenario
         model, optimizer, criterion, benchmark = self.init_scenario(multi_task=False)
-        strategy = Naive(
-            model,
-            optimizer,
-            criterion,
-            train_mb_size=10,
-            eval_mb_size=50,
-            train_epochs=2,
-            plugins=[
-                RWalkPlugin(
-                    ewc_lambda=0.1,
-                    ewc_alpha=0.9,
-                    delta_t=10,
-                ),
-            ],
-        )
+        with self.assertWarns(PositionalArgumentsDeprecatedWarning):
+            strategy = Naive(
+                model,
+                optimizer,
+                criterion,
+                train_mb_size=10,
+                eval_mb_size=50,
+                train_epochs=2,
+                plugins=[
+                    RWalkPlugin(
+                        ewc_lambda=0.1,
+                        ewc_alpha=0.9,
+                        delta_t=10,
+                    ),
+                ],
+            )
         run_strategy(benchmark, strategy)
 
         # # MT scenario
         model, optimizer, criterion, benchmark = self.init_scenario(multi_task=True)
         strategy = Naive(
-            model,
-            optimizer,
-            criterion,
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
             train_mb_size=10,
             eval_mb_size=50,
             train_epochs=2,
@@ -735,23 +773,24 @@ def test_rwalk(self):
     def test_synaptic_intelligence(self):
         # SIT scenario
         model, optimizer, criterion, benchmark = self.init_scenario(multi_task=False)
-        strategy = SynapticIntelligence(
-            model,
-            optimizer,
-            criterion,
-            si_lambda=0.0001,
-            train_epochs=1,
-            train_mb_size=10,
-            eval_mb_size=10,
-        )
+        with self.assertWarns(PositionalArgumentsDeprecatedWarning):
+            strategy = SynapticIntelligence(
+                model,
+                optimizer,
+                criterion,
+                si_lambda=0.0001,
+                train_epochs=1,
+                train_mb_size=10,
+                eval_mb_size=10,
+            )
         run_strategy(benchmark, strategy)
 
         # MT scenario
         model, optimizer, criterion, benchmark = self.init_scenario(multi_task=True)
         strategy = SynapticIntelligence(
-            model,
-            optimizer,
-            criterion,
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
             si_lambda=0.0001,
             train_epochs=1,
             train_mb_size=10,
@@ -766,18 +805,19 @@ def test_cope(self):
 
         # SIT scenario
         model, optimizer, criterion, benchmark = self.init_scenario(multi_task=False)
-        strategy = CoPE(
-            model,
-            optimizer,
-            criterion,
-            mem_size=10,
-            n_classes=n_classes,
-            p_size=emb_size,
-            train_mb_size=10,
-            device=self.device,
-            eval_mb_size=50,
-            train_epochs=2,
-        )
+        with self.assertWarns(PositionalArgumentsDeprecatedWarning):
+            strategy = CoPE(
+                model,
+                optimizer,
+                criterion,
+                mem_size=10,
+                n_classes=n_classes,
+                p_size=emb_size,
+                train_mb_size=10,
+                device=self.device,
+                eval_mb_size=50,
+                train_epochs=2,
+            )
         run_strategy(benchmark, strategy)
 
         # MT scenario
@@ -802,14 +842,15 @@ def test_pnn(self):
         # eval on future tasks is not allowed.
         model = PNN(num_layers=3, in_features=6, hidden_features_per_column=10)
         optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
-        strategy = PNNStrategy(
-            model,
-            optimizer,
-            train_mb_size=10,
-            device=self.device,
-            eval_mb_size=50,
-            train_epochs=2,
-        )
+        with self.assertWarns(PositionalArgumentsDeprecatedWarning):
+            strategy = PNNStrategy(
+                model,
+                optimizer,
+                train_mb_size=10,
+                device=self.device,
+                eval_mb_size=50,
+                train_epochs=2,
+            )
 
         # train and test loop
         benchmark = self.load_benchmark(use_task_labels=True)
@@ -820,17 +861,18 @@ def test_pnn(self):
     def test_expertgate(self):
         model = ExpertGate(shape=(3, 227, 227), device=self.device)
         optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
-        strategy = ExpertGateStrategy(
-            model,
-            optimizer,
-            device=self.device,
-            train_mb_size=10,
-            train_epochs=2,
-            eval_mb_size=50,
-            ae_train_mb_size=10,
-            ae_train_epochs=2,
-            ae_lr=5e-4,
-        )
+        with self.assertWarns(PositionalArgumentsDeprecatedWarning):
+            strategy = ExpertGateStrategy(
+                model,
+                optimizer,
+                device=self.device,
+                train_mb_size=10,
+                train_epochs=2,
+                eval_mb_size=50,
+                ae_train_mb_size=10,
+                ae_train_epochs=2,
+                ae_lr=5e-4,
+            )
 
         # Mandatory transform for AlexNet
         # 3 Channels and input size should be a minimum of 227
@@ -858,35 +900,37 @@ def test_icarl(self):
         cls = torch.nn.Sigmoid()
         optimizer = SGD([*fe.parameters(), *cls.parameters()], lr=0.001)
 
-        strategy = ICaRL(
-            fe,
-            cls,
-            optimizer,
-            20,
-            buffer_transform=None,
-            fixed_memory=True,
-            train_mb_size=32,
-            train_epochs=2,
-            eval_mb_size=50,
-            device=self.device,
-            eval_every=-1,
-        )
+        with self.assertWarns(PositionalArgumentsDeprecatedWarning):
+            strategy = ICaRL(
+                fe,
+                cls,
+                optimizer,
+                20,
+                buffer_transform=None,
+                fixed_memory=True,
+                train_mb_size=32,
+                train_epochs=2,
+                eval_mb_size=50,
+                device=self.device,
+                eval_every=-1,
+            )
 
         run_strategy(benchmark, strategy)
 
     def test_lfl(self):
         # SIT scenario
         model, optimizer, criterion, benchmark = self.init_scenario(multi_task=False)
-        strategy = LFL(
-            model,
-            optimizer,
-            criterion,
-            lambda_e=0.0001,
-            train_mb_size=10,
-            device=self.device,
-            eval_mb_size=50,
-            train_epochs=2,
-        )
+        with self.assertWarns(PositionalArgumentsDeprecatedWarning):
+            strategy = LFL(
+                model,
+                optimizer,
+                criterion,
+                lambda_e=0.0001,
+                train_mb_size=10,
+                device=self.device,
+                eval_mb_size=50,
+                train_epochs=2,
+            )
         run_strategy(benchmark, strategy)
 
         # MT scenario
@@ -907,17 +951,18 @@ def test_lfl(self):
     def test_mas(self):
         # SIT scenario
         model, optimizer, criterion, benchmark = self.init_scenario(multi_task=False)
-        strategy = MAS(
-            model,
-            optimizer,
-            criterion,
-            lambda_reg=1.0,
-            alpha=0.5,
-            train_mb_size=10,
-            device=self.device,
-            eval_mb_size=50,
-            train_epochs=2,
-        )
+        with self.assertWarns(PositionalArgumentsDeprecatedWarning):
+            strategy = MAS(
+                model,
+                optimizer,
+                criterion,
+                lambda_reg=1.0,
+                alpha=0.5,
+                train_mb_size=10,
+                device=self.device,
+                eval_mb_size=50,
+                train_epochs=2,
+            )
         run_strategy(benchmark, strategy)
 
         # MT scenario
@@ -939,46 +984,48 @@ def test_mas(self):
     def test_bic(self):
         # SIT scenario
         model, optimizer, criterion, benchmark = self.init_scenario(multi_task=False)
-        strategy = BiC(
-            model,
-            optimizer,
-            criterion,
-            mem_size=50,
-            val_percentage=0.1,
-            T=2,
-            stage_2_epochs=10,
-            lamb=-1,
-            lr=0.01,
-            train_mb_size=10,
-            device=self.device,
-            eval_mb_size=50,
-            train_epochs=2,
-        )
+        with self.assertWarns(PositionalArgumentsDeprecatedWarning):
+            strategy = BiC(
+                model,
+                optimizer,
+                criterion,
+                mem_size=50,
+                val_percentage=0.1,
+                T=2,
+                stage_2_epochs=10,
+                lamb=-1,
+                lr=0.01,
+                train_mb_size=10,
+                device=self.device,
+                eval_mb_size=50,
+                train_epochs=2,
+            )
         run_strategy(benchmark, strategy)
 
     def test_mir(self):
         # SIT scenario
         model, optimizer, criterion, benchmark = self.init_scenario(multi_task=False)
-        strategy = MIR(
-            model,
-            optimizer,
-            criterion,
-            mem_size=1000,
-            batch_size_mem=10,
-            subsample=50,
-            train_mb_size=10,
-            device=self.device,
-            eval_mb_size=50,
-            train_epochs=2,
-        )
+        with self.assertWarns(PositionalArgumentsDeprecatedWarning):
+            strategy = MIR(
+                model,
+                optimizer,
+                criterion,
+                mem_size=1000,
+                batch_size_mem=10,
+                subsample=50,
+                train_mb_size=10,
+                device=self.device,
+                eval_mb_size=50,
+                train_epochs=2,
+            )
         run_strategy(benchmark, strategy)
 
         # MT scenario
         model, optimizer, criterion, benchmark = self.init_scenario(multi_task=True)
         strategy = MIR(
-            model,
-            optimizer,
-            criterion,
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
             mem_size=1000,
             batch_size_mem=10,
             subsample=50,
@@ -992,70 +1039,74 @@ def test_mir(self):
     def test_erace(self):
         # SIT scenario
         model, optimizer, criterion, benchmark = self.init_scenario(multi_task=False)
-        strategy = ER_ACE(
-            model,
-            optimizer,
-            criterion,
-            mem_size=1000,
-            batch_size_mem=10,
-            train_mb_size=10,
-            device=self.device,
-            eval_mb_size=50,
-            train_epochs=2,
-        )
+        with self.assertWarns(PositionalArgumentsDeprecatedWarning):
+            strategy = ER_ACE(
+                model,
+                optimizer,
+                criterion,
+                mem_size=1000,
+                batch_size_mem=10,
+                train_mb_size=10,
+                device=self.device,
+                eval_mb_size=50,
+                train_epochs=2,
+            )
         run_strategy(benchmark, strategy)
 
     def test_eraml(self):
         # SIT scenario
         model, optimizer, criterion, benchmark = self.init_scenario(multi_task=False)
-        strategy = ER_AML(
-            model,
-            model.features,
-            optimizer,
-            criterion,
-            temp=0.1,
-            base_temp=0.07,
-            mem_size=1000,
-            batch_size_mem=10,
-            train_mb_size=10,
-            device=self.device,
-            eval_mb_size=50,
-            train_epochs=2,
-        )
+        with self.assertWarns(PositionalArgumentsDeprecatedWarning):
+            strategy = ER_AML(
+                model,
+                model.features,
+                optimizer,
+                criterion,
+                temp=0.1,
+                base_temp=0.07,
+                mem_size=1000,
+                batch_size_mem=10,
+                train_mb_size=10,
+                device=self.device,
+                eval_mb_size=50,
+                train_epochs=2,
+            )
         run_strategy(benchmark, strategy)
 
     def test_l2p(self):
         _, _, _, benchmark = self.init_scenario(multi_task=False)
 
-        strategy = LearningToPrompt(
-            model_name="simpleMLP",
-            criterion=CrossEntropyLoss(),
-            train_mb_size=10,
-            device=self.device,
-            train_epochs=1,
-            num_classes=10,
-            eval_mb_size=50,
-            use_cls_features=False,
-            use_mask=False,
-            use_vit=False,
-        )
+        with self.assertWarns(PositionalArgumentsDeprecatedWarning):
+            strategy = LearningToPrompt(
+                "simpleMLP",
+                criterion=CrossEntropyLoss(),
+                train_mb_size=10,
+                device=self.device,
+                train_epochs=1,
+                num_classes=10,
+                eval_mb_size=50,
+                use_cls_features=False,
+                use_mask=False,
+                use_vit=False,
+            )
 
         run_strategy(benchmark, strategy)
 
     def test_der(self):
         # SIT scenario
         model, optimizer, criterion, benchmark = self.init_scenario(multi_task=False)
-        strategy = DER(
-            model,
-            optimizer,
-            criterion,
-            mem_size=1000,
-            batch_size_mem=10,
-            train_mb_size=10,
-            device=self.device,
-            eval_mb_size=50,
-            train_epochs=2,
-        )
+        with self.assertWarns(PositionalArgumentsDeprecatedWarning):
+            strategy = DER(
+                model,
+                optimizer,
+                criterion,
+                mem_size=1000,
+                batch_size_mem=10,
+                train_mb_size=10,
+                device=self.device,
+                eval_mb_size=50,
+                train_epochs=2,
+            )
         run_strategy(benchmark, strategy)
 
     def test_feature_distillation(self):
@@ -1073,9 +1124,9 @@ def test_feature_distillation(self):
         plugins = [feature_distillation]
 
         strategy = Naive(
-            model,
-            optimizer,
-            criterion,
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
             device=self.device,
             train_mb_size=10,
             eval_mb_size=50,
@@ -1094,16 +1145,17 @@ def test_feature_replay(self):
 
         # We do not add MT criterion cause FeatureReplay uses
         # MaskedCrossEntropy as main criterion
-        strategy = FeatureReplay(
-            model,
-            optimizer,
-            device=self.device,
-            last_layer_name=last_fc_name,
-            train_mb_size=10,
-            eval_mb_size=50,
-            train_epochs=2,
-            plugins=plugins,
-        )
+        with self.assertWarns(PositionalArgumentsDeprecatedWarning):
+            strategy = FeatureReplay(
+                model,
+                optimizer,
+                device=self.device,
+                last_layer_name=last_fc_name,
+                train_mb_size=10,
+                eval_mb_size=50,
+                train_epochs=2,
+                plugins=plugins,
+            )
         run_strategy(benchmark, strategy)
 
     def load_benchmark(
diff --git a/tests/training/test_strategies_accuracy.py b/tests/training/test_strategies_accuracy.py
index 63776c7d8..481f94367 100644
--- a/tests/training/test_strategies_accuracy.py
+++ b/tests/training/test_strategies_accuracy.py
@@ -82,9 +82,9 @@ def test_multihead_cumulative(self):
         exp_acc = ExperienceAccuracy()
         evalp = EvaluationPlugin(main_metric, exp_acc, loggers=None)
         strategy = Cumulative(
-            model,
-            optimizer,
-            criterion,
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
             train_mb_size=64,
             device=get_device(),
             eval_mb_size=512,
@@ -111,8 +111,8 @@ def test_pnn(self):
         model = PNN(num_layers=1, in_features=6, hidden_features_per_column=50)
         optimizer = SGD(model.parameters(), lr=0.1)
         strategy = PNNStrategy(
-            model,
-            optimizer,
+            model=model,
+            optimizer=optimizer,
             train_mb_size=32,
             device=get_device(),
             eval_mb_size=512,
diff --git a/tests/training/test_supervised_regression.py b/tests/training/test_supervised_regression.py
index 76cff8f9b..f1af6950c 100644
--- a/tests/training/test_supervised_regression.py
+++ b/tests/training/test_supervised_regression.py
@@ -34,6 +34,7 @@
 from avalanche.training.plugins.clock import Clock
 
 from avalanche.training.plugins import EvaluationPlugin
+from avalanche.training.supervised.strategy_wrappers import Naive
 from avalanche.training.templates import SupervisedTemplate
 from avalanche.training.utils import trigger_plugins
 from tests.unit_tests_utils import get_fast_benchmark
@@ -396,9 +397,9 @@ def __init__(
         eval_every=-1,
     ):
         super().__init__(
-            model,
-            optimizer,
-            criterion,
+            model=model,
+            optimizer=optimizer,
+            criterion=criterion,
             train_mb_size=train_mb_size,
             train_epochs=train_epochs,
             eval_mb_size=eval_mb_size,
@@ -541,9 +542,9 @@ def test_reproduce_old_base_strategy(self):
         # if you want to check whether the seeds are set correctly
         # switch SupervisedTemplate with OldBaseStrategy and check that
         # values are exactly equal.
-        new_strategy = SupervisedTemplate(
-            new_model,
-            SGD(new_model.parameters(), lr=0.01),
+        new_strategy = Naive(
+            model=new_model,
+            optimizer=SGD(new_model.parameters(), lr=0.01),
             train_epochs=2,
             plugins=[p_new],
             evaluator=None,