diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml
index c4a8aa5b9..cbd2a565c 100644
--- a/.github/workflows/testing.yml
+++ b/.github/workflows/testing.yml
@@ -35,7 +35,7 @@ jobs:
         fail-fast: false
         matrix:
             os: ["ubuntu-latest", "macos-latest", "windows-latest"]
-            python-version: ["3.10", "3.11", "3.12"]
+            python-version: ["3.9", "3.10", "3.11", "3.12"]
             # Install the min or latest dependencies for skrub
             # as defined in setup.cfg at [options.extra_require].
             #
@@ -56,9 +56,12 @@ jobs:
                 python-version: "3.11"
               - dependencies-version: "dev, polars"
                 python-version: "3.12"
-              - dependencies-version: "dev, min-py310"
+              - dependencies-version: "dev, min-py39"
                 python-version: "3.10"
                 dependencies-version-type: "minimal"
+              - dependencies-version: "dev, min-py39"
+                python-version: "3.9"
+                dependencies-version-type: "minimal"
     name: ${{ matrix.os-name }} with Python ${{ matrix.python-version }} and ${{ matrix.dependencies-version-type }} dependencies
     defaults:
       run:
diff --git a/CHANGES.rst b/CHANGES.rst
index f905921a7..7e90826d1 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -36,6 +36,9 @@ Major changes
   It now has a ``key``` parameter that allows to join main and auxiliary tables that share
   the same column names. :pr:`876` by :user:`Théo Jolivet <TheooJ>`.
 
+* The minimum supported python version is now 3.9
+  :pr:`939` by :user:`Jérôme Dockès <jeromedockes>`.
+
 Minor changes
 -------------
 
diff --git a/benchmarks/bench_fuzzy_join_count_vs_hash.py b/benchmarks/bench_fuzzy_join_count_vs_hash.py
index e9fbafa8a..f693ed947 100644
--- a/benchmarks/bench_fuzzy_join_count_vs_hash.py
+++ b/benchmarks/bench_fuzzy_join_count_vs_hash.py
@@ -14,7 +14,6 @@
 import math
 from argparse import ArgumentParser
 from time import perf_counter
-from typing import Literal, Tuple, Union
 
 import matplotlib.pyplot as plt
 import numpy as np
@@ -34,21 +33,21 @@
 
 # Function kept for reference
 def fuzzy_join(
-    left: pd.DataFrame,
-    right: pd.DataFrame,
-    how: Literal["left", "right"] = "left",
-    left_on: Union[str, None] = None,
-    right_on: Union[str, None] = None,
-    on: Union[str, None] = None,
-    encoder: Literal["count", "hash"] = "count",
-    analyzer: Literal["word", "char", "char_wb"] = "char_wb",
-    ngram_range: Tuple[int, int] = (2, 4),
-    return_score: bool = False,
-    match_score: float = 0,
-    drop_unmatched: bool = False,
-    sort: bool = False,
-    suffixes: Tuple[str, str] = ("_x", "_y"),
-) -> pd.DataFrame:
+    left,
+    right,
+    how="left",
+    left_on=None,
+    right_on=None,
+    on=None,
+    encoder="count",
+    analyzer="char_wb",
+    ngram_range=(2, 4),
+    return_score=False,
+    match_score=0,
+    drop_unmatched=False,
+    sort=False,
+    suffixes=("_x", "_y"),
+):
     """
     Join two tables categorical string columns based on approximate
     matching and using morphological similarity.
@@ -317,10 +316,10 @@ def fuzzy_join(
     repeat=10,
 )
 def benchmark(
-    encoder: Literal["hash", "count"],
-    dataset_name: str,
-    analyzer: Literal["char_wb", "char", "word"],
-    ngram_range: tuple,
+    encoder,
+    dataset_name,
+    analyzer,
+    ngram_range,
 ):
     left_table, right_table, gt = load_data(dataset_name)
 
@@ -352,7 +351,7 @@ def benchmark(
     return res_dic
 
 
-def plot(df: pd.DataFrame):
+def plot(df):
     sns.set_theme(style="ticks", palette="pastel")
 
     n_datasets = len(np.unique(df["dataset_name"]))
diff --git a/benchmarks/bench_fuzzy_join_sparse_vs_dense.py b/benchmarks/bench_fuzzy_join_sparse_vs_dense.py
index 851ee95bd..ead650e93 100644
--- a/benchmarks/bench_fuzzy_join_sparse_vs_dense.py
+++ b/benchmarks/bench_fuzzy_join_sparse_vs_dense.py
@@ -13,9 +13,7 @@
 import warnings
 from argparse import ArgumentParser
 from collections.abc import Iterable
-from pathlib import Path
 from time import perf_counter
-from typing import Literal
 
 import matplotlib.pyplot as plt
 import numpy as np
@@ -35,11 +33,11 @@
 
 
 def _numeric_encoding(
-    main: pd.DataFrame,
-    main_cols: list | str,
-    aux: pd.DataFrame,
-    aux_cols: list | str,
-) -> tuple:
+    main,
+    main_cols,
+    aux,
+    aux_cols,
+):
     """Encoding numerical columns.
 
     Parameters
@@ -71,15 +69,15 @@ def _numeric_encoding(
 
 
 def _string_encoding(
-    main: pd.DataFrame,
-    main_cols: list | str,
-    aux: pd.DataFrame,
-    aux_cols: list | str,
-    analyzer: Literal["word", "char", "char_wb"],
-    ngram_range: int | int,
-    encoder: _VectorizerMixin = None,
-    sparse: bool = True,
-) -> tuple:
+    main,
+    main_cols,
+    aux,
+    aux_cols,
+    analyzer,
+    ngram_range,
+    encoder=None,
+    sparse=True,
+):
     """Encoding string columns.
 
     Parameters
@@ -147,7 +145,7 @@ def _string_encoding(
         return main_enc_d, aux_enc_d
 
 
-def _nearest_matches(main_array, aux_array, sparse=True) -> np.ndarray | np.ndarray:
+def _nearest_matches(main_array, aux_array, sparse=True):
     """Find the closest matches using the nearest neighbors method.
 
     Parameters
@@ -182,23 +180,23 @@ def _nearest_matches(main_array, aux_array, sparse=True) -> np.ndarray | np.ndar
 
 
 def fuzzy_join(
-    left: pd.DataFrame,
-    right: pd.DataFrame,
-    how: Literal["left", "right"] = "left",
-    left_on: str | list[str] | list[int] | None = None,
-    right_on: str | list[str] | list[int] | None = None,
-    on: str | list[str] | list[int] | None = None,
-    numerical_match: Literal["string", "number"] = "number",
-    encoder: _VectorizerMixin = None,
-    analyzer: Literal["word", "char", "char_wb"] = "char_wb",
-    ngram_range: tuple[int, int] = (2, 4),
-    return_score: bool = False,
-    match_score: float = 0,
-    drop_unmatched: bool = False,
-    sort: bool = False,
-    suffixes: tuple[str, str] = ("_x", "_y"),
-    sparse: bool = True,
-) -> pd.DataFrame:
+    left,
+    right,
+    how="left",
+    left_on=None,
+    right_on=None,
+    on=None,
+    numerical_match="number",
+    encoder=None,
+    analyzer="char_wb",
+    ngram_range=(2, 4),
+    return_score=False,
+    match_score=0,
+    drop_unmatched=False,
+    sort=False,
+    suffixes=("_x", "_y"),
+    sparse=True,
+):
     """
     Join two tables categorical string columns based on approximate
     matching and using morphological similarity.
@@ -526,12 +524,12 @@ def fuzzy_join(
     save_as=benchmark_name,
 )
 def benchmark(
-    sparse: bool,
-    dataset_name: str,
-    analyzer: Literal["char_wb", "char", "word"],
-    ngram_range: tuple,
-    data_home: Path | str | None = None,
-    data_directory: str | None = "benchmarks_data",
+    sparse,
+    dataset_name,
+    analyzer,
+    ngram_range,
+    data_home=None,
+    data_directory="benchmarks_data",
 ):
     left_table, right_table, gt = fetch_big_data(
         dataset_name=dataset_name, data_home=data_home, data_directory=data_directory
@@ -578,7 +576,7 @@ def benchmark(
     return res_dic
 
 
-def plot(df: pd.DataFrame):
+def plot(df):
     sns.set_theme(style="ticks", palette="pastel")
 
     n_datasets = len(np.unique(df["dataset_name"]))
diff --git a/benchmarks/bench_fuzzy_join_vs_others.py b/benchmarks/bench_fuzzy_join_vs_others.py
index d7c526c71..69ad95a4b 100644
--- a/benchmarks/bench_fuzzy_join_vs_others.py
+++ b/benchmarks/bench_fuzzy_join_vs_others.py
@@ -96,8 +96,8 @@ def autofj_merge(left, right, target=0.9):
     repeat=5,
 )
 def benchmark(
-    dataset_name: str,
-    join: str,
+    dataset_name,
+    join,
 ):
     left_table, right_table, gt = load_data(dataset_name)
 
@@ -151,7 +151,7 @@ def benchmark(
     return res_dic
 
 
-def plot(df: pd.DataFrame):
+def plot(df):
     sns.set_theme(style="ticks", palette="pastel")
 
     n_datasets = len(np.unique(df["dataset_name"]))
diff --git a/benchmarks/bench_gap_divergence.py b/benchmarks/bench_gap_divergence.py
index 6dcdc3933..f78ddfe32 100644
--- a/benchmarks/bench_gap_divergence.py
+++ b/benchmarks/bench_gap_divergence.py
@@ -55,10 +55,10 @@
 
 
 class ModifiedGapEncoderColumn(GapEncoder):
-    def __init__(self, *args, column_name: str = "MISSING COLUMN", **kwargs):
+    def __init__(self, *args, column_name="MISSING COLUMN", **kwargs):
         super().__init__(*args, **kwargs)
         self.column_name = column_name
-        self.benchmark_results_: list[dict[str, np.ndarray | float]] = []
+        self.benchmark_results_ = []
 
     def fit(self, X, y=None):
         # Copy parameter rho
@@ -123,9 +123,7 @@ def fit(self, X, y=None):
 
 
 class ModifiedGapEncoder(GapEncoder):
-    fitted_models_: list[ModifiedGapEncoderColumn]
-
-    def _create_column_gap_encoder(self, column_name: str):
+    def _create_column_gap_encoder(self, column_name):
         return ModifiedGapEncoderColumn(
             column_name=column_name,
             ngram_range=self.ngram_range,
@@ -187,7 +185,7 @@ def fit(self, X, y=None):
     },
     save_as=benchmark_name,
 )
-def benchmark(max_iter_e_step: int, dataset_name: str):
+def benchmark(max_iter_e_step, dataset_name):
     """
     Cross-validate a pipeline with a modified `GapEncoder` instance for the
     high cardinality column. The rest of the columns are passed to a
@@ -261,7 +259,7 @@ def benchmark(max_iter_e_step: int, dataset_name: str):
     return results
 
 
-def plot(df: pd.DataFrame):
+def plot(df):
     # Keep only the last outer iteration
     df = df[df["gap_iter"] == 5]
 
diff --git a/benchmarks/bench_gap_encoder_hp.py b/benchmarks/bench_gap_encoder_hp.py
index 1d076d798..4a6b6e73f 100644
--- a/benchmarks/bench_gap_encoder_hp.py
+++ b/benchmarks/bench_gap_encoder_hp.py
@@ -49,12 +49,12 @@
     repeat=1,
 )
 def benchmark(
-    high_card_feature: str,
-    batch_size: int,
-    max_iter_e_step: int,
-    max_rows: int,
-    max_no_improvement: int,
-    random_state: int,
+    high_card_feature,
+    batch_size,
+    max_iter_e_step,
+    max_rows,
+    max_no_improvement,
+    random_state,
 ):
     X = np.array(ds.X[high_card_feature]).reshape(-1, 1).astype(str)
     y = ds.y
@@ -111,7 +111,7 @@ def benchmark(
     return res_dic
 
 
-def plot(df: pd.DataFrame):
+def plot(df):
     base_values = {"batch_size": 1024, "max_iter_e_step": 1, "max_no_improvement": 5}
     for variable in base_values.keys():
         df_to_plot = df
diff --git a/benchmarks/bench_gap_es_score.py b/benchmarks/bench_gap_es_score.py
index bb771bb1e..be32b9ab3 100644
--- a/benchmarks/bench_gap_es_score.py
+++ b/benchmarks/bench_gap_es_score.py
@@ -82,7 +82,7 @@ def _minibatch_convergence(self, batch_size, batch_cost, n_samples, step, n_step
 
         return False
 
-    def fit(self, X, y=None) -> "GapEncoder":
+    def fit(self, X, y=None):
         """
         Fit the GapEncoder on `X`.
 
@@ -164,8 +164,6 @@ def fit(self, X, y=None) -> "GapEncoder":
 
 
 class ModifiedGapEncoder(GapEncoder):
-    fitted_models_: list[ModifiedGapEncoderColumn]
-
     def _create_column_gap_encoder(self):
         return ModifiedGapEncoderColumn(
             ngram_range=self.ngram_range,
@@ -219,9 +217,9 @@ def _create_column_gap_encoder(self):
     repeat=2,
 )
 def benchmark(
-    high_card_feature: str,
-    max_rows: int,
-    modif: bool,
+    high_card_feature,
+    max_rows,
+    modif,
 ):
     ds = fetch_traffic_violations()
     X = np.array(ds.X[high_card_feature]).reshape(-1, 1).astype(str)
@@ -277,7 +275,7 @@ def benchmark(
     return res_dic
 
 
-def plot(df: pd.DataFrame):
+def plot(df):
     sns.lineplot(
         x="train_size", y="time_fit", data=df, hue="high_card_feature", style="modif"
     )
diff --git a/benchmarks/bench_minhash_batch_number.py b/benchmarks/bench_minhash_batch_number.py
index 8cf74a292..4a26d36a3 100644
--- a/benchmarks/bench_minhash_batch_number.py
+++ b/benchmarks/bench_minhash_batch_number.py
@@ -9,9 +9,7 @@
 
 import pickle
 from argparse import ArgumentParser
-from collections.abc import Callable, Collection
 from pathlib import Path
-from typing import Literal
 
 import matplotlib.pyplot as plt
 import numpy as np
@@ -106,20 +104,18 @@ class MinHashEncoder(BaseEstimator, TransformerMixin):
 
     """
 
-    hash_dict_: LRUDict
-
-    _capacity: int = 2**10
+    _capacity = 2**10
 
     def __init__(
         self,
-        n_components: int = 30,
-        ngram_range: tuple[int, int] = (2, 4),
-        hashing: Literal["fast", "murmur"] = "fast",
-        minmax_hash: bool = False,
-        handle_missing: Literal["error", "zero_impute"] = "zero_impute",
-        batch: bool = False,
-        batch_per_job: int = 1,
-        n_jobs: int = None,
+        n_components=30,
+        ngram_range=(2, 4),
+        hashing="fast",
+        minmax_hash=False,
+        handle_missing="zero_impute",
+        batch=False,
+        batch_per_job=1,
+        n_jobs=None,
     ):
         self.ngram_range = ngram_range
         self.n_components = n_components
@@ -130,13 +126,13 @@ def __init__(
         self.batch_per_job = batch_per_job
         self.n_jobs = n_jobs
 
-    def _more_tags(self) -> dict[str, list[str]]:
+    def _more_tags(self):
         """
         Used internally by sklearn to ease the estimator checks.
         """
         return {"X_types": ["categorical"]}
 
-    def _get_murmur_hash(self, string: str) -> np.array:
+    def _get_murmur_hash(self, string):
         """
         Encode a string using murmur hashing function.
 
@@ -164,7 +160,7 @@ def _get_murmur_hash(self, string: str) -> np.array:
             min_hashes = np.minimum(min_hashes, hash_array)
         return min_hashes / (2**32 - 1)
 
-    def _get_fast_hash(self, string: str) -> np.array:
+    def _get_fast_hash(self, string):
         """
         Encode a string with fast hashing function.
         fast hashing supports both min_hash and minmax_hash encoding.
@@ -194,9 +190,7 @@ def _get_fast_hash(self, string: str) -> np.array:
                 ]
             )
 
-    def _compute_hash(
-        self, string: str, hash_func: Callable[[str], np.ndarray]
-    ) -> np.ndarray:
+    def _compute_hash(self, string, hash_func):
         """Function called to compute the hash of a string.
 
         Check if the string is in the hash dictionary, if not, scompute the hash using
@@ -221,9 +215,7 @@ def _compute_hash(
                 self.hash_dict_[string] = hash_func(string)
         return self.hash_dict_[string]
 
-    def _compute_hash_batched(
-        self, batch: Collection[str], hash_func: Callable[[str], np.ndarray]
-    ):
+    def _compute_hash_batched(self, batch, hash_func):
         """Function called to compute the hashes of a batch of strings.
 
         Check if the string is in the hash dictionary, if not, compute the hash using
@@ -251,7 +243,7 @@ def _compute_hash_batched(
             res[i] = self.hash_dict_[string]
         return res
 
-    def fit(self, X, y=None) -> "MinHashEncoder":
+    def fit(self, X, y=None):
         """
         Fit the MinHashEncoder to X. In practice, just initializes a dictionary
         to store encodings to speed up computation.
@@ -281,7 +273,7 @@ def fit(self, X, y=None) -> "MinHashEncoder":
         self.hash_dict_ = LRUDict(capacity=self._capacity)
         return self
 
-    def transform(self, X) -> np.array:
+    def transform(self, X):
         """
         Transform X using specified encoding scheme.
 
@@ -388,18 +380,18 @@ def transform(self, X) -> np.array:
     repeat=10,
 )
 def benchmark(
-    dataset_size: str,
-    batched: bool,
-    n_jobs: int,
-    batch_per_job: int,
-) -> None:
+    dataset_size,
+    batched,
+    n_jobs,
+    batch_per_job,
+):
     X = data[dataset_size]
     MinHashEncoder(batch=batched, n_jobs=n_jobs, batch_per_job=batch_per_job).fit(
         X
     ).transform(X)
 
 
-def plot(df: pd.DataFrame):
+def plot(df):
     sns.set_theme(style="ticks", palette="pastel")
 
     # Create a new columns merging batched and batch_per_job
diff --git a/benchmarks/bench_tablevectorizer_tuning.py b/benchmarks/bench_tablevectorizer_tuning.py
index 74ddf3ca7..ceadadf3c 100644
--- a/benchmarks/bench_tablevectorizer_tuning.py
+++ b/benchmarks/bench_tablevectorizer_tuning.py
@@ -53,9 +53,9 @@
     repeat=3,
 )
 def benchmark(
-    tv_cardinality_threshold: int,
-    minhash_n_components: int,
-    dataset_name: str,
+    tv_cardinality_threshold,
+    minhash_n_components,
+    dataset_name,
 ):
     tv = TableVectorizer(
         cardinality_threshold=tv_cardinality_threshold,
@@ -84,7 +84,7 @@ def benchmark(
     }
 
 
-def plot(df: pd.DataFrame):
+def plot(df):
     sns.set_theme(style="ticks", palette="pastel")
 
     n_datasets = len(np.unique(df["dataset_name"]))
diff --git a/benchmarks/utils/_various.py b/benchmarks/utils/_various.py
index b6f50dc6c..6a592055a 100644
--- a/benchmarks/utils/_various.py
+++ b/benchmarks/utils/_various.py
@@ -3,7 +3,6 @@
 import pandas as pd
 
 from skrub.datasets import (
-    DatasetAll,
     fetch_drug_directory,
     fetch_employee_salaries,
     fetch_medical_charge,
@@ -14,11 +13,11 @@
 )
 
 
-def find_result(bench_name: str) -> Path:
+def find_result(bench_name):
     return choose_file(find_results(bench_name))
 
 
-def find_results(bench_name: str) -> list[Path]:
+def find_results(bench_name):
     """
     Returns the list of results in the results' directory.
     """
@@ -30,7 +29,7 @@ def find_results(bench_name: str) -> list[Path]:
     ]
 
 
-def choose_file(results: list[Path]) -> Path:
+def choose_file(results):
     """
     Given a list of files, chooses one based on these rules:
     - If there are no files to choose from, exit the program
@@ -66,7 +65,7 @@ def choose_file(results: list[Path]) -> Path:
         return results[int(choice) - 1]
 
 
-def get_classification_datasets() -> dict[str, DatasetAll]:
+def get_classification_datasets():
     return {
         "open_payments": fetch_open_payments(),
         "drug_directory": fetch_drug_directory(),
@@ -76,7 +75,7 @@ def get_classification_datasets() -> dict[str, DatasetAll]:
     }
 
 
-def get_regression_datasets() -> dict[str, DatasetAll]:
+def get_regression_datasets():
     return {
         "medical_charge": fetch_medical_charge(),
         "employee_salaries": fetch_employee_salaries(),
diff --git a/benchmarks/utils/join.py b/benchmarks/utils/join.py
index 54273ba80..04397711d 100644
--- a/benchmarks/utils/join.py
+++ b/benchmarks/utils/join.py
@@ -1,14 +1,12 @@
-from pathlib import Path
-
 import pandas as pd
 
 from skrub.datasets._utils import get_data_dir
 
 
 def get_local_data(
-    dataset_name: str,
-    data_home: Path | str | None = None,
-    data_directory: str | None = None,
+    dataset_name,
+    data_home=None,
+    data_directory=None,
 ):
     """Get the path to the local datasets."""
     data_directory = get_data_dir(data_directory, data_home)
@@ -26,11 +24,11 @@ def get_local_data(
 
 
 def fetch_data(
-    dataset_name: str,
-    save: bool = True,
-    data_home: Path | str | None = None,
-    data_directory: str | None = None,
-) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
+    dataset_name,
+    save=True,
+    data_home=None,
+    data_directory=None,
+):
     """Fetch datasets from [1]_.
 
     Parameters
@@ -88,12 +86,12 @@ def fetch_data(
 
 
 def fetch_big_data(
-    dataset_name: str,
-    data_type: str = "Dirty",
-    save: bool = True,
-    data_home: Path | str | None = None,
-    data_directory: str | None = None,
-) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
+    dataset_name,
+    data_type="Dirty",
+    save=True,
+    data_home=None,
+    data_directory=None,
+):
     """Fetch datasets from [1]_.
 
     Parameters
diff --git a/benchmarks/utils/monitor.py b/benchmarks/utils/monitor.py
index f40d736ec..5afb3e68b 100644
--- a/benchmarks/utils/monitor.py
+++ b/benchmarks/utils/monitor.py
@@ -1,7 +1,7 @@
 import os
 import tracemalloc
 from collections import defaultdict
-from collections.abc import Callable, Collection, Mapping
+from collections.abc import Mapping
 from datetime import datetime
 from itertools import product
 from pathlib import Path
@@ -9,7 +9,6 @@
 from string import ascii_letters, digits
 from time import perf_counter
 from time import time as get_time
-from typing import Any
 from warnings import warn
 
 import pandas as pd
@@ -18,13 +17,13 @@
 
 def monitor(
     *,
-    parametrize: Collection[Mapping[Any]] | Mapping[str, Collection[Any]] | None = None,
-    save_as: str | None = None,
-    memory: bool = True,
-    time: bool = True,
-    repeat: int = 1,
-    hot_load: str | None = None,
-) -> Callable[..., Callable[..., pd.DataFrame]]:
+    parametrize=None,
+    save_as=None,
+    memory=True,
+    time=True,
+    repeat=1,
+    hot_load=None,
+):
     """Decorator used to monitor the execution of a function.
 
     The decorated function should return either:
@@ -128,9 +127,7 @@ def monitor(
 
     reserved_column_names = {"iter", "time", "memory"}
 
-    def decorator(
-        func: Callable[..., Mapping[str, Any] | list[Mapping[str, Any]] | None]
-    ):
+    def decorator(func):
         """
         Catches the decorated function.
 
@@ -140,7 +137,7 @@ def decorator(
             The decorated function callable object.
         """
 
-        def wrapper(*call_args, **call_kwargs) -> pd.DataFrame:
+        def wrapper(*call_args, **call_kwargs):
             """
             Catches the decorated function's call arguments.
 
@@ -162,7 +159,7 @@ def wrapper(*call_args, **call_kwargs) -> pd.DataFrame:
                     f"positional values: {call_args!r}"
                 )
 
-            def get_random_file_name() -> str:
+            def get_random_file_name():
                 """
                 Returns a random file name, used by hot-loading.
                 Format is ``{time}-{random_string}.parquet``.
@@ -171,7 +168,7 @@ def get_random_file_name() -> str:
                 time = int(get_time())
                 return f"{time}-{name}.parquet"
 
-            def load_intermediate_results(file_name: str) -> pd.DataFrame:
+            def load_intermediate_results(file_name):
                 """
                 Loads the results from the file passed.
                 If the file is not found, and to avoid unexpected behavior,
@@ -184,12 +181,12 @@ def load_intermediate_results(file_name: str) -> pd.DataFrame:
 
                 return pd.read_parquet(file_name)
 
-            def product_map(iterables: Mapping[str, Any]):
+            def product_map(iterables):
                 """``itertools.product`` with mapping support."""
                 for combination in product(*iterables.values()):
                     yield dict(zip(iterables.keys(), combination))
 
-            def exec_func(**kwargs) -> pd.DataFrame:
+            def exec_func(**kwargs):
                 """
                 Wraps the decorated function call with a single set of
                 parameters, and pre-process the returned values.
@@ -274,7 +271,6 @@ def exec_func(**kwargs) -> pd.DataFrame:
 
                 return df_results
 
-            parametrization: list[Mapping]
             if parametrize is None:
                 # Use the parameters passed by the call
                 parametrization = [call_kwargs]
diff --git a/setup.cfg b/setup.cfg
index e6fbcf4e6..23a16092c 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -14,6 +14,7 @@ classifiers =
     Intended Audience :: Science/Research
     License :: OSI Approved :: BSD License
     Operating System :: OS Independent
+    Programming Language :: Python :: 3.9
     Programming Language :: Python :: 3.10
     Programming Language :: Python :: 3.11
     Programming Language :: Python :: 3.12
@@ -32,7 +33,7 @@ install_requires =
     scipy>=1.9.3
     pandas>=1.5.3
     packaging>=23.1
-python_requires = >=3.10
+python_requires = >=3.9
 
 [options.packages.find]
 include = skrub*
@@ -79,7 +80,7 @@ benchmarks =
     loguru
 # Overwrite the previous install_requires for CI testing purposes
 # as defined in testing.yml.
-min-py310 =
+min-py39 =
     scikit-learn==1.2.1
     numpy==1.23.5
     scipy==1.9.3
diff --git a/skrub/_agg_joiner.py b/skrub/_agg_joiner.py
index 159374a46..7044aa571 100644
--- a/skrub/_agg_joiner.py
+++ b/skrub/_agg_joiner.py
@@ -5,7 +5,6 @@
 Both classes aggregate the auxiliary table first, then join this grouped
 table with the main table.
 """
-from typing import Iterable
 
 import numpy as np
 from sklearn.base import BaseEstimator, TransformerMixin
@@ -22,7 +21,7 @@
 ALL_OPS = NUM_OPERATIONS + CATEG_OPERATIONS
 
 
-def split_num_categ_operations(operations: list[str]) -> tuple[list[str], list[str]]:
+def split_num_categ_operations(operations):
     """Separate aggregator operators input by their type.
 
     Parameters
@@ -373,9 +372,9 @@ class AggTarget(TransformerMixin, BaseEstimator):
 
     def __init__(
         self,
-        main_key: str | Iterable[str],
-        operation: str | Iterable[str] | None = None,
-        suffix: str | None = None,
+        main_key,
+        operation=None,
+        suffix=None,
     ):
         self.main_key = main_key
         self.operation = operation
diff --git a/skrub/_deduplicate.py b/skrub/_deduplicate.py
index 11a56d84b..ce6426bd4 100644
--- a/skrub/_deduplicate.py
+++ b/skrub/_deduplicate.py
@@ -2,12 +2,10 @@
 Implements deduplication based on clustering string distance matrices.
 """
 
-from collections.abc import Sequence
 
 import numpy as np
 import pandas as pd
 from joblib import Parallel, delayed
-from numpy.typing import NDArray
 from scipy.cluster.hierarchy import fcluster, linkage
 from scipy.spatial.distance import pdist, squareform
 from sklearn.feature_extraction.text import TfidfVectorizer
@@ -15,10 +13,10 @@
 
 
 def compute_ngram_distance(
-    unique_words: Sequence[str] | NDArray,
-    ngram_range: tuple[int, int] = (2, 4),
-    analyzer: str = "char_wb",
-) -> NDArray:
+    unique_words,
+    ngram_range=(2, 4),
+    analyzer="char_wb",
+):
     """Compute the condensed pair-wise n-gram distance between `unique_words`.
 
     Parameters
@@ -52,15 +50,13 @@ def compute_ngram_distance(
     return distance_mat
 
 
-def _get_silhouette_avg(Z: NDArray, n_clust: int, redundant_dist: NDArray) -> float:
+def _get_silhouette_avg(Z, n_clust, redundant_dist):
     labels = fcluster(Z, n_clust, criterion="maxclust")
     silhouette_avg = silhouette_score(redundant_dist, labels, metric="precomputed")
     return silhouette_avg
 
 
-def _guess_clusters(
-    Z: NDArray, distance_mat: NDArray, n_jobs: int | None = None
-) -> int:
+def _guess_clusters(Z, distance_mat, n_jobs=None):
     """Finds the number of clusters that maximize the silhouette score
     when clustering `distance_mat`.
 
@@ -88,10 +84,10 @@ def _guess_clusters(
 
 
 def _create_spelling_correction(
-    unique_words: Sequence[str] | NDArray[np.str_],
-    counts: Sequence[int] | NDArray[np.int_],
-    clusters: Sequence[int],
-) -> pd.Series:
+    unique_words,
+    counts,
+    clusters,
+):
     """
     Creates a pandas Series that map each cluster member to the most
     frequent cluster member. The assumption is that the most common spelling
@@ -115,8 +111,8 @@ def _create_spelling_correction(
         corrected spelling of each word as values.
     """
     count_series = pd.Series(counts, index=unique_words)
-    original_spelling: list[str] = []
-    corrected_spelling: list[str] = []
+    original_spelling = []
+    corrected_spelling = []
     for cluster in np.unique(clusters):
         sorted_spellings = (
             count_series.loc[clusters == cluster]
diff --git a/skrub/_fast_hash.py b/skrub/_fast_hash.py
index 5e07edf0f..96be294ec 100644
--- a/skrub/_fast_hash.py
+++ b/skrub/_fast_hash.py
@@ -48,11 +48,11 @@ def gen_atom(atom_len, seed=0):
 
 
 def ngram_min_hash(
-    string: str,
-    ngram_range: tuple[int, int] = (2, 4),
-    seed: int = 0,
+    string,
+    ngram_range=(2, 4),
+    seed=0,
     return_minmax=False,
-) -> int | tuple[int, int]:
+):
     """
     Compute the min/max hash of the ngrams of the string.
 
diff --git a/skrub/_gap_encoder.py b/skrub/_gap_encoder.py
index 76b964454..e3d0059db 100644
--- a/skrub/_gap_encoder.py
+++ b/skrub/_gap_encoder.py
@@ -3,15 +3,11 @@
 """
 from __future__ import annotations
 
-from collections.abc import Generator
 from copy import deepcopy
-from typing import Literal
 
 import numpy as np
 import pandas as pd
 import scipy.sparse as sp
-from numpy.random import RandomState
-from numpy.typing import ArrayLike, NDArray
 from scipy import sparse
 from sklearn.base import TransformerMixin
 from sklearn.cluster import KMeans, kmeans_plusplus
@@ -186,30 +182,27 @@ class GapEncoder(TransformerMixin, SingleColumnTransformer):
     The higher the value, the bigger the correspondence with the topic.
     """
 
-    rho_: float
-    H_dict_: dict[NDArray, NDArray]
-
     def __init__(
         self,
-        n_components: int = 10,
-        batch_size: int = 1024,
-        gamma_shape_prior: float = 1.1,
-        gamma_scale_prior: float = 1.0,
-        rho: float = 0.95,
-        rescale_rho: bool = False,
-        hashing: bool = False,
-        hashing_n_features: int = 2**12,
-        init: Literal["k-means++", "random", "k-means"] = "k-means++",
-        max_iter: int = 5,
-        ngram_range: tuple[int, int] = (2, 4),
-        analyzer: Literal["word", "char", "char_wb"] = "char",
-        add_words: bool = False,
-        random_state: int | RandomState | None = None,
-        rescale_W: bool = True,
-        max_iter_e_step: int = 1,
-        max_no_improvement: int = 5,
+        n_components=10,
+        batch_size=1024,
+        gamma_shape_prior=1.1,
+        gamma_scale_prior=1.0,
+        rho=0.95,
+        rescale_rho=False,
+        hashing=False,
+        hashing_n_features=2**12,
+        init="k-means++",
+        max_iter=5,
+        ngram_range=(2, 4),
+        analyzer="char",
+        add_words=False,
+        random_state=None,
+        rescale_W=True,
+        max_iter_e_step=1,
+        max_no_improvement=5,
         handle_missing="zero_impute",
-        verbose: int = 0,
+        verbose=0,
     ):
         self.ngram_range = ngram_range
         self.n_components = n_components
@@ -231,7 +224,7 @@ def __init__(
         self.handle_missing = handle_missing
         self.verbose = verbose
 
-    def _init_vars(self, X) -> tuple[NDArray, NDArray, NDArray]:
+    def _init_vars(self, X):
         """
         Build the bag-of-n-grams representation `V` of `X` and initialize
         the topics `W`.
@@ -286,7 +279,7 @@ def _init_vars(self, X) -> tuple[NDArray, NDArray, NDArray]:
             self.rho_ = self.rho ** (self.batch_size / len(X))
         return unq_X, unq_V, lookup
 
-    def _get_H(self, X: NDArray) -> NDArray:
+    def _get_H(self, X):
         """
         Return the bag-of-n-grams representation of `X`.
         """
@@ -295,7 +288,7 @@ def _get_H(self, X: NDArray) -> NDArray:
             h_out[:] = self.H_dict_[x]
         return H_out
 
-    def _init_w(self, V: NDArray, X) -> tuple[NDArray, NDArray, NDArray]:
+    def _init_w(self, V, X):
         """
         Initialize the topics `W`.
         If `self.init='k-means++'`, we use the init method of
@@ -351,11 +344,11 @@ def _init_w(self, V: NDArray, X) -> tuple[NDArray, NDArray, NDArray]:
 
     def _minibatch_convergence(
         self,
-        batch_size: int,
-        batch_cost: float,
-        n_samples: int,
-        step: int,
-        n_steps: int,
+        batch_size,
+        batch_cost,
+        n_samples,
+        step,
+        n_steps,
     ):
         """
         Helper function to encapsulate the early stopping logic.
@@ -517,9 +510,9 @@ def fit(self, X, y=None):
 
     def get_feature_names_out(
         self,
-        n_labels: int = 3,
-        prefix: str = "",
-    ) -> list[str]:
+        n_labels=3,
+        prefix="",
+    ):
         """
         Return the labels that best summarize the learned components/topics.
 
@@ -692,7 +685,7 @@ def partial_fit(self, X, y=None):
         self.H_dict_.update(zip(unq_X, unq_H))
         return self
 
-    def _add_unseen_keys_to_H_dict(self, X) -> None:
+    def _add_unseen_keys_to_H_dict(self, X):
         """
         Add activations of unseen string categories from `X` to `H_dict`.
         """
@@ -792,7 +785,7 @@ def _handle_missing(self, X):
         return X
 
 
-def _rescale_W(W: NDArray, A: NDArray) -> None:
+def _rescale_W(W, A):
     """
     Rescale the topics `W` to have a L1-norm equal to 1.
     Note that they are modified in-place.
@@ -826,14 +819,14 @@ def _special_sparse_dot(H, W, X):
 
 
 def _multiplicative_update_w(
-    Vt: NDArray,
-    W: NDArray,
-    A: NDArray,
-    B: NDArray,
-    Ht: NDArray,
-    rescale_W: bool,
-    rho: float,
-) -> tuple[NDArray, NDArray, NDArray]:
+    Vt,
+    W,
+    A,
+    B,
+    Ht,
+    rescale_W,
+    rho,
+):
     """
     Multiplicative update step for the topics `W`.
     """
@@ -852,7 +845,7 @@ def _multiplicative_update_w(
     return W, A, B
 
 
-def _rescale_h(V: NDArray, H: NDArray) -> NDArray:
+def _rescale_h(V, H):
     """
     Rescale the activations `H`.
     """
@@ -863,14 +856,14 @@ def _rescale_h(V: NDArray, H: NDArray) -> NDArray:
 
 
 def _multiplicative_update_h(
-    Vt: NDArray,
-    W: NDArray,
-    Ht: NDArray,
-    epsilon: float = 1e-3,
-    max_iter: int = 10,
-    rescale_W: bool = False,
-    gamma_shape_prior: float = 1.1,
-    gamma_scale_prior: float = 1.0,
+    Vt,
+    W,
+    Ht,
+    epsilon=1e-3,
+    max_iter=10,
+    rescale_W=False,
+    gamma_shape_prior=1.1,
+    gamma_scale_prior=1.0,
 ):
     """
     Multiplicative update step for the activations `H`.
@@ -900,9 +893,9 @@ def _multiplicative_update_h(
 
 
 def batch_lookup(
-    lookup: NDArray,
-    n: int = 1,
-) -> Generator[tuple[NDArray, NDArray], None, None]:
+    lookup,
+    n=1,
+):
     """
     Make batches of the lookup array.
     """
@@ -914,15 +907,15 @@ def batch_lookup(
 
 
 def get_kmeans_prototypes(
-    X: ArrayLike,
-    n_prototypes: int,
-    analyzer: Literal["word", "char", "char_wb"] = "char",
-    hashing_dim: int = 128,
-    ngram_range: tuple[int, int] = (2, 4),
-    sparse: bool = False,
+    X,
+    n_prototypes,
+    analyzer="char",
+    hashing_dim=128,
+    ngram_range=(2, 4),
+    sparse=False,
     sample_weight=None,
-    random_state: int | RandomState | None = None,
-) -> NDArray:
+    random_state=None,
+):
     """
     Computes prototypes based on:
       - dimensionality reduction (via hashing n-grams)
diff --git a/skrub/_minhash_encoder.py b/skrub/_minhash_encoder.py
index c849cc4d1..53676a5f5 100644
--- a/skrub/_minhash_encoder.py
+++ b/skrub/_minhash_encoder.py
@@ -4,12 +4,8 @@
 """
 from __future__ import annotations
 
-from collections.abc import Callable, Collection
-from typing import Literal
-
 import numpy as np
 from joblib import Parallel, delayed, effective_n_jobs
-from numpy.typing import NDArray
 from sklearn.base import TransformerMixin
 from sklearn.utils import gen_even_slices, murmurhash3_32
 from sklearn.utils.validation import check_is_fitted
@@ -115,19 +111,17 @@ class MinHashEncoder(TransformerMixin, SingleColumnTransformer):
     3 -1.975829e+09 -2.095000e+09 -1.530721e+09 -1.459183e+09 -1.580988e+09
     """
 
-    hash_dict_: LRUDict
-
-    _capacity: int = 2**10
+    _capacity = 2**10
 
     def __init__(
         self,
         *,
-        n_components: int = 30,
-        ngram_range: tuple[int, int] = (2, 4),
-        hashing: Literal["fast", "murmur"] = "fast",
-        minmax_hash: bool = False,
-        handle_missing: Literal["error", "zero_impute"] = "zero_impute",
-        n_jobs: int = None,
+        n_components=30,
+        ngram_range=(2, 4),
+        hashing="fast",
+        minmax_hash=False,
+        handle_missing="zero_impute",
+        n_jobs=None,
     ):
         self.ngram_range = ngram_range
         self.n_components = n_components
@@ -136,7 +130,7 @@ def __init__(
         self.handle_missing = handle_missing
         self.n_jobs = n_jobs
 
-    def _get_murmur_hash(self, string: str) -> NDArray:
+    def _get_murmur_hash(self, string):
         """
         Encode a string using murmur hashing function.
 
@@ -164,7 +158,7 @@ def _get_murmur_hash(self, string: str) -> NDArray:
             min_hashes = np.minimum(min_hashes, hash_array)
         return min_hashes / (2**32 - 1)
 
-    def _get_fast_hash(self, string: str) -> NDArray:
+    def _get_fast_hash(self, string):
         """Encode a string with fast hashing function.
 
         Fast hashing supports both min_hash and minmax_hash encoding.
@@ -194,9 +188,7 @@ def _get_fast_hash(self, string: str) -> NDArray:
                 ]
             )
 
-    def _compute_hash_batched(
-        self, batch: Collection[str], hash_func: Callable[[str], NDArray]
-    ) -> NDArray:
+    def _compute_hash_batched(self, batch, hash_func):
         """Function called to compute the hashes of a batch of strings.
 
         Check if the string is in the hash dictionary, if not, compute the hash
@@ -224,7 +216,7 @@ def _compute_hash_batched(
             res[i] = self.hash_dict_[string]
         return res
 
-    def fit(self, X, y=None) -> "MinHashEncoder":
+    def fit(self, X, y=None):
         """Fit the MinHashEncoder to `X`.
 
         In practice, just initializes a dictionary
diff --git a/skrub/_similarity_encoder.py b/skrub/_similarity_encoder.py
index a183531f7..86a7fa1fd 100644
--- a/skrub/_similarity_encoder.py
+++ b/skrub/_similarity_encoder.py
@@ -3,13 +3,11 @@
 which encodes similarity instead of equality of values.
 """
 
-from typing import Literal
 
 import numpy as np
 import pandas as pd
 import sklearn
 from joblib import Parallel, delayed
-from numpy.typing import ArrayLike, NDArray
 from scipy import sparse
 from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer
 from sklearn.preprocessing import OneHotEncoder
@@ -23,15 +21,15 @@
 
 
 def _ngram_similarity_one_sample_inplace(
-    x_count_vector: NDArray,
-    vocabulary_count_matrix: NDArray,
-    str_x: str,
-    vocabulary_ngram_counts: NDArray,
-    se_dict: dict,
-    unq_X: NDArray,
-    i: int,
-    ngram_range: tuple[int, int],
-) -> None:
+    x_count_vector,
+    vocabulary_count_matrix,
+    str_x,
+    vocabulary_ngram_counts,
+    se_dict,
+    unq_X,
+    i,
+    ngram_range,
+):
     """
     Update inplace a dict of similarities between a string and a vocabulary
 
@@ -74,12 +72,12 @@ def _ngram_similarity_one_sample_inplace(
 
 def ngram_similarity_matrix(
     X,
-    cats: list[str],
-    ngram_range: tuple[int, int],
-    analyzer: Literal["word", "char", "char_wb"],
-    hashing_dim: int,
-    dtype: type = np.float64,
-) -> NDArray:
+    cats,
+    ngram_range,
+    analyzer,
+    hashing_dim,
+    dtype=np.float64,
+):
     """
     Similarity encoding for dirty categorical variables:
     Given two arrays of strings, returns the similarity encoding matrix
@@ -261,25 +259,17 @@ class SimilarityEncoder(OneHotEncoder):
           dtype=object)
     """
 
-    categories_: list[NDArray]
-    n_features_in_: int
-    drop_idx_: NDArray
-    vectorizers_: list[CountVectorizer]
-    vocabulary_count_matrices_: list[NDArray]
-    vocabulary_ngram_counts_: list[list[int]]
-    _infrequent_enabled: bool
-
     def __init__(
         self,
         *,
-        ngram_range: tuple[int, int] = (2, 4),
-        analyzer: Literal["word", "char", "char_wb"] = "char",
-        categories: Literal["auto"] | list[list[str]] = "auto",
-        dtype: type = np.float64,
-        handle_unknown: Literal["error", "ignore"] = "ignore",
-        handle_missing: Literal["error", ""] = "",
-        hashing_dim: int | None = None,
-        n_jobs: int | None = None,
+        ngram_range=(2, 4),
+        analyzer="char",
+        categories="auto",
+        dtype=np.float64,
+        handle_unknown="ignore",
+        handle_missing="",
+        hashing_dim=None,
+        n_jobs=None,
     ):
         super().__init__()
         self.categories = categories
@@ -298,7 +288,7 @@ def __init__(
                     "'auto' or a list of prototypes. "
                 )
 
-    def fit(self, X: ArrayLike, y=None) -> "SimilarityEncoder":
+    def fit(self, X, y=None):
         """Fit the instance to `X`.
 
         Parameters
@@ -422,7 +412,7 @@ def fit(self, X: ArrayLike, y=None) -> "SimilarityEncoder":
         self._n_features_outs = list(map(len, self.categories_))
         return self
 
-    def transform(self, X: ArrayLike, fast: bool = True) -> NDArray:
+    def transform(self, X, fast=True):
         """Transform `X` using specified encoding scheme.
 
         Parameters
@@ -498,9 +488,9 @@ def transform(self, X: ArrayLike, fast: bool = True) -> NDArray:
 
     def _ngram_similarity_fast(
         self,
-        X: list | NDArray,
-        col_idx: int,
-    ) -> NDArray:
+        X,
+        col_idx,
+    ):
         """
         Fast computation of ngram similarity.
 
diff --git a/skrub/_string_distances.py b/skrub/_string_distances.py
index eecedd689..cf454cde0 100644
--- a/skrub/_string_distances.py
+++ b/skrub/_string_distances.py
@@ -8,7 +8,7 @@
 # TODO vectorize these functions (accept arrays)
 
 
-def get_ngram_count(string: str, ngram_range: tuple[int, int]) -> int:
+def get_ngram_count(string, ngram_range):
     """
     Compute the number of ngrams in a string.
 
@@ -30,7 +30,7 @@ def get_ngram_count(string: str, ngram_range: tuple[int, int]) -> int:
     return ngram_count
 
 
-def preprocess(x: str) -> str:
+def preprocess(x):
     """
     Combine preprocessing done by CountVectorizer and the SimilarityEncoder.
 
@@ -56,7 +56,7 @@ def preprocess(x: str) -> str:
     return _white_spaces.sub(" ", x)
 
 
-def get_unique_ngrams(string: str, ngram_range: tuple[int, int]):
+def get_unique_ngrams(string, ngram_range):
     """
     Return the set of unique n-grams of a string.
 
@@ -83,7 +83,7 @@ def get_unique_ngrams(string: str, ngram_range: tuple[int, int]):
     return ngram_set
 
 
-def get_ngrams(string: str, n: int) -> list[tuple]:
+def get_ngrams(string, n):
     """Return the set of different n-grams in a string"""
     # Pure Python implementation: no numpy
     spaces = " "  # * (n // 2 + n % 2)
diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py
index bec0532ef..06931b6c6 100644
--- a/skrub/_table_vectorizer.py
+++ b/skrub/_table_vectorizer.py
@@ -514,7 +514,7 @@ def _check_specific_columns(self):
                         f"Column {c!r} used twice in 'specific_transformers', "
                         f"at indices {specific_columns[c]} and {i}."
                     )
-            specific_columns |= {c: i for c in cols}
+            specific_columns.update({c: i for c in cols})
         self._specific_columns = list(specific_columns.keys())
 
     def _make_pipeline(self):
@@ -620,7 +620,7 @@ def _store_output_to_input(self):
 
     # scikt-learn compatibility
 
-    def _more_tags(self) -> dict:
+    def _more_tags(self):
         """
         Used internally by sklearn to ease the estimator checks.
         """
diff --git a/skrub/_utils.py b/skrub/_utils.py
index 09e91697d..5ba0554d5 100644
--- a/skrub/_utils.py
+++ b/skrub/_utils.py
@@ -1,11 +1,9 @@
 import collections
 import importlib
 import secrets
-from collections.abc import Hashable
-from typing import Any, Iterable
+from typing import Iterable
 
 import numpy as np
-from numpy.typing import NDArray
 from sklearn.base import clone
 from sklearn.utils import check_array
 
@@ -18,11 +16,11 @@ class LRUDict:
     Using LRU eviction avoids memorizing a full dataset.
     """
 
-    def __init__(self, capacity: int):
+    def __init__(self, capacity):
         self.capacity = capacity
         self.cache = collections.OrderedDict()
 
-    def __getitem__(self, key: Hashable):
+    def __getitem__(self, key):
         try:
             value = self.cache.pop(key)
             self.cache[key] = value
@@ -30,7 +28,7 @@ def __getitem__(self, key: Hashable):
         except KeyError:
             return -1
 
-    def __setitem__(self, key: Hashable, value: Any):
+    def __setitem__(self, key, value):
         try:
             self.cache.pop(key)
         except KeyError:
@@ -38,11 +36,11 @@ def __setitem__(self, key: Hashable, value: Any):
                 self.cache.popitem(last=False)
         self.cache[key] = value
 
-    def __contains__(self, key: Hashable):
+    def __contains__(self, key):
         return key in self.cache
 
 
-def check_input(X) -> NDArray:
+def check_input(X):
     """Check input with sklearn standards.
 
     Also converts X to a numpy array if not already.
@@ -69,7 +67,7 @@ def check_input(X) -> NDArray:
     return X_
 
 
-def import_optional_dependency(name: str, extra: str = ""):
+def import_optional_dependency(name, extra=""):
     """Import an optional dependency.
 
     By default, if a dependency is missing an ImportError with a nice
diff --git a/skrub/datasets/_generating.py b/skrub/datasets/_generating.py
index 8514c732d..0f1aefc9c 100644
--- a/skrub/datasets/_generating.py
+++ b/skrub/datasets/_generating.py
@@ -11,11 +11,11 @@
 
 
 def make_deduplication_data(
-    examples: list[str],
-    entries_per_example: list[int],
-    prob_mistake_per_letter: float = 0.2,
-    random_state: int | np.random.RandomState | None = None,
-) -> list[str]:
+    examples,
+    entries_per_example,
+    prob_mistake_per_letter=0.2,
+    random_state=None,
+):
     """Duplicates examples with spelling mistakes.
 
     Characters are misspelled with probability `prob_mistake_per_letter`.
diff --git a/skrub/datasets/_ken_embeddings.py b/skrub/datasets/_ken_embeddings.py
index 0a447edb0..4d6f174c9 100644
--- a/skrub/datasets/_ken_embeddings.py
+++ b/skrub/datasets/_ken_embeddings.py
@@ -17,7 +17,7 @@
 )
 
 
-def fetch_ken_table_aliases() -> set[str]:
+def fetch_ken_table_aliases():
     """Get the supported aliases of embedded KEN entities tables.
 
     These aliases can be using in subsequent functions (see section *See Also*).
@@ -50,11 +50,11 @@ def fetch_ken_table_aliases() -> set[str]:
 
 
 def fetch_ken_types(
-    search: str = None,
+    search=None,
     *,
-    exclude: str | None = None,
-    embedding_table_id: str = "all_entities",
-) -> pd.DataFrame:
+    exclude=None,
+    embedding_table_id="all_entities",
+):
     """Helper function to search for KEN entity types.
 
     The result can then be used with fetch_ken_embeddings.
@@ -136,14 +136,14 @@ def fetch_ken_types(
 
 
 def fetch_ken_embeddings(
-    search_types: str | None = None,
+    search_types=None,
     *,
-    exclude: str | None = None,
-    embedding_table_id: str = "all_entities",
-    embedding_type_id: str | None = None,
-    pca_components: int | None = None,
-    suffix: str = "",
-) -> pd.DataFrame:
+    exclude=None,
+    embedding_table_id="all_entities",
+    embedding_type_id=None,
+    pca_components=None,
+    suffix="",
+):
     """Download Wikipedia embeddings by type.
 
     More details on the embeddings can be found on
diff --git a/skrub/datasets/_utils.py b/skrub/datasets/_utils.py
index 7b9a57ab3..aba51f5e6 100644
--- a/skrub/datasets/_utils.py
+++ b/skrub/datasets/_utils.py
@@ -1,7 +1,7 @@
 from pathlib import Path
 
 
-def get_data_home(data_home: Path | str | None = None) -> Path:
+def get_data_home(data_home=None):
     """Returns the path of the skrub data directory.
 
     This folder is used by some large dataset loaders to avoid downloading the
@@ -35,7 +35,7 @@ def get_data_home(data_home: Path | str | None = None) -> Path:
     return data_home
 
 
-def get_data_dir(name: str | None = None, data_home: Path | str | None = None) -> Path:
+def get_data_dir(name=None, data_home=None):
     """
     Returns the directory in which skrub looks for data.
 
diff --git a/skrub/datasets/tests/test_fetching.py b/skrub/datasets/tests/test_fetching.py
index 61c79dab6..523622276 100644
--- a/skrub/datasets/tests/test_fetching.py
+++ b/skrub/datasets/tests/test_fetching.py
@@ -9,7 +9,7 @@
 from skrub.datasets import _fetching
 
 
-def _has_data_id(call, data_id: int) -> bool:
+def _has_data_id(call, data_id):
     # Unpacking copied from `mock._Call.__eq__`
     if len(call) == 2:
         args, kwargs = call
@@ -22,7 +22,7 @@ def _has_data_id(call, data_id: int) -> bool:
     "skrub.datasets._fetching.fetch_openml",
     side_effect=_fetching.fetch_openml,
 )
-def test_openml_fetching(fetch_openml_mock: mock.Mock):
+def test_openml_fetching(fetch_openml_mock):
     """
     Downloads a small dataset (midwest survey) and performs a bunch of tests
     that asserts the fetching function works correctly.
@@ -79,7 +79,7 @@ def test_openml_datasets_exist():
 
 
 @mock.patch("skrub.datasets._fetching.fetch_openml")
-def test_openml_datasets_calls(fetch_openml_mock: mock.Mock):
+def test_openml_datasets_calls(fetch_openml_mock):
     """
     Checks that calling the fetching functions actually calls
     `sklearn.datasets.fetch_openml`.
diff --git a/skrub/tests/test_check_input.py b/skrub/tests/test_check_input.py
index 6b802b33f..764dbbe52 100644
--- a/skrub/tests/test_check_input.py
+++ b/skrub/tests/test_check_input.py
@@ -53,11 +53,9 @@ def test_column_names_to_unique_strings():
     df = pd.DataFrame(np.ones((2, 4)), columns=["a", 0, "0", "a"])
     assert df.columns.tolist() == ["a", 0, "0", "a"]
     check = CheckInputDataFrame()
-    with (
-        pytest.warns(UserWarning, match="Some column names are not strings"),
-        pytest.warns(UserWarning, match="Found duplicated column names"),
-    ):
-        out = check.fit_transform(df)
+    with pytest.warns(UserWarning, match="Some column names are not strings"):
+        with pytest.warns(UserWarning, match="Found duplicated column names"):
+            out = check.fit_transform(df)
     assert out.shape == (2, 4)
     out_cols = out.columns.tolist()
     assert out_cols[:2] == ["a", "0"]
diff --git a/skrub/tests/test_deduplicate.py b/skrub/tests/test_deduplicate.py
index 9e3323de4..16042989f 100644
--- a/skrub/tests/test_deduplicate.py
+++ b/skrub/tests/test_deduplicate.py
@@ -1,5 +1,3 @@
-from functools import cache
-
 import joblib
 import numpy as np
 import pandas as pd
@@ -22,9 +20,9 @@
     [([500, 100, 1500], 0.05), ([100, 100], 0.02), ([200, 50, 30, 200, 800], 0.01)],
 )
 def test_deduplicate(
-    entries_per_category: list[int],
-    prob_mistake_per_letter: float,
-    seed: int = 123,
+    entries_per_category,
+    prob_mistake_per_letter,
+    seed=123,
 ):
     rng = np.random.RandomState(seed)
 
@@ -78,7 +76,7 @@ def test__guess_clusters():
     assert n_clusters == len(np.unique(words))
 
 
-def test__create_spelling_correction(seed: int = 123):
+def test__create_spelling_correction(seed=123):
     rng = np.random.RandomState(seed)
     n_clusters = 3
     samples_per_cluster = 10
@@ -101,8 +99,7 @@ def test__create_spelling_correction(seed: int = 123):
         ).all()
 
 
-@cache
-def default_deduplicate(n: int = 500, random_state=0):
+def default_deduplicate(n=500, random_state=0):
     """
     Create a default deduplication dataset.
     """
diff --git a/skrub/tests/test_docstrings.py b/skrub/tests/test_docstrings.py
index 22c594a07..7ee463b1e 100644
--- a/skrub/tests/test_docstrings.py
+++ b/skrub/tests/test_docstrings.py
@@ -10,7 +10,6 @@
 
 import inspect
 import re
-from collections.abc import Callable
 from importlib import import_module
 
 import pytest
@@ -68,10 +67,10 @@ def get_functions_to_validate():
 
 
 def repr_errors(
-    res: dict,
-    estimator: type | None = None,
-    method: str | None = None,
-) -> str:
+    res,
+    estimator=None,
+    method=None,
+):
     """
     Pretty print original docstring and the obtained errors
 
@@ -126,7 +125,7 @@ def repr_errors(
     return msg
 
 
-def filter_errors(errors, method: Callable, estimator_cls: type | None = None):
+def filter_errors(errors, method, estimator_cls=None):
     """
     Ignore some errors based on the method type.
     """
@@ -163,7 +162,7 @@ def filter_errors(errors, method: Callable, estimator_cls: type | None = None):
     ["estimator_cls", "method"],
     get_methods_to_validate(),
 )
-def test_estimator_docstrings(estimator_cls: type, method: str, request):
+def test_estimator_docstrings(estimator_cls, method, request):
     base_import_path = estimator_cls.__module__
     import_path = [base_import_path, estimator_cls.__name__]
     if method is not None:
@@ -192,7 +191,7 @@ def test_estimator_docstrings(estimator_cls: type, method: str, request):
     ["func", "name"],
     get_functions_to_validate(),
 )
-def test_function_docstrings(func: Callable, name: str, request):
+def test_function_docstrings(func, name, request):
     import_path = ".".join([func.__module__, name])
     print(import_path)
 
diff --git a/skrub/tests/test_fast_hash.py b/skrub/tests/test_fast_hash.py
index 3cf251964..2e16a9d55 100644
--- a/skrub/tests/test_fast_hash.py
+++ b/skrub/tests/test_fast_hash.py
@@ -2,7 +2,7 @@
 from skrub.tests.utils import generate_data
 
 
-def test_fast_hash() -> None:
+def test_fast_hash():
     data = generate_data(100, as_list=True)
     a = data[0]
 
diff --git a/skrub/tests/test_fuzzy_join.py b/skrub/tests/test_fuzzy_join.py
index 27ebae3e5..624e9f2e4 100644
--- a/skrub/tests/test_fuzzy_join.py
+++ b/skrub/tests/test_fuzzy_join.py
@@ -1,5 +1,4 @@
 import warnings
-from typing import Literal
 
 import numpy as np
 import pandas as pd
@@ -16,7 +15,7 @@
     "analyzer",
     ["char", "char_wb", "word"],
 )
-def test_fuzzy_join(df_module, analyzer: Literal["char", "char_wb", "word"]):
+def test_fuzzy_join(df_module, analyzer):
     """
     Testing if ``fuzzy_join`` results are as expected.
     """
diff --git a/skrub/tests/test_gap_encoder.py b/skrub/tests/test_gap_encoder.py
index 9c9dd98aa..3cd3e67f1 100644
--- a/skrub/tests/test_gap_encoder.py
+++ b/skrub/tests/test_gap_encoder.py
@@ -32,11 +32,11 @@ def generate(*args, as_list=True, **kwargs):
     ],
 )
 def test_analyzer(
-    hashing: bool,
-    init: str,
-    rescale_W: bool,
-    add_words: bool,
-    rescale_rho: bool,
+    hashing,
+    init,
+    rescale_W,
+    add_words,
+    rescale_rho,
     generate_data,
 ):
     """
@@ -90,11 +90,11 @@ def test_analyzer(
     ],
 )
 def test_gap_encoder(
-    hashing: bool,
-    init: str,
-    analyzer: str,
-    add_words: bool,
-    verbose: bool,
+    hashing,
+    init,
+    analyzer,
+    add_words,
+    verbose,
     generate_data,
 ):
     n_samples = 70
@@ -137,7 +137,7 @@ def test_gap_encoder(
     "add_words",
     [True, False],
 )
-def test_partial_fit(df_module, add_words: bool, generate_data):
+def test_partial_fit(df_module, add_words, generate_data):
     n_samples = 70
     X = generate_data(n_samples, random_state=0)
     X2 = generate_data(n_samples - 10, random_state=1)
@@ -217,7 +217,7 @@ def test_score(generate_data):
     "missing",
     ["zero_impute", "error", "aaa"],
 )
-def test_missing_values(df_module, missing: str):
+def test_missing_values(df_module, missing):
     """Test what happens when missing values are in the data."""
     if df_module.name == "polars":
         pytest.xfail(
diff --git a/skrub/tests/test_minhash_encoder.py b/skrub/tests/test_minhash_encoder.py
index 8d5036749..630c1ee46 100644
--- a/skrub/tests/test_minhash_encoder.py
+++ b/skrub/tests/test_minhash_encoder.py
@@ -77,7 +77,7 @@ def test_encoder_params(generate_data, hashing, minmax_hash):
 
 @pytest.mark.parametrize("missing", ["error", "zero_impute", "aaa"])
 @pytest.mark.parametrize("hashing", ["fast", "murmur", "aaa"])
-def test_missing_values(df_module, missing: str, hashing: str):
+def test_missing_values(df_module, missing, hashing):
     X = df_module.make_column(
         "", ["Red", None, "green", "blue", "green", "green", "blue", None]
     )
diff --git a/skrub/tests/test_similarity_encoder.py b/skrub/tests/test_similarity_encoder.py
index a11d27211..ce82e4449 100644
--- a/skrub/tests/test_similarity_encoder.py
+++ b/skrub/tests/test_similarity_encoder.py
@@ -1,5 +1,3 @@
-from collections.abc import Callable
-
 import numpy as np
 import numpy.testing
 import pandas as pd
@@ -76,7 +74,7 @@ def test_parameters():
         sim.transform(X2)
 
 
-def _test_missing_values(input_type: str, missing: str):
+def _test_missing_values(input_type, missing):
     observations = [["a", "b"], ["b", "a"], ["b", None], ["a", "c"], [np.nan, "a"]]
     encoded = np.array(
         [
@@ -108,7 +106,7 @@ def _test_missing_values(input_type: str, missing: str):
         return
 
 
-def _test_missing_values_transform(input_type: str, missing: str):
+def _test_missing_values_transform(input_type, missing):
     observations = [["a", "b"], ["b", "a"], ["b", "b"], ["a", "c"], ["c", "a"]]
     test_observations = [
         ["a", "b"],
@@ -146,9 +144,9 @@ def _test_missing_values_transform(input_type: str, missing: str):
 
 
 def _test_similarity(
-    similarity_f: Callable,
-    hashing_dim: int | None = None,
-    categories: str = "auto",
+    similarity_f,
+    hashing_dim=None,
+    categories="auto",
 ):
     X = np.array(["aa", "aaa", "aaab"]).reshape(-1, 1)
     X_test = np.array([["Aa", "aAa", "aaa", "aaab", " aaa  c"]]).reshape(-1, 1)
diff --git a/skrub/tests/test_string_distances.py b/skrub/tests/test_string_distances.py
index e56b15876..08dd2346a 100644
--- a/skrub/tests/test_string_distances.py
+++ b/skrub/tests/test_string_distances.py
@@ -3,7 +3,7 @@
 from skrub import _string_distances
 
 
-def test_get_unique_ngrams() -> None:
+def test_get_unique_ngrams():
     string = "test"
     true_ngrams = {
         (" ", "t"),
@@ -24,7 +24,7 @@ def test_get_unique_ngrams() -> None:
     assert ngrams == true_ngrams
 
 
-def _random_string_pairs(n_pairs=50, seed=1) -> list[tuple[str, str]]:
+def _random_string_pairs(n_pairs=50, seed=1):
     rng = np.random.RandomState(seed)
     characters = list(map(chr, range(10000)))
     pairs = []
@@ -37,12 +37,12 @@ def _random_string_pairs(n_pairs=50, seed=1) -> list[tuple[str, str]]:
     return pairs
 
 
-def _check_symmetry(dist_func, *args, **kwargs) -> None:
+def _check_symmetry(dist_func, *args, **kwargs):
     for a, b in _random_string_pairs():
         assert dist_func(a, b, *args, **kwargs) == dist_func(b, a, *args, **kwargs)
 
 
-def test_ngram_similarity() -> None:
+def test_ngram_similarity():
     # TODO
     # assert ...
     for n in range(1, 4):
diff --git a/skrub/tests/test_table_vectorizer.py b/skrub/tests/test_table_vectorizer.py
index 01e5a3f89..b32232653 100644
--- a/skrub/tests/test_table_vectorizer.py
+++ b/skrub/tests/test_table_vectorizer.py
@@ -514,7 +514,7 @@ def test_changing_types(X_train, X_test, expected_X_out):
     assert (X_out.dropna() == expected_X_out.dropna()).all().all()
 
 
-def test_changing_types_int_float() -> None:
+def test_changing_types_int_float():
     """
     The TableVectorizer shouldn't cast floats to ints
     even if only ints were seen during fit.
@@ -684,6 +684,8 @@ def test_accept_pipeline():
 
 def test_clean_null_downcast_warning():
     # non-regression test for https://github.com/skrub-data/skrub/issues/894
+    if parse_version(sklearn.__version__) < parse_version("1.4"):
+        pytest.skip("polars not supported for old scikit-learn versions")
     pl = pytest.importorskip("polars")
     df = pl.DataFrame(dict(a=[0, 1], b=["a", "b"]))
     with warnings.catch_warnings():
diff --git a/skrub/tests/utils.py b/skrub/tests/utils.py
index 6c3708e63..8cc179948 100644
--- a/skrub/tests/utils.py
+++ b/skrub/tests/utils.py
@@ -1,15 +1,14 @@
 import random
 
 import numpy as np
-from numpy.typing import NDArray
 
 
 def generate_data(
-    n_samples: int,
-    as_list: bool = False,
-    random_state: int | float | str | bytes | bytearray | None = None,
-    sample_length: int = 100,
-) -> NDArray:
+    n_samples,
+    as_list=False,
+    random_state=None,
+    sample_length=100,
+):
     if random_state is not None:
         random.seed(random_state)
     MAX_LIMIT = 255  # extended ASCII Character set