diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml index c4a8aa5b9..cbd2a565c 100644 --- a/.github/workflows/testing.yml +++ b/.github/workflows/testing.yml @@ -35,7 +35,7 @@ jobs: fail-fast: false matrix: os: ["ubuntu-latest", "macos-latest", "windows-latest"] - python-version: ["3.10", "3.11", "3.12"] + python-version: ["3.9", "3.10", "3.11", "3.12"] # Install the min or latest dependencies for skrub # as defined in setup.cfg at [options.extra_require]. # @@ -56,9 +56,12 @@ jobs: python-version: "3.11" - dependencies-version: "dev, polars" python-version: "3.12" - - dependencies-version: "dev, min-py310" + - dependencies-version: "dev, min-py39" python-version: "3.10" dependencies-version-type: "minimal" + - dependencies-version: "dev, min-py39" + python-version: "3.9" + dependencies-version-type: "minimal" name: ${{ matrix.os-name }} with Python ${{ matrix.python-version }} and ${{ matrix.dependencies-version-type }} dependencies defaults: run: diff --git a/CHANGES.rst b/CHANGES.rst index f905921a7..7e90826d1 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -36,6 +36,9 @@ Major changes It now has a ``key``` parameter that allows to join main and auxiliary tables that share the same column names. :pr:`876` by :user:`Théo Jolivet `. +* The minimum supported python version is now 3.9 + :pr:`939` by :user:`Jérôme Dockès `. + Minor changes ------------- diff --git a/benchmarks/bench_fuzzy_join_count_vs_hash.py b/benchmarks/bench_fuzzy_join_count_vs_hash.py index e9fbafa8a..f693ed947 100644 --- a/benchmarks/bench_fuzzy_join_count_vs_hash.py +++ b/benchmarks/bench_fuzzy_join_count_vs_hash.py @@ -14,7 +14,6 @@ import math from argparse import ArgumentParser from time import perf_counter -from typing import Literal, Tuple, Union import matplotlib.pyplot as plt import numpy as np @@ -34,21 +33,21 @@ # Function kept for reference def fuzzy_join( - left: pd.DataFrame, - right: pd.DataFrame, - how: Literal["left", "right"] = "left", - left_on: Union[str, None] = None, - right_on: Union[str, None] = None, - on: Union[str, None] = None, - encoder: Literal["count", "hash"] = "count", - analyzer: Literal["word", "char", "char_wb"] = "char_wb", - ngram_range: Tuple[int, int] = (2, 4), - return_score: bool = False, - match_score: float = 0, - drop_unmatched: bool = False, - sort: bool = False, - suffixes: Tuple[str, str] = ("_x", "_y"), -) -> pd.DataFrame: + left, + right, + how="left", + left_on=None, + right_on=None, + on=None, + encoder="count", + analyzer="char_wb", + ngram_range=(2, 4), + return_score=False, + match_score=0, + drop_unmatched=False, + sort=False, + suffixes=("_x", "_y"), +): """ Join two tables categorical string columns based on approximate matching and using morphological similarity. @@ -317,10 +316,10 @@ def fuzzy_join( repeat=10, ) def benchmark( - encoder: Literal["hash", "count"], - dataset_name: str, - analyzer: Literal["char_wb", "char", "word"], - ngram_range: tuple, + encoder, + dataset_name, + analyzer, + ngram_range, ): left_table, right_table, gt = load_data(dataset_name) @@ -352,7 +351,7 @@ def benchmark( return res_dic -def plot(df: pd.DataFrame): +def plot(df): sns.set_theme(style="ticks", palette="pastel") n_datasets = len(np.unique(df["dataset_name"])) diff --git a/benchmarks/bench_fuzzy_join_sparse_vs_dense.py b/benchmarks/bench_fuzzy_join_sparse_vs_dense.py index 851ee95bd..ead650e93 100644 --- a/benchmarks/bench_fuzzy_join_sparse_vs_dense.py +++ b/benchmarks/bench_fuzzy_join_sparse_vs_dense.py @@ -13,9 +13,7 @@ import warnings from argparse import ArgumentParser from collections.abc import Iterable -from pathlib import Path from time import perf_counter -from typing import Literal import matplotlib.pyplot as plt import numpy as np @@ -35,11 +33,11 @@ def _numeric_encoding( - main: pd.DataFrame, - main_cols: list | str, - aux: pd.DataFrame, - aux_cols: list | str, -) -> tuple: + main, + main_cols, + aux, + aux_cols, +): """Encoding numerical columns. Parameters @@ -71,15 +69,15 @@ def _numeric_encoding( def _string_encoding( - main: pd.DataFrame, - main_cols: list | str, - aux: pd.DataFrame, - aux_cols: list | str, - analyzer: Literal["word", "char", "char_wb"], - ngram_range: int | int, - encoder: _VectorizerMixin = None, - sparse: bool = True, -) -> tuple: + main, + main_cols, + aux, + aux_cols, + analyzer, + ngram_range, + encoder=None, + sparse=True, +): """Encoding string columns. Parameters @@ -147,7 +145,7 @@ def _string_encoding( return main_enc_d, aux_enc_d -def _nearest_matches(main_array, aux_array, sparse=True) -> np.ndarray | np.ndarray: +def _nearest_matches(main_array, aux_array, sparse=True): """Find the closest matches using the nearest neighbors method. Parameters @@ -182,23 +180,23 @@ def _nearest_matches(main_array, aux_array, sparse=True) -> np.ndarray | np.ndar def fuzzy_join( - left: pd.DataFrame, - right: pd.DataFrame, - how: Literal["left", "right"] = "left", - left_on: str | list[str] | list[int] | None = None, - right_on: str | list[str] | list[int] | None = None, - on: str | list[str] | list[int] | None = None, - numerical_match: Literal["string", "number"] = "number", - encoder: _VectorizerMixin = None, - analyzer: Literal["word", "char", "char_wb"] = "char_wb", - ngram_range: tuple[int, int] = (2, 4), - return_score: bool = False, - match_score: float = 0, - drop_unmatched: bool = False, - sort: bool = False, - suffixes: tuple[str, str] = ("_x", "_y"), - sparse: bool = True, -) -> pd.DataFrame: + left, + right, + how="left", + left_on=None, + right_on=None, + on=None, + numerical_match="number", + encoder=None, + analyzer="char_wb", + ngram_range=(2, 4), + return_score=False, + match_score=0, + drop_unmatched=False, + sort=False, + suffixes=("_x", "_y"), + sparse=True, +): """ Join two tables categorical string columns based on approximate matching and using morphological similarity. @@ -526,12 +524,12 @@ def fuzzy_join( save_as=benchmark_name, ) def benchmark( - sparse: bool, - dataset_name: str, - analyzer: Literal["char_wb", "char", "word"], - ngram_range: tuple, - data_home: Path | str | None = None, - data_directory: str | None = "benchmarks_data", + sparse, + dataset_name, + analyzer, + ngram_range, + data_home=None, + data_directory="benchmarks_data", ): left_table, right_table, gt = fetch_big_data( dataset_name=dataset_name, data_home=data_home, data_directory=data_directory @@ -578,7 +576,7 @@ def benchmark( return res_dic -def plot(df: pd.DataFrame): +def plot(df): sns.set_theme(style="ticks", palette="pastel") n_datasets = len(np.unique(df["dataset_name"])) diff --git a/benchmarks/bench_fuzzy_join_vs_others.py b/benchmarks/bench_fuzzy_join_vs_others.py index d7c526c71..69ad95a4b 100644 --- a/benchmarks/bench_fuzzy_join_vs_others.py +++ b/benchmarks/bench_fuzzy_join_vs_others.py @@ -96,8 +96,8 @@ def autofj_merge(left, right, target=0.9): repeat=5, ) def benchmark( - dataset_name: str, - join: str, + dataset_name, + join, ): left_table, right_table, gt = load_data(dataset_name) @@ -151,7 +151,7 @@ def benchmark( return res_dic -def plot(df: pd.DataFrame): +def plot(df): sns.set_theme(style="ticks", palette="pastel") n_datasets = len(np.unique(df["dataset_name"])) diff --git a/benchmarks/bench_gap_divergence.py b/benchmarks/bench_gap_divergence.py index 6dcdc3933..f78ddfe32 100644 --- a/benchmarks/bench_gap_divergence.py +++ b/benchmarks/bench_gap_divergence.py @@ -55,10 +55,10 @@ class ModifiedGapEncoderColumn(GapEncoder): - def __init__(self, *args, column_name: str = "MISSING COLUMN", **kwargs): + def __init__(self, *args, column_name="MISSING COLUMN", **kwargs): super().__init__(*args, **kwargs) self.column_name = column_name - self.benchmark_results_: list[dict[str, np.ndarray | float]] = [] + self.benchmark_results_ = [] def fit(self, X, y=None): # Copy parameter rho @@ -123,9 +123,7 @@ def fit(self, X, y=None): class ModifiedGapEncoder(GapEncoder): - fitted_models_: list[ModifiedGapEncoderColumn] - - def _create_column_gap_encoder(self, column_name: str): + def _create_column_gap_encoder(self, column_name): return ModifiedGapEncoderColumn( column_name=column_name, ngram_range=self.ngram_range, @@ -187,7 +185,7 @@ def fit(self, X, y=None): }, save_as=benchmark_name, ) -def benchmark(max_iter_e_step: int, dataset_name: str): +def benchmark(max_iter_e_step, dataset_name): """ Cross-validate a pipeline with a modified `GapEncoder` instance for the high cardinality column. The rest of the columns are passed to a @@ -261,7 +259,7 @@ def benchmark(max_iter_e_step: int, dataset_name: str): return results -def plot(df: pd.DataFrame): +def plot(df): # Keep only the last outer iteration df = df[df["gap_iter"] == 5] diff --git a/benchmarks/bench_gap_encoder_hp.py b/benchmarks/bench_gap_encoder_hp.py index 1d076d798..4a6b6e73f 100644 --- a/benchmarks/bench_gap_encoder_hp.py +++ b/benchmarks/bench_gap_encoder_hp.py @@ -49,12 +49,12 @@ repeat=1, ) def benchmark( - high_card_feature: str, - batch_size: int, - max_iter_e_step: int, - max_rows: int, - max_no_improvement: int, - random_state: int, + high_card_feature, + batch_size, + max_iter_e_step, + max_rows, + max_no_improvement, + random_state, ): X = np.array(ds.X[high_card_feature]).reshape(-1, 1).astype(str) y = ds.y @@ -111,7 +111,7 @@ def benchmark( return res_dic -def plot(df: pd.DataFrame): +def plot(df): base_values = {"batch_size": 1024, "max_iter_e_step": 1, "max_no_improvement": 5} for variable in base_values.keys(): df_to_plot = df diff --git a/benchmarks/bench_gap_es_score.py b/benchmarks/bench_gap_es_score.py index bb771bb1e..be32b9ab3 100644 --- a/benchmarks/bench_gap_es_score.py +++ b/benchmarks/bench_gap_es_score.py @@ -82,7 +82,7 @@ def _minibatch_convergence(self, batch_size, batch_cost, n_samples, step, n_step return False - def fit(self, X, y=None) -> "GapEncoder": + def fit(self, X, y=None): """ Fit the GapEncoder on `X`. @@ -164,8 +164,6 @@ def fit(self, X, y=None) -> "GapEncoder": class ModifiedGapEncoder(GapEncoder): - fitted_models_: list[ModifiedGapEncoderColumn] - def _create_column_gap_encoder(self): return ModifiedGapEncoderColumn( ngram_range=self.ngram_range, @@ -219,9 +217,9 @@ def _create_column_gap_encoder(self): repeat=2, ) def benchmark( - high_card_feature: str, - max_rows: int, - modif: bool, + high_card_feature, + max_rows, + modif, ): ds = fetch_traffic_violations() X = np.array(ds.X[high_card_feature]).reshape(-1, 1).astype(str) @@ -277,7 +275,7 @@ def benchmark( return res_dic -def plot(df: pd.DataFrame): +def plot(df): sns.lineplot( x="train_size", y="time_fit", data=df, hue="high_card_feature", style="modif" ) diff --git a/benchmarks/bench_minhash_batch_number.py b/benchmarks/bench_minhash_batch_number.py index 8cf74a292..4a26d36a3 100644 --- a/benchmarks/bench_minhash_batch_number.py +++ b/benchmarks/bench_minhash_batch_number.py @@ -9,9 +9,7 @@ import pickle from argparse import ArgumentParser -from collections.abc import Callable, Collection from pathlib import Path -from typing import Literal import matplotlib.pyplot as plt import numpy as np @@ -106,20 +104,18 @@ class MinHashEncoder(BaseEstimator, TransformerMixin): """ - hash_dict_: LRUDict - - _capacity: int = 2**10 + _capacity = 2**10 def __init__( self, - n_components: int = 30, - ngram_range: tuple[int, int] = (2, 4), - hashing: Literal["fast", "murmur"] = "fast", - minmax_hash: bool = False, - handle_missing: Literal["error", "zero_impute"] = "zero_impute", - batch: bool = False, - batch_per_job: int = 1, - n_jobs: int = None, + n_components=30, + ngram_range=(2, 4), + hashing="fast", + minmax_hash=False, + handle_missing="zero_impute", + batch=False, + batch_per_job=1, + n_jobs=None, ): self.ngram_range = ngram_range self.n_components = n_components @@ -130,13 +126,13 @@ def __init__( self.batch_per_job = batch_per_job self.n_jobs = n_jobs - def _more_tags(self) -> dict[str, list[str]]: + def _more_tags(self): """ Used internally by sklearn to ease the estimator checks. """ return {"X_types": ["categorical"]} - def _get_murmur_hash(self, string: str) -> np.array: + def _get_murmur_hash(self, string): """ Encode a string using murmur hashing function. @@ -164,7 +160,7 @@ def _get_murmur_hash(self, string: str) -> np.array: min_hashes = np.minimum(min_hashes, hash_array) return min_hashes / (2**32 - 1) - def _get_fast_hash(self, string: str) -> np.array: + def _get_fast_hash(self, string): """ Encode a string with fast hashing function. fast hashing supports both min_hash and minmax_hash encoding. @@ -194,9 +190,7 @@ def _get_fast_hash(self, string: str) -> np.array: ] ) - def _compute_hash( - self, string: str, hash_func: Callable[[str], np.ndarray] - ) -> np.ndarray: + def _compute_hash(self, string, hash_func): """Function called to compute the hash of a string. Check if the string is in the hash dictionary, if not, scompute the hash using @@ -221,9 +215,7 @@ def _compute_hash( self.hash_dict_[string] = hash_func(string) return self.hash_dict_[string] - def _compute_hash_batched( - self, batch: Collection[str], hash_func: Callable[[str], np.ndarray] - ): + def _compute_hash_batched(self, batch, hash_func): """Function called to compute the hashes of a batch of strings. Check if the string is in the hash dictionary, if not, compute the hash using @@ -251,7 +243,7 @@ def _compute_hash_batched( res[i] = self.hash_dict_[string] return res - def fit(self, X, y=None) -> "MinHashEncoder": + def fit(self, X, y=None): """ Fit the MinHashEncoder to X. In practice, just initializes a dictionary to store encodings to speed up computation. @@ -281,7 +273,7 @@ def fit(self, X, y=None) -> "MinHashEncoder": self.hash_dict_ = LRUDict(capacity=self._capacity) return self - def transform(self, X) -> np.array: + def transform(self, X): """ Transform X using specified encoding scheme. @@ -388,18 +380,18 @@ def transform(self, X) -> np.array: repeat=10, ) def benchmark( - dataset_size: str, - batched: bool, - n_jobs: int, - batch_per_job: int, -) -> None: + dataset_size, + batched, + n_jobs, + batch_per_job, +): X = data[dataset_size] MinHashEncoder(batch=batched, n_jobs=n_jobs, batch_per_job=batch_per_job).fit( X ).transform(X) -def plot(df: pd.DataFrame): +def plot(df): sns.set_theme(style="ticks", palette="pastel") # Create a new columns merging batched and batch_per_job diff --git a/benchmarks/bench_tablevectorizer_tuning.py b/benchmarks/bench_tablevectorizer_tuning.py index 74ddf3ca7..ceadadf3c 100644 --- a/benchmarks/bench_tablevectorizer_tuning.py +++ b/benchmarks/bench_tablevectorizer_tuning.py @@ -53,9 +53,9 @@ repeat=3, ) def benchmark( - tv_cardinality_threshold: int, - minhash_n_components: int, - dataset_name: str, + tv_cardinality_threshold, + minhash_n_components, + dataset_name, ): tv = TableVectorizer( cardinality_threshold=tv_cardinality_threshold, @@ -84,7 +84,7 @@ def benchmark( } -def plot(df: pd.DataFrame): +def plot(df): sns.set_theme(style="ticks", palette="pastel") n_datasets = len(np.unique(df["dataset_name"])) diff --git a/benchmarks/utils/_various.py b/benchmarks/utils/_various.py index b6f50dc6c..6a592055a 100644 --- a/benchmarks/utils/_various.py +++ b/benchmarks/utils/_various.py @@ -3,7 +3,6 @@ import pandas as pd from skrub.datasets import ( - DatasetAll, fetch_drug_directory, fetch_employee_salaries, fetch_medical_charge, @@ -14,11 +13,11 @@ ) -def find_result(bench_name: str) -> Path: +def find_result(bench_name): return choose_file(find_results(bench_name)) -def find_results(bench_name: str) -> list[Path]: +def find_results(bench_name): """ Returns the list of results in the results' directory. """ @@ -30,7 +29,7 @@ def find_results(bench_name: str) -> list[Path]: ] -def choose_file(results: list[Path]) -> Path: +def choose_file(results): """ Given a list of files, chooses one based on these rules: - If there are no files to choose from, exit the program @@ -66,7 +65,7 @@ def choose_file(results: list[Path]) -> Path: return results[int(choice) - 1] -def get_classification_datasets() -> dict[str, DatasetAll]: +def get_classification_datasets(): return { "open_payments": fetch_open_payments(), "drug_directory": fetch_drug_directory(), @@ -76,7 +75,7 @@ def get_classification_datasets() -> dict[str, DatasetAll]: } -def get_regression_datasets() -> dict[str, DatasetAll]: +def get_regression_datasets(): return { "medical_charge": fetch_medical_charge(), "employee_salaries": fetch_employee_salaries(), diff --git a/benchmarks/utils/join.py b/benchmarks/utils/join.py index 54273ba80..04397711d 100644 --- a/benchmarks/utils/join.py +++ b/benchmarks/utils/join.py @@ -1,14 +1,12 @@ -from pathlib import Path - import pandas as pd from skrub.datasets._utils import get_data_dir def get_local_data( - dataset_name: str, - data_home: Path | str | None = None, - data_directory: str | None = None, + dataset_name, + data_home=None, + data_directory=None, ): """Get the path to the local datasets.""" data_directory = get_data_dir(data_directory, data_home) @@ -26,11 +24,11 @@ def get_local_data( def fetch_data( - dataset_name: str, - save: bool = True, - data_home: Path | str | None = None, - data_directory: str | None = None, -) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: + dataset_name, + save=True, + data_home=None, + data_directory=None, +): """Fetch datasets from [1]_. Parameters @@ -88,12 +86,12 @@ def fetch_data( def fetch_big_data( - dataset_name: str, - data_type: str = "Dirty", - save: bool = True, - data_home: Path | str | None = None, - data_directory: str | None = None, -) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: + dataset_name, + data_type="Dirty", + save=True, + data_home=None, + data_directory=None, +): """Fetch datasets from [1]_. Parameters diff --git a/benchmarks/utils/monitor.py b/benchmarks/utils/monitor.py index f40d736ec..5afb3e68b 100644 --- a/benchmarks/utils/monitor.py +++ b/benchmarks/utils/monitor.py @@ -1,7 +1,7 @@ import os import tracemalloc from collections import defaultdict -from collections.abc import Callable, Collection, Mapping +from collections.abc import Mapping from datetime import datetime from itertools import product from pathlib import Path @@ -9,7 +9,6 @@ from string import ascii_letters, digits from time import perf_counter from time import time as get_time -from typing import Any from warnings import warn import pandas as pd @@ -18,13 +17,13 @@ def monitor( *, - parametrize: Collection[Mapping[Any]] | Mapping[str, Collection[Any]] | None = None, - save_as: str | None = None, - memory: bool = True, - time: bool = True, - repeat: int = 1, - hot_load: str | None = None, -) -> Callable[..., Callable[..., pd.DataFrame]]: + parametrize=None, + save_as=None, + memory=True, + time=True, + repeat=1, + hot_load=None, +): """Decorator used to monitor the execution of a function. The decorated function should return either: @@ -128,9 +127,7 @@ def monitor( reserved_column_names = {"iter", "time", "memory"} - def decorator( - func: Callable[..., Mapping[str, Any] | list[Mapping[str, Any]] | None] - ): + def decorator(func): """ Catches the decorated function. @@ -140,7 +137,7 @@ def decorator( The decorated function callable object. """ - def wrapper(*call_args, **call_kwargs) -> pd.DataFrame: + def wrapper(*call_args, **call_kwargs): """ Catches the decorated function's call arguments. @@ -162,7 +159,7 @@ def wrapper(*call_args, **call_kwargs) -> pd.DataFrame: f"positional values: {call_args!r}" ) - def get_random_file_name() -> str: + def get_random_file_name(): """ Returns a random file name, used by hot-loading. Format is ``{time}-{random_string}.parquet``. @@ -171,7 +168,7 @@ def get_random_file_name() -> str: time = int(get_time()) return f"{time}-{name}.parquet" - def load_intermediate_results(file_name: str) -> pd.DataFrame: + def load_intermediate_results(file_name): """ Loads the results from the file passed. If the file is not found, and to avoid unexpected behavior, @@ -184,12 +181,12 @@ def load_intermediate_results(file_name: str) -> pd.DataFrame: return pd.read_parquet(file_name) - def product_map(iterables: Mapping[str, Any]): + def product_map(iterables): """``itertools.product`` with mapping support.""" for combination in product(*iterables.values()): yield dict(zip(iterables.keys(), combination)) - def exec_func(**kwargs) -> pd.DataFrame: + def exec_func(**kwargs): """ Wraps the decorated function call with a single set of parameters, and pre-process the returned values. @@ -274,7 +271,6 @@ def exec_func(**kwargs) -> pd.DataFrame: return df_results - parametrization: list[Mapping] if parametrize is None: # Use the parameters passed by the call parametrization = [call_kwargs] diff --git a/setup.cfg b/setup.cfg index e6fbcf4e6..23a16092c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -14,6 +14,7 @@ classifiers = Intended Audience :: Science/Research License :: OSI Approved :: BSD License Operating System :: OS Independent + Programming Language :: Python :: 3.9 Programming Language :: Python :: 3.10 Programming Language :: Python :: 3.11 Programming Language :: Python :: 3.12 @@ -32,7 +33,7 @@ install_requires = scipy>=1.9.3 pandas>=1.5.3 packaging>=23.1 -python_requires = >=3.10 +python_requires = >=3.9 [options.packages.find] include = skrub* @@ -79,7 +80,7 @@ benchmarks = loguru # Overwrite the previous install_requires for CI testing purposes # as defined in testing.yml. -min-py310 = +min-py39 = scikit-learn==1.2.1 numpy==1.23.5 scipy==1.9.3 diff --git a/skrub/_agg_joiner.py b/skrub/_agg_joiner.py index 159374a46..7044aa571 100644 --- a/skrub/_agg_joiner.py +++ b/skrub/_agg_joiner.py @@ -5,7 +5,6 @@ Both classes aggregate the auxiliary table first, then join this grouped table with the main table. """ -from typing import Iterable import numpy as np from sklearn.base import BaseEstimator, TransformerMixin @@ -22,7 +21,7 @@ ALL_OPS = NUM_OPERATIONS + CATEG_OPERATIONS -def split_num_categ_operations(operations: list[str]) -> tuple[list[str], list[str]]: +def split_num_categ_operations(operations): """Separate aggregator operators input by their type. Parameters @@ -373,9 +372,9 @@ class AggTarget(TransformerMixin, BaseEstimator): def __init__( self, - main_key: str | Iterable[str], - operation: str | Iterable[str] | None = None, - suffix: str | None = None, + main_key, + operation=None, + suffix=None, ): self.main_key = main_key self.operation = operation diff --git a/skrub/_deduplicate.py b/skrub/_deduplicate.py index 11a56d84b..ce6426bd4 100644 --- a/skrub/_deduplicate.py +++ b/skrub/_deduplicate.py @@ -2,12 +2,10 @@ Implements deduplication based on clustering string distance matrices. """ -from collections.abc import Sequence import numpy as np import pandas as pd from joblib import Parallel, delayed -from numpy.typing import NDArray from scipy.cluster.hierarchy import fcluster, linkage from scipy.spatial.distance import pdist, squareform from sklearn.feature_extraction.text import TfidfVectorizer @@ -15,10 +13,10 @@ def compute_ngram_distance( - unique_words: Sequence[str] | NDArray, - ngram_range: tuple[int, int] = (2, 4), - analyzer: str = "char_wb", -) -> NDArray: + unique_words, + ngram_range=(2, 4), + analyzer="char_wb", +): """Compute the condensed pair-wise n-gram distance between `unique_words`. Parameters @@ -52,15 +50,13 @@ def compute_ngram_distance( return distance_mat -def _get_silhouette_avg(Z: NDArray, n_clust: int, redundant_dist: NDArray) -> float: +def _get_silhouette_avg(Z, n_clust, redundant_dist): labels = fcluster(Z, n_clust, criterion="maxclust") silhouette_avg = silhouette_score(redundant_dist, labels, metric="precomputed") return silhouette_avg -def _guess_clusters( - Z: NDArray, distance_mat: NDArray, n_jobs: int | None = None -) -> int: +def _guess_clusters(Z, distance_mat, n_jobs=None): """Finds the number of clusters that maximize the silhouette score when clustering `distance_mat`. @@ -88,10 +84,10 @@ def _guess_clusters( def _create_spelling_correction( - unique_words: Sequence[str] | NDArray[np.str_], - counts: Sequence[int] | NDArray[np.int_], - clusters: Sequence[int], -) -> pd.Series: + unique_words, + counts, + clusters, +): """ Creates a pandas Series that map each cluster member to the most frequent cluster member. The assumption is that the most common spelling @@ -115,8 +111,8 @@ def _create_spelling_correction( corrected spelling of each word as values. """ count_series = pd.Series(counts, index=unique_words) - original_spelling: list[str] = [] - corrected_spelling: list[str] = [] + original_spelling = [] + corrected_spelling = [] for cluster in np.unique(clusters): sorted_spellings = ( count_series.loc[clusters == cluster] diff --git a/skrub/_fast_hash.py b/skrub/_fast_hash.py index 5e07edf0f..96be294ec 100644 --- a/skrub/_fast_hash.py +++ b/skrub/_fast_hash.py @@ -48,11 +48,11 @@ def gen_atom(atom_len, seed=0): def ngram_min_hash( - string: str, - ngram_range: tuple[int, int] = (2, 4), - seed: int = 0, + string, + ngram_range=(2, 4), + seed=0, return_minmax=False, -) -> int | tuple[int, int]: +): """ Compute the min/max hash of the ngrams of the string. diff --git a/skrub/_gap_encoder.py b/skrub/_gap_encoder.py index 76b964454..e3d0059db 100644 --- a/skrub/_gap_encoder.py +++ b/skrub/_gap_encoder.py @@ -3,15 +3,11 @@ """ from __future__ import annotations -from collections.abc import Generator from copy import deepcopy -from typing import Literal import numpy as np import pandas as pd import scipy.sparse as sp -from numpy.random import RandomState -from numpy.typing import ArrayLike, NDArray from scipy import sparse from sklearn.base import TransformerMixin from sklearn.cluster import KMeans, kmeans_plusplus @@ -186,30 +182,27 @@ class GapEncoder(TransformerMixin, SingleColumnTransformer): The higher the value, the bigger the correspondence with the topic. """ - rho_: float - H_dict_: dict[NDArray, NDArray] - def __init__( self, - n_components: int = 10, - batch_size: int = 1024, - gamma_shape_prior: float = 1.1, - gamma_scale_prior: float = 1.0, - rho: float = 0.95, - rescale_rho: bool = False, - hashing: bool = False, - hashing_n_features: int = 2**12, - init: Literal["k-means++", "random", "k-means"] = "k-means++", - max_iter: int = 5, - ngram_range: tuple[int, int] = (2, 4), - analyzer: Literal["word", "char", "char_wb"] = "char", - add_words: bool = False, - random_state: int | RandomState | None = None, - rescale_W: bool = True, - max_iter_e_step: int = 1, - max_no_improvement: int = 5, + n_components=10, + batch_size=1024, + gamma_shape_prior=1.1, + gamma_scale_prior=1.0, + rho=0.95, + rescale_rho=False, + hashing=False, + hashing_n_features=2**12, + init="k-means++", + max_iter=5, + ngram_range=(2, 4), + analyzer="char", + add_words=False, + random_state=None, + rescale_W=True, + max_iter_e_step=1, + max_no_improvement=5, handle_missing="zero_impute", - verbose: int = 0, + verbose=0, ): self.ngram_range = ngram_range self.n_components = n_components @@ -231,7 +224,7 @@ def __init__( self.handle_missing = handle_missing self.verbose = verbose - def _init_vars(self, X) -> tuple[NDArray, NDArray, NDArray]: + def _init_vars(self, X): """ Build the bag-of-n-grams representation `V` of `X` and initialize the topics `W`. @@ -286,7 +279,7 @@ def _init_vars(self, X) -> tuple[NDArray, NDArray, NDArray]: self.rho_ = self.rho ** (self.batch_size / len(X)) return unq_X, unq_V, lookup - def _get_H(self, X: NDArray) -> NDArray: + def _get_H(self, X): """ Return the bag-of-n-grams representation of `X`. """ @@ -295,7 +288,7 @@ def _get_H(self, X: NDArray) -> NDArray: h_out[:] = self.H_dict_[x] return H_out - def _init_w(self, V: NDArray, X) -> tuple[NDArray, NDArray, NDArray]: + def _init_w(self, V, X): """ Initialize the topics `W`. If `self.init='k-means++'`, we use the init method of @@ -351,11 +344,11 @@ def _init_w(self, V: NDArray, X) -> tuple[NDArray, NDArray, NDArray]: def _minibatch_convergence( self, - batch_size: int, - batch_cost: float, - n_samples: int, - step: int, - n_steps: int, + batch_size, + batch_cost, + n_samples, + step, + n_steps, ): """ Helper function to encapsulate the early stopping logic. @@ -517,9 +510,9 @@ def fit(self, X, y=None): def get_feature_names_out( self, - n_labels: int = 3, - prefix: str = "", - ) -> list[str]: + n_labels=3, + prefix="", + ): """ Return the labels that best summarize the learned components/topics. @@ -692,7 +685,7 @@ def partial_fit(self, X, y=None): self.H_dict_.update(zip(unq_X, unq_H)) return self - def _add_unseen_keys_to_H_dict(self, X) -> None: + def _add_unseen_keys_to_H_dict(self, X): """ Add activations of unseen string categories from `X` to `H_dict`. """ @@ -792,7 +785,7 @@ def _handle_missing(self, X): return X -def _rescale_W(W: NDArray, A: NDArray) -> None: +def _rescale_W(W, A): """ Rescale the topics `W` to have a L1-norm equal to 1. Note that they are modified in-place. @@ -826,14 +819,14 @@ def _special_sparse_dot(H, W, X): def _multiplicative_update_w( - Vt: NDArray, - W: NDArray, - A: NDArray, - B: NDArray, - Ht: NDArray, - rescale_W: bool, - rho: float, -) -> tuple[NDArray, NDArray, NDArray]: + Vt, + W, + A, + B, + Ht, + rescale_W, + rho, +): """ Multiplicative update step for the topics `W`. """ @@ -852,7 +845,7 @@ def _multiplicative_update_w( return W, A, B -def _rescale_h(V: NDArray, H: NDArray) -> NDArray: +def _rescale_h(V, H): """ Rescale the activations `H`. """ @@ -863,14 +856,14 @@ def _rescale_h(V: NDArray, H: NDArray) -> NDArray: def _multiplicative_update_h( - Vt: NDArray, - W: NDArray, - Ht: NDArray, - epsilon: float = 1e-3, - max_iter: int = 10, - rescale_W: bool = False, - gamma_shape_prior: float = 1.1, - gamma_scale_prior: float = 1.0, + Vt, + W, + Ht, + epsilon=1e-3, + max_iter=10, + rescale_W=False, + gamma_shape_prior=1.1, + gamma_scale_prior=1.0, ): """ Multiplicative update step for the activations `H`. @@ -900,9 +893,9 @@ def _multiplicative_update_h( def batch_lookup( - lookup: NDArray, - n: int = 1, -) -> Generator[tuple[NDArray, NDArray], None, None]: + lookup, + n=1, +): """ Make batches of the lookup array. """ @@ -914,15 +907,15 @@ def batch_lookup( def get_kmeans_prototypes( - X: ArrayLike, - n_prototypes: int, - analyzer: Literal["word", "char", "char_wb"] = "char", - hashing_dim: int = 128, - ngram_range: tuple[int, int] = (2, 4), - sparse: bool = False, + X, + n_prototypes, + analyzer="char", + hashing_dim=128, + ngram_range=(2, 4), + sparse=False, sample_weight=None, - random_state: int | RandomState | None = None, -) -> NDArray: + random_state=None, +): """ Computes prototypes based on: - dimensionality reduction (via hashing n-grams) diff --git a/skrub/_minhash_encoder.py b/skrub/_minhash_encoder.py index c849cc4d1..53676a5f5 100644 --- a/skrub/_minhash_encoder.py +++ b/skrub/_minhash_encoder.py @@ -4,12 +4,8 @@ """ from __future__ import annotations -from collections.abc import Callable, Collection -from typing import Literal - import numpy as np from joblib import Parallel, delayed, effective_n_jobs -from numpy.typing import NDArray from sklearn.base import TransformerMixin from sklearn.utils import gen_even_slices, murmurhash3_32 from sklearn.utils.validation import check_is_fitted @@ -115,19 +111,17 @@ class MinHashEncoder(TransformerMixin, SingleColumnTransformer): 3 -1.975829e+09 -2.095000e+09 -1.530721e+09 -1.459183e+09 -1.580988e+09 """ - hash_dict_: LRUDict - - _capacity: int = 2**10 + _capacity = 2**10 def __init__( self, *, - n_components: int = 30, - ngram_range: tuple[int, int] = (2, 4), - hashing: Literal["fast", "murmur"] = "fast", - minmax_hash: bool = False, - handle_missing: Literal["error", "zero_impute"] = "zero_impute", - n_jobs: int = None, + n_components=30, + ngram_range=(2, 4), + hashing="fast", + minmax_hash=False, + handle_missing="zero_impute", + n_jobs=None, ): self.ngram_range = ngram_range self.n_components = n_components @@ -136,7 +130,7 @@ def __init__( self.handle_missing = handle_missing self.n_jobs = n_jobs - def _get_murmur_hash(self, string: str) -> NDArray: + def _get_murmur_hash(self, string): """ Encode a string using murmur hashing function. @@ -164,7 +158,7 @@ def _get_murmur_hash(self, string: str) -> NDArray: min_hashes = np.minimum(min_hashes, hash_array) return min_hashes / (2**32 - 1) - def _get_fast_hash(self, string: str) -> NDArray: + def _get_fast_hash(self, string): """Encode a string with fast hashing function. Fast hashing supports both min_hash and minmax_hash encoding. @@ -194,9 +188,7 @@ def _get_fast_hash(self, string: str) -> NDArray: ] ) - def _compute_hash_batched( - self, batch: Collection[str], hash_func: Callable[[str], NDArray] - ) -> NDArray: + def _compute_hash_batched(self, batch, hash_func): """Function called to compute the hashes of a batch of strings. Check if the string is in the hash dictionary, if not, compute the hash @@ -224,7 +216,7 @@ def _compute_hash_batched( res[i] = self.hash_dict_[string] return res - def fit(self, X, y=None) -> "MinHashEncoder": + def fit(self, X, y=None): """Fit the MinHashEncoder to `X`. In practice, just initializes a dictionary diff --git a/skrub/_similarity_encoder.py b/skrub/_similarity_encoder.py index a183531f7..86a7fa1fd 100644 --- a/skrub/_similarity_encoder.py +++ b/skrub/_similarity_encoder.py @@ -3,13 +3,11 @@ which encodes similarity instead of equality of values. """ -from typing import Literal import numpy as np import pandas as pd import sklearn from joblib import Parallel, delayed -from numpy.typing import ArrayLike, NDArray from scipy import sparse from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer from sklearn.preprocessing import OneHotEncoder @@ -23,15 +21,15 @@ def _ngram_similarity_one_sample_inplace( - x_count_vector: NDArray, - vocabulary_count_matrix: NDArray, - str_x: str, - vocabulary_ngram_counts: NDArray, - se_dict: dict, - unq_X: NDArray, - i: int, - ngram_range: tuple[int, int], -) -> None: + x_count_vector, + vocabulary_count_matrix, + str_x, + vocabulary_ngram_counts, + se_dict, + unq_X, + i, + ngram_range, +): """ Update inplace a dict of similarities between a string and a vocabulary @@ -74,12 +72,12 @@ def _ngram_similarity_one_sample_inplace( def ngram_similarity_matrix( X, - cats: list[str], - ngram_range: tuple[int, int], - analyzer: Literal["word", "char", "char_wb"], - hashing_dim: int, - dtype: type = np.float64, -) -> NDArray: + cats, + ngram_range, + analyzer, + hashing_dim, + dtype=np.float64, +): """ Similarity encoding for dirty categorical variables: Given two arrays of strings, returns the similarity encoding matrix @@ -261,25 +259,17 @@ class SimilarityEncoder(OneHotEncoder): dtype=object) """ - categories_: list[NDArray] - n_features_in_: int - drop_idx_: NDArray - vectorizers_: list[CountVectorizer] - vocabulary_count_matrices_: list[NDArray] - vocabulary_ngram_counts_: list[list[int]] - _infrequent_enabled: bool - def __init__( self, *, - ngram_range: tuple[int, int] = (2, 4), - analyzer: Literal["word", "char", "char_wb"] = "char", - categories: Literal["auto"] | list[list[str]] = "auto", - dtype: type = np.float64, - handle_unknown: Literal["error", "ignore"] = "ignore", - handle_missing: Literal["error", ""] = "", - hashing_dim: int | None = None, - n_jobs: int | None = None, + ngram_range=(2, 4), + analyzer="char", + categories="auto", + dtype=np.float64, + handle_unknown="ignore", + handle_missing="", + hashing_dim=None, + n_jobs=None, ): super().__init__() self.categories = categories @@ -298,7 +288,7 @@ def __init__( "'auto' or a list of prototypes. " ) - def fit(self, X: ArrayLike, y=None) -> "SimilarityEncoder": + def fit(self, X, y=None): """Fit the instance to `X`. Parameters @@ -422,7 +412,7 @@ def fit(self, X: ArrayLike, y=None) -> "SimilarityEncoder": self._n_features_outs = list(map(len, self.categories_)) return self - def transform(self, X: ArrayLike, fast: bool = True) -> NDArray: + def transform(self, X, fast=True): """Transform `X` using specified encoding scheme. Parameters @@ -498,9 +488,9 @@ def transform(self, X: ArrayLike, fast: bool = True) -> NDArray: def _ngram_similarity_fast( self, - X: list | NDArray, - col_idx: int, - ) -> NDArray: + X, + col_idx, + ): """ Fast computation of ngram similarity. diff --git a/skrub/_string_distances.py b/skrub/_string_distances.py index eecedd689..cf454cde0 100644 --- a/skrub/_string_distances.py +++ b/skrub/_string_distances.py @@ -8,7 +8,7 @@ # TODO vectorize these functions (accept arrays) -def get_ngram_count(string: str, ngram_range: tuple[int, int]) -> int: +def get_ngram_count(string, ngram_range): """ Compute the number of ngrams in a string. @@ -30,7 +30,7 @@ def get_ngram_count(string: str, ngram_range: tuple[int, int]) -> int: return ngram_count -def preprocess(x: str) -> str: +def preprocess(x): """ Combine preprocessing done by CountVectorizer and the SimilarityEncoder. @@ -56,7 +56,7 @@ def preprocess(x: str) -> str: return _white_spaces.sub(" ", x) -def get_unique_ngrams(string: str, ngram_range: tuple[int, int]): +def get_unique_ngrams(string, ngram_range): """ Return the set of unique n-grams of a string. @@ -83,7 +83,7 @@ def get_unique_ngrams(string: str, ngram_range: tuple[int, int]): return ngram_set -def get_ngrams(string: str, n: int) -> list[tuple]: +def get_ngrams(string, n): """Return the set of different n-grams in a string""" # Pure Python implementation: no numpy spaces = " " # * (n // 2 + n % 2) diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py index bec0532ef..06931b6c6 100644 --- a/skrub/_table_vectorizer.py +++ b/skrub/_table_vectorizer.py @@ -514,7 +514,7 @@ def _check_specific_columns(self): f"Column {c!r} used twice in 'specific_transformers', " f"at indices {specific_columns[c]} and {i}." ) - specific_columns |= {c: i for c in cols} + specific_columns.update({c: i for c in cols}) self._specific_columns = list(specific_columns.keys()) def _make_pipeline(self): @@ -620,7 +620,7 @@ def _store_output_to_input(self): # scikt-learn compatibility - def _more_tags(self) -> dict: + def _more_tags(self): """ Used internally by sklearn to ease the estimator checks. """ diff --git a/skrub/_utils.py b/skrub/_utils.py index 09e91697d..5ba0554d5 100644 --- a/skrub/_utils.py +++ b/skrub/_utils.py @@ -1,11 +1,9 @@ import collections import importlib import secrets -from collections.abc import Hashable -from typing import Any, Iterable +from typing import Iterable import numpy as np -from numpy.typing import NDArray from sklearn.base import clone from sklearn.utils import check_array @@ -18,11 +16,11 @@ class LRUDict: Using LRU eviction avoids memorizing a full dataset. """ - def __init__(self, capacity: int): + def __init__(self, capacity): self.capacity = capacity self.cache = collections.OrderedDict() - def __getitem__(self, key: Hashable): + def __getitem__(self, key): try: value = self.cache.pop(key) self.cache[key] = value @@ -30,7 +28,7 @@ def __getitem__(self, key: Hashable): except KeyError: return -1 - def __setitem__(self, key: Hashable, value: Any): + def __setitem__(self, key, value): try: self.cache.pop(key) except KeyError: @@ -38,11 +36,11 @@ def __setitem__(self, key: Hashable, value: Any): self.cache.popitem(last=False) self.cache[key] = value - def __contains__(self, key: Hashable): + def __contains__(self, key): return key in self.cache -def check_input(X) -> NDArray: +def check_input(X): """Check input with sklearn standards. Also converts X to a numpy array if not already. @@ -69,7 +67,7 @@ def check_input(X) -> NDArray: return X_ -def import_optional_dependency(name: str, extra: str = ""): +def import_optional_dependency(name, extra=""): """Import an optional dependency. By default, if a dependency is missing an ImportError with a nice diff --git a/skrub/datasets/_generating.py b/skrub/datasets/_generating.py index 8514c732d..0f1aefc9c 100644 --- a/skrub/datasets/_generating.py +++ b/skrub/datasets/_generating.py @@ -11,11 +11,11 @@ def make_deduplication_data( - examples: list[str], - entries_per_example: list[int], - prob_mistake_per_letter: float = 0.2, - random_state: int | np.random.RandomState | None = None, -) -> list[str]: + examples, + entries_per_example, + prob_mistake_per_letter=0.2, + random_state=None, +): """Duplicates examples with spelling mistakes. Characters are misspelled with probability `prob_mistake_per_letter`. diff --git a/skrub/datasets/_ken_embeddings.py b/skrub/datasets/_ken_embeddings.py index 0a447edb0..4d6f174c9 100644 --- a/skrub/datasets/_ken_embeddings.py +++ b/skrub/datasets/_ken_embeddings.py @@ -17,7 +17,7 @@ ) -def fetch_ken_table_aliases() -> set[str]: +def fetch_ken_table_aliases(): """Get the supported aliases of embedded KEN entities tables. These aliases can be using in subsequent functions (see section *See Also*). @@ -50,11 +50,11 @@ def fetch_ken_table_aliases() -> set[str]: def fetch_ken_types( - search: str = None, + search=None, *, - exclude: str | None = None, - embedding_table_id: str = "all_entities", -) -> pd.DataFrame: + exclude=None, + embedding_table_id="all_entities", +): """Helper function to search for KEN entity types. The result can then be used with fetch_ken_embeddings. @@ -136,14 +136,14 @@ def fetch_ken_types( def fetch_ken_embeddings( - search_types: str | None = None, + search_types=None, *, - exclude: str | None = None, - embedding_table_id: str = "all_entities", - embedding_type_id: str | None = None, - pca_components: int | None = None, - suffix: str = "", -) -> pd.DataFrame: + exclude=None, + embedding_table_id="all_entities", + embedding_type_id=None, + pca_components=None, + suffix="", +): """Download Wikipedia embeddings by type. More details on the embeddings can be found on diff --git a/skrub/datasets/_utils.py b/skrub/datasets/_utils.py index 7b9a57ab3..aba51f5e6 100644 --- a/skrub/datasets/_utils.py +++ b/skrub/datasets/_utils.py @@ -1,7 +1,7 @@ from pathlib import Path -def get_data_home(data_home: Path | str | None = None) -> Path: +def get_data_home(data_home=None): """Returns the path of the skrub data directory. This folder is used by some large dataset loaders to avoid downloading the @@ -35,7 +35,7 @@ def get_data_home(data_home: Path | str | None = None) -> Path: return data_home -def get_data_dir(name: str | None = None, data_home: Path | str | None = None) -> Path: +def get_data_dir(name=None, data_home=None): """ Returns the directory in which skrub looks for data. diff --git a/skrub/datasets/tests/test_fetching.py b/skrub/datasets/tests/test_fetching.py index 61c79dab6..523622276 100644 --- a/skrub/datasets/tests/test_fetching.py +++ b/skrub/datasets/tests/test_fetching.py @@ -9,7 +9,7 @@ from skrub.datasets import _fetching -def _has_data_id(call, data_id: int) -> bool: +def _has_data_id(call, data_id): # Unpacking copied from `mock._Call.__eq__` if len(call) == 2: args, kwargs = call @@ -22,7 +22,7 @@ def _has_data_id(call, data_id: int) -> bool: "skrub.datasets._fetching.fetch_openml", side_effect=_fetching.fetch_openml, ) -def test_openml_fetching(fetch_openml_mock: mock.Mock): +def test_openml_fetching(fetch_openml_mock): """ Downloads a small dataset (midwest survey) and performs a bunch of tests that asserts the fetching function works correctly. @@ -79,7 +79,7 @@ def test_openml_datasets_exist(): @mock.patch("skrub.datasets._fetching.fetch_openml") -def test_openml_datasets_calls(fetch_openml_mock: mock.Mock): +def test_openml_datasets_calls(fetch_openml_mock): """ Checks that calling the fetching functions actually calls `sklearn.datasets.fetch_openml`. diff --git a/skrub/tests/test_check_input.py b/skrub/tests/test_check_input.py index 6b802b33f..764dbbe52 100644 --- a/skrub/tests/test_check_input.py +++ b/skrub/tests/test_check_input.py @@ -53,11 +53,9 @@ def test_column_names_to_unique_strings(): df = pd.DataFrame(np.ones((2, 4)), columns=["a", 0, "0", "a"]) assert df.columns.tolist() == ["a", 0, "0", "a"] check = CheckInputDataFrame() - with ( - pytest.warns(UserWarning, match="Some column names are not strings"), - pytest.warns(UserWarning, match="Found duplicated column names"), - ): - out = check.fit_transform(df) + with pytest.warns(UserWarning, match="Some column names are not strings"): + with pytest.warns(UserWarning, match="Found duplicated column names"): + out = check.fit_transform(df) assert out.shape == (2, 4) out_cols = out.columns.tolist() assert out_cols[:2] == ["a", "0"] diff --git a/skrub/tests/test_deduplicate.py b/skrub/tests/test_deduplicate.py index 9e3323de4..16042989f 100644 --- a/skrub/tests/test_deduplicate.py +++ b/skrub/tests/test_deduplicate.py @@ -1,5 +1,3 @@ -from functools import cache - import joblib import numpy as np import pandas as pd @@ -22,9 +20,9 @@ [([500, 100, 1500], 0.05), ([100, 100], 0.02), ([200, 50, 30, 200, 800], 0.01)], ) def test_deduplicate( - entries_per_category: list[int], - prob_mistake_per_letter: float, - seed: int = 123, + entries_per_category, + prob_mistake_per_letter, + seed=123, ): rng = np.random.RandomState(seed) @@ -78,7 +76,7 @@ def test__guess_clusters(): assert n_clusters == len(np.unique(words)) -def test__create_spelling_correction(seed: int = 123): +def test__create_spelling_correction(seed=123): rng = np.random.RandomState(seed) n_clusters = 3 samples_per_cluster = 10 @@ -101,8 +99,7 @@ def test__create_spelling_correction(seed: int = 123): ).all() -@cache -def default_deduplicate(n: int = 500, random_state=0): +def default_deduplicate(n=500, random_state=0): """ Create a default deduplication dataset. """ diff --git a/skrub/tests/test_docstrings.py b/skrub/tests/test_docstrings.py index 22c594a07..7ee463b1e 100644 --- a/skrub/tests/test_docstrings.py +++ b/skrub/tests/test_docstrings.py @@ -10,7 +10,6 @@ import inspect import re -from collections.abc import Callable from importlib import import_module import pytest @@ -68,10 +67,10 @@ def get_functions_to_validate(): def repr_errors( - res: dict, - estimator: type | None = None, - method: str | None = None, -) -> str: + res, + estimator=None, + method=None, +): """ Pretty print original docstring and the obtained errors @@ -126,7 +125,7 @@ def repr_errors( return msg -def filter_errors(errors, method: Callable, estimator_cls: type | None = None): +def filter_errors(errors, method, estimator_cls=None): """ Ignore some errors based on the method type. """ @@ -163,7 +162,7 @@ def filter_errors(errors, method: Callable, estimator_cls: type | None = None): ["estimator_cls", "method"], get_methods_to_validate(), ) -def test_estimator_docstrings(estimator_cls: type, method: str, request): +def test_estimator_docstrings(estimator_cls, method, request): base_import_path = estimator_cls.__module__ import_path = [base_import_path, estimator_cls.__name__] if method is not None: @@ -192,7 +191,7 @@ def test_estimator_docstrings(estimator_cls: type, method: str, request): ["func", "name"], get_functions_to_validate(), ) -def test_function_docstrings(func: Callable, name: str, request): +def test_function_docstrings(func, name, request): import_path = ".".join([func.__module__, name]) print(import_path) diff --git a/skrub/tests/test_fast_hash.py b/skrub/tests/test_fast_hash.py index 3cf251964..2e16a9d55 100644 --- a/skrub/tests/test_fast_hash.py +++ b/skrub/tests/test_fast_hash.py @@ -2,7 +2,7 @@ from skrub.tests.utils import generate_data -def test_fast_hash() -> None: +def test_fast_hash(): data = generate_data(100, as_list=True) a = data[0] diff --git a/skrub/tests/test_fuzzy_join.py b/skrub/tests/test_fuzzy_join.py index 27ebae3e5..624e9f2e4 100644 --- a/skrub/tests/test_fuzzy_join.py +++ b/skrub/tests/test_fuzzy_join.py @@ -1,5 +1,4 @@ import warnings -from typing import Literal import numpy as np import pandas as pd @@ -16,7 +15,7 @@ "analyzer", ["char", "char_wb", "word"], ) -def test_fuzzy_join(df_module, analyzer: Literal["char", "char_wb", "word"]): +def test_fuzzy_join(df_module, analyzer): """ Testing if ``fuzzy_join`` results are as expected. """ diff --git a/skrub/tests/test_gap_encoder.py b/skrub/tests/test_gap_encoder.py index 9c9dd98aa..3cd3e67f1 100644 --- a/skrub/tests/test_gap_encoder.py +++ b/skrub/tests/test_gap_encoder.py @@ -32,11 +32,11 @@ def generate(*args, as_list=True, **kwargs): ], ) def test_analyzer( - hashing: bool, - init: str, - rescale_W: bool, - add_words: bool, - rescale_rho: bool, + hashing, + init, + rescale_W, + add_words, + rescale_rho, generate_data, ): """ @@ -90,11 +90,11 @@ def test_analyzer( ], ) def test_gap_encoder( - hashing: bool, - init: str, - analyzer: str, - add_words: bool, - verbose: bool, + hashing, + init, + analyzer, + add_words, + verbose, generate_data, ): n_samples = 70 @@ -137,7 +137,7 @@ def test_gap_encoder( "add_words", [True, False], ) -def test_partial_fit(df_module, add_words: bool, generate_data): +def test_partial_fit(df_module, add_words, generate_data): n_samples = 70 X = generate_data(n_samples, random_state=0) X2 = generate_data(n_samples - 10, random_state=1) @@ -217,7 +217,7 @@ def test_score(generate_data): "missing", ["zero_impute", "error", "aaa"], ) -def test_missing_values(df_module, missing: str): +def test_missing_values(df_module, missing): """Test what happens when missing values are in the data.""" if df_module.name == "polars": pytest.xfail( diff --git a/skrub/tests/test_minhash_encoder.py b/skrub/tests/test_minhash_encoder.py index 8d5036749..630c1ee46 100644 --- a/skrub/tests/test_minhash_encoder.py +++ b/skrub/tests/test_minhash_encoder.py @@ -77,7 +77,7 @@ def test_encoder_params(generate_data, hashing, minmax_hash): @pytest.mark.parametrize("missing", ["error", "zero_impute", "aaa"]) @pytest.mark.parametrize("hashing", ["fast", "murmur", "aaa"]) -def test_missing_values(df_module, missing: str, hashing: str): +def test_missing_values(df_module, missing, hashing): X = df_module.make_column( "", ["Red", None, "green", "blue", "green", "green", "blue", None] ) diff --git a/skrub/tests/test_similarity_encoder.py b/skrub/tests/test_similarity_encoder.py index a11d27211..ce82e4449 100644 --- a/skrub/tests/test_similarity_encoder.py +++ b/skrub/tests/test_similarity_encoder.py @@ -1,5 +1,3 @@ -from collections.abc import Callable - import numpy as np import numpy.testing import pandas as pd @@ -76,7 +74,7 @@ def test_parameters(): sim.transform(X2) -def _test_missing_values(input_type: str, missing: str): +def _test_missing_values(input_type, missing): observations = [["a", "b"], ["b", "a"], ["b", None], ["a", "c"], [np.nan, "a"]] encoded = np.array( [ @@ -108,7 +106,7 @@ def _test_missing_values(input_type: str, missing: str): return -def _test_missing_values_transform(input_type: str, missing: str): +def _test_missing_values_transform(input_type, missing): observations = [["a", "b"], ["b", "a"], ["b", "b"], ["a", "c"], ["c", "a"]] test_observations = [ ["a", "b"], @@ -146,9 +144,9 @@ def _test_missing_values_transform(input_type: str, missing: str): def _test_similarity( - similarity_f: Callable, - hashing_dim: int | None = None, - categories: str = "auto", + similarity_f, + hashing_dim=None, + categories="auto", ): X = np.array(["aa", "aaa", "aaab"]).reshape(-1, 1) X_test = np.array([["Aa", "aAa", "aaa", "aaab", " aaa c"]]).reshape(-1, 1) diff --git a/skrub/tests/test_string_distances.py b/skrub/tests/test_string_distances.py index e56b15876..08dd2346a 100644 --- a/skrub/tests/test_string_distances.py +++ b/skrub/tests/test_string_distances.py @@ -3,7 +3,7 @@ from skrub import _string_distances -def test_get_unique_ngrams() -> None: +def test_get_unique_ngrams(): string = "test" true_ngrams = { (" ", "t"), @@ -24,7 +24,7 @@ def test_get_unique_ngrams() -> None: assert ngrams == true_ngrams -def _random_string_pairs(n_pairs=50, seed=1) -> list[tuple[str, str]]: +def _random_string_pairs(n_pairs=50, seed=1): rng = np.random.RandomState(seed) characters = list(map(chr, range(10000))) pairs = [] @@ -37,12 +37,12 @@ def _random_string_pairs(n_pairs=50, seed=1) -> list[tuple[str, str]]: return pairs -def _check_symmetry(dist_func, *args, **kwargs) -> None: +def _check_symmetry(dist_func, *args, **kwargs): for a, b in _random_string_pairs(): assert dist_func(a, b, *args, **kwargs) == dist_func(b, a, *args, **kwargs) -def test_ngram_similarity() -> None: +def test_ngram_similarity(): # TODO # assert ... for n in range(1, 4): diff --git a/skrub/tests/test_table_vectorizer.py b/skrub/tests/test_table_vectorizer.py index 01e5a3f89..b32232653 100644 --- a/skrub/tests/test_table_vectorizer.py +++ b/skrub/tests/test_table_vectorizer.py @@ -514,7 +514,7 @@ def test_changing_types(X_train, X_test, expected_X_out): assert (X_out.dropna() == expected_X_out.dropna()).all().all() -def test_changing_types_int_float() -> None: +def test_changing_types_int_float(): """ The TableVectorizer shouldn't cast floats to ints even if only ints were seen during fit. @@ -684,6 +684,8 @@ def test_accept_pipeline(): def test_clean_null_downcast_warning(): # non-regression test for https://github.com/skrub-data/skrub/issues/894 + if parse_version(sklearn.__version__) < parse_version("1.4"): + pytest.skip("polars not supported for old scikit-learn versions") pl = pytest.importorskip("polars") df = pl.DataFrame(dict(a=[0, 1], b=["a", "b"])) with warnings.catch_warnings(): diff --git a/skrub/tests/utils.py b/skrub/tests/utils.py index 6c3708e63..8cc179948 100644 --- a/skrub/tests/utils.py +++ b/skrub/tests/utils.py @@ -1,15 +1,14 @@ import random import numpy as np -from numpy.typing import NDArray def generate_data( - n_samples: int, - as_list: bool = False, - random_state: int | float | str | bytes | bytearray | None = None, - sample_length: int = 100, -) -> NDArray: + n_samples, + as_list=False, + random_state=None, + sample_length=100, +): if random_state is not None: random.seed(random_state) MAX_LIMIT = 255 # extended ASCII Character set