Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MAINT add support for Python 3.9 #939

Merged
merged 10 commits into from
Jun 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions .github/workflows/testing.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ jobs:
fail-fast: false
matrix:
os: ["ubuntu-latest", "macos-latest", "windows-latest"]
python-version: ["3.10", "3.11", "3.12"]
python-version: ["3.9", "3.10", "3.11", "3.12"]
# Install the min or latest dependencies for skrub
# as defined in setup.cfg at [options.extra_require].
#
Expand All @@ -56,9 +56,12 @@ jobs:
python-version: "3.11"
- dependencies-version: "dev, polars"
python-version: "3.12"
- dependencies-version: "dev, min-py310"
- dependencies-version: "dev, min-py39"
python-version: "3.10"
dependencies-version-type: "minimal"
- dependencies-version: "dev, min-py39"
python-version: "3.9"
dependencies-version-type: "minimal"
name: ${{ matrix.os-name }} with Python ${{ matrix.python-version }} and ${{ matrix.dependencies-version-type }} dependencies
defaults:
run:
Expand Down
3 changes: 3 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ Major changes
It now has a ``key``` parameter that allows to join main and auxiliary tables that share
the same column names. :pr:`876` by :user:`Théo Jolivet <TheooJ>`.

* The minimum supported python version is now 3.9
:pr:`939` by :user:`Jérôme Dockès <jeromedockes>`.

Minor changes
-------------

Expand Down
41 changes: 20 additions & 21 deletions benchmarks/bench_fuzzy_join_count_vs_hash.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
import math
from argparse import ArgumentParser
from time import perf_counter
from typing import Literal, Tuple, Union

import matplotlib.pyplot as plt
import numpy as np
Expand All @@ -34,21 +33,21 @@

# Function kept for reference
def fuzzy_join(
left: pd.DataFrame,
right: pd.DataFrame,
how: Literal["left", "right"] = "left",
left_on: Union[str, None] = None,
right_on: Union[str, None] = None,
on: Union[str, None] = None,
encoder: Literal["count", "hash"] = "count",
analyzer: Literal["word", "char", "char_wb"] = "char_wb",
ngram_range: Tuple[int, int] = (2, 4),
return_score: bool = False,
match_score: float = 0,
drop_unmatched: bool = False,
sort: bool = False,
suffixes: Tuple[str, str] = ("_x", "_y"),
) -> pd.DataFrame:
left,
right,
how="left",
left_on=None,
right_on=None,
on=None,
encoder="count",
analyzer="char_wb",
ngram_range=(2, 4),
return_score=False,
match_score=0,
drop_unmatched=False,
sort=False,
suffixes=("_x", "_y"),
):
"""
Join two tables categorical string columns based on approximate
matching and using morphological similarity.
Expand Down Expand Up @@ -317,10 +316,10 @@ def fuzzy_join(
repeat=10,
)
def benchmark(
encoder: Literal["hash", "count"],
dataset_name: str,
analyzer: Literal["char_wb", "char", "word"],
ngram_range: tuple,
encoder,
dataset_name,
analyzer,
ngram_range,
):
left_table, right_table, gt = load_data(dataset_name)

Expand Down Expand Up @@ -352,7 +351,7 @@ def benchmark(
return res_dic


def plot(df: pd.DataFrame):
def plot(df):
sns.set_theme(style="ticks", palette="pastel")

n_datasets = len(np.unique(df["dataset_name"]))
Expand Down
80 changes: 39 additions & 41 deletions benchmarks/bench_fuzzy_join_sparse_vs_dense.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,7 @@
import warnings
from argparse import ArgumentParser
from collections.abc import Iterable
from pathlib import Path
from time import perf_counter
from typing import Literal

import matplotlib.pyplot as plt
import numpy as np
Expand All @@ -35,11 +33,11 @@


def _numeric_encoding(
main: pd.DataFrame,
main_cols: list | str,
aux: pd.DataFrame,
aux_cols: list | str,
) -> tuple:
main,
main_cols,
aux,
aux_cols,
):
"""Encoding numerical columns.

Parameters
Expand Down Expand Up @@ -71,15 +69,15 @@ def _numeric_encoding(


def _string_encoding(
main: pd.DataFrame,
main_cols: list | str,
aux: pd.DataFrame,
aux_cols: list | str,
analyzer: Literal["word", "char", "char_wb"],
ngram_range: int | int,
encoder: _VectorizerMixin = None,
sparse: bool = True,
) -> tuple:
main,
main_cols,
aux,
aux_cols,
analyzer,
ngram_range,
encoder=None,
sparse=True,
):
"""Encoding string columns.

Parameters
Expand Down Expand Up @@ -147,7 +145,7 @@ def _string_encoding(
return main_enc_d, aux_enc_d


def _nearest_matches(main_array, aux_array, sparse=True) -> np.ndarray | np.ndarray:
def _nearest_matches(main_array, aux_array, sparse=True):
"""Find the closest matches using the nearest neighbors method.

Parameters
Expand Down Expand Up @@ -182,23 +180,23 @@ def _nearest_matches(main_array, aux_array, sparse=True) -> np.ndarray | np.ndar


def fuzzy_join(
left: pd.DataFrame,
right: pd.DataFrame,
how: Literal["left", "right"] = "left",
left_on: str | list[str] | list[int] | None = None,
right_on: str | list[str] | list[int] | None = None,
on: str | list[str] | list[int] | None = None,
numerical_match: Literal["string", "number"] = "number",
encoder: _VectorizerMixin = None,
analyzer: Literal["word", "char", "char_wb"] = "char_wb",
ngram_range: tuple[int, int] = (2, 4),
return_score: bool = False,
match_score: float = 0,
drop_unmatched: bool = False,
sort: bool = False,
suffixes: tuple[str, str] = ("_x", "_y"),
sparse: bool = True,
) -> pd.DataFrame:
left,
right,
how="left",
left_on=None,
right_on=None,
on=None,
numerical_match="number",
encoder=None,
analyzer="char_wb",
ngram_range=(2, 4),
return_score=False,
match_score=0,
drop_unmatched=False,
sort=False,
suffixes=("_x", "_y"),
sparse=True,
):
"""
Join two tables categorical string columns based on approximate
matching and using morphological similarity.
Expand Down Expand Up @@ -526,12 +524,12 @@ def fuzzy_join(
save_as=benchmark_name,
)
def benchmark(
sparse: bool,
dataset_name: str,
analyzer: Literal["char_wb", "char", "word"],
ngram_range: tuple,
data_home: Path | str | None = None,
data_directory: str | None = "benchmarks_data",
sparse,
dataset_name,
analyzer,
ngram_range,
data_home=None,
data_directory="benchmarks_data",
):
left_table, right_table, gt = fetch_big_data(
dataset_name=dataset_name, data_home=data_home, data_directory=data_directory
Expand Down Expand Up @@ -578,7 +576,7 @@ def benchmark(
return res_dic


def plot(df: pd.DataFrame):
def plot(df):
sns.set_theme(style="ticks", palette="pastel")

n_datasets = len(np.unique(df["dataset_name"]))
Expand Down
6 changes: 3 additions & 3 deletions benchmarks/bench_fuzzy_join_vs_others.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,8 @@ def autofj_merge(left, right, target=0.9):
repeat=5,
)
def benchmark(
dataset_name: str,
join: str,
dataset_name,
join,
):
left_table, right_table, gt = load_data(dataset_name)

Expand Down Expand Up @@ -151,7 +151,7 @@ def benchmark(
return res_dic


def plot(df: pd.DataFrame):
def plot(df):
sns.set_theme(style="ticks", palette="pastel")

n_datasets = len(np.unique(df["dataset_name"]))
Expand Down
12 changes: 5 additions & 7 deletions benchmarks/bench_gap_divergence.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,10 @@


class ModifiedGapEncoderColumn(GapEncoder):
def __init__(self, *args, column_name: str = "MISSING COLUMN", **kwargs):
def __init__(self, *args, column_name="MISSING COLUMN", **kwargs):
super().__init__(*args, **kwargs)
self.column_name = column_name
self.benchmark_results_: list[dict[str, np.ndarray | float]] = []
self.benchmark_results_ = []

def fit(self, X, y=None):
# Copy parameter rho
Expand Down Expand Up @@ -123,9 +123,7 @@ def fit(self, X, y=None):


class ModifiedGapEncoder(GapEncoder):
fitted_models_: list[ModifiedGapEncoderColumn]

def _create_column_gap_encoder(self, column_name: str):
def _create_column_gap_encoder(self, column_name):
return ModifiedGapEncoderColumn(
column_name=column_name,
ngram_range=self.ngram_range,
Expand Down Expand Up @@ -187,7 +185,7 @@ def fit(self, X, y=None):
},
save_as=benchmark_name,
)
def benchmark(max_iter_e_step: int, dataset_name: str):
def benchmark(max_iter_e_step, dataset_name):
"""
Cross-validate a pipeline with a modified `GapEncoder` instance for the
high cardinality column. The rest of the columns are passed to a
Expand Down Expand Up @@ -261,7 +259,7 @@ def benchmark(max_iter_e_step: int, dataset_name: str):
return results


def plot(df: pd.DataFrame):
def plot(df):
# Keep only the last outer iteration
df = df[df["gap_iter"] == 5]

Expand Down
14 changes: 7 additions & 7 deletions benchmarks/bench_gap_encoder_hp.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,12 @@
repeat=1,
)
def benchmark(
high_card_feature: str,
batch_size: int,
max_iter_e_step: int,
max_rows: int,
max_no_improvement: int,
random_state: int,
high_card_feature,
batch_size,
max_iter_e_step,
max_rows,
max_no_improvement,
random_state,
):
X = np.array(ds.X[high_card_feature]).reshape(-1, 1).astype(str)
y = ds.y
Expand Down Expand Up @@ -111,7 +111,7 @@ def benchmark(
return res_dic


def plot(df: pd.DataFrame):
def plot(df):
base_values = {"batch_size": 1024, "max_iter_e_step": 1, "max_no_improvement": 5}
for variable in base_values.keys():
df_to_plot = df
Expand Down
12 changes: 5 additions & 7 deletions benchmarks/bench_gap_es_score.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def _minibatch_convergence(self, batch_size, batch_cost, n_samples, step, n_step

return False

def fit(self, X, y=None) -> "GapEncoder":
def fit(self, X, y=None):
"""
Fit the GapEncoder on `X`.

Expand Down Expand Up @@ -164,8 +164,6 @@ def fit(self, X, y=None) -> "GapEncoder":


class ModifiedGapEncoder(GapEncoder):
fitted_models_: list[ModifiedGapEncoderColumn]

def _create_column_gap_encoder(self):
return ModifiedGapEncoderColumn(
ngram_range=self.ngram_range,
Expand Down Expand Up @@ -219,9 +217,9 @@ def _create_column_gap_encoder(self):
repeat=2,
)
def benchmark(
high_card_feature: str,
max_rows: int,
modif: bool,
high_card_feature,
max_rows,
modif,
):
ds = fetch_traffic_violations()
X = np.array(ds.X[high_card_feature]).reshape(-1, 1).astype(str)
Expand Down Expand Up @@ -277,7 +275,7 @@ def benchmark(
return res_dic


def plot(df: pd.DataFrame):
def plot(df):
sns.lineplot(
x="train_size", y="time_fit", data=df, hue="high_card_feature", style="modif"
)
Expand Down
Loading
Loading