diff --git a/docs/api/client/index.md b/docs/api/client/index.md index 74372a98..aba77a91 100644 --- a/docs/api/client/index.md +++ b/docs/api/client/index.md @@ -2,6 +2,8 @@ `matchbox.client` is the client used to interact with the [Matchbox server](../../server/install.md). +All names in `matchbox.client` are also accessible from the top-level `matchbox` module. + ::: matchbox.client options: show_root_heading: true diff --git a/docs/client/explore_resolutions.md b/docs/client/explore_resolutions.md new file mode 100644 index 00000000..e7eb2180 --- /dev/null +++ b/docs/client/explore_resolutions.md @@ -0,0 +1,12 @@ +Matchbox lets you link many sources of data in many different ways. But when you query it, which way should you choose? + +A *resolution*, or *point of resolution* represents a queriable state describing how to cluster entities from one or more data sources. A resolution can represent an original data source, a deduplicated data source, or the result of linking two or more resolutions. + +In order to explore which resolutions are stored on Matchbox, you can use the following client method: + +=== "Example" + ```python + from matchbox import draw_resolution_graph + + draw_resolution_graph() + ``` diff --git a/docs/client/query-data.md b/docs/client/query-data.md index 2a10b873..f562707c 100644 --- a/docs/client/query-data.md +++ b/docs/client/query-data.md @@ -7,16 +7,16 @@ Given a primary key and a source dataset, retrieves all primary keys that share === "Example" ```python import matchbox as mb - from matchbox.client.helpers import selector + from matchbox import select import sqlalchemy engine = sqlalchemy.create_engine('postgresql://') mb.match( + select("datahub_companies", engine=engine), + source=select("companies_house", engine=engine), source_pk="8534735", - source="dbt.companieshouse", - target="hmrc.exporters", - resolution="companies", + resolution_name="last_linker", ) ``` @@ -44,13 +44,13 @@ Retrieves entire data sources along with a unique entity identifier according to === "Example" ```python import matchbox as mb - from matchbox.client.helpers import selector + from matchbox import select import sqlalchemy engine = sqlalchemy.create_engine('postgresql://') mb.query( - selector( + select( { "dbt.companieshouse": ["company_name"], "hmrc.exporters": ["year", "commodity_codes"], @@ -68,4 +68,6 @@ Retrieves entire data sources along with a unique entity identifier according to 122 Acme Ltd. 2024 ['72142', '72143'] 5 Gamma Exports 2023 ['90328', '90329'] ... - ``` \ No newline at end of file + ``` + +For more information on how to use the functions on this page, please check out the relevant examples in the [client API docs](../../api/client/). \ No newline at end of file diff --git a/docs/index.md b/docs/index.md index a63badfc..64e0a54b 100644 --- a/docs/index.md +++ b/docs/index.md @@ -9,9 +9,12 @@ hide: --- - Learn how to quickly install and use Matchbox. + Learn how to quickly install and use Matchbox: + + * The **client** lets you query and link/dedupe data + * The **server** is for setting up a new Matchbox instance for your organisation. - [:octicons-download-16: Install client](./client/install.md){ .md-button .md-button--primary } [:octicons-download-16: Install server](./server/install.md){ .md-button .md-button--primary } + [:octicons-zap-16: Get started with the client](./client/install.md){ .md-button .md-button--primary } [:octicons-download-16: Deploy server in your org](./server/install.md){ .md-button .md-button--primary } diff --git a/mkdocs.yml b/mkdocs.yml index 3825f9bb..c043c5ad 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -9,6 +9,7 @@ nav: - Use cases: use-cases.md - Client: - Installation: client/install.md + - Explore resolutions: client/explore_resolutions.md - Retrieve: client/query-data.md - Link and deduplicate: client/link-data.md - API: diff --git a/src/matchbox/__init__.py b/src/matchbox/__init__.py index 31a6b8a6..4436d639 100644 --- a/src/matchbox/__init__.py +++ b/src/matchbox/__init__.py @@ -4,10 +4,4 @@ load_dotenv(dotenv_path) # Environment variables must be loaded first for other imports to work - -from matchbox.client.helpers.cleaner import process # NoQA: E402 -from matchbox.client.helpers.index import index # NoQA: E402 -from matchbox.client.helpers.selector import match, query, select # NoQA: E402 -from matchbox.client.models.models import make_model # NoQA: E402 - -__all__ = ("make_model", "process", "select", "query", "match", "index") +from matchbox.client import * # noqa: E402, F403 diff --git a/src/matchbox/client/__init__.py b/src/matchbox/client/__init__.py index af602727..f3442138 100644 --- a/src/matchbox/client/__init__.py +++ b/src/matchbox/client/__init__.py @@ -1,6 +1,9 @@ +"""All client-side functionalities of Matchbox.""" + +from matchbox.client.helpers.cleaner import process +from matchbox.client.helpers.index import index +from matchbox.client.helpers.selector import match, query +from matchbox.client.models.models import make_model from matchbox.client.visualisation import draw_resolution_graph -__all__ = ( - # Visualisation - "draw_resolution_graph", -) +__all__ = ("process", "index", "match", "query", "make_model", "draw_resolution_graph") diff --git a/src/matchbox/client/_handler.py b/src/matchbox/client/_handler.py index e5e2ddfb..e2dd37a4 100644 --- a/src/matchbox/client/_handler.py +++ b/src/matchbox/client/_handler.py @@ -1,3 +1,5 @@ +"""Functions abstracting the interaction with the server API.""" + import time from collections.abc import Iterable from io import BytesIO diff --git a/src/matchbox/client/_logging.py b/src/matchbox/client/_logging.py index 87c95234..907bf10b 100644 --- a/src/matchbox/client/_logging.py +++ b/src/matchbox/client/_logging.py @@ -1,3 +1,5 @@ +"""Client-side logging utilities.""" + import logging import sys diff --git a/src/matchbox/client/clean/.gitkeep b/src/matchbox/client/clean/.gitkeep deleted file mode 100644 index e69de29b..00000000 diff --git a/src/matchbox/client/clean/__init__.py b/src/matchbox/client/clean/__init__.py index b25b302c..531b02d5 100644 --- a/src/matchbox/client/clean/__init__.py +++ b/src/matchbox/client/clean/__init__.py @@ -1,3 +1,5 @@ +"""Library of default cleaning functions.""" + from matchbox.client.clean.lib import ( company_name, company_number, diff --git a/src/matchbox/client/clean/lib.py b/src/matchbox/client/clean/lib.py index f796a6a0..36b5c46d 100644 --- a/src/matchbox/client/clean/lib.py +++ b/src/matchbox/client/clean/lib.py @@ -1,3 +1,5 @@ +"""Implementation of default cleaning functions.""" + from functools import partial from pandas import DataFrame diff --git a/src/matchbox/client/clean/steps/__init__.py b/src/matchbox/client/clean/steps/__init__.py index 211f74dd..37b98040 100644 --- a/src/matchbox/client/clean/steps/__init__.py +++ b/src/matchbox/client/clean/steps/__init__.py @@ -1,3 +1,5 @@ +"""Low-level components of default cleaning functions.""" + from matchbox.client.clean.steps.clean_basic import ( array_except, array_intersect, diff --git a/src/matchbox/client/clean/steps/clean_basic.py b/src/matchbox/client/clean/steps/clean_basic.py index 96f530f7..3fc75bc9 100644 --- a/src/matchbox/client/clean/steps/clean_basic.py +++ b/src/matchbox/client/clean/steps/clean_basic.py @@ -1,3 +1,5 @@ +"""Low-level primitives supporting default cleaning functions.""" + from typing import Dict, List from matchbox.client.clean.utils import ABBREVIATIONS, STOPWORDS diff --git a/src/matchbox/client/clean/steps/clean_basic_original.py b/src/matchbox/client/clean/steps/clean_basic_original.py index 8288e628..27430aa7 100644 --- a/src/matchbox/client/clean/steps/clean_basic_original.py +++ b/src/matchbox/client/clean/steps/clean_basic_original.py @@ -1,3 +1,6 @@ +"""Legacy cleaning rules inherited by the Company Matching Service.""" + + def cms_original_clean_company_name_general(column): """Replicates the original Company Matching Service company name cleaning regex exactly. Intended to help replicate the methodology for comparison. diff --git a/src/matchbox/client/clean/utils.py b/src/matchbox/client/clean/utils.py index f04d5c8e..a0c99d84 100644 --- a/src/matchbox/client/clean/utils.py +++ b/src/matchbox/client/clean/utils.py @@ -1,3 +1,5 @@ +"""Generic utilities for default cleaning functions.""" + from typing import Callable import duckdb diff --git a/src/matchbox/client/helpers/__init__.py b/src/matchbox/client/helpers/__init__.py index da86c0df..8f9a5d5e 100644 --- a/src/matchbox/client/helpers/__init__.py +++ b/src/matchbox/client/helpers/__init__.py @@ -1,3 +1,5 @@ +"""Core functionalities of the Matchbox client.""" + from matchbox.client.helpers.cleaner import cleaner, cleaners from matchbox.client.helpers.comparison import comparison from matchbox.client.helpers.selector import select diff --git a/src/matchbox/client/helpers/cleaner.py b/src/matchbox/client/helpers/cleaner.py index 98b01487..9718512b 100644 --- a/src/matchbox/client/helpers/cleaner.py +++ b/src/matchbox/client/helpers/cleaner.py @@ -1,17 +1,60 @@ +"""Functions to pre-process data sources.""" + from typing import Any, Callable, Dict from pandas import DataFrame def cleaner(function: Callable, arguments: Dict) -> Dict[str, Dict[str, Any]]: + """Define a function to clean a dataset. + + Args: + function: the callable implementing the cleaning behaviour + arguments: a dictionary of keyword arguments to pass to the cleaning function + + Returns: + A representation of the cleaner ready to be passed to the `cleaners()` function + + """ return {function.__name__: {"function": function, "arguments": arguments}} def cleaners(*cleaner: Dict[str, Dict[str, Any]]) -> Dict[str, Dict[str, Any]]: + """Combine multiple cleaners in a single object to pass to `process()` + + Args: + cleaner: Output of the `cleaner()` function + + Returns: + A representation of multiple cleaners to be passed to the `process()` function + + Examples: + ```python + clean_pipeline = cleaners( + cleaner( + normalise_company_number, + {"column": "company_number"}, + ), + cleaner( + normalise_postcode, + {"column": "postcode"}, + ), + ) + ``` + """ return {k: v for d in cleaner for k, v in d.items()} def process(data: DataFrame, pipeline: Dict[str, Dict[str, Any]]) -> DataFrame: + """Apply cleaners to input dataframe. + + Args: + data: The dataframe to process + pipeline: Output of the `cleaners()` function + + Returns: + The processed dataset + """ curr = data for func in pipeline.keys(): curr = pipeline[func]["function"](curr, **pipeline[func]["arguments"]) diff --git a/src/matchbox/client/helpers/comparison.py b/src/matchbox/client/helpers/comparison.py index d9409003..310c5304 100644 --- a/src/matchbox/client/helpers/comparison.py +++ b/src/matchbox/client/helpers/comparison.py @@ -1,3 +1,5 @@ +"""Functions to compare fields in different datasets.""" + import sqlglot.expressions as exp from sqlglot import parse_one from sqlglot.errors import ParseError diff --git a/src/matchbox/client/helpers/index.py b/src/matchbox/client/helpers/index.py index 92b75803..435f32a7 100644 --- a/src/matchbox/client/helpers/index.py +++ b/src/matchbox/client/helpers/index.py @@ -1,3 +1,5 @@ +"""Functions to index data sources to the Matchbox server.""" + from sqlalchemy import Engine from matchbox.client import _handler diff --git a/src/matchbox/client/helpers/selector.py b/src/matchbox/client/helpers/selector.py index eac16b76..4195b9fc 100644 --- a/src/matchbox/client/helpers/selector.py +++ b/src/matchbox/client/helpers/selector.py @@ -1,3 +1,5 @@ +"""Functions to select and retrieve data from the Matchbox server.""" + import itertools from os import getenv from typing import Literal @@ -216,6 +218,16 @@ def match( If None, uses the resolutions' default threshold If an integer, uses that threshold for the specified resolution, and the resolution's cached thresholds for its ancestors + + Examples: + ```python + mb.match( + select("datahub_companies", engine=engine), + source=select("companies_house", engine=engine), + source_pk="8534735", + resolution_name="last_linker", + ) + ``` """ if len(source) > 1: raise ValueError("Only one source can be matched at one time") diff --git a/src/matchbox/client/models/__init__.py b/src/matchbox/client/models/__init__.py index e69de29b..cc7df612 100644 --- a/src/matchbox/client/models/__init__.py +++ b/src/matchbox/client/models/__init__.py @@ -0,0 +1 @@ +"""Deduplication and linking methodologies.""" diff --git a/src/matchbox/client/models/dedupers/__init__.py b/src/matchbox/client/models/dedupers/__init__.py index 3cc93a39..f94bf336 100644 --- a/src/matchbox/client/models/dedupers/__init__.py +++ b/src/matchbox/client/models/dedupers/__init__.py @@ -1,3 +1,5 @@ +"""Deduplication methodologies.""" + from matchbox.client.models.dedupers.naive import NaiveDeduper __all__ = ("NaiveDeduper",) diff --git a/src/matchbox/client/models/dedupers/base.py b/src/matchbox/client/models/dedupers/base.py index 8adbbb9c..239112f0 100644 --- a/src/matchbox/client/models/dedupers/base.py +++ b/src/matchbox/client/models/dedupers/base.py @@ -1,3 +1,5 @@ +"""Base class for deduplication methodologies.""" + import warnings from abc import ABC, abstractmethod diff --git a/src/matchbox/client/models/dedupers/naive.py b/src/matchbox/client/models/dedupers/naive.py index 3514dbd7..70bf8b58 100644 --- a/src/matchbox/client/models/dedupers/naive.py +++ b/src/matchbox/client/models/dedupers/naive.py @@ -1,3 +1,5 @@ +"""A deduplication methodology based on a deterministic set of conditions.""" + from typing import List, Type import duckdb diff --git a/src/matchbox/client/models/linkers/__init__.py b/src/matchbox/client/models/linkers/__init__.py index 0d224cc9..72823324 100644 --- a/src/matchbox/client/models/linkers/__init__.py +++ b/src/matchbox/client/models/linkers/__init__.py @@ -1,3 +1,5 @@ +"""Linking methodologies.""" + from matchbox.client.models.linkers.deterministic import DeterministicLinker from matchbox.client.models.linkers.splinklinker import SplinkLinker from matchbox.client.models.linkers.weighteddeterministic import ( diff --git a/src/matchbox/client/models/linkers/base.py b/src/matchbox/client/models/linkers/base.py index 4c073d9e..87b53b4b 100644 --- a/src/matchbox/client/models/linkers/base.py +++ b/src/matchbox/client/models/linkers/base.py @@ -1,3 +1,5 @@ +"""Base class for linkers.""" + import warnings from abc import ABC, abstractmethod diff --git a/src/matchbox/client/models/linkers/deterministic.py b/src/matchbox/client/models/linkers/deterministic.py index fa767138..e497ad71 100644 --- a/src/matchbox/client/models/linkers/deterministic.py +++ b/src/matchbox/client/models/linkers/deterministic.py @@ -1,3 +1,5 @@ +"""A linking methodology based on a deterministic set of conditions.""" + from typing import Type import duckdb diff --git a/src/matchbox/client/models/linkers/splinklinker.py b/src/matchbox/client/models/linkers/splinklinker.py index 8715a7ea..6407b925 100644 --- a/src/matchbox/client/models/linkers/splinklinker.py +++ b/src/matchbox/client/models/linkers/splinklinker.py @@ -1,3 +1,5 @@ +"""A linking methodology leveraging Splink.""" + import ast import inspect import logging diff --git a/src/matchbox/client/models/linkers/weighteddeterministic.py b/src/matchbox/client/models/linkers/weighteddeterministic.py index a73b69ca..d4caa235 100644 --- a/src/matchbox/client/models/linkers/weighteddeterministic.py +++ b/src/matchbox/client/models/linkers/weighteddeterministic.py @@ -1,3 +1,5 @@ +"""A linking methodology that applies different weights to field comparisons.""" + from typing import List, Type import duckdb diff --git a/src/matchbox/client/models/models.py b/src/matchbox/client/models/models.py index 48312f50..721f1070 100644 --- a/src/matchbox/client/models/models.py +++ b/src/matchbox/client/models/models.py @@ -1,3 +1,5 @@ +"""Functions and classes to define, run and register models.""" + from typing import Any, ParamSpec, TypeVar from pandas import DataFrame diff --git a/src/matchbox/client/results.py b/src/matchbox/client/results.py index c98c484e..f0680925 100644 --- a/src/matchbox/client/results.py +++ b/src/matchbox/client/results.py @@ -1,3 +1,5 @@ +"""Objects representing the results of running a model client-side.""" + import logging from functools import wraps from typing import TYPE_CHECKING, Any, Callable, Hashable, ParamSpec, TypeVar diff --git a/src/matchbox/client/visualisation.py b/src/matchbox/client/visualisation.py index 55819eb5..d48830a6 100644 --- a/src/matchbox/client/visualisation.py +++ b/src/matchbox/client/visualisation.py @@ -1,3 +1,5 @@ +"""Visualisation utilities.""" + import rustworkx as rx from matplotlib.figure import Figure from rustworkx.visualization import mpl_draw