Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Centralise logging #80

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 2 additions & 5 deletions src/matchbox/__init__.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,11 @@
import logging

from matchbox.common.exceptions import MatchboxClientSettingsException

logic_logger = logging.getLogger("mb_logic")
from matchbox.common.logging import mb_server_logger

try:
# Environment variables must be loaded first for other imports to work
from matchbox.client import * # noqa: E402, F403
except MatchboxClientSettingsException:
logic_logger.warning(
mb_server_logger.warning(
"Impossible to initialise client. "
"Please ignore if running in server mode. Otherwise, check your .env file.",
)
13 changes: 0 additions & 13 deletions src/matchbox/client/_logging.py

This file was deleted.

4 changes: 2 additions & 2 deletions src/matchbox/client/helpers/selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@
from sqlalchemy import Engine, create_engine

from matchbox.client import _handler
from matchbox.client._logging import client_logger
from matchbox.client._settings import settings
from matchbox.common.graph import DEFAULT_RESOLUTION
from matchbox.common.logging import mb_client_logger
from matchbox.common.sources import Match, Source, SourceAddress


Expand Down Expand Up @@ -47,7 +47,7 @@ def select(
if not engine:
if default_engine := settings.default_warehouse:
engine = create_engine(default_engine)
client_logger.warning("Using default engine")
mb_client_logger.warning("Using default engine")
else:
raise ValueError(
"An engine needs to be provided if "
Expand Down
6 changes: 2 additions & 4 deletions src/matchbox/client/models/linkers/splinklinker.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

import ast
import inspect
import logging
from typing import Any, Dict, List, Optional, Type

import pyarrow as pa
Expand All @@ -13,8 +12,7 @@
from splink.internals.linker_components.training import LinkerTraining

from matchbox.client.models.linkers.base import Linker, LinkerSettings

logic_logger = logging.getLogger("mb_logic")
from matchbox.common.logging import mb_client_logger


class SplinkLinkerFunction(BaseModel):
Expand Down Expand Up @@ -219,7 +217,7 @@ def prepare(self, left: DataFrame, right: DataFrame) -> None:

def link(self, left: DataFrame = None, right: DataFrame = None) -> DataFrame:
if left is not None or right is not None:
logic_logger.warning(
mb_client_logger.warning(
"Left and right data are declared in .prepare() for SplinkLinker. "
"These values will be ignored"
)
Expand Down
3 changes: 0 additions & 3 deletions src/matchbox/client/results.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
"""Objects representing the results of running a model client-side."""

import logging
from functools import wraps
from typing import TYPE_CHECKING, Any, Callable, Hashable, ParamSpec, TypeVar

Expand All @@ -22,8 +21,6 @@
P = ParamSpec("P")
R = TypeVar("R")

logic_logger = logging.getLogger("mb_logic")


def calculate_clusters(func: Callable[P, R]) -> Callable[P, R]:
"""Decorator to calculate clusters if it hasn't been already."""
Expand Down
49 changes: 49 additions & 0 deletions src/matchbox/common/logging.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
import logging

from rich.console import Console
from rich.logging import RichHandler
from rich.progress import (
BarColumn,
Progress,
Expand All @@ -8,6 +11,52 @@
TimeRemainingColumn,
)

CRITICAL = logging.CRITICAL
FATAL = logging.FATAL
ERROR = logging.ERROR
WARNING = logging.WARNING
WARN = logging.WARN
INFO = logging.INFO
DEBUG = logging.DEBUG
NOTSET = logging.NOTSET


logging.basicConfig(
level=INFO,
format="%(message)s",
handlers=[RichHandler(rich_tracebacks=True)],
)


def get_logger(name: str, custom_format: str = None) -> logging.Logger:
logger = logging.getLogger(name)

if custom_format:
# Remove existing handlers
logger.handlers.clear()

# Create a custom handler
custom_handler = RichHandler(rich_tracebacks=True)

# Create a custom formatter
formatter = logging.Formatter(custom_format)

# Set formatter for the custom handler
custom_handler.setFormatter(formatter)

# Add custom handler to the logger
logger.addHandler(custom_handler)

# Prevent the logger from propagating messages to the root logger
logger.propagate = False

return logger


mb_server_logger = get_logger("mb_server_logger")
mb_client_logger = get_logger("mb_client_logger")
mb_test_logger = get_logger("mb_test_logger")


def get_console():
"""Get the console instance."""
Expand Down
19 changes: 10 additions & 9 deletions src/matchbox/common/transform.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import logging
import multiprocessing
from collections import defaultdict
from concurrent.futures import ProcessPoolExecutor
Expand All @@ -10,12 +9,10 @@
import rustworkx as rx

from matchbox.common.hash import hash_values
from matchbox.common.logging import build_progress_bar
from matchbox.common.logging import build_progress_bar, mb_server_logger

T = TypeVar("T", bound=Hashable)

logic_logger = logging.getLogger("mb_logic")


def to_clusters(
results: pa.Table,
Expand Down Expand Up @@ -322,7 +319,9 @@ def to_hierarchical_clusters(
n_cores = multiprocessing.cpu_count()
n_components = len(components)

logic_logger.info(f"Processing {n_components:,} components using {n_cores} workers")
mb_server_logger.info(
f"Processing {n_components:,} components using {n_cores} workers"
)

# Split table into separate component tables
component_col = probabilities["component"]
Expand Down Expand Up @@ -370,12 +369,12 @@ def to_hierarchical_clusters(
results.append(result)
progress.update(process_task, advance=1)
except TimeoutError:
logic_logger.error(
mb_server_logger.error(
f"Component processing timed out after {timeout} seconds"
)
raise
except Exception as e:
logic_logger.error(f"Error processing component: {str(e)}")
mb_server_logger.error(f"Error processing component: {str(e)}")
raise
else:
results = [
Expand All @@ -384,11 +383,13 @@ def to_hierarchical_clusters(
if not progress.update(process_task, advance=1)
]

logic_logger.info(f"Completed processing {len(results):,} components successfully")
mb_server_logger.info(
f"Completed processing {len(results):,} components successfully"
)

# Create empty table if no results
if not results:
logic_logger.warning("No results to concatenate")
mb_server_logger.warning("No results to concatenate")
return pa.table(
{
"parent": pa.array([], type=dtype()),
Expand Down
10 changes: 2 additions & 8 deletions src/matchbox/server/postgresql/benchmark/cluster_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,20 @@
import logging
import time
from contextlib import contextmanager
from pathlib import Path

import pyarrow as pa
from rich.logging import RichHandler

from matchbox.common.factories.models import generate_dummy_probabilities
from matchbox.common.hash import hash_data, hash_values
from matchbox.common.logging import get_logger
from matchbox.common.transform import (
attach_components_to_probabilities,
to_hierarchical_clusters,
)
from matchbox.server.postgresql.benchmark.generate_tables import PRESETS
from matchbox.server.postgresql.utils.insert import HashIDMap

logging.basicConfig(
level=logging.INFO,
format="%(message)s",
handlers=[RichHandler(rich_tracebacks=True)],
)
pipeline_logger = logging.getLogger("mb_pipeline")
pipeline_logger = get_logger("mb_pipeline")

ROOT = Path(__file__).parent.parent

Expand Down
Loading