Refactored to get things working in GitHub

uktrade · Sep 30, 2024 · 511b896 · 511b896
1 parent d62c8c8
commit 511b896
Show file tree

Hide file tree

Showing 15 changed files with 194 additions and 192 deletions.
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
@@ -0,0 +1,22 @@
+# Pull Request Description
+
+## Changes Made
+- [List the main changes you've made]
+
+## Reason for Changes
+[Explain why you've made these changes]
+
+## Testing Done
+[Describe the testing you've done to validate your changes]
+
+## Screenshots (if applicable)
+[Add screenshots here if your changes include visual elements]
+
+## Checklist:
+- [ ] My code follows the style guidelines of this project
+- [ ] I have performed a self-review of my own code
+- [ ] I have commented my code, particularly in hard-to-understand areas
+- [ ] I have made corresponding changes to the documentation
+- [ ] My changes generate no new warnings
+- [ ] I have added tests that prove my fix is effective or that my feature works
+- [ ] New and existing unit tests pass locally with my changes
diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
@@ -0,0 +1,23 @@
+name: Unit tests
+
+jobs:
+  uv-example:
+    name: python
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v2
+
+      - name: Set up Python
+        run: uv python install
+
+      - name: Set up PostgreSQL
+        run: |
+          docker compose up db -d --wait
+
+      - name: Run pytest
+        run: |
+          uv python -m pytest
diff --git a/Makefile b/Makefile
@@ -35,10 +35,7 @@ format:
 
 ## Run Python tests
 test:
-	@echo ">>> Dropping and recreating the test database"
-	docker-compose exec db psql -U testuser -c "DROP DATABASE IF EXISTS testdb;"
-	docker-compose exec db psql -U testuser -c "CREATE DATABASE testdb;"
-	@echo ">>> Running tests"
+	docker compose up db -d --wait
 	uv run pytest
 
 

diff --git a/README.md b/README.md
@@ -1,46 +1,17 @@
-# 🔗 Company matching framework
+# 🔥 Matchbox (neé Company Matching Framework)
 
-A match orchestration framework to allow the comparison, validation, and orchestration of the best match methods for the company matching job.
+Record matching is a chore. We aim to:
 
-We envisage this forming one of three repos in the Company Matching Framework:
+* Make it an iterative, collaborative, measurable problem
+* Allow organisations to know they have matching records without having to share the data
+* Allow matching pipelines to run iteratively
 
-* `company-matching-framework`, this repo. A Python library for creating data linkage and deduplication pipelines over a shared relational database
-* `company-matching-framework-dash`, or https://matching.data.trade.gov.uk/. A dashboard for verifying links and deduplications, and comparing the performance metrics of different approaches. Uses `company-matching-framework`
-* `company-matching-framework-pipeline`. The live pipeline of matching and deduping methods, running in production. Uses `company-matching-framework`
+## Development
 
-## Coverage
+This project is managed by [uv](https://docs.astral.sh/uv/), linted and formated with [ruff](https://docs.astral.sh/ruff/), and tested with [pytest](https://docs.pytest.org/en/stable/).
 
-* [Companies House](https://data.trade.gov.uk/datasets/a777d199-53a4-4d0a-bbbb-1559a86f8c4c#companies-house-company-data)
-* [Data Hub companies](https://data.trade.gov.uk/datasets/32918f3e-a727-42e6-8359-9efc61c93aa4#data-hub-companies-master)
-* [Export Wins](https://data.trade.gov.uk/datasets/0738396f-d1fd-46f1-a53f-5d8641d032af#export-wins-master-datasets)
-* [HMRC UK exporters](https://data.trade.gov.uk/datasets/76fb2db3-ab32-4af8-ae87-d41d36b31265#uk-exporters)
+Task running is done with [make](https://www.gnu.org/software/make/). To see all available commands:
 
-## Quickstart
-
-Clone the repo, then run:
-
-```bash
-. setup.sh
-```
-
-Create a `.env` with your development schema to write tables into. Copy the sample with `cp .env.sample .env` then fill it in.
-
-* `SCHEMA` is where any tables the service creates will be written by default
-
-To set up the database in your specificed schema run:
-
-```bash
-make cmf
+```console
+make
 ```
-
-## Usage
-
-See [the aspirational README](references/README_aspitational.md) for how we envisage the finished version of this Python library will be used.
-
-## Release metrics
-
-🛠 Coming soon!
-
---------
-
-<p><small>Project based on the <a target="_blank" href="https://drivendata.github.io/cookiecutter-data-science/">cookiecutter data science project template</a>.</small></p>
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -1,8 +1,7 @@
-version: '3.8'
-
 services:
   db:
-    image: postgres:13
+    image: postgres:14
+    restart: always
     environment:
       POSTGRES_USER: testuser
       POSTGRES_PASSWORD: testpassword

diff --git a/pyproject.toml b/pyproject.toml
@@ -18,7 +18,7 @@ dependencies = [
     "pydantic>=2.9.2",
     "python-dotenv>=1.0.1",
     "rustworkx>=0.15.1",
-    "splink>=4.0.3",
+    "splink<4",
     "sqlalchemy>=2.0.35",
     "tomli>=2.0.1",
 ]
@@ -94,6 +94,8 @@ skip-magic-trailing-comma = false
 line-ending = "auto"
 
 [tool.pytest.ini_options]
+testpaths = ["test"]
+pythonpath = ["."]
 addopts = "-s -vv --cov=cmf test/ --log-disable=pg_bulk_ingest"
 log_cli = false
 log_cli_level = "INFO"

diff --git a/test/__init__.py b/test/__init__.py
diff --git a/test/fixtures/__init__.py b/test/fixtures/__init__.py
diff --git a/test/fixtures/data.py b/test/fixtures/data.py
@@ -6,8 +6,9 @@
 import numpy as np
 import pandas as pd
 import pytest
-import testing.postgresql
 from dotenv import find_dotenv, load_dotenv
+from pandas import DataFrame
+from sqlalchemy.engine import Engine
 
 import cmf.locations as loc
 from cmf import process, query
@@ -19,11 +20,9 @@
 
 LOGGER = logging.getLogger(__name__)
 
-CMF_POSTGRES = testing.postgresql.PostgresqlFactory(cache_initialized_db=True)
-
 
 @pytest.fixture(scope="session")
-def all_companies():
+def all_companies() -> DataFrame:
     """
     Raw, correct company data. Uses UUID as ID to replicate Data Workspace.
     1,000 entries.
@@ -36,7 +35,7 @@ def all_companies():
 
 
 @pytest.fixture(scope="session")
-def crn_companies(all_companies):
+def crn_companies(all_companies: DataFrame) -> DataFrame:
     """
     Company data split into CRN version.
 
@@ -64,7 +63,7 @@ def crn_companies(all_companies):
 
 
 @pytest.fixture(scope="session")
-def duns_companies(all_companies):
+def duns_companies(all_companies: DataFrame) -> DataFrame:
     """
     Company data split into DUNS version.
 
@@ -87,7 +86,7 @@ def duns_companies(all_companies):
 
 
 @pytest.fixture(scope="session")
-def cdms_companies(all_companies):
+def cdms_companies(all_companies: DataFrame) -> DataFrame:
     """
     Company data split into CDMS version.
 
@@ -111,17 +110,15 @@ def cdms_companies(all_companies):
 
 
 @pytest.fixture(scope="function")
-def query_clean_crn(db_engine):
+def query_clean_crn(db_engine: Engine) -> DataFrame:
     # Select
     select_crn = selector(
         table=f"{os.getenv('SCHEMA')}.crn",
         fields=["crn", "company_name"],
-        engine=db_engine[1],
+        engine=db_engine,
     )
 
-    crn = query(
-        selector=select_crn, model=None, return_type="pandas", engine=db_engine[1]
-    )
+    crn = query(selector=select_crn, model=None, return_type="pandas", engine=db_engine)
 
     # Clean
     col_prefix = f"{os.getenv('SCHEMA')}_crn_"
@@ -136,16 +133,16 @@ def query_clean_crn(db_engine):
 
 
 @pytest.fixture(scope="function")
-def query_clean_duns(db_engine):
+def query_clean_duns(db_engine: Engine) -> DataFrame:
     # Select
     select_duns = selector(
         table=f"{os.getenv('SCHEMA')}.duns",
         fields=["duns", "company_name"],
-        engine=db_engine[1],
+        engine=db_engine,
     )
 
     duns = query(
-        selector=select_duns, model=None, return_type="pandas", engine=db_engine[1]
+        selector=select_duns, model=None, return_type="pandas", engine=db_engine
     )
 
     # Clean
@@ -161,36 +158,36 @@ def query_clean_duns(db_engine):
 
 
 @pytest.fixture(scope="function")
-def query_clean_cdms(db_engine):
+def query_clean_cdms(db_engine: Engine) -> DataFrame:
     # Select
     select_cdms = selector(
         table=f"{os.getenv('SCHEMA')}.cdms",
         fields=["crn", "cdms"],
-        engine=db_engine[1],
+        engine=db_engine,
     )
 
     cdms = query(
-        selector=select_cdms, model=None, return_type="pandas", engine=db_engine[1]
+        selector=select_cdms, model=None, return_type="pandas", engine=db_engine
     )
 
     # No cleaning needed, see original data
     return cdms
 
 
 @pytest.fixture(scope="function")
-def query_clean_crn_deduped(db_engine):
+def query_clean_crn_deduped(db_engine: Engine) -> DataFrame:
     # Select
     select_crn = selector(
         table=f"{os.getenv('SCHEMA')}.crn",
         fields=["crn", "company_name"],
-        engine=db_engine[1],
+        engine=db_engine,
     )
 
     crn = query(
         selector=select_crn,
         model=f"naive_{os.getenv('SCHEMA')}.crn",
         return_type="pandas",
-        engine=db_engine[1],
+        engine=db_engine,
     )
 
     # Clean
@@ -206,19 +203,19 @@ def query_clean_crn_deduped(db_engine):
 
 
 @pytest.fixture(scope="function")
-def query_clean_duns_deduped(db_engine):
+def query_clean_duns_deduped(db_engine: Engine) -> DataFrame:
     # Select
     select_duns = selector(
         table=f"{os.getenv('SCHEMA')}.duns",
         fields=["duns", "company_name"],
-        engine=db_engine[1],
+        engine=db_engine,
     )
 
     duns = query(
         selector=select_duns,
         model=f"naive_{os.getenv('SCHEMA')}.duns",
         return_type="pandas",
-        engine=db_engine[1],
+        engine=db_engine,
     )
 
     # Clean
@@ -234,19 +231,19 @@ def query_clean_duns_deduped(db_engine):
 
 
 @pytest.fixture(scope="function")
-def query_clean_cdms_deduped(db_engine):
+def query_clean_cdms_deduped(db_engine: Engine) -> DataFrame:
     # Select
     select_cdms = selector(
         table=f"{os.getenv('SCHEMA')}.cdms",
         fields=["crn", "cdms"],
-        engine=db_engine[1],
+        engine=db_engine,
     )
 
     cdms = query(
         selector=select_cdms,
         model=f"naive_{os.getenv('SCHEMA')}.cdms",
         return_type="pandas",
-        engine=db_engine[1],
+        engine=db_engine,
     )
 
     # No cleaning needed, see original data