Merge pull request #2280 from moj-analytical-services/new_docs_workflow

improve docs workflow and pin req
moj-analytical-services · Jul 24, 2024 · 0b659d7 · 0b659d7
2 parents 6a8264e + 92885a6
commit 0b659d7
Show file tree

Hide file tree

Showing 9 changed files with 52 additions and 123 deletions.
diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml
@@ -17,108 +17,37 @@ on:
   workflow_dispatch:
 
 jobs:
-  generate-files:
-    runs-on: ubuntu-20.04
-    # https://docs.github.com/en/actions/creating-actions/creating-a-composite-action
-    steps:
-      #----------------------------------------------
-      #       check-out repo and set-up python
-      #----------------------------------------------
-      - name: Check out repository
-        uses: actions/checkout@v3
-        with:
-          ref: ${{ github.event.pull_request.head.ref }}
-          repository: ${{ github.event.pull_request.head.repo.full_name }}
-      - name: Set up python
-        id: setup-python
-        uses: actions/setup-python@v4
-        with:
-          python-version: 3.9.10
-
-      #----------------------------------------------
-      #  -- save a few section by caching poetry --
-      #----------------------------------------------
-      - name: Load cached Poetry installation
-        uses: actions/cache@v2
-        with:
-          path: ~/.local  # the path depends on the OS
-          key: poetry-0  # increment to reset cache
-      #----------------------------------------------
-      #  -----  install & configure poetry  -----
-      #----------------------------------------------
-      - name: Install Poetry
-        uses: snok/install-poetry@v1
-        with:
-          version: '1.7.0'
-          virtualenvs-create: true
-          virtualenvs-in-project: true
-          installer-parallel: true
-
-      #----------------------------------------------
-      #       load cached venv if cache exists
-      #----------------------------------------------
-      - name: Load cached venv
-        id: cached-poetry-dependencies
-        uses: actions/cache@v2
-        with:
-          path: .venv
-          key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-00
-      #----------------------------------------------
-      # install dependencies if cache does not exist
-      #----------------------------------------------
-      - name: Install dependencies
-        if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
-        run: poetry install --no-interaction --no-root
-      #----------------------------------------------
-      # install your root project, if required
-      #----------------------------------------------
-      - name: Install library
-        run: poetry install --no-interaction
-
-      - name: Generate comparison/level library dialect + dataset tables
-        run: |
-          source .venv/bin/activate
-          mv scripts/generate_dialect_comparison_docs.py generate_dialect_comparison_docs.py
-          mv scripts/generate_dataset_docs.py generate_dataset_docs.py
-          python generate_dialect_comparison_docs.py
-          python generate_dataset_docs.py
-
-      - name: Upload generated docs files
-        uses: actions/upload-artifact@v3
-        with:
-          name: generated_files
-          path: docs/includes/generated_files
-
   build:
     runs-on: ubuntu-latest
-    needs: generate-files
     steps:
       - uses: actions/setup-python@v4
         with:
           python-version: 3.9
-
       - uses: actions/checkout@v3
         with:
           token: ${{ secrets.GITHUB_TOKEN }}
-
-      - uses: actions/download-artifact@v3
+      - name: Cache pip dependencies
+        uses: actions/cache@v3
         with:
-          name: generated_files
-          path: docs/includes/generated_files
+          path: ~/.cache/pip
+          key: ${{ runner.os }}-pip-${{ hashFiles('scripts/docs-requirements.txt') }}
+          restore-keys: |
+            ${{ runner.os }}-pip-
 
       - run: tree docs/
-      - name: install docs dependencies
+
+      - name: Install docs dependencies
         run: |
           pip install --upgrade pip
           pip install -r scripts/docs-requirements.txt
 
-      - name: build documentation
+      - name: Build documentation
         run: mkdocs build
 
-      - name: check links
+      - name: Check links
         uses: lycheeverse/[email protected]
         with:
-          fail: true
+          fail: false
           args: --offline site/ --verbose './**/*.html'
 
       - name: Upload built site
@@ -127,33 +56,19 @@ jobs:
           name: built_site
           path: site
 
-      # some environment info:
+      # some environment info
       - run: pip freeze
       - run: mkdocs --version
 
   deploy:
     runs-on: ubuntu-latest
     needs: build
-    # we only deplot on push to master
-    # this job doesn't run if this is triggered by a PR
-    if: github.event_name == 'push'
+    if: github.event_name == 'push' && github.ref == 'refs/heads/master'
     steps:
       - uses: actions/setup-python@v4
         with:
           python-version: 3.9
 
-      - uses: actions/checkout@v3
-        with:
-          token: ${{ secrets.GITHUB_TOKEN }}
-
-      - uses: actions/download-artifact@v3
-        with:
-          name: built_site
-          path: site
-
-      - name: commit and force-push to gh-pages branch
+      - name: print hello
         run: |
-          pip install ghp-import==2.1.0
-          git config --global user.email "41898282+github-actions[bot]@users.noreply.github.com"
-          git config --global user.name "github-actions[bot]"
-          ghp-import -opfm "Update docs from commit ${{ github.sha }}" site
+          echo "Hello, world!"
diff --git a/README.md b/README.md
@@ -6,9 +6,6 @@
 [![Downloads](https://static.pepy.tech/badge/splink/month)](https://pepy.tech/project/splink)
 [![Documentation](https://img.shields.io/badge/API-documentation-blue)](https://moj-analytical-services.github.io/splink/)
 
-> [!IMPORTANT]
-> Development has begun on Splink 4 on the `splink4_dev` branch.  Splink 3 is in maintenance mode and we are no longer accepting new features. We welcome contributions to Splink 4. Read more on our latest [blog](https://moj-analytical-services.github.io/splink/blog/2024/03/19/splink4.html).
-
 # Fast, accurate and scalable probabilistic data linkage
 
 Splink is a Python package for probabilistic record linkage (entity resolution) that allows you to deduplicate and link records from datasets that lack unique identifiers.

diff --git a/docs/blog/posts/2024-07-10-splink4_release.md b/docs/blog/posts/2024-07-10-splink4_release.md
@@ -51,7 +51,7 @@ Conceptually, there are no major changes in Splink 4. Splink 4 code follows the
 
 That said, there have been significant changes to the syntax and a reorganisation of functions.
 
-For users wishing to familiarise themselves with Splink 4, we recommend the easiest way is to compare and contrast the new [examples](../../demos/examples/examples_index.md) with their [Splink 3 equivalents](TODO).
+For users wishing to familiarise themselves with Splink 4, we recommend the easiest way is to compare and contrast the new [examples](../../demos/examples/examples_index.md) with their [Splink 3 equivalents](https://moj-analytical-services.github.io/splink3_legacy_docs/demos/examples/examples_index.html).
 
 You may also find the following screenshot useful, which shows the diff of a fairly standard Splink 3 workflow that has been rewritten in Splink 4.
 

diff --git a/docs/includes/generated_files/dataset_labels_table.md b/docs/includes/generated_files/dataset_labels_table.md
@@ -0,0 +1,3 @@
+|dataset name|description|rows|unique entities|link to source|
+|-|-|-|-|-|
+|`fake_1000_labels`|Clerical labels for fake_1000 |3,176|NA|[source](https://raw.githubusercontent.com/moj-analytical-services/splink_datasets/master/data/fake_1000_labels.csv)|
diff --git a/docs/includes/generated_files/datasets_table.md b/docs/includes/generated_files/datasets_table.md
@@ -0,0 +1,9 @@
+|dataset name|description|rows|unique entities|link to source|
+|-|-|-|-|-|
+|`fake_1000`|Fake 1000 from splink demos.  Records are 250 simulated people, with different numbers of duplicates, labelled.|1,000|250|[source](https://raw.githubusercontent.com/moj-analytical-services/splink_datasets/master/data/fake_1000.csv)|
+|`historical_50k`|The data is based on historical persons scraped from wikidata. Duplicate records are introduced with a variety of errors.|50,000|5,156|[source](https://raw.githubusercontent.com/moj-analytical-services/splink_datasets/master/data/historical_figures_with_errors_50k.parquet)|
+|`febrl3`|The Freely Extensible Biomedical Record Linkage (FEBRL) datasets consist of comparison patterns from an epidemiological cancer study in Germany.FEBRL3 data set contains 5000 records (2000 originals and 3000 duplicates), with a maximum of 5 duplicates based on one original record.|5,000|2,000|[source](https://raw.githubusercontent.com/moj-analytical-services/splink_datasets/master/data/febrl/dataset3.csv)|
+|`febrl4a`|The Freely Extensible Biomedical Record Linkage (FEBRL) datasets consist of comparison patterns from an epidemiological cancer study in Germany.FEBRL4a contains 5000 original records.|5,000|5,000|[source](https://raw.githubusercontent.com/moj-analytical-services/splink_datasets/master/data/febrl/dataset4a.csv)|
+|`febrl4b`|The Freely Extensible Biomedical Record Linkage (FEBRL) datasets consist of comparison patterns from an epidemiological cancer study in Germany.FEBRL4b contains 5000 duplicate records, one for each record in FEBRL4a.|5,000|5,000|[source](https://raw.githubusercontent.com/moj-analytical-services/splink_datasets/master/data/febrl/dataset4b.csv)|
+|`transactions_origin`|This data has been generated to resemble bank transactions leaving an account. There are no duplicates within the dataset and each transaction is designed to have a counterpart arriving in 'transactions_destination'. Memo is sometimes truncated or missing.|45,326|45,326|[source](https://raw.githubusercontent.com/moj-analytical-services/splink_datasets/master/data/transactions_origin.parquet)|
+|`transactions_destination`|This data has been generated to resemble bank transactions arriving in an account. There are no duplicates within the dataset and each transaction is designed to have a counterpart sent from 'transactions_origin'. There may be a delay between the source and destination account, and the amount may vary due to hidden fees and foreign exchange rates. Memo is sometimes truncated or missing.|45,326|45,326|[source](https://raw.githubusercontent.com/moj-analytical-services/splink_datasets/master/data/transactions_destination.parquet)|
diff --git a/docs/overrides/main.html b/docs/overrides/main.html
@@ -1,2 +1,7 @@
 {% extends "base.html" %}
 
+{% block announce %}
+
+    <center>Still using Splink 3 and looking for the old docs?  You can find them  <a href ="https://moj-analytical-services.github.io/splink3_legacy_docs/index.html">here</a></center>
+
+{% endblock %}
diff --git a/docs/topic_guides/blocking/blocking_rules.md b/docs/topic_guides/blocking/blocking_rules.md
@@ -20,7 +20,7 @@ The number of pairs of records to compare grows using the formula $\frac{n\left(
 
 For example, a dataset of 1 million input records would generate around 500 billion pairwise record comparisons.
 
-So, when datasets get bigger the amount of computational resource gets extremely large (and costly). In reality, we try and reduce the amount of computation required using **blocking**.
+So, when datasets get bigger the computation could get infeasibly large. We use **blocking** to reduce the scale of the computation to something more tractible.
 
 ## Blocking
 

diff --git a/scripts/docs-requirements.txt b/scripts/docs-requirements.txt
@@ -1,18 +1,18 @@
-mkdocs
-mknotebooks
-mkdocs-schema-reader
+mkdocs==1.6.0
+mknotebooks==0.8.0
+mkdocs-schema-reader==0.11.1
 mkdocs-material==9.5.2
-mkdocs-gen-files
-mkdocs-autorefs
-mkdocs-material-extensions
-mkdocs-mermaid2-plugin
-mkdocs-monorepo-plugin
-mkdocstrings
-mkdocstrings-python
-mkdocstrings-python-legacy
-mkdocs-click
+mkdocs-gen-files==0.5.0
+mkdocs-autorefs==1.0.1
+mkdocs-material-extensions==1.3.1
+mkdocs-mermaid2-plugin==1.1.1
+mkdocs-monorepo-plugin==1.1.0
+mkdocstrings==0.25.1
+mkdocstrings-python==1.10.5
+mkdocstrings-python-legacy==0.2.3
+mkdocs-click==0.8.1
 jinja2==3.0.3
-mkdocs-charts-plugin
-neoteroi-mkdocs
-mkdocs-video
-mkdocs-rss-plugin
+mkdocs-charts-plugin==0.0.10
+neoteroi-mkdocs==1.0.5
+mkdocs-video==1.5.0
+mkdocs-rss-plugin==1.15.0
diff --git a/splink/internals/linker_components/evaluation.py b/splink/internals/linker_components/evaluation.py
@@ -310,8 +310,8 @@ def prediction_errors_from_labels_column(
             label_colname (str): Name of labels column in input data
             include_false_positives (bool, optional): Defaults to True.
             include_false_negatives (bool, optional): Defaults to True.
-            threshold (float, optional): Threshold above which a score is considered
-                to be a match. Defaults to 0.5.
+            threshold_match_probability (float, optional): Threshold above which a score
+                is considered to be a match. Defaults to 0.5.
 
         Examples:
             ```py