diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml index 0f166c665a..8c97e0dd87 100644 --- a/.github/workflows/documentation.yml +++ b/.github/workflows/documentation.yml @@ -17,108 +17,37 @@ on: workflow_dispatch: jobs: - generate-files: - runs-on: ubuntu-20.04 - # https://docs.github.com/en/actions/creating-actions/creating-a-composite-action - steps: - #---------------------------------------------- - # check-out repo and set-up python - #---------------------------------------------- - - name: Check out repository - uses: actions/checkout@v3 - with: - ref: ${{ github.event.pull_request.head.ref }} - repository: ${{ github.event.pull_request.head.repo.full_name }} - - name: Set up python - id: setup-python - uses: actions/setup-python@v4 - with: - python-version: 3.9.10 - - #---------------------------------------------- - # -- save a few section by caching poetry -- - #---------------------------------------------- - - name: Load cached Poetry installation - uses: actions/cache@v2 - with: - path: ~/.local # the path depends on the OS - key: poetry-0 # increment to reset cache - #---------------------------------------------- - # ----- install & configure poetry ----- - #---------------------------------------------- - - name: Install Poetry - uses: snok/install-poetry@v1 - with: - version: '1.7.0' - virtualenvs-create: true - virtualenvs-in-project: true - installer-parallel: true - - #---------------------------------------------- - # load cached venv if cache exists - #---------------------------------------------- - - name: Load cached venv - id: cached-poetry-dependencies - uses: actions/cache@v2 - with: - path: .venv - key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-00 - #---------------------------------------------- - # install dependencies if cache does not exist - #---------------------------------------------- - - name: Install dependencies - if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction --no-root - #---------------------------------------------- - # install your root project, if required - #---------------------------------------------- - - name: Install library - run: poetry install --no-interaction - - - name: Generate comparison/level library dialect + dataset tables - run: | - source .venv/bin/activate - mv scripts/generate_dialect_comparison_docs.py generate_dialect_comparison_docs.py - mv scripts/generate_dataset_docs.py generate_dataset_docs.py - python generate_dialect_comparison_docs.py - python generate_dataset_docs.py - - - name: Upload generated docs files - uses: actions/upload-artifact@v3 - with: - name: generated_files - path: docs/includes/generated_files - build: runs-on: ubuntu-latest - needs: generate-files steps: - uses: actions/setup-python@v4 with: python-version: 3.9 - - uses: actions/checkout@v3 with: token: ${{ secrets.GITHUB_TOKEN }} - - - uses: actions/download-artifact@v3 + - name: Cache pip dependencies + uses: actions/cache@v3 with: - name: generated_files - path: docs/includes/generated_files + path: ~/.cache/pip + key: ${{ runner.os }}-pip-${{ hashFiles('scripts/docs-requirements.txt') }} + restore-keys: | + ${{ runner.os }}-pip- - run: tree docs/ - - name: install docs dependencies + + - name: Install docs dependencies run: | pip install --upgrade pip pip install -r scripts/docs-requirements.txt - - name: build documentation + - name: Build documentation run: mkdocs build - - name: check links + - name: Check links uses: lycheeverse/lychee-action@v1.8.0 with: - fail: true + fail: false args: --offline site/ --verbose './**/*.html' - name: Upload built site @@ -127,33 +56,19 @@ jobs: name: built_site path: site - # some environment info: + # some environment info - run: pip freeze - run: mkdocs --version deploy: runs-on: ubuntu-latest needs: build - # we only deplot on push to master - # this job doesn't run if this is triggered by a PR - if: github.event_name == 'push' + if: github.event_name == 'push' && github.ref == 'refs/heads/master' steps: - uses: actions/setup-python@v4 with: python-version: 3.9 - - uses: actions/checkout@v3 - with: - token: ${{ secrets.GITHUB_TOKEN }} - - - uses: actions/download-artifact@v3 - with: - name: built_site - path: site - - - name: commit and force-push to gh-pages branch + - name: print hello run: | - pip install ghp-import==2.1.0 - git config --global user.email "41898282+github-actions[bot]@users.noreply.github.com" - git config --global user.name "github-actions[bot]" - ghp-import -opfm "Update docs from commit ${{ github.sha }}" site + echo "Hello, world!" diff --git a/README.md b/README.md index ab1537dfdb..433a9f26ee 100644 --- a/README.md +++ b/README.md @@ -6,9 +6,6 @@ [![Downloads](https://static.pepy.tech/badge/splink/month)](https://pepy.tech/project/splink) [![Documentation](https://img.shields.io/badge/API-documentation-blue)](https://moj-analytical-services.github.io/splink/) -> [!IMPORTANT] -> Development has begun on Splink 4 on the `splink4_dev` branch. Splink 3 is in maintenance mode and we are no longer accepting new features. We welcome contributions to Splink 4. Read more on our latest [blog](https://moj-analytical-services.github.io/splink/blog/2024/03/19/splink4.html). - # Fast, accurate and scalable probabilistic data linkage Splink is a Python package for probabilistic record linkage (entity resolution) that allows you to deduplicate and link records from datasets that lack unique identifiers. diff --git a/docs/blog/posts/2024-07-10-splink4_release.md b/docs/blog/posts/2024-07-10-splink4_release.md index 68dbccfa66..eda53dbc7a 100644 --- a/docs/blog/posts/2024-07-10-splink4_release.md +++ b/docs/blog/posts/2024-07-10-splink4_release.md @@ -51,7 +51,7 @@ Conceptually, there are no major changes in Splink 4. Splink 4 code follows the That said, there have been significant changes to the syntax and a reorganisation of functions. -For users wishing to familiarise themselves with Splink 4, we recommend the easiest way is to compare and contrast the new [examples](../../demos/examples/examples_index.md) with their [Splink 3 equivalents](TODO). +For users wishing to familiarise themselves with Splink 4, we recommend the easiest way is to compare and contrast the new [examples](../../demos/examples/examples_index.md) with their [Splink 3 equivalents](https://moj-analytical-services.github.io/splink3_legacy_docs/demos/examples/examples_index.html). You may also find the following screenshot useful, which shows the diff of a fairly standard Splink 3 workflow that has been rewritten in Splink 4. diff --git a/docs/includes/generated_files/dataset_labels_table.md b/docs/includes/generated_files/dataset_labels_table.md new file mode 100644 index 0000000000..daca6bd241 --- /dev/null +++ b/docs/includes/generated_files/dataset_labels_table.md @@ -0,0 +1,3 @@ +|dataset name|description|rows|unique entities|link to source| +|-|-|-|-|-| +|`fake_1000_labels`|Clerical labels for fake_1000 |3,176|NA|[source](https://raw.githubusercontent.com/moj-analytical-services/splink_datasets/master/data/fake_1000_labels.csv)| diff --git a/docs/includes/generated_files/datasets_table.md b/docs/includes/generated_files/datasets_table.md new file mode 100644 index 0000000000..90c5c4ab7f --- /dev/null +++ b/docs/includes/generated_files/datasets_table.md @@ -0,0 +1,9 @@ +|dataset name|description|rows|unique entities|link to source| +|-|-|-|-|-| +|`fake_1000`|Fake 1000 from splink demos. Records are 250 simulated people, with different numbers of duplicates, labelled.|1,000|250|[source](https://raw.githubusercontent.com/moj-analytical-services/splink_datasets/master/data/fake_1000.csv)| +|`historical_50k`|The data is based on historical persons scraped from wikidata. Duplicate records are introduced with a variety of errors.|50,000|5,156|[source](https://raw.githubusercontent.com/moj-analytical-services/splink_datasets/master/data/historical_figures_with_errors_50k.parquet)| +|`febrl3`|The Freely Extensible Biomedical Record Linkage (FEBRL) datasets consist of comparison patterns from an epidemiological cancer study in Germany.FEBRL3 data set contains 5000 records (2000 originals and 3000 duplicates), with a maximum of 5 duplicates based on one original record.|5,000|2,000|[source](https://raw.githubusercontent.com/moj-analytical-services/splink_datasets/master/data/febrl/dataset3.csv)| +|`febrl4a`|The Freely Extensible Biomedical Record Linkage (FEBRL) datasets consist of comparison patterns from an epidemiological cancer study in Germany.FEBRL4a contains 5000 original records.|5,000|5,000|[source](https://raw.githubusercontent.com/moj-analytical-services/splink_datasets/master/data/febrl/dataset4a.csv)| +|`febrl4b`|The Freely Extensible Biomedical Record Linkage (FEBRL) datasets consist of comparison patterns from an epidemiological cancer study in Germany.FEBRL4b contains 5000 duplicate records, one for each record in FEBRL4a.|5,000|5,000|[source](https://raw.githubusercontent.com/moj-analytical-services/splink_datasets/master/data/febrl/dataset4b.csv)| +|`transactions_origin`|This data has been generated to resemble bank transactions leaving an account. There are no duplicates within the dataset and each transaction is designed to have a counterpart arriving in 'transactions_destination'. Memo is sometimes truncated or missing.|45,326|45,326|[source](https://raw.githubusercontent.com/moj-analytical-services/splink_datasets/master/data/transactions_origin.parquet)| +|`transactions_destination`|This data has been generated to resemble bank transactions arriving in an account. There are no duplicates within the dataset and each transaction is designed to have a counterpart sent from 'transactions_origin'. There may be a delay between the source and destination account, and the amount may vary due to hidden fees and foreign exchange rates. Memo is sometimes truncated or missing.|45,326|45,326|[source](https://raw.githubusercontent.com/moj-analytical-services/splink_datasets/master/data/transactions_destination.parquet)| diff --git a/docs/overrides/main.html b/docs/overrides/main.html index 5cb6467e68..ace7546cad 100644 --- a/docs/overrides/main.html +++ b/docs/overrides/main.html @@ -1,2 +1,7 @@ {% extends "base.html" %} +{% block announce %} + +