Skip to content

Commit

Permalink
Merge pull request #474 from moj-analytical-services/dev_12
Browse files Browse the repository at this point in the history
dev12
  • Loading branch information
RobinL authored May 18, 2022
2 parents d66fd9e + 7bd15c9 commit d9515d0
Show file tree
Hide file tree
Showing 9 changed files with 151 additions and 162 deletions.
11 changes: 2 additions & 9 deletions .github/workflows/pytest_benchmark_comment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -53,19 +53,12 @@ jobs:
#----------------------------------------------
- name: Install dependencies
if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
run: |
sed -i 's/sqlglot = "1.23.1"/sqlglot = "1.23.1" \npyspark = "^3.2.1"/' ./pyproject.toml
poetry install --no-interaction --no-root
run: poetry install --no-interaction --no-root
#----------------------------------------------
# install your root project, if required
#----------------------------------------------
- name: Install library
run: |
if ! grep "pyspark" ./pyproject.toml
then
sed -i 's/sqlglot = "1.23.1"/sqlglot = "1.23.1" \npyspark = "^3.2.1"/' ./pyproject.toml
fi
poetry install --no-interaction
run: poetry install --no-interaction
#----------------------------------------------
# run benchmarks and comment
#----------------------------------------------
Expand Down
11 changes: 2 additions & 9 deletions .github/workflows/pytest_benchmark_commit.yml
Original file line number Diff line number Diff line change
Expand Up @@ -53,19 +53,12 @@ jobs:
#----------------------------------------------
- name: Install dependencies
if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
run: |
sed -i 's/sqlglot = "1.23.1"/sqlglot = "1.23.1" \npyspark = "^3.2.1"/' ./pyproject.toml
poetry install --no-interaction --no-root
run: poetry install --no-interaction --no-root
#----------------------------------------------
# install your root project, if required
#----------------------------------------------
- name: Install library
run: |
if ! grep "pyspark" ./pyproject.toml
then
sed -i 's/sqlglot = "1.23.1"/sqlglot = "1.23.1" \npyspark = "^3.2.1"/' ./pyproject.toml
fi
poetry install --no-interaction
run: poetry install --no-interaction
#----------------------------------------------
# run benchmarks and comment
#----------------------------------------------
Expand Down
11 changes: 2 additions & 9 deletions .github/workflows/pytest_run_tests_with_cache.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,19 +50,12 @@ jobs:
#----------------------------------------------
- name: Install dependencies
if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
run: |
sed -i 's/sqlglot = "1.23.1"/sqlglot = "1.23.1" \npyspark = "^3.2.1"/' ./pyproject.toml
poetry install --no-interaction --no-root
run: poetry install --no-interaction --no-root
#----------------------------------------------
# install your root project, if required
#----------------------------------------------
- name: Install library
run: |
if ! grep "pyspark" ./pyproject.toml
then
sed -i 's/sqlglot = "1.23.1"/sqlglot = "1.23.1" \npyspark = "^3.2.1"/' ./pyproject.toml
fi
poetry install --no-interaction
run: poetry install --no-interaction
#----------------------------------------------
# run test suite
#----------------------------------------------
Expand Down
4 changes: 4 additions & 0 deletions benchmarking/test_performance.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
# python3 -m pytest benchmarking/test_performance.py
from rapidfuzz.distance.Levenshtein import distance

from splink.duckdb.duckdb_linker import DuckDBLinker
from splink.spark.spark_linker import SparkLinker
from splink.sqlite.sqlite_linker import SQLiteLinker
Expand Down Expand Up @@ -303,6 +305,7 @@ def test_2_rounds_1k_sqlite(benchmark):

def setup():
con = sqlite3.connect(":memory:")
con.create_function("levenshtein", 2, distance)
df = pd.read_csv("./tests/datasets/fake_1000_from_splink_demos.csv")
df.to_sql("input_df_tablename", con)
return (con,), {"target_rows": 1e6}
Expand All @@ -321,6 +324,7 @@ def test_10_rounds_20k_sqlite(benchmark):

def setup():
con = sqlite3.connect(":memory:")
con.create_function("levenshtein", 2, distance)
df = pd.read_csv("./benchmarking/fake_20000_from_splink_demos.csv")
df.to_sql("input_df_tablename", con)
return (con,), {"target_rows": 3e6}
Expand Down
263 changes: 133 additions & 130 deletions poetry.lock

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "splink"
version = "3.0.0.dev11"
version = "3.0.0.dev12"
description = "Implementation of Fellegi-Sunter's canonical model of record linkage in Apache Spark, including EM algorithm to estimate parameters"
authors = ["Robin Linacre <[email protected]>", "Sam Lindsay", "Theodore Manassis"]
license = "MIT"
Expand All @@ -15,7 +15,6 @@ pandas = "^1.0.0"
duckdb = "0.3.2"
sqlglot = "1.23.1"
altair = "^4.2.0"
rapidfuzz = "^2.0.3"
Jinja2 = "^3.0.3"

[tool.poetry.dev-dependencies]
Expand All @@ -27,7 +26,8 @@ black = "^22.1.0"
flake8 = "^4.0.1"
pyarrow = "^7.0.0"
networkx = "2.5.1"

pyspark = "^3.2.1"
rapidfuzz = "^2.0.3"

[build-system]
requires = ["poetry>=0.12"]
Expand Down
2 changes: 0 additions & 2 deletions splink/sqlite/sqlite_linker.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from typing import Union
import logging
from math import pow, log2
from rapidfuzz.distance.Levenshtein import distance


from ..logging_messages import execute_sql_logging_message_info, log_sql
Expand Down Expand Up @@ -95,7 +94,6 @@ def __init__(
self.con.row_factory = dict_factory
self.con.create_function("log2", 1, log2)
self.con.create_function("pow", 2, pow)
self.con.create_function("levenshtein", 2, distance)

super().__init__(
input_table_or_tables,
Expand Down
2 changes: 2 additions & 0 deletions tests/test_compare_splink2.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,10 @@ def test_splink_2_predict_spark(df_spark):
def test_splink_2_predict_sqlite():

import sqlite3
from rapidfuzz.distance.Levenshtein import distance

con = sqlite3.connect(":memory:")
con.create_function("levenshtein", 2, distance)
df = pd.read_csv("./tests/datasets/fake_1000_from_splink_demos.csv")
df.to_sql("fake_data_1", con, if_exists="replace")
settings_dict = get_settings_dict()
Expand Down
3 changes: 3 additions & 0 deletions tests/test_full_example_sqlite.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,10 @@

def test_full_example_sqlite(tmp_path):

from rapidfuzz.distance.Levenshtein import distance

con = sqlite3.connect(":memory:")
con.create_function("levenshtein", 2, distance)
df = pd.read_csv("./tests/datasets/fake_1000_from_splink_demos.csv")

df.to_sql("input_df_tablename", con)
Expand Down

0 comments on commit d9515d0

Please sign in to comment.