Merge pull request #474 from moj-analytical-services/dev_12

dev12
moj-analytical-services · May 18, 2022 · d9515d0 · d9515d0
2 parents d66fd9e + 7bd15c9
commit d9515d0
Show file tree

Hide file tree

Showing 9 changed files with 151 additions and 162 deletions.
diff --git a/.github/workflows/pytest_benchmark_comment.yml b/.github/workflows/pytest_benchmark_comment.yml
@@ -53,19 +53,12 @@ jobs:
       #----------------------------------------------
       - name: Install dependencies
         if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
-        run: |
-          sed -i 's/sqlglot = "1.23.1"/sqlglot = "1.23.1" \npyspark = "^3.2.1"/' ./pyproject.toml
-          poetry install --no-interaction --no-root
+        run: poetry install --no-interaction --no-root
       #----------------------------------------------
       # install your root project, if required
       #----------------------------------------------
       - name: Install library
-        run: |
-          if ! grep "pyspark" ./pyproject.toml
-          then
-            sed -i 's/sqlglot = "1.23.1"/sqlglot = "1.23.1" \npyspark = "^3.2.1"/' ./pyproject.toml
-          fi
-          poetry install --no-interaction
+        run: poetry install --no-interaction
       #----------------------------------------------
       #              run benchmarks and comment
       #----------------------------------------------

diff --git a/.github/workflows/pytest_benchmark_commit.yml b/.github/workflows/pytest_benchmark_commit.yml
@@ -53,19 +53,12 @@ jobs:
       #----------------------------------------------
       - name: Install dependencies
         if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
-        run: |
-          sed -i 's/sqlglot = "1.23.1"/sqlglot = "1.23.1" \npyspark = "^3.2.1"/' ./pyproject.toml
-          poetry install --no-interaction --no-root
+        run: poetry install --no-interaction --no-root
       #----------------------------------------------
       # install your root project, if required
       #----------------------------------------------
       - name: Install library
-        run: |
-          if ! grep "pyspark" ./pyproject.toml
-          then
-            sed -i 's/sqlglot = "1.23.1"/sqlglot = "1.23.1" \npyspark = "^3.2.1"/' ./pyproject.toml
-          fi
-          poetry install --no-interaction
+        run: poetry install --no-interaction
       #----------------------------------------------
       #              run benchmarks and comment
       #----------------------------------------------

diff --git a/.github/workflows/pytest_run_tests_with_cache.yml b/.github/workflows/pytest_run_tests_with_cache.yml
@@ -50,19 +50,12 @@ jobs:
       #----------------------------------------------
       - name: Install dependencies
         if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
-        run: |
-          sed -i 's/sqlglot = "1.23.1"/sqlglot = "1.23.1" \npyspark = "^3.2.1"/' ./pyproject.toml
-          poetry install --no-interaction --no-root
+        run: poetry install --no-interaction --no-root
       #----------------------------------------------
       # install your root project, if required
       #----------------------------------------------
       - name: Install library
-        run: |
-          if ! grep "pyspark" ./pyproject.toml
-          then
-            sed -i 's/sqlglot = "1.23.1"/sqlglot = "1.23.1" \npyspark = "^3.2.1"/' ./pyproject.toml
-          fi
-          poetry install --no-interaction
+        run: poetry install --no-interaction
       #----------------------------------------------
       #              run test suite
       #----------------------------------------------

diff --git a/benchmarking/test_performance.py b/benchmarking/test_performance.py
@@ -1,4 +1,6 @@
 # python3 -m pytest benchmarking/test_performance.py
+from rapidfuzz.distance.Levenshtein import distance
+
 from splink.duckdb.duckdb_linker import DuckDBLinker
 from splink.spark.spark_linker import SparkLinker
 from splink.sqlite.sqlite_linker import SQLiteLinker
@@ -303,6 +305,7 @@ def test_2_rounds_1k_sqlite(benchmark):
 
     def setup():
         con = sqlite3.connect(":memory:")
+        con.create_function("levenshtein", 2, distance)
         df = pd.read_csv("./tests/datasets/fake_1000_from_splink_demos.csv")
         df.to_sql("input_df_tablename", con)
         return (con,), {"target_rows": 1e6}
@@ -321,6 +324,7 @@ def test_10_rounds_20k_sqlite(benchmark):
 
     def setup():
         con = sqlite3.connect(":memory:")
+        con.create_function("levenshtein", 2, distance)
         df = pd.read_csv("./benchmarking/fake_20000_from_splink_demos.csv")
         df.to_sql("input_df_tablename", con)
         return (con,), {"target_rows": 3e6}

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "splink"
-version = "3.0.0.dev11"
+version = "3.0.0.dev12"
 description = "Implementation of Fellegi-Sunter's canonical model of record linkage in Apache Spark, including EM algorithm to estimate parameters"
 authors = ["Robin Linacre <[email protected]>", "Sam Lindsay", "Theodore Manassis"]
 license = "MIT"
@@ -15,7 +15,6 @@ pandas = "^1.0.0"
 duckdb = "0.3.2"
 sqlglot = "1.23.1"
 altair = "^4.2.0"
-rapidfuzz = "^2.0.3"
 Jinja2 = "^3.0.3"
 
 [tool.poetry.dev-dependencies]
@@ -27,7 +26,8 @@ black = "^22.1.0"
 flake8 = "^4.0.1"
 pyarrow = "^7.0.0"
 networkx = "2.5.1"
-
+pyspark = "^3.2.1"
+rapidfuzz = "^2.0.3"
 
 [build-system]
 requires = ["poetry>=0.12"]

diff --git a/splink/sqlite/sqlite_linker.py b/splink/sqlite/sqlite_linker.py
@@ -2,7 +2,6 @@
 from typing import Union
 import logging
 from math import pow, log2
-from rapidfuzz.distance.Levenshtein import distance
 
 
 from ..logging_messages import execute_sql_logging_message_info, log_sql
@@ -95,7 +94,6 @@ def __init__(
         self.con.row_factory = dict_factory
         self.con.create_function("log2", 1, log2)
         self.con.create_function("pow", 2, pow)
-        self.con.create_function("levenshtein", 2, distance)
 
         super().__init__(
             input_table_or_tables,

diff --git a/tests/test_compare_splink2.py b/tests/test_compare_splink2.py
@@ -51,8 +51,10 @@ def test_splink_2_predict_spark(df_spark):
 def test_splink_2_predict_sqlite():
 
     import sqlite3
+    from rapidfuzz.distance.Levenshtein import distance
 
     con = sqlite3.connect(":memory:")
+    con.create_function("levenshtein", 2, distance)
     df = pd.read_csv("./tests/datasets/fake_1000_from_splink_demos.csv")
     df.to_sql("fake_data_1", con, if_exists="replace")
     settings_dict = get_settings_dict()

diff --git a/tests/test_full_example_sqlite.py b/tests/test_full_example_sqlite.py
@@ -9,7 +9,10 @@
 
 def test_full_example_sqlite(tmp_path):
 
+    from rapidfuzz.distance.Levenshtein import distance
+
     con = sqlite3.connect(":memory:")
+    con.create_function("levenshtein", 2, distance)
     df = pd.read_csv("./tests/datasets/fake_1000_from_splink_demos.csv")
 
     df.to_sql("input_df_tablename", con)