diff --git a/.credo.exs b/.credo.exs new file mode 100644 index 0000000..c43c153 --- /dev/null +++ b/.credo.exs @@ -0,0 +1,183 @@ +# This file contains the configuration for Credo and you are probably reading +# this after creating it with `mix credo.gen.config`. +# +# If you find anything wrong or unclear in this file, please report an +# issue on GitHub: https://github.com/rrrene/credo/issues +# +%{ + # + # You can have as many configs as you like in the `configs:` field. + configs: [ + %{ + # + # Run any config using `mix credo -C `. If no config name is given + # "default" is used. + # + name: "default", + # + # These are the files included in the analysis: + files: %{ + # + # You can give explicit globs or simply directories. + # In the latter case `**/*.{ex,exs}` will be used. + # + included: ["lib/", "src/", "test/", "web/", "apps/"], + excluded: [~r"/_build/", ~r"/deps/", ~r"/node_modules/"] + }, + # + # Load and configure plugins here: + # + plugins: [], + # + # If you create your own checks, you must specify the source files for + # them here, so they can be loaded by Credo before running the analysis. + # + requires: [], + # + # Credo automatically checks for updates, like e.g. Hex does. + # You can disable this behaviour below: + check_for_updates: true, + # + # If you want to enforce a style guide and need a more traditional linting + # experience, you can change `strict` to `true` below: + # + strict: true, + # + # To modify the timeout for parsing files, change this value: + # + parse_timeout: 5000, + # + # If you want to use uncolored output by default, you can change `color` + # to `false` below: + # + color: true, + # + # You can customize the parameters of any check by adding a second element + # to the tuple. + # + # To disable a check put `false` as second element: + # + # {Credo.Check.Design.DuplicatedCode, false} + # + checks: [ + # + ## Consistency Checks + # + {Credo.Check.Consistency.ExceptionNames, []}, + {Credo.Check.Consistency.LineEndings, []}, + {Credo.Check.Consistency.ParameterPatternMatching, []}, + {Credo.Check.Consistency.SpaceAroundOperators, []}, + {Credo.Check.Consistency.SpaceInParentheses, []}, + {Credo.Check.Consistency.TabsOrSpaces, []}, + + # + ## Design Checks + # + # You can customize the priority of any check + # Priority values are: `low, normal, high, higher` + # + {Credo.Check.Design.AliasUsage, + [priority: :low, if_nested_deeper_than: 2, if_called_more_often_than: 0]}, + # You can also customize the exit_status of each check. + # If you don't want TODO comments to cause `mix credo` to fail, just + # set this value to 0 (zero). + # + {Credo.Check.Design.TagTODO, [exit_status: 2]}, + {Credo.Check.Design.TagFIXME, []}, + + # + ## Readability Checks + # + {Credo.Check.Readability.AliasOrder, []}, + {Credo.Check.Readability.FunctionNames, []}, + {Credo.Check.Readability.LargeNumbers, []}, + {Credo.Check.Readability.MaxLineLength, [priority: :low, max_length: 120]}, + {Credo.Check.Readability.ModuleAttributeNames, []}, + {Credo.Check.Readability.ModuleDoc, []}, + {Credo.Check.Readability.ModuleNames, []}, + {Credo.Check.Readability.ParenthesesInCondition, []}, + {Credo.Check.Readability.ParenthesesOnZeroArityDefs, []}, + {Credo.Check.Readability.PredicateFunctionNames, []}, + {Credo.Check.Readability.PreferImplicitTry, []}, + {Credo.Check.Readability.RedundantBlankLines, []}, + {Credo.Check.Readability.Semicolons, []}, + {Credo.Check.Readability.SpaceAfterCommas, []}, + {Credo.Check.Readability.StringSigils, []}, + {Credo.Check.Readability.TrailingBlankLine, []}, + {Credo.Check.Readability.TrailingWhiteSpace, []}, + {Credo.Check.Readability.UnnecessaryAliasExpansion, []}, + {Credo.Check.Readability.VariableNames, []}, + + # + ## Refactoring Opportunities + # + {Credo.Check.Refactor.CondStatements, []}, + {Credo.Check.Refactor.CyclomaticComplexity, []}, + {Credo.Check.Refactor.FunctionArity, []}, + {Credo.Check.Refactor.LongQuoteBlocks, []}, + {Credo.Check.Refactor.MatchInCondition, []}, + {Credo.Check.Refactor.NegatedConditionsInUnless, []}, + {Credo.Check.Refactor.NegatedConditionsWithElse, []}, + {Credo.Check.Refactor.Nesting, []}, + {Credo.Check.Refactor.UnlessWithElse, []}, + {Credo.Check.Refactor.WithClauses, []}, + + # + ## Warnings + # + {Credo.Check.Warning.ApplicationConfigInModuleAttribute, []}, + {Credo.Check.Warning.BoolOperationOnSameValues, []}, + {Credo.Check.Warning.ExpensiveEmptyEnumCheck, []}, + {Credo.Check.Warning.IExPry, []}, + {Credo.Check.Warning.IoInspect, []}, + {Credo.Check.Warning.LazyLogging, false}, + {Credo.Check.Warning.MixEnv, []}, + {Credo.Check.Warning.OperationOnSameValues, []}, + {Credo.Check.Warning.OperationWithConstantResult, []}, + {Credo.Check.Warning.RaiseInsideRescue, []}, + {Credo.Check.Warning.UnusedEnumOperation, []}, + {Credo.Check.Warning.UnusedFileOperation, []}, + {Credo.Check.Warning.UnusedKeywordOperation, []}, + {Credo.Check.Warning.UnusedListOperation, []}, + {Credo.Check.Warning.UnusedPathOperation, []}, + {Credo.Check.Warning.UnusedRegexOperation, []}, + {Credo.Check.Warning.UnusedStringOperation, []}, + {Credo.Check.Warning.UnusedTupleOperation, []}, + {Credo.Check.Warning.UnsafeExec, []}, + + # + # Checks scheduled for next check update (opt-in for now, just replace `false` with `[]`) + + # + # Controversial and experimental checks (opt-in, just replace `false` with `[]`) + # + {Credo.Check.Consistency.MultiAliasImportRequireUse, []}, + {Credo.Check.Consistency.UnusedVariableNames, []}, + {Credo.Check.Design.DuplicatedCode, []}, + {Credo.Check.Readability.AliasAs, []}, + {Credo.Check.Readability.BlockPipe, []}, + {Credo.Check.Readability.ImplTrue, []}, + {Credo.Check.Readability.MultiAlias, []}, + {Credo.Check.Readability.SeparateAliasRequire, []}, + {Credo.Check.Readability.SinglePipe, []}, + {Credo.Check.Readability.Specs, []}, + {Credo.Check.Readability.StrictModuleLayout, []}, + {Credo.Check.Readability.WithCustomTaggedTuple, []}, + {Credo.Check.Refactor.ABCSize, false}, + {Credo.Check.Refactor.AppendSingleItem, []}, + {Credo.Check.Refactor.DoubleBooleanNegation, []}, + {Credo.Check.Refactor.ModuleDependencies, []}, + {Credo.Check.Refactor.NegatedIsNil, []}, + {Credo.Check.Refactor.PipeChainStart, []}, + {Credo.Check.Refactor.VariableRebinding, []}, + {Credo.Check.Warning.LeakyEnvironment, []}, + {Credo.Check.Warning.MapGetUnsafePass, []}, + {Credo.Check.Warning.UnsafeToAtom, []} + + # + # Custom checks can be created using `mix credo.gen.check`. + # + ] + } + ] +} diff --git a/.dialyzerignore b/.dialyzerignore new file mode 100644 index 0000000..e69de29 diff --git a/.formatter.exs b/.formatter.exs index d2cda26..1b7c942 100644 --- a/.formatter.exs +++ b/.formatter.exs @@ -1,4 +1,14 @@ -# Used by "mix format" [ - inputs: ["{mix,.formatter}.exs", "{config,lib,test}/**/*.{ex,exs}"] + inputs: ["{mix,.formatter}.exs", "{config,lib,test}/**/*.{ex,exs}"], + locals_without_parens: [ + # Formatter tests + assert_format: 2, + assert_format: 3, + assert_same: 1, + assert_same: 2, + + # Errors tests + assert_eval_raise: 3 + ], + line_length: 120 ] diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..81bac3d --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,15 @@ +# Contributing + +When contributing to this repository, please first discuss the change you wish to make via issue, +email, or any other method with the owners of this repository before making a change. + +## Pull Request Process + +1. Ensure any install or build dependencies are removed before the end of the layer when doing a + build. +2. Update the README.md with details of any structural change, this includes new environment + variables, exposed ports, useful file locations and container parameters. +3. Increase the version numbers in any examples files and the README.md to the new version that this + Pull Request would represent. The versioning scheme we use is [SemVer](http://semver.org/). +4. Before submitting a Pull Request run `mix test` and `mix check` and ensure that they complete without any errors. +5. You should request a review by at least one repo maintainer. diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..517d621 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,5 @@ +FROM elixir:1.11.3 + +WORKDIR /code + +ENTRYPOINT ["/code/entrypoint"] diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..e91d5e5 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2015-2021 Prima.it + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md index fcbbfa1..1021278 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,23 @@ # ExFuzzywuzzy +[![Build Status](https://drone-1.prima.it/api/badges/primait/ex_fuzzywuzzy/status.svg)](https://drone-1.prima.it/primait/ex_fuzzywuzzy) +[![Module Version](https://img.shields.io/hexpm/v/ex_fuzzywuzzy.svg)](https://hex.pm/packages/ex_fuzzywuzzy) +[![Hex Docs](https://img.shields.io/badge/hex-docs-lightgreen.svg)](https://hexdocs.pm/ex_fuzzywuzzy/) +[![Total Download](https://img.shields.io/hexpm/dt/ex_fuzzywuzzy.svg)](https://hex.pm/packages/ex_fuzzywuzzy) +[![License](https://img.shields.io/hexpm/l/ex_fuzzywuzzy.svg)](https://hex.pm/packages/ex_fuzzywuzzy) +[![Last Updated](https://img.shields.io/github/last-commit/primait/ex_fuzzywuzzy.svg)](https://github.com/primait/ex_fuzzywuzzy/commits/master) -**TODO: Add description** +- [ ] _**TODO: Update badges**_ +- [ ] _**TODO: Publish**_ -## Installation +ExFuzzyWuzzy is a fuzzy string matching library that provides many ways of calculating +a matching ratio between two strings, starting from a similarity function which can be +based on Levenshtein or Jaro-Winkler or a custom one. + +The library is an Elixir port of SeatGeek's [fuzzywuzzy](https://github.com/seatgeek/fuzzywuzzy). -If [available in Hex](https://hex.pm/docs/publish), the package can be installed -by adding `ex_fuzzywuzzy` to your list of dependencies in `mix.exs`: +## Installation +To install ExFuzzyWuzzy, just add an entry to your `mix.exs`: ```elixir def deps do [ @@ -15,7 +26,58 @@ def deps do end ``` -Documentation can be generated with [ExDoc](https://github.com/elixir-lang/ex_doc) -and published on [HexDocs](https://hexdocs.pm). Once published, the docs can -be found at [https://hexdocs.pm/ex_fuzzywuzzy](https://hexdocs.pm/ex_fuzzywuzzy). +## Usage + + +Choose the ratio function which fits best your needs among the available, +providing the two strings to be matched and - if needed - overwriting options +over the configured ones. + +Available methods are: +- Simple ratio +- Quick ratio +- Partial ratio +- Token sort ratio +- Partial token sort ratio +- Token set ratio +- Partial token set ratio +- Best score ratio + +Available options are: +- Similarity function (Levenshtein and Jaro-Winkler provided in library) +- Case sensitiveness of match +- Decimal precision of output score + +Here are some examples. + +### Simple ratio +```elixir +iex> ExFuzzywuzzy.ratio("this is a test", "this is a test!") +96.55 +``` + +### Quick ratio +```elixir +iex> ExFuzzywuzzy.quick_ratio("this is a test", "this is a test!") +100.0 +``` + +### Partial ratio +```elixir +iex> ExFuzzywuzzy.partial_ratio("this is a test", "this is a test!") +100.0 +``` + +### Best Score ratio +```elixir +iex> ExFuzzywuzzy.best_score_ratio("this is a test", "this is a test!") +{:quick, 100.0} +``` + + +## Contributing +Thank your for considering helping with this project. Please see +`CONTRIBUTING.md` file for contributing to this project. +## License +MIT License. Copyright (c) 2015-2021 Prima.it diff --git a/config/config.exs b/config/config.exs new file mode 100644 index 0000000..09dad82 --- /dev/null +++ b/config/config.exs @@ -0,0 +1,10 @@ +import Config + +config :logger, :console, colors: [enabled: false] + +config :ex_fuzzywuzzy, + similarity_fn: &ExFuzzywuzzy.Similarity.Levenshtein.calculate/2, + case_sensitive: false, + precision: 0 + +import_config "#{config_env()}.exs" diff --git a/config/dev.exs b/config/dev.exs new file mode 100644 index 0000000..becde76 --- /dev/null +++ b/config/dev.exs @@ -0,0 +1 @@ +import Config diff --git a/config/test.exs b/config/test.exs new file mode 100644 index 0000000..08d912c --- /dev/null +++ b/config/test.exs @@ -0,0 +1,3 @@ +import Config + +config :ex_fuzzywuzzy, :precision, 2 diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..5a56f55 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,8 @@ +version: "3" + +services: + web: + build: . + volumes: + - ".:/code" + working_dir: "/code" diff --git a/entrypoint b/entrypoint new file mode 100755 index 0000000..aeda1c9 --- /dev/null +++ b/entrypoint @@ -0,0 +1,12 @@ +#!/usr/bin/env sh + +mix local.hex --force +mix local.rebar +mix hex.info +mix deps.get + +if [ -n "$1" ]; then + exec $@ +else + mix run --no-halt +fi diff --git a/lib/ex_fuzzywuzzy.ex b/lib/ex_fuzzywuzzy.ex index aa855c4..5058f52 100644 --- a/lib/ex_fuzzywuzzy.ex +++ b/lib/ex_fuzzywuzzy.ex @@ -1,18 +1,325 @@ defmodule ExFuzzywuzzy do + @external_resource readme = "README.md" @moduledoc """ - Documentation for `ExFuzzywuzzy`. + ex_fuzzywuzzy is a fuzzy string matching library that uses a customizable measure + to calculate a distance ratio + + #{ + readme + |> File.read!() + |> String.split("") + |> Enum.fetch!(1) + } + """ + + alias ExFuzzywuzzy.Algorithms.PartialMatch + + @typedoc """ + Ratio calculator-like signature + """ + @type ratio_calculator :: (String.t(), String.t() -> float()) + + @typedoc """ + Configurable runtime option types + """ + @type fuzzywuzzy_option :: + {:similarity_fn, ratio_calculator()} + | {:case_sensitive, boolean()} + | {:precision, non_neg_integer()} + + @typedoc """ + Configurable runtime options for ratio + """ + @type fuzzywuzzy_options :: [fuzzywuzzy_option()] + + @typedoc """ + Ratio methods available that match the full string + """ + @type full_match_method :: :standard | :quick | :token_sort | :token_set + + @typedoc """ + Ratio methods available that works on the best matching substring + """ + @type partial_match_method :: :partial | :partial_token_sort | :partial_token_set + + @typedoc """ + All ratio methods available + """ + @type match_method :: full_match_method() | partial_match_method() + + @doc """ + Calculates the standard ratio between two strings as a percentage. + It demands the calculus to the chosen measure, standardizing the produced output + + ```elixir + iex> ratio("this is a test", "this is a test!") + 96.55 + ``` + """ + @spec ratio(String.t(), String.t(), fuzzywuzzy_options()) :: float() + def ratio(left, right, options \\ []) do + apply_ratio(left, right, &do_ratio/3, options) + end + + @spec do_ratio(String.t(), String.t(), ratio_calculator()) :: float() + defp do_ratio(left, right, ratio_fn), do: ratio_fn.(left, right) + + @doc """ + Like standard ratio, but ignores any non-alphanumeric character + + ```elixir + iex> quick_ratio("this is a test", "this is a test!") + 100.0 + ``` + """ + @spec quick_ratio(String.t(), String.t(), fuzzywuzzy_options()) :: float() + def quick_ratio(left, right, options \\ []) do + left + |> quick_ratio_normalizer() + |> apply_ratio(quick_ratio_normalizer(right), &do_ratio/3, options) + end + + @spec quick_ratio_normalizer(String.t()) :: String.t() + defp quick_ratio_normalizer(string) do + string + |> string_normalizer() + |> Enum.join(" ") + end + + @doc """ + Calculates the partial ratio between two strings, that is the ratio between + the best matching m-length substrings + + ```elixir + iex> partial_ratio("this is a test", "this is a test!") + 100.0 + + iex> partial_ratio("yankees", "new york yankees") + 100.0 + ``` + """ + @spec partial_ratio(String.t(), String.t(), fuzzywuzzy_options()) :: float() + def partial_ratio(left, right, options \\ []) do + apply_ratio(left, right, &do_partial_ratio/3, options) + end + + @spec do_partial_ratio(String.t(), String.t(), ratio_calculator()) :: float() + defp do_partial_ratio(left, right, ratio_fn) do + left + |> PartialMatch.matching_blocks(right) + |> Enum.map(fn %PartialMatch{left_block: left_candidate, right_block: right_candidate} -> + ratio_fn.(left_candidate, right_candidate) + end) + |> Enum.max() + end + + @doc """ + Calculates the token sort ratio between two strings, that is the ratio calculated + after tokenizing and sorting alphabetically each string + + ```elixir + iex> token_sort_ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear") + 100.0 + + iex> token_sort_ratio("fuzzy muzzy was a bear", "wuzzy fuzzy was a bear") + 77.27 + ``` + """ + @spec token_sort_ratio(String.t(), String.t(), fuzzywuzzy_options()) :: float() + def token_sort_ratio(left, right, options \\ []) do + apply_ratio(left, right, &do_token_sort_ratio/3, options) + end + + @spec do_token_sort_ratio(String.t(), String.t(), ratio_calculator()) :: float() + defp do_token_sort_ratio(left, right, ratio_fn) do + left + |> token_sort_normalizer() + |> ratio_fn.(token_sort_normalizer(right)) + end + + @spec token_sort_normalizer(String.t()) :: String.t() + defp token_sort_normalizer(string) do + string + |> string_normalizer() + |> Enum.sort() + |> Enum.join(" ") + end + + @doc """ + Like token sort ratio, but a partial ratio - instead of a standard one - is applied + + ```elixir + iex> partial_token_sort_ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear") + 100.0 + + iex> partial_token_sort_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear") + 81.25 + ``` """ + @spec partial_token_sort_ratio(String.t(), String.t(), fuzzywuzzy_options()) :: float() + def partial_token_sort_ratio(left, right, options \\ []) do + apply_ratio(left, right, &do_partial_token_sort_ratio/3, options) + end + + @spec do_partial_token_sort_ratio(String.t(), String.t(), ratio_calculator()) :: float() + defp do_partial_token_sort_ratio(left, right, _) do + do_token_sort_ratio(left, right, fn a, b -> partial_ratio(a, b) / 100 end) + end @doc """ - Hello world. + Calculates the token set ratio between two strings, that is the ratio calculated + after tokenizing each string, splitting in two sets (a set with fully matching tokens, + a set with other tokens), then sorting on set membership and alphabetically + + ```elixir + iex> token_set_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear") + 100.0 + + iex> token_set_ratio("fuzzy was a bear", "muzzy wuzzy was a bear") + 78.95 + ``` + """ + @spec token_set_ratio(String.t(), String.t(), fuzzywuzzy_options()) :: float() + def token_set_ratio(left, right, options \\ []), do: apply_ratio(left, right, &do_token_set_ratio/3, options) + + @spec do_token_set_ratio(String.t(), String.t(), ratio_calculator()) :: float() + defp do_token_set_ratio(left, right, ratio_fn) do + left_tokens = token_set_normalizer(left) + right_tokens = token_set_normalizer(right) - ## Examples + base = + left_tokens + |> MapSet.intersection(right_tokens) + |> Enum.sort() + |> Enum.join(" ") + |> String.trim() + + left_minus_right = token_set_diff(left_tokens, right_tokens, base) + + right_minus_left = token_set_diff(right_tokens, left_tokens, base) + + [ + {base, left_minus_right}, + {base, right_minus_left}, + {left_minus_right, right_minus_left} + ] + |> Enum.map(fn {left, right} -> ratio_fn.(left, right) end) + |> Enum.max() + end + + @spec token_set_normalizer(String.t()) :: MapSet.t() + defp token_set_normalizer(string) do + string + |> string_normalizer() + |> MapSet.new() + end + + @spec token_set_diff(MapSet.t(), MapSet.t(), String.t()) :: String.t() + defp token_set_diff(left, right, prefix) do + body = + left + |> MapSet.difference(right) + |> Enum.sort() + |> Enum.join(" ") + + String.trim(prefix <> " " <> body) + end + + @doc """ + Like token set ratio, but a partial ratio - instead a full one - is applied - iex> ExFuzzywuzzy.hello() - :world + ```elixir + iex> partial_token_set_ratio("grizzly was a bear", "a grizzly inside a box") + 100.0 + iex> partial_token_set_ratio("grizzly was a bear", "be what you wear") + 43.75 + ``` """ - def hello do - :world + @spec partial_token_set_ratio(String.t(), String.t(), fuzzywuzzy_options()) :: float() + def partial_token_set_ratio(left, right, options \\ []) do + apply_ratio(left, right, &do_partial_token_set_ratio/3, options) + end + + @spec do_partial_token_set_ratio(String.t(), String.t(), ratio_calculator()) :: float() + defp do_partial_token_set_ratio(left, right, _) do + do_token_set_ratio(left, right, fn a, b -> partial_ratio(a, b) / 100 end) + end + + @doc """ + Calculates the ratio between the strings using various methods, returning the best score and algorithm + """ + @spec best_score_ratio(String.t(), String.t(), boolean(), fuzzywuzzy_options()) :: {match_method(), float()} + def best_score_ratio(left, right, partial \\ false, options \\ []) do + [ + {:standard, &ratio/3}, + {:quick, &quick_ratio/3}, + {:token_sort, &token_sort_ratio/3}, + {:token_set, &token_set_ratio/3} + ] + |> Enum.concat( + if partial do + [ + {:partial, &partial_ratio/3}, + {:partial_token_sort, &partial_token_sort_ratio/3}, + {:partial_token_set, &partial_token_set_ratio/3} + ] + else + [] + end + ) + |> Enum.map(fn {method, calculator} -> {method, calculator.(left, right, options)} end) + |> Enum.max_by(&elem(&1, 1)) + end + + @doc """ + Weighted ratio. Not implemented yet + """ + + @spec weighted_ratio(String.t(), String.t(), fuzzywuzzy_options()) :: float() + def weighted_ratio(_, _, _) do + raise "not_implemented" + end + + @doc """ + Process a list of strings, finding the best match on a string reference. Not implemented yet + """ + @spec process(String.t(), [String.t()], fuzzywuzzy_options()) :: String.t() + def process(_, _, _) do + raise "not_implemented" + end + + @spec string_normalizer(String.t()) :: [String.t()] + defp string_normalizer(string), do: String.split(string, ~R/[^[:alnum:]\-]/u, trim: true) + + @spec apply_ratio( + String.t(), + String.t(), + (String.t(), String.t(), ratio_calculator() -> float()), + fuzzywuzzy_options() + ) :: + float() + defp apply_ratio("", _, _, _), do: 0.0 + defp apply_ratio(_, "", _, _), do: 0.0 + defp apply_ratio(string, string, _, _), do: 100.0 + + defp apply_ratio(left, right, ratio_fn, options) do + {left, right} = + if get_option(options, :case_sensitive), + do: {left, right}, + else: {String.upcase(left), String.upcase(right)} + + similarity_fn = get_option(options, :similarity_fn) + precision = get_option(options, :precision) + Float.round(100 * ratio_fn.(left, right, similarity_fn), precision) + end + + @spec get_option(fuzzywuzzy_options(), atom()) :: any() + defp get_option(options, option) do + Keyword.get( + options, + option, + Application.get_env(:ex_fuzzywuzzy, option) + ) end end diff --git a/lib/ex_fuzzywuzzy/algorithms/longest_common_substring.ex b/lib/ex_fuzzywuzzy/algorithms/longest_common_substring.ex new file mode 100644 index 0000000..6f78b4c --- /dev/null +++ b/lib/ex_fuzzywuzzy/algorithms/longest_common_substring.ex @@ -0,0 +1,66 @@ +defmodule ExFuzzywuzzy.Algorithms.LongestCommonSubstring do + @moduledoc """ + Helper module for the calculus of the longest common substring algorithm between two strings + """ + + defstruct [:substring, :left_starting_index, :right_starting_index, :length] + + @typedoc """ + The data collected applying partial matching algorithm + """ + @type t :: %__MODULE__{ + substring: String.t(), + left_starting_index: non_neg_integer(), + right_starting_index: non_neg_integer(), + length: non_neg_integer() + } + + @typep grapheme :: String.t() + @typep row :: map() + @typep match :: {integer(), integer(), integer()} + @typep longest_match :: {{row(), row()}, match()} + + @doc """ + Calculates the longest common substring between two strings, returning a tuple containing + the matched substring, the length of the substring itself, the starting index of the matches + on the left and on the right. + """ + @spec lcs(String.t(), String.t()) :: nil | t() + def lcs(left, right) do + left_list = left |> String.graphemes() |> Enum.with_index() + right_list = right |> String.graphemes() |> Enum.with_index() + + {_, match} = lcs_dynamic_programming(left_list, right_list) + build_result(left, match) + end + + @spec lcs_dynamic_programming([grapheme()], [grapheme()]) :: longest_match() + defp lcs_dynamic_programming(left, right) do + Enum.reduce(left, {{%{}, %{}}, {0, 0, 0}}, fn x, acc -> + {{_, current}, lcs} = Enum.reduce(right, acc, &step(x, &1, &2)) + {{current, %{}}, lcs} + end) + end + + @spec step({integer(), integer()}, {integer(), integer()}, longest_match()) :: longest_match() + defp step({c, i}, {c, j}, {{previous, current}, match = {_, _, lcs_length}}) do + length = Map.get(previous, j - 1, 0) + 1 + current = Map.put(current, j, length) + match = if length > lcs_length, do: {i - length + 1, j - length + 1, length}, else: match + {{previous, current}, match} + end + + defp step(_, _, acc), do: acc + + @spec build_result(String.t(), match()) :: nil | t() + defp build_result(_, {_, _, 0}), do: nil + + defp build_result(left, {left_start, right_start, length}) do + %__MODULE__{ + substring: String.slice(left, left_start, length), + left_starting_index: left_start, + right_starting_index: right_start, + length: length + } + end +end diff --git a/lib/ex_fuzzywuzzy/algorithms/partial_match.ex b/lib/ex_fuzzywuzzy/algorithms/partial_match.ex new file mode 100644 index 0000000..bf1cb09 --- /dev/null +++ b/lib/ex_fuzzywuzzy/algorithms/partial_match.ex @@ -0,0 +1,148 @@ +defmodule ExFuzzywuzzy.Algorithms.PartialMatch do + @moduledoc """ + Implementation for the partial matching algorithms used by the library interface. + The model defined is linked to the calling ratio functions, making no sense to be used externally + """ + + alias ExFuzzywuzzy.Algorithms.LongestCommonSubstring + + defstruct [:left_block, :right_block, :left_starting_index, :right_starting_index, :length] + + @typedoc """ + The position of a grapheme in a string + """ + @type index :: non_neg_integer() + + @typedoc """ + The data collected applying partial matching algorithm + """ + @type t :: %__MODULE__{ + left_block: String.t(), + right_block: String.t(), + left_starting_index: index(), + right_starting_index: index(), + length: non_neg_integer() + } + + @typep slice :: {index(), index(), index(), index()} + + @doc """ + Calculates a list of string pairs which are the best matching substrings extracted from the provided ones + """ + @spec matching_blocks(String.t(), String.t()) :: [t()] + def matching_blocks(left, right), do: matching_blocks(left, right, String.length(left), String.length(right)) + + @spec matching_blocks(String.t(), String.t(), index(), index()) :: [t()] + def matching_blocks(left, right, left_length, right_length) when right_length < left_length do + # swapping after calculus isn't done in order to guarantee the same ratio when the calling order is swapped + matching_blocks(right, left, right_length, left_length) + end + + def matching_blocks(left, right, left_length, right_length) do + [{0, left_length, 0, right_length}] + |> do_matching_blocks(left, right, []) + |> Enum.concat([ + %__MODULE__{ + left_block: left, + right_block: String.slice(right, right_length - left_length, left_length), + left_starting_index: left_length, + right_starting_index: right_length, + length: 0 + } + ]) + |> Enum.sort() + |> Enum.reduce([], fn + %__MODULE__{ + left_block: first_left_block, + right_block: first_right_block, + left_starting_index: first_left_index, + right_starting_index: first_right_index, + length: first_length + }, + [ + %__MODULE__{ + left_block: second_left_block, + right_block: second_right_block, + left_starting_index: second_left_index, + right_starting_index: second_right_index, + length: second_length + } + | other_matches + ] + when first_left_index + first_length == second_left_index and + first_right_index + first_length == second_right_index -> + [ + %__MODULE__{ + left_block: first_left_block <> second_left_block, + right_block: first_right_block <> second_right_block, + left_starting_index: first_left_index + second_left_index, + right_starting_index: first_right_index + second_right_index, + length: first_length + second_length + } + | other_matches + ] + + match, matches -> + [match | matches] + end) + |> Enum.reverse() + end + + @spec do_matching_blocks([slice()], String.t(), String.t(), [t()]) :: [t()] + defp do_matching_blocks(to_be_processed, left, right, matches) + + defp do_matching_blocks([], _, _, acc), do: acc + + defp do_matching_blocks([{left_from, left_to, right_from, right_to} | remaining], left, right, matches) do + case LongestCommonSubstring.lcs( + String.slice(left, left_from, left_to - left_from), + String.slice(right, right_from, right_to - right_from) + ) do + nil -> + do_matching_blocks(remaining, left, right, matches) + + lcs = %LongestCommonSubstring{ + left_starting_index: left_index, + right_starting_index: right_index, + length: k + } -> + i = left_from + left_index + j = right_from + right_index + + remaining + |> update_left_boundary(left_from, right_from, lcs) + |> update_right_boundary(left_from, left_to, right_from, right_to, lcs) + |> do_matching_blocks(left, right, [ + %__MODULE__{ + left_block: left, + right_block: String.slice(right, max(j - i, 0), String.length(left)), + left_starting_index: i, + right_starting_index: j, + length: k + } + | matches + ]) + end + end + + @spec update_left_boundary([slice()], index(), index(), LongestCommonSubstring.t()) :: [slice()] + defp update_left_boundary(remaining, left_from, right_from, %LongestCommonSubstring{ + left_starting_index: left_index, + right_starting_index: right_index + }) + when left_index > 0 and right_index > 0, + do: [{left_from, left_from + left_index, right_from, right_from + right_index} | remaining] + + defp update_left_boundary(remaining, _, _, _), do: remaining + + @spec update_right_boundary([slice()], index(), index(), index(), index(), LongestCommonSubstring.t()) :: [slice()] + defp update_right_boundary(remaining, left_from, left_to, right_from, right_to, %LongestCommonSubstring{ + left_starting_index: left_index, + right_starting_index: right_index, + length: k + }) + when left_from + left_index + k < left_to and right_from + right_index + k < right_to, + do: [{left_from + left_index + k, left_to, right_from + right_index + k, right_to} | remaining] + + defp update_right_boundary(remaining, _, _, _, _, _), do: remaining +end diff --git a/lib/ex_fuzzywuzzy/similarity.ex b/lib/ex_fuzzywuzzy/similarity.ex new file mode 100644 index 0000000..7a5be4c --- /dev/null +++ b/lib/ex_fuzzywuzzy/similarity.ex @@ -0,0 +1,12 @@ +defmodule ExFuzzywuzzy.Similarity do + @moduledoc """ + Defines the `ExFuzzywuzzy.Similarity` behaviour for implementing a similarity calculator. + + A custom calculator expects two strings and calculates the similarity between them + as a floating-point decimal between 0 and 1. + + Out-of-the-box, ExFuzzywuzzy provides Levenshtein and Jaro algorithms. + """ + + @callback calculate(String.t(), String.t()) :: float() +end diff --git a/lib/ex_fuzzywuzzy/similarity/jaro_winkler.ex b/lib/ex_fuzzywuzzy/similarity/jaro_winkler.ex new file mode 100644 index 0000000..fbbec49 --- /dev/null +++ b/lib/ex_fuzzywuzzy/similarity/jaro_winkler.ex @@ -0,0 +1,15 @@ +defmodule ExFuzzywuzzy.Similarity.JaroWinkler do + @moduledoc """ + Implements the similarity calculus basing on the + [Jaro-Winkler method](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) + """ + + @behaviour ExFuzzywuzzy.Similarity + + @doc """ + The Jaro-Winkler + Calculus delegates to Elixir standard library implementation + """ + @spec calculate(String.t(), String.t()) :: float() + defdelegate calculate(left, right), to: String, as: :jaro_distance +end diff --git a/lib/ex_fuzzywuzzy/similarity/levenshtein.ex b/lib/ex_fuzzywuzzy/similarity/levenshtein.ex new file mode 100644 index 0000000..f1c1c99 --- /dev/null +++ b/lib/ex_fuzzywuzzy/similarity/levenshtein.ex @@ -0,0 +1,23 @@ +defmodule ExFuzzywuzzy.Similarity.Levenshtein do + @moduledoc """ + Implements the similarity calculus basing on the + [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance) + """ + + @behaviour ExFuzzywuzzy.Similarity + + alias ExFuzzywuzzy.Algorithms.PartialMatch + + @doc """ + The Levenshtein distance is calculated as the minimum number of edits in order to transition from one string to the other + Implementation follows [Hjelmqvist algorithm](https://www.codeproject.com/Articles/13525/Fast-memory-efficient-Levenshtein-algorithm-2) + """ + @spec calculate(String.t(), String.t()) :: float() + def calculate(left, right) do + left + |> PartialMatch.matching_blocks(right) + |> Enum.map(& &1.length) + |> Enum.sum() + |> (fn matches -> 2 * matches / (length(String.graphemes(left)) + length(String.graphemes(right))) end).() + end +end diff --git a/mix.exs b/mix.exs index 8b318f6..37f7555 100644 --- a/mix.exs +++ b/mix.exs @@ -15,7 +15,7 @@ defmodule ExFuzzywuzzy.MixProject do docs: docs(), aliases: aliases(), package: package(), - description: "Fuzzy string matching", + description: "A fuzzy string matching library", dialyzer: [ plt_add_apps: [:mix], plt_add_deps: :transitive, @@ -24,6 +24,10 @@ defmodule ExFuzzywuzzy.MixProject do ] end + def application do + [] + end + defp package do [ maintainers: ["Carlo Suriano"], @@ -32,9 +36,11 @@ defmodule ExFuzzywuzzy.MixProject do ] end - def project do + defp deps do [ - elixirc_paths: elixirc_paths(Mix.env()), + {:credo, "~> 1.5", only: [:dev, :test], runtime: false}, + {:dialyxir, "~> 1.0", only: [:dev, :test], runtime: false}, + {:ex_doc, "~> 0.23", only: :dev} ] end @@ -47,16 +53,6 @@ defmodule ExFuzzywuzzy.MixProject do ] end - def application do - [ - extra_applications: [:logger] - ] - end - - defp deps do - [] - end - defp aliases do [ dep_check: ["deps.unlock --check-unused"], @@ -69,7 +65,7 @@ defmodule ExFuzzywuzzy.MixProject do ], "format.all": [ "format mix.exs \"lib/**/*.{ex,exs}\" \"test/**/*.{ex,exs}\" \"priv/**/*.{ex,exs}\" \"config/**/*.{ex,exs}\"" - ], + ] ] end end diff --git a/mix.lock b/mix.lock new file mode 100644 index 0000000..1a653e5 --- /dev/null +++ b/mix.lock @@ -0,0 +1,14 @@ +%{ + "bunt": {:hex, :bunt, "0.2.0", "951c6e801e8b1d2cbe58ebbd3e616a869061ddadcc4863d0a2182541acae9a38", [:mix], [], "hexpm", "7af5c7e09fe1d40f76c8e4f9dd2be7cebd83909f31fee7cd0e9eadc567da8353"}, + "credo": {:hex, :credo, "1.5.5", "e8f422026f553bc3bebb81c8e8bf1932f498ca03339856c7fec63d3faac8424b", [:mix], [{:bunt, "~> 0.2.0", [hex: :bunt, repo: "hexpm", optional: false]}, {:file_system, "~> 0.2.8", [hex: :file_system, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}], "hexpm", "dd8623ab7091956a855dc9f3062486add9c52d310dfd62748779c4315d8247de"}, + "dialyxir": {:hex, :dialyxir, "1.1.0", "c5aab0d6e71e5522e77beff7ba9e08f8e02bad90dfbeffae60eaf0cb47e29488", [:mix], [{:erlex, ">= 0.2.6", [hex: :erlex, repo: "hexpm", optional: false]}], "hexpm", "07ea8e49c45f15264ebe6d5b93799d4dd56a44036cf42d0ad9c960bc266c0b9a"}, + "earmark_parser": {:hex, :earmark_parser, "1.4.12", "b245e875ec0a311a342320da0551da407d9d2b65d98f7a9597ae078615af3449", [:mix], [], "hexpm", "711e2cc4d64abb7d566d43f54b78f7dc129308a63bc103fbd88550d2174b3160"}, + "erlex": {:hex, :erlex, "0.2.6", "c7987d15e899c7a2f34f5420d2a2ea0d659682c06ac607572df55a43753aa12e", [:mix], [], "hexpm", "2ed2e25711feb44d52b17d2780eabf998452f6efda104877a3881c2f8c0c0c75"}, + "ex_doc": {:hex, :ex_doc, "0.24.1", "15673de99154f93ca7f05900e4e4155ced1ee0cd34e0caeee567900a616871a4", [:mix], [{:earmark_parser, "~> 1.4.0", [hex: :earmark_parser, repo: "hexpm", optional: false]}, {:makeup_elixir, "~> 0.14", [hex: :makeup_elixir, repo: "hexpm", optional: false]}, {:makeup_erlang, "~> 0.1", [hex: :makeup_erlang, repo: "hexpm", optional: false]}], "hexpm", "07972f17bdf7dc7b5bd76ec97b556b26178ed3f056e7ec9288eb7cea7f91cce2"}, + "file_system": {:hex, :file_system, "0.2.10", "fb082005a9cd1711c05b5248710f8826b02d7d1784e7c3451f9c1231d4fc162d", [:mix], [], "hexpm", "41195edbfb562a593726eda3b3e8b103a309b733ad25f3d642ba49696bf715dc"}, + "jason": {:hex, :jason, "1.2.2", "ba43e3f2709fd1aa1dce90aaabfd039d000469c05c56f0b8e31978e03fa39052", [:mix], [{:decimal, "~> 1.0 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "18a228f5f0058ee183f29f9eae0805c6e59d61c3b006760668d8d18ff0d12179"}, + "makeup": {:hex, :makeup, "1.0.5", "d5a830bc42c9800ce07dd97fa94669dfb93d3bf5fcf6ea7a0c67b2e0e4a7f26c", [:mix], [{:nimble_parsec, "~> 0.5 or ~> 1.0", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "cfa158c02d3f5c0c665d0af11512fed3fba0144cf1aadee0f2ce17747fba2ca9"}, + "makeup_elixir": {:hex, :makeup_elixir, "0.15.1", "b5888c880d17d1cc3e598f05cdb5b5a91b7b17ac4eaf5f297cb697663a1094dd", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}, {:nimble_parsec, "~> 1.1", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "db68c173234b07ab2a07f645a5acdc117b9f99d69ebf521821d89690ae6c6ec8"}, + "makeup_erlang": {:hex, :makeup_erlang, "0.1.1", "3fcb7f09eb9d98dc4d208f49cc955a34218fc41ff6b84df7c75b3e6e533cc65f", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}], "hexpm", "174d0809e98a4ef0b3309256cbf97101c6ec01c4ab0b23e926a9e17df2077cbb"}, + "nimble_parsec": {:hex, :nimble_parsec, "1.1.0", "3a6fca1550363552e54c216debb6a9e95bd8d32348938e13de5eda962c0d7f89", [:mix], [], "hexpm", "08eb32d66b706e913ff748f11694b17981c0b04a33ef470e33e11b3d3ac8f54b"}, +} diff --git a/test/algorithms/longest_common_substring_test.exs b/test/algorithms/longest_common_substring_test.exs new file mode 100644 index 0000000..1997d33 --- /dev/null +++ b/test/algorithms/longest_common_substring_test.exs @@ -0,0 +1,72 @@ +defmodule ExFuzzywuzzy.Test do + use ExUnit.Case + + alias ExFuzzywuzzy.Algorithms.LongestCommonSubstring + + test "lcs" do + assert LongestCommonSubstring.lcs("aaaaaa", "aaaaaa") == %LongestCommonSubstring{ + substring: "aaaaaa", + left_starting_index: 0, + right_starting_index: 0, + length: 6 + } + + assert LongestCommonSubstring.lcs("XXXaaaaaaXXXX", "YYaaaaaaYYYYY") == %LongestCommonSubstring{ + substring: "aaaaaa", + left_starting_index: 3, + right_starting_index: 2, + length: 6 + } + + assert LongestCommonSubstring.lcs("aabc", "dc") == %LongestCommonSubstring{ + substring: "c", + left_starting_index: 3, + right_starting_index: 1, + length: 1 + } + + assert LongestCommonSubstring.lcs("XXXaaaaaaXXXXaaaaaa", "YYaaaaaaYYYYYaaaaaa") == %LongestCommonSubstring{ + substring: "aaaaaa", + left_starting_index: 3, + right_starting_index: 2, + length: 6 + } + + assert LongestCommonSubstring.lcs("XaaXaaa", "YaaYaaa") == %LongestCommonSubstring{ + substring: "aaa", + left_starting_index: 4, + right_starting_index: 4, + length: 3 + } + + assert LongestCommonSubstring.lcs("XXXaaaaaaXXXXaaaaaaa", "YYaaaaaaYYYYYaaaaaaa") == %LongestCommonSubstring{ + substring: "aaaaaaa", + left_starting_index: 13, + right_starting_index: 13, + length: 7 + } + + assert LongestCommonSubstring.lcs("stringX", "stringY") == %LongestCommonSubstring{ + substring: "string", + left_starting_index: 0, + right_starting_index: 0, + length: 6 + } + + assert LongestCommonSubstring.lcs("stringX", "Ystring") == %LongestCommonSubstring{ + substring: "string", + left_starting_index: 0, + right_starting_index: 1, + length: 6 + } + + assert LongestCommonSubstring.lcs("Xstring", "Ystring") == %LongestCommonSubstring{ + substring: "string", + left_starting_index: 1, + right_starting_index: 1, + length: 6 + } + + assert is_nil(LongestCommonSubstring.lcs("bbbbbb", "aaaaaa")) + end +end diff --git a/test/algorithms/partial_match_test.exs b/test/algorithms/partial_match_test.exs new file mode 100644 index 0000000..ab5027f --- /dev/null +++ b/test/algorithms/partial_match_test.exs @@ -0,0 +1,56 @@ +defmodule ExFuzzywuzzy.PartialMatch.Test do + use ExUnit.Case + + alias ExFuzzywuzzy.Algorithms.PartialMatch + + test "extract matching blocks" do + assert PartialMatch.matching_blocks("abcd", "abxcd") == + [ + %PartialMatch{ + left_block: "abcd", + right_block: "abxc", + left_starting_index: 0, + right_starting_index: 0, + length: 2 + }, + %PartialMatch{ + left_block: "abcd", + right_block: "bxcd", + left_starting_index: 2, + right_starting_index: 3, + length: 2 + }, + %PartialMatch{ + left_block: "abcd", + right_block: "bxcd", + left_starting_index: 4, + right_starting_index: 5, + length: 0 + } + ] + end + + test "swap test" do + assert PartialMatch.matching_blocks("test-ab", "ab") == [ + %PartialMatch{ + left_block: "ab", + right_block: "ab", + left_starting_index: 0, + right_starting_index: 5, + length: 2 + }, + %PartialMatch{ + left_block: "ab", + right_block: "ab", + left_starting_index: 2, + right_starting_index: 7, + length: 0 + } + ] + end + + test "swap test invariant" do + assert PartialMatch.matching_blocks("abcd", "abxcd") == PartialMatch.matching_blocks("abxcd", "abcd") + assert PartialMatch.matching_blocks("test-ab", "ab") == PartialMatch.matching_blocks("ab", "test-ab") + end +end diff --git a/test/data/japanese.txt b/test/data/japanese.txt new file mode 100644 index 0000000..0d51ed8 --- /dev/null +++ b/test/data/japanese.txt @@ -0,0 +1,10 @@ +索れぐ信司ヘタリチ庁値ヱヌカ企際せみゅん防業ドせめぽ済立たむな情4更トネカフ養質86名じルざだ綿理ゃず申侍勃卑厩あゅ。天76搬なこけぱ事府メノ検死レノナオ経鮮以ゃき国打ほをゆぴ歳売っほろル幕武カ状目あ組時トチレケ触督ノロ評載ミ夜7除いへよ機浜フス命階齢都フぐし。層よて分勝禁棚1有そ事完へねめ道育テナ齢対横東らぽ断果ン蓄宣ソキテケ世規玄しでラか需土ルヱク献性ワムア誌聞ホ証園返道きスどん。 +転ホヱア宅需カヌ優朝レヌオシ日原マ蓮日部トド属弾サニ増落てな身芸ソ外京どよん年2窓はうざイ自竹チヱス趣開でせく芳似だゆにぶ。治ソワ芸戦リク指奈コテソ権続員ヤトヒ険患キネ工都容勝ネサヨロ謎見わやてづ報操びきの死揺るぶ注60佑俗吐堺1読93風四ソル増気なぽ理光懐さらラけ。4問せこ年稿ツスハモ参鳥うれひろ暮習フ組破ふぴ就梅も修視ほあっ始楽玲すフゃ虐屋ヤイネム能森ッとしず権反据灯琴ぽく。 +本カツ歩前ト改挙かぶリば高読ム益著ルチ禁末シフ室階ぞんっぜ早組セヱエ出違見て週月ぐくどえ載択特ごろべで際断択ヲイヒホ識拍繁貫費たおる。信ユイ兼勝たリぎれ歩結ぞゅか島4科エマユス強政ほ以駐でだま立異づ止属禁トぽや場村78契絡ぼラ。益ヘ万教らつぎや物民キテメス喫食おちあよ説数ゃ化20断いよル移光レ件66芸キ会著リ表文ロ示公みラぽば環好名提あつづ歩値確速アユホウ最令陸班もへぐ。 +輝けクてフ警復ばぎ不資くん朝政ルツヌ全牟さで氷行オヲワム覚国スヌツナ漢民サキオ車南タケホ年意シオ更刀情用イヲネモ段面ふ治93米ネコラ柏均1催単村がびごわ。条能和トヱロ碁87誇れゆフえ恐拠はす女本わで能運め気部ひまへ決号ソコ韓過やどっつ減児ハキル裏枝逮ケサ争沢タヒナ尾50質レか庁恋県ヘオ半根スづして必州査俺ずよぞね。 +終三ヤ工代変るつ手教ヱ話文ユカムキ著目にめゃし容入さふって憶社タイアツ超並ち育出ニヌマツ崎若式がつざき。以くひ切足エネツ述1市わをず敷35建け形輸民坊ざぶラ層勤あクぽ均聞ソキ市模レトッぶ体踏びひべ動模ホフ認隠まぶら講置ケヒルヨ覧稿立ず棋事三以億ぽぜ。択めゃざ給済エラヒオ込消ソケ東年ぼ町下げ提佳死読セキ席額げや論31下春環5回り検税イ食供市ユウラロ並大イワミト卵買非刑駐と。 +紙モ直育力減げつぐ皇画うぞ医話ゃとねそ置逃エ際29洗オム列塁げリど稿継セリ習投ラでたか転古つ駆更おい放選歳磨ちば。告なラ堀情た者調タヲメ投5論コモ長単9目よゃに相作ヨヱ刊再ーだご王百問シ用勝トヨ必列知乱さン戻後ハヘリ付神っ部波セテニク湖上ご寺模まご。題ゆ詳悪テスニ写数6対想ニサ告買リ慎国ざぼ百線ぎん供1協ヒコ試千ドンイま否急埋詰ぴもス。 +軽ぞすぽ懸府ミノフレ下阪えはトま典無ごぞ更自ケフク状25認ニコヌム盛層問ず集守ヨツナロ面進ゃりぶが動強ドつ柱順にめン利妻モヌ巣宝とぴおじ佐津強ヱウナ術国びゆ抵享凡喬幽らイ。軍エ速9疎ク属図秀カチヨシ聞今覧イ初新き力門ろと重5社コ考血箱リナ年会ぶあつげ黒測稿氏努せがく。曜歌ラむけ非子にぼじ属速フニ京経ヤオ周9財て融込際とびご動変ロノレ先企ナム自同用しすぎ当2前来役きげょ。 +加は綱防ばふもじ広稿ツネコ等大をゃーレ今7井団イだわ府流ナマ市午リハ意要ラキコ府解よち民実ス読質イ校島クヨテハ家事こ行受きかリん詳倒ナニロセ見9丹庄弥れ。芸ヌムト作形テヌヱマ明選ばぼも属造すぱせわ展煙87秋めつん戦進度ヘイノス記現帯ウテヨ園住ユ祈憶ソサヌ自島伎末拳き。止カヤ意寅之ウノレア学3一づね最報生宇引みども静病ゆ同7開げをぶ載層ケ海成て夕投学せどよ勢丹庄弥陥もむ。 +教ませぎ出之何キモテフ見證場ニテリ加演シ参治ゃげら性満ツ書全エヤ金49外び身第ほち青標みひ棋宙ホスヘ目刻草劣く。高れ査環ワマヤ年能くげフ促1京シ国仕ぐ飛揮ヌタク共著ずだいゅ注疑ぎえ者償のべねず必晶局75全ネ力左不過げに。視セホラ経旅ほ必早ニモ代収南ごぱおの伝政ひもられ迫番よ際授人ヨヘイツ欺藤を際誰ケヤイウ速牛ッた座表もあぶぎ抗69見形スコユ様点圧あせ。 +鮮へ回表そらぴゆ施世に内13症6意るむ校申交金すきぼ題歳レウ惑係シアロク公開長マ来簡ふそと査時カテノ会作びうさず平行フカムナ任誰町奨ば。五マソ第反潟応イニヘ陳題ソミウ目黒ネワ事投ゆび発体すせはイ報遷申べぴち当見江スクや決69覧キ定博みリご町能科ま正観めゃ。方メ表改なじひル算種つっほ報35費ヲヱ練後じてせ逃呉ヒモサ量59辞を委断ト不心モキ助快カヨワヌ込僕滞硬菜れ。 diff --git a/test/data/lorem.txt b/test/data/lorem.txt new file mode 100644 index 0000000..6e98020 --- /dev/null +++ b/test/data/lorem.txt @@ -0,0 +1,30 @@ +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Proin dapibus a libero ut placerat. Duis in tellus mauris. Fusce congue gravida ante ut molestie. Nulla et arcu quis lectus feugiat accumsan ut id quam. Phasellus eget urna ac velit convallis vulputate. Vivamus lobortis blandit ex, id porttitor enim egestas vitae. In interdum gravida lorem a gravida. Nulla interdum tellus eget purus mattis, sit amet tristique ex lacinia. Suspendisse augue tortor, finibus ut lorem ut, accumsan venenatis felis. Curabitur laoreet tellus sed elementum egestas. +Morbi sit amet efficitur quam. Fusce tortor orci, elementum sollicitudin pulvinar sit amet, pulvinar in sapien. Vivamus eu iaculis mi. Aenean a risus consectetur, tincidunt lacus ac, tempor purus. Praesent varius rutrum eros, id porta nibh. In auctor tincidunt ullamcorper. Proin libero urna, aliquam lobortis tortor in, volutpat porta sapien. Integer pharetra ornare justo vel ultricies. Pellentesque sagittis venenatis metus vel ullamcorper. +Duis efficitur libero vitae blandit sollicitudin. In volutpat vulputate nunc vehicula pretium. Phasellus tincidunt dui nec turpis tristique efficitur ut vitae erat. Nullam mollis sollicitudin magna, eget aliquam urna egestas non. Vivamus varius quis ipsum vitae facilisis. Aenean purus turpis, dapibus eget tortor nec, fringilla facilisis dui. Donec lobortis quam at tortor feugiat posuere. +Vivamus orci ligula, ullamcorper eu condimentum in, cursus porta massa. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Donec blandit quam eget tristique dapibus. Nullam tellus lacus, interdum non hendrerit vel, cursus a sem. Praesent tristique tincidunt mi, vel vestibulum lectus commodo vitae. Integer congue neque et sapien posuere molestie. Cras eget fringilla risus. Donec pulvinar auctor ipsum, sed elementum nibh. Ut ultricies, enim vel mollis condimentum, neque sapien pellentesque lacus, id eleifend enim lectus in leo. Pellentesque fringilla laoreet neque, maximus lacinia magna tincidunt fringilla. Praesent quis facilisis augue. +Sed leo diam, sodales at fermentum lacinia, convallis eget arcu. Vivamus et lectus vitae lacus fringilla suscipit eu blandit massa. Aenean elementum eget ex eu auctor. Cras id dui leo. Sed dui orci, pharetra sit amet eros ac, suscipit porta ex. Nulla porta, ipsum sed lacinia tincidunt, arcu nibh semper purus, quis hendrerit turpis velit ut tellus. Aliquam erat volutpat. Pellentesque felis mi, lobortis quis est vitae, consequat semper nulla. In sed libero velit. Curabitur non lectus tincidunt, pulvinar velit eget, pharetra est. Proin tempus ultricies porta. Vestibulum rhoncus sagittis dolor non interdum. Aliquam erat volutpat. +Suspendisse viverra egestas nisi, eget sagittis libero. Nunc sed lorem eget leo rhoncus mollis. Etiam eget dui sed lectus dictum ornare. Praesent tempor sem elit, non finibus nisi commodo ut. Aliquam vel tortor varius, consequat lectus nec, consectetur erat. Aenean ultricies erat eget ultricies tempor. Suspendisse elementum fermentum urna, ac viverra felis suscipit ut. Sed at vehicula nisi. Sed a aliquet orci. Donec mollis lectus quis arcu molestie aliquam. Duis fringilla aliquet nibh, eget semper dolor condimentum sed. Aenean libero odio, maximus quis facilisis vel, convallis sit amet lacus. Etiam lacinia eget odio sed molestie. Ut vulputate quam sed dignissim auctor. Fusce convallis sem tellus, commodo ornare purus suscipit eget. Suspendisse sit amet orci metus. +Aenean vulputate, risus lobortis vehicula sodales, augue leo pulvinar risus, quis semper nibh augue eget diam. Aliquam vel velit sed dolor tincidunt facilisis. Nam a eros sed lacus pulvinar laoreet. Donec lorem elit, commodo ac faucibus ut, tristique a ex. Duis felis odio, dapibus eget felis eu, vestibulum rhoncus lorem. Quisque ut lorem tempor, ultricies neque at, placerat justo. Phasellus gravida ante in volutpat varius. +Proin sed augue odio. In posuere, odio eu condimentum bibendum, metus elit volutpat ipsum, non posuere nunc augue quis sem. Nulla pretium blandit luctus. Orci varius natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Vivamus ut ipsum quis velit commodo vulputate. Donec aliquet egestas ligula in vulputate. Donec cursus arcu ut ligula maximus, ac congue massa vehicula. Fusce dapibus pellentesque velit nec vestibulum. Aenean vitae orci sed lorem lacinia tempus. Donec suscipit sollicitudin ultricies. +Proin consequat mollis convallis. Pellentesque non pretium quam. Pellentesque eu condimentum metus, sit amet tincidunt dui. Vestibulum erat lectus, facilisis eu posuere eu, blandit ac augue. Mauris et blandit urna, id pellentesque leo. Duis sed nibh sem. Ut dapibus eros id risus rhoncus, ac cursus lorem vehicula. Sed at tincidunt ipsum. Vestibulum a sem est. +Quisque dictum sollicitudin quam at pharetra. Integer volutpat, massa ac convallis sollicitudin, velit libero fringilla eros, at viverra leo lacus at tortor. Nunc et feugiat urna. Vivamus at molestie felis. Nulla semper neque at ligula eleifend, nec lacinia ligula lobortis. Duis luctus, lectus quis imperdiet tempus, justo leo porttitor elit, non interdum tortor nunc vel ipsum. Vivamus imperdiet sed tellus tristique lacinia. Etiam fringilla lobortis arcu, sed vehicula ipsum condimentum non. Duis accumsan sem sapien, sit amet vestibulum velit pulvinar nec. Fusce vulputate ornare leo eget rhoncus. +Proin elementum odio imperdiet neque vulputate lacinia. Sed rhoncus sem non porta varius. Sed suscipit, odio nec vulputate blandit, odio enim tempus tellus, eget convallis turpis dui suscipit felis. Cras eleifend erat arcu, vel sodales velit iaculis vitae. Vestibulum tincidunt eleifend dolor non eleifend. Donec maximus erat leo, sit amet tempus tortor sagittis at. Donec molestie tortor lectus, in hendrerit neque congue in. +Pellentesque feugiat nisi neque, non tempor dui elementum eget. Praesent tristique, turpis eget scelerisque laoreet, ante diam posuere orci, id tincidunt urna est et sem. Quisque eu accumsan libero. Maecenas mauris enim, ullamcorper eu scelerisque a, laoreet vitae risus. Etiam rhoncus lorem sit amet purus mattis, vitae facilisis justo ornare. Nullam ligula ligula, congue id nulla pulvinar, ullamcorper convallis urna. Integer sit amet lectus placerat, accumsan quam lobortis, interdum nulla. Praesent et sollicitudin nibh, sit amet ornare metus. Maecenas porttitor arcu eu aliquam ornare. Donec ligula nibh, ultricies vitae molestie quis, fermentum eget ex. +Ut eu imperdiet nulla, gravida dignissim nisi. Nulla facilisi. Praesent quam nulla, ornare id ligula in, porttitor efficitur felis. Fusce ut magna sem. Proin ut mattis ex, ut efficitur erat. Proin finibus lectus vehicula quam ornare eleifend. In sem massa, rhoncus a ultricies eu, consequat in erat. Morbi sed nibh nec arcu condimentum gravida. Suspendisse elementum vestibulum ullamcorper. In porttitor iaculis justo et maximus. Ut non ultrices tellus. Nulla elit nulla, scelerisque at scelerisque in, aliquam ac lectus. Vestibulum vestibulum libero a hendrerit placerat. +Pellentesque molestie condimentum mauris et facilisis. Cras eu erat ut nisl luctus rhoncus. Quisque ut libero non magna viverra finibus. Donec scelerisque eros et euismod tincidunt. Donec sit amet finibus erat. Sed ultricies porttitor leo, semper bibendum turpis ullamcorper ut. Nullam cursus luctus quam quis efficitur. Sed eleifend eros in tincidunt consectetur. Vestibulum pretium, eros vel egestas scelerisque, eros elit semper nisl, fermentum sagittis massa tellus a justo. Sed viverra enim a mauris suscipit, nec dictum orci elementum. Vestibulum vel urna sollicitudin, rhoncus nulla nec, accumsan ligula. Proin eleifend purus id massa cursus, id semper lorem rutrum. Sed molestie ornare ultrices. Proin quis mattis erat. Curabitur eleifend, leo ut finibus consequat, nunc sapien auctor dui, et volutpat magna est eget nisl. +Fusce fringilla orci eget interdum sagittis. Proin hendrerit mauris quis tortor sodales, eu molestie turpis tristique. Pellentesque at tristique massa. Fusce convallis diam purus, sed interdum nulla facilisis vel. Nulla urna leo, ornare eu faucibus non, vehicula dapibus urna. Nulla ut tincidunt tortor, at accumsan mauris. Sed nec lorem sem. Morbi eu malesuada metus. Mauris enim orci, luctus eget risus ac, scelerisque molestie ligula. +Suspendisse lobortis enim sed magna ultrices tempus. Proin sit amet quam sed nunc volutpat congue. Vivamus eu risus tortor. Vivamus molestie leo in sapien interdum tincidunt. Nullam eu ipsum iaculis libero mollis efficitur at et arcu. Pellentesque egestas mi magna, ac iaculis magna vulputate a. Nam pharetra felis a nisi fringilla posuere. Morbi vestibulum, nunc tincidunt facilisis commodo, mauris est pharetra mi, pellentesque semper dui purus eu tortor. Proin iaculis tellus ac condimentum finibus. +In sed massa sed nisl dictum dapibus. Curabitur vitae ipsum ligula. Nullam maximus suscipit luctus. Nam at semper ligula. Suspendisse massa sapien, viverra nec velit vitae, sagittis sagittis turpis. Sed auctor arcu quis risus rutrum feugiat. Donec varius scelerisque dui, et vehicula sem congue non. Orci varius natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Nam sollicitudin ex vel eros scelerisque, eget dictum sapien porttitor. In molestie convallis felis, in congue ante mollis et. Phasellus ex quam, elementum mollis consectetur sed, tempor a arcu. Mauris cursus pharetra erat, at lobortis enim feugiat varius. Maecenas eu augue at lorem pellentesque faucibus ultricies non lectus. Aliquam erat volutpat. Fusce id laoreet ipsum. +Proin at urna metus. Vivamus iaculis eget nisi in blandit. Pellentesque quis vestibulum dui, non hendrerit dolor. Donec pellentesque ex quis eleifend vulputate. Interdum et malesuada fames ac ante ipsum primis in faucibus. Vivamus sollicitudin ut nisl ac mollis. Cras euismod volutpat arcu nec placerat. Vivamus euismod sit amet risus ut tincidunt. Ut sed neque ex. +Vestibulum auctor leo lobortis massa mattis, at ultricies purus venenatis. Vivamus sed massa mattis, euismod magna a, venenatis ante. Aliquam dignissim tortor arcu, in imperdiet nisl pretium vel. Proin bibendum placerat dictum. Nam molestie nec nunc eget egestas. Pellentesque habitant morbi tristique senectus et netus et malesuada fames ac turpis egestas. Fusce dapibus justo in lacus sodales, sit amet ultricies ipsum ullamcorper. Vivamus eu dui nec mauris pellentesque blandit. Quisque mattis massa at ipsum fringilla, sit amet placerat lorem fermentum. Proin pharetra enim non ante accumsan, et tristique massa posuere. Nulla varius urna nec ligula convallis, ut elementum libero tristique. Integer porta enim quis nibh vehicula, ac euismod leo tincidunt. Integer leo ligula, efficitur et lobortis eget, consequat eget nibh. In convallis aliquet convallis. +Duis in lacus ac libero molestie hendrerit non eu lorem. Nullam cursus finibus tellus sit amet vestibulum. Mauris condimentum tincidunt facilisis. In ut risus eu augue tristique rhoncus quis vitae ipsum. Phasellus rhoncus elit turpis, vitae interdum sem rhoncus non. Cras consequat, nisi a commodo cursus, quam turpis dignissim ipsum, sed congue diam massa in quam. Sed nec tincidunt arcu. Donec molestie risus at ante convallis, et tincidunt justo molestie. +Quisque vitae condimentum augue. Sed id risus eget dolor molestie pretium non a purus. Nam tincidunt at enim quis accumsan. Sed finibus eros sem, ac molestie eros dignissim sed. Vivamus dignissim ultricies tincidunt. Aenean purus libero, convallis et malesuada ut, rutrum at sapien. Morbi fringilla, nunc non vulputate tristique, dui metus posuere purus, vel lobortis tortor neque sed risus. +Donec convallis sapien id consequat lacinia. Ut viverra lacinia tincidunt. Sed elementum lobortis augue, sit amet maximus elit gravida a. Ut ut lacus finibus, consectetur quam in, accumsan justo. Nulla aliquet blandit lorem, eu tempor quam luctus eu. In fermentum urna eu dui commodo, ut vulputate purus tristique. Phasellus facilisis est ac lobortis aliquet. Integer ultricies placerat nulla at condimentum. Praesent bibendum lobortis pharetra. +Nullam quis porta arcu. Mauris pretium nulla magna, id iaculis nisi laoreet in. Donec quis ullamcorper purus, vitae iaculis diam. Suspendisse potenti. Sed fermentum viverra est. Sed vel risus sem. Sed feugiat pellentesque maximus. In vel urna eu urna maximus scelerisque nec sed nunc. Suspendisse potenti. Vestibulum sollicitudin metus non semper vulputate. Sed odio felis, tempus faucibus porttitor in, imperdiet non mauris. +Nullam laoreet magna a lorem rhoncus porttitor. Duis condimentum nisi elit, vitae iaculis neque ullamcorper nec. Curabitur non risus sit amet velit dictum volutpat sit amet vel mauris. Donec neque orci, dictum nec laoreet congue, congue ut augue. Phasellus nec nisi ut mi euismod sollicitudin. Praesent sit amet ante nec libero pulvinar sagittis. Nulla porttitor diam sit amet magna tempor, ac vehicula dui dictum. Phasellus non erat at mauris commodo fermentum a in arcu. Quisque sollicitudin lacinia interdum. +Aliquam vitae rhoncus erat. Cras sollicitudin facilisis libero. Duis sit amet posuere risus. Praesent ex elit, vehicula at condimentum vitae, fermentum ac sem. Duis sit amet nunc est. Mauris lobortis odio eget justo tempus mollis. Fusce luctus ligula mattis lacinia suscipit. Integer vestibulum tincidunt est mollis pharetra. Integer ut varius leo. Praesent sit amet imperdiet neque. Praesent eu scelerisque elit, ac rutrum justo. Donec volutpat, felis eget pulvinar lobortis, arcu felis accumsan mi, at lacinia nisi justo sit amet sapien. Maecenas molestie a quam ac tristique. Donec interdum lacus et purus eleifend, in fringilla diam malesuada. +Nullam porttitor interdum massa ac consectetur. Quisque pharetra, risus vehicula vehicula dictum, erat dui hendrerit risus, auctor accumsan dui sapien sit amet purus. Etiam vel lacinia lorem, id porta urna. Integer eleifend eget nunc in varius. Nullam id massa nec elit cursus pretium. Aliquam luctus urna vitae ante auctor vehicula. Pellentesque facilisis augue et laoreet sollicitudin. Nunc efficitur pulvinar diam eget feugiat. Donec laoreet feugiat nunc quis commodo. +Orci varius natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Nunc feugiat suscipit sem, vel posuere risus sodales at. Aliquam id justo non arcu accumsan feugiat eget vel libero. Nullam rhoncus neque ut magna finibus, et vehicula massa tempus. Sed sagittis risus eget massa vehicula molestie. Pellentesque habitant morbi tristique senectus et netus et malesuada fames ac turpis egestas. Etiam urna quam, placerat vitae luctus vitae, fringilla eget tellus. Maecenas at est molestie, porttitor sem eget, finibus lacus. +Sed velit nisi, malesuada in ipsum ut, bibendum ornare ipsum. Nam vitae dapibus enim. Aenean ut leo quis dolor tempus bibendum. Morbi condimentum diam sapien, nec pretium justo dapibus a. Nullam in felis in elit varius aliquet sit amet at lorem. Mauris rutrum eros nec est blandit, id lobortis mauris blandit. Nam suscipit, tortor pharetra volutpat suscipit, turpis quam volutpat magna, et pharetra nisi ligula non tortor. Integer sodales porttitor dolor. Donec ut tempus dolor, nec placerat nunc. Phasellus molestie fringilla blandit. +Nulla sollicitudin dictum ante, ac pretium libero hendrerit sed. Vestibulum malesuada arcu dapibus dolor ullamcorper sodales. Aenean consectetur turpis sapien, vitae porttitor tortor accumsan non. Nulla tempus libero sit amet consectetur suscipit. Etiam quis nisi id felis imperdiet posuere congue et nisi. Fusce efficitur varius est. Quisque ac tempus nulla. Praesent fringilla risus ipsum, sed molestie metus ullamcorper at. Nulla a fermentum justo. Duis dignissim, ipsum vel maximus finibus, dolor metus feugiat lectus, ac consequat ex purus sit amet lectus. Phasellus lacus dui, interdum in malesuada a, ullamcorper quis felis. +Aenean aliquet posuere ex, vitae vulputate nunc euismod eu. Sed quis mi fermentum, luctus ex at, bibendum lectus. Vivamus nec lectus nec ipsum aliquam cursus. In vitae suscipit turpis. Pellentesque vulputate consequat dui, ac ornare leo elementum vel. Vestibulum pharetra velit nisl, id malesuada nibh tempus ut. Nunc sollicitudin arcu nec arcu commodo, at pellentesque neque porta. Ut venenatis, augue in aliquet fermentum, mi lectus volutpat quam, ac fermentum mi dolor et lorem. Aenean lacus sapien, lacinia vel mauris ut, tincidunt iaculis diam. Etiam iaculis volutpat justo, vel tempus tellus tincidunt ut. Nullam euismod accumsan dolor, ac viverra dolor elementum quis. In fermentum eros efficitur tellus commodo, sit amet aliquam nibh vulputate. diff --git a/test/ex_fuzzywuzzy_performance_test.exs b/test/ex_fuzzywuzzy_performance_test.exs new file mode 100644 index 0000000..29870d6 --- /dev/null +++ b/test/ex_fuzzywuzzy_performance_test.exs @@ -0,0 +1,35 @@ +defmodule ExFuzzywuzzyPerformanceTest do + use ExUnit.Case + + import ExFuzzywuzzy + alias ExFuzzywuzzy.Similarity.JaroWinkler + + @lorem_sample "lorem.txt" + @japanese_sample "japanese.txt" + + test "levenshtein performance match" do + left = read_sample(@lorem_sample) + right = read_sample(@japanese_sample) + + assert ratio(left, left) == 100.0 + assert ratio(right, right) == 100.0 + # a bit slow, maybe improve underlying implementation + assert ratio(left, right) < 10.0 + end + + test "jaro-winkler performance match" do + left = read_sample(@lorem_sample) + right = read_sample(@japanese_sample) + + assert ratio(left, left, similarity_fn: &JaroWinkler.calculate/2) == 100.0 + assert ratio(right, right, similarity_fn: &JaroWinkler.calculate/2) == 100.0 + assert ratio(left, right, similarity_fn: &JaroWinkler.calculate/2) < 40.0 + end + + defp read_sample(sample_name) do + __DIR__ + |> Path.join("data") + |> Path.join(sample_name) + |> File.read!() + end +end diff --git a/test/ex_fuzzywuzzy_test.exs b/test/ex_fuzzywuzzy_test.exs index 961619b..c7db330 100644 --- a/test/ex_fuzzywuzzy_test.exs +++ b/test/ex_fuzzywuzzy_test.exs @@ -1,8 +1,98 @@ defmodule ExFuzzywuzzyTest do use ExUnit.Case - doctest ExFuzzywuzzy + doctest ExFuzzywuzzy, import: true - test "greets the world" do - assert ExFuzzywuzzy.hello() == :world + import ExFuzzywuzzy + alias ExFuzzywuzzy.Similarity.JaroWinkler + + test "standard ratio full match" do + assert ratio("new york mets", "new york mets", case_sensitive: true) == 100.0 + assert ratio("new york mets", "new YORK mets", case_sensitive: true, precision: 0) == 69.0 + + assert ratio("new york mets", "new york mets", + case_sensitive: true, + similarity_fn: &JaroWinkler.calculate/2, + precision: 0 + ) == 100 + + assert ratio("{", "{", case_sensitive: true) == 100.0 + assert ratio("{a", "{a", case_sensitive: true) == 100.0 + + assert ratio("new york mets", "new york mets", + case_sensitive: true, + similarity_fn: &JaroWinkler.calculate/2, + precision: 0 + ) == 100 + + assert ratio("神是狗", "神是狗", similarity_fn: &JaroWinkler.calculate/2) == 100.0 + end + + test "standard ratio full match case insensitive" do + assert ratio("new york mets", "new YORK mets", case_sensitive: false) == 100.0 + + assert ratio("new york mets", "new YORK mets", + case_sensitive: false, + similarity_fn: &JaroWinkler.calculate/2, + precision: 1 + ) == 100.0 + + assert ratio("你貴姓大名?", "你貴姓大名?", case_sensitive: false) == 100.0 + end + + test "standard ratio non full match case insensitive" do + assert ratio("ciao", "ci", precision: 0) == 67.0 + assert ratio("神狗", "神是狗", similarity_fn: &JaroWinkler.calculate/2, precision: 3) == 61.111 + end + + test "partial ratio" do + assert partial_ratio("the wonderful new york mets", "new YORK mets", case_sensitive: false) == 100.0 + assert partial_ratio("the wonderful new york mets", "new YORK mets", case_sensitive: true) == 69.23 + end + + test "token sort ratio" do + assert token_sort_ratio("the wonderful new york mets", "new YORK mets the WONDERFUL", case_sensitive: false) == + 100.0 + + assert token_sort_ratio("神 是 狗", "狗 是 神", case_sensitive: false) == 100.0 + end + + test "token set ratio" do + assert token_set_ratio("the wonderful new york new york mets", "new YORK mets the WONDERFUL", case_sensitive: false) == + 100.0 + + assert token_set_ratio( + """ + 神 + 神 + 是 + 神 + 神 + 是 + """, + "狗 是 神", + case_sensitive: false + ) == 100.0 + end + + test "best score ratio" do + assert best_score_ratio( + """ + 神 + 神 + 是 + 神 + 神 + 是 + """, + "狗 是 神", + case_sensitive: false + ) == + {:token_set, 100.0} + + assert best_score_ratio("blue jeans are the most wonderful wear", "next month the new YORK mets will be WONDERFUL", + partial: true, + case_sensitive: true + ) == + {:partial_token_set, 100.0} end end diff --git a/test/test_helper.exs b/test/test_helper.exs index 869559e..3c96d9e 100644 --- a/test/test_helper.exs +++ b/test/test_helper.exs @@ -1 +1 @@ -ExUnit.start() +ExUnit.start(exclude: [pending: true])