Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Draft][Require RFC] mb_levenshtein function #16043

Draft
wants to merge 16 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 118 additions & 0 deletions ext/mbstring/mbstring.c
Original file line number Diff line number Diff line change
Expand Up @@ -3166,6 +3166,124 @@ PHP_FUNCTION(mb_rtrim)
php_do_mb_trim(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_RTRIM);
}

PHP_FUNCTION(mb_levenshtein)
{
zend_string *string1, *string2, *enc_name = NULL;
zend_long cost_ins = 1;
zend_long cost_rep = 1;
zend_long cost_del = 1;

ZEND_PARSE_PARAMETERS_START(2, 6)
Z_PARAM_STR(string1)
Z_PARAM_STR(string2)
Z_PARAM_OPTIONAL
Z_PARAM_LONG(cost_ins)
Z_PARAM_LONG(cost_rep)
Z_PARAM_LONG(cost_del)
Z_PARAM_STR_OR_NULL(enc_name)
ZEND_PARSE_PARAMETERS_END();

if (ZSTR_LEN(string1) == 0) {
RETVAL_LONG(ZSTR_LEN(string2) * cost_ins);
}

if (ZSTR_LEN(string2) == 0) {
RETVAL_LONG(ZSTR_LEN(string1) * cost_del);
}

const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 6);
if (!enc) {
RETURN_THROWS();
}

/* When all costs are equal, levenshtein fulfills the requirements of a metric, which means
* that the distance is symmetric. If string1 is shorter than string 2 we can save memory (and CPU time)
* by having shorter rows (p1 & p2). */
if (ZSTR_LEN(string1) < ZSTR_LEN(string2) && cost_ins == cost_rep && cost_rep == cost_del) {
zend_string *tmp = string1;
string1 = string2;
string2 = tmp;
}

uint32_t wchar_buf_1[128], wchar_buf_2[128];
size_t i1, i2;
zend_long *p1, *p2, *tmp;
size_t strlen_1 = mb_get_strlen(string1, enc);
size_t strlen_2 = mb_get_strlen(string2, enc);
size_t len_2 = 0;
size_t in_len_1 = ZSTR_LEN(string1);
size_t in_len_2 = ZSTR_LEN(string2);
unsigned char *in_1 = (unsigned char*)ZSTR_VAL(string1);
unsigned char *in_2 = (unsigned char*)ZSTR_VAL(string2);
unsigned int state = 0;

zend_long c0, c1, c2;

p1 = safe_emalloc(strlen_1, sizeof(zend_long), 0);
p2 = safe_emalloc(strlen_2, sizeof(zend_long), 0);

for (i2 = 0; i2 <= strlen_2; i2++) {
p1[i2] = i2 * cost_ins;
}

zend_long tmp_wchar_len_1 = 0;
zend_long tmp_wchar_len_2 = 0;
bool first = true;

while (in_len_1) {
tmp_wchar_len_1 = enc->to_wchar(&in_1, &in_len_1, wchar_buf_1, 128, &state);
ZEND_ASSERT(in_len_1 <= 128);
tmp_wchar_len_2 = enc->to_wchar(&in_2, &in_len_2, wchar_buf_2, 128, &state);
len_2 += tmp_wchar_len_2;
ZEND_ASSERT(in_len_2 <= 128);

for (i1 = 0; i1 < tmp_wchar_len_1; i1++) {
/* First loop that does not cross a 128 code points */
if (first) {
p2[0] = p1[0] + cost_del;
}
/* Insertion process when there is a surplus of 128 code points. */
if (tmp_wchar_len_2 == 0) {
for (i2 = 0; i2 < tmp_wchar_len_1; i2++) {
c0 = p1[i2 + (len_2 - tmp_wchar_len_1)] + cost_rep;
c1 = p1[i2 + (len_2 - tmp_wchar_len_1) + 1] + cost_del;
if (c1 < c0) {
c0 = c1;
}
c2 = p2[i2 + (len_2 - tmp_wchar_len_1)] + cost_ins;
if (c2 < c0) {
c0 = c2;
}
p2[i2 + (len_2 - tmp_wchar_len_1) + 1] = c0;
}
} else {
for (i2 = 0; i2 < tmp_wchar_len_2; i2++) {
c0 = p1[i2 + (len_2 - tmp_wchar_len_2)] + (wchar_buf_1[i1] == wchar_buf_2[i2] ? 0 : cost_rep);
c1 = p1[i2 + (len_2 - tmp_wchar_len_2) + 1] + cost_del;
if (c1 < c0) {
c0 = c1;
}
c2 = p2[i2 + (len_2 - tmp_wchar_len_2)] + cost_ins;
if (c2 < c0) {
c0 = c2;
}
p2[i2 + (len_2 - tmp_wchar_len_2) + 1] = c0;
}
}
tmp = p1;
p1 = p2;
p2 = tmp;
}
first = false;
}

c0 = p1[strlen_2];
efree(p1);
efree(p2);

RETVAL_LONG(c0);
}

static const mbfl_encoding **duplicate_elist(const mbfl_encoding **elist, size_t size)
{
const mbfl_encoding **new_elist = safe_emalloc(size, sizeof(mbfl_encoding*), 0);
Expand Down
2 changes: 2 additions & 0 deletions ext/mbstring/mbstring.stub.php
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,8 @@ function mb_ltrim(string $string, ?string $characters = null, ?string $encoding

function mb_rtrim(string $string, ?string $characters = null, ?string $encoding = null): string {}

function mb_levenshtein(string $string1, string $string2, int $insertion_cost = 1, int $replacement_cost = 1, int $deletion_cost = 1, ?string $encoding = null): int {}

/** @refcount 1 */
function mb_detect_encoding(string $string, array|string|null $encodings = null, bool $strict = false): string|false {}

Expand Down
13 changes: 12 additions & 1 deletion ext/mbstring/mbstring_arginfo.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

86 changes: 86 additions & 0 deletions ext/mbstring/tests/mb_levenshtein.phpt
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
--TEST--
mb_levenshtein() function test
--FILE--
<?php

echo '--- Equal ---' . \PHP_EOL;
var_dump(mb_levenshtein('12345', '12345'));

echo '--- First string empty ---' . \PHP_EOL;
var_dump(mb_levenshtein('', 'xyz'));
echo '--- Second string empty ---' . \PHP_EOL;
var_dump(mb_levenshtein('xyz', ''));
echo '--- Both empty ---' . \PHP_EOL;
var_dump(mb_levenshtein('', ''));
var_dump(mb_levenshtein('', '', 10, 10, 10));

echo '--- 1 character ---' . \PHP_EOL;
var_dump(mb_levenshtein('1', '2'));
echo '--- 2 character swapped ---' . \PHP_EOL;
var_dump(mb_levenshtein('12', '21'));

echo '--- Inexpensive deletion ---' . \PHP_EOL;
var_dump(mb_levenshtein('2121', '11', 2));
echo '--- Expensive deletion ---' . \PHP_EOL;
var_dump(mb_levenshtein('2121', '11', 2, 1, 5));

echo '--- Inexpensive insertion ---' . \PHP_EOL;
var_dump(mb_levenshtein('11', '2121'));
echo '--- Expensive insertion ---' . \PHP_EOL;
var_dump(mb_levenshtein('11', '2121', 5));

echo '--- Expensive replacement ---' . \PHP_EOL;
var_dump(mb_levenshtein('111', '121', 2, 3, 2));
echo '--- Very expensive replacement ---' . \PHP_EOL;
var_dump(mb_levenshtein('111', '121', 2, 9, 2));

echo '--- 128 codepoints over ---' . \PHP_EOL;
var_dump(mb_levenshtein(str_repeat("a", 128) . "abc", str_repeat("a", 128) . "aaa"));
echo '--- 128 codepoints over only $string1 ---' . \PHP_EOL;
var_dump(mb_levenshtein(str_repeat("a", 128) . "abc", "aaa"));
echo '--- 128 codepoints over only $string2 ---' . \PHP_EOL;
var_dump(mb_levenshtein("abc", str_repeat("a", 128) . "aaa"));
echo '--- 128 codepoints over Hiragana ---' . \PHP_EOL;
var_dump(mb_levenshtein(str_repeat("あ", 128) . "あああ", str_repeat("あ", 128) . "あいう"));

echo '--- 128 codepoints over Hiragana in Shift_JIS ---' . \PHP_EOL;
$hiragana_a = mb_convert_encoding("あ", "SJIS", "UTF-8");
$hiragana_aiu = mb_convert_encoding("あいう", "SJIS", "UTF-8");
var_dump(mb_levenshtein(str_repeat($hiragana_a, 128 + 3), str_repeat($hiragana_a, 128) . $hiragana_aiu, encoding: "SJIS"));
?>
--EXPECT--
--- Equal ---
int(0)
--- First string empty ---
int(3)
--- Second string empty ---
int(3)
--- Both empty ---
int(0)
int(0)
--- 1 character ---
int(1)
--- 2 character swapped ---
int(2)
--- Inexpensive deletion ---
int(2)
--- Expensive deletion ---
int(10)
--- Inexpensive insertion ---
int(2)
--- Expensive insertion ---
int(10)
--- Expensive replacement ---
int(3)
--- Very expensive replacement ---
int(4)
--- 128 codepoints over ---
int(2)
--- 128 codepoints over only $string1 ---
int(128)
--- 128 codepoints over only $string2 ---
int(130)
--- 128 codepoints over Hiragana ---
int(2)
--- 128 codepoints over Hiragana in Shift_JIS ---
int(2)
Loading