-
Notifications
You must be signed in to change notification settings - Fork 1.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Update REGEXP_MATCH scalar function to support Utf8View #14449
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -193,6 +193,29 @@ NULL | |
[Köln] | ||
[إسرائيل] | ||
|
||
# test string view | ||
statement ok | ||
CREATE TABLE t_stringview AS | ||
SELECT arrow_cast(str, 'Utf8View') as str, arrow_cast(pattern, 'Utf8View') as pattern, arrow_cast(flags, 'Utf8View') as flags FROM t; | ||
|
||
query ? | ||
SELECT regexp_match(str, pattern, flags) FROM t_stringview; | ||
---- | ||
[a] | ||
[A] | ||
[B] | ||
NULL | ||
NULL | ||
NULL | ||
[010] | ||
[Düsseldorf] | ||
[Москва] | ||
[Köln] | ||
[إسرائيل] | ||
|
||
statement ok | ||
DROP TABLE t_stringview; | ||
|
||
query ? | ||
SELECT regexp_match('foobarbequebaz', ''); | ||
---- | ||
|
@@ -354,6 +377,29 @@ X | |
X | ||
X | ||
|
||
# test string view | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As discussed in #11911 (comment) it would be great to move these tests into string.slt but we can totally do it as a follow on as well |
||
statement ok | ||
CREATE TABLE t_stringview AS | ||
SELECT arrow_cast(str, 'Utf8View') as str, arrow_cast(pattern, 'Utf8View') as pattern, arrow_cast(flags, 'Utf8View') as flags FROM t; | ||
|
||
query T | ||
SELECT regexp_replace(str, pattern, 'X', concat('g', flags)) FROM t_stringview; | ||
---- | ||
Xbc | ||
X | ||
aXc | ||
AbC | ||
aBC | ||
4000 | ||
X | ||
X | ||
X | ||
X | ||
X | ||
|
||
statement ok | ||
DROP TABLE t_stringview; | ||
|
||
query T | ||
SELECT regexp_replace('ABCabcABC', '(abc)', 'X', 'gi'); | ||
---- | ||
|
@@ -621,7 +667,7 @@ CREATE TABLE t_stringview AS | |
SELECT arrow_cast(str, 'Utf8View') as str, arrow_cast(pattern, 'Utf8View') as pattern, arrow_cast(start, 'Int64') as start, arrow_cast(flags, 'Utf8View') as flags FROM t; | ||
|
||
query I | ||
SELECT regexp_count(str, '\w') from t; | ||
SELECT regexp_count(str, '\w') from t_stringview; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 this looks like a driveby fix There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, it was. Couldn't resist. |
||
---- | ||
3 | ||
3 | ||
|
@@ -636,7 +682,7 @@ SELECT regexp_count(str, '\w') from t; | |
7 | ||
|
||
query I | ||
SELECT regexp_count(str, '\w{2}', start) from t; | ||
SELECT regexp_count(str, '\w{2}', start) from t_stringview; | ||
---- | ||
1 | ||
1 | ||
|
@@ -651,7 +697,7 @@ SELECT regexp_count(str, '\w{2}', start) from t; | |
3 | ||
|
||
query I | ||
SELECT regexp_count(str, 'ab', 1, 'i') from t; | ||
SELECT regexp_count(str, 'ab', 1, 'i') from t_stringview; | ||
---- | ||
1 | ||
1 | ||
|
@@ -667,7 +713,7 @@ SELECT regexp_count(str, 'ab', 1, 'i') from t; | |
|
||
|
||
query I | ||
SELECT regexp_count(str, pattern) from t; | ||
SELECT regexp_count(str, pattern) from t_stringview; | ||
---- | ||
1 | ||
1 | ||
|
@@ -682,7 +728,7 @@ SELECT regexp_count(str, pattern) from t; | |
1 | ||
|
||
query I | ||
SELECT regexp_count(str, pattern, start) from t; | ||
SELECT regexp_count(str, pattern, start) from t_stringview; | ||
---- | ||
1 | ||
1 | ||
|
@@ -697,7 +743,7 @@ SELECT regexp_count(str, pattern, start) from t; | |
1 | ||
|
||
query I | ||
SELECT regexp_count(str, pattern, start, flags) from t; | ||
SELECT regexp_count(str, pattern, start, flags) from t_stringview; | ||
---- | ||
1 | ||
1 | ||
|
@@ -713,7 +759,7 @@ SELECT regexp_count(str, pattern, start, flags) from t; | |
|
||
# test type coercion | ||
query I | ||
SELECT regexp_count(arrow_cast(str, 'Utf8'), arrow_cast(pattern, 'LargeUtf8'), arrow_cast(start, 'Int32'), flags) from t; | ||
SELECT regexp_count(arrow_cast(str, 'Utf8'), arrow_cast(pattern, 'LargeUtf8'), arrow_cast(start, 'Int32'), flags) from t_stringview; | ||
---- | ||
1 | ||
1 | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -783,7 +783,7 @@ EXPLAIN SELECT | |
FROM test; | ||
---- | ||
logical_plan | ||
01)Projection: regexp_match(CAST(test.column1_utf8view AS Utf8), Utf8("^https?://(?:www\.)?([^/]+)/.*$")) AS k | ||
01)Projection: regexp_match(test.column1_utf8view, Utf8View("^https?://(?:www\.)?([^/]+)/.*$")) AS k | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🎉 |
||
02)--TableScan: test projection=[column1_utf8view] | ||
|
||
## Ensure no casts for REGEXP_REPLACE | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I agree technically speaking this is an API change but I also think it is small and is ok. I will flag this PR as API change but I think it will be minimally disruptive