Skip to content

Commit

Permalink
address comments
Browse files Browse the repository at this point in the history
  • Loading branch information
gaoyangxiaozhu committed Jul 1, 2024
1 parent f8b740a commit 304914c
Show file tree
Hide file tree
Showing 3 changed files with 70 additions and 20 deletions.
15 changes: 11 additions & 4 deletions velox/docs/functions/spark/string.rst
Original file line number Diff line number Diff line change
Expand Up @@ -195,14 +195,21 @@ Unless specified otherwise, all functions return NULL if at least one of the arg

SELECT soundex('Miller'); -- "M460"

.. spark:function:: split(string, delimiter) -> array(string)
Splits ``string`` on ``delimiter`` and returns an array.
The delimiter is any string matching regex, supported by re2. ::
.. spark:function:: split(string, delimiter[, limit]) -> array(string)
Splits ``string`` around occurrences that match ``delimiter`` and returns an array
with a length of at most ``limit``. ``delimiter`` is a string representing a regular
expression. ``limit`` is an integer which controls the number of times the regex is
applied. By default, ``limit`` is -1. When ``limit`` > 0, the resulting array's
length will not be more than ``limit``, and the resulting array's last entry will
contain all input beyond the last matched regex. When ``limit`` <= 0, ``regex`` will
be applied as many times as possible, and the resulting array can be of any size. ::

SELECT split('oneAtwoBthreeC', '[ABC]'); -- ["one","two","three",""]
SELECT split('oneAtwoBthreeC', '[ABC]', 2); -- ["one","twoBthreeC"]
SELECT split('one', ''); -- ["o", "n", "e", ""]
SELECT split('one', '1'); -- ["one"]
SELECT split('abcd', ''); -- ["a", "b", "c", "d"]
SELECT split('abcd', '', 3); -- ["a", "b", "c"]

.. spark:function:: split(string, delimiter, limit) -> array(string)
:noindex:
Expand Down
68 changes: 52 additions & 16 deletions velox/functions/sparksql/SplitFunctions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,18 +54,28 @@ class Split final : public exec::VectorFunction {
const auto* rawStrings = strings->data<StringView>();
const auto delim = delims->valueAt<StringView>(0);
rows.applyToSelected([&](vector_size_t row) {
applyInner(rawStrings[row], delim, limit, row, resultWriter);
if (delim.size() == 0) {
splitEmptyDelimer(rawStrings[row], limit, row, resultWriter);
} else {
splitInner(rawStrings[row], delim, limit, row, resultWriter);
}
});
} else {
// The rest of the cases are handled through this general path and no
// direct access.
rows.applyToSelected([&](vector_size_t row) {
applyInner(
strings->valueAt<StringView>(row),
delims->valueAt<StringView>(row),
limit,
row,
resultWriter);
const auto delim = delims->valueAt<StringView>(row);
if (delim.size() == 0) {
splitEmptyDelimer(
strings->valueAt<StringView>(row), limit, row, resultWriter);
} else {
splitInner(
strings->valueAt<StringView>(row),
delim,
limit,
row,
resultWriter);
}
});
}

Expand All @@ -78,7 +88,40 @@ class Split final : public exec::VectorFunction {
->acquireSharedStringBuffers(strings->base());
}

inline void applyInner(
private:
mutable functions::detail::ReCache cache_;

// When pattern is empty, split each character out. Since Spark 3.4, when
// delimiter is empty, the result does not include an empty tail string, e.g.
// split('abc', '') outputs ["a", "b", "c"] instead of ["a", "b", "c", ""].
// The result does not include remaining string when limit is smaller than the
// string size, e.g. split('abc', '', 2) outputs ["a", "b"] instead of ["a",
// "bc"].
void splitEmptyDelimer(
const StringView current,
int64_t limit,
vector_size_t row,
exec::VectorWriter<Array<Varchar>>& resultWriter) const {
resultWriter.setOffset(row);
auto& arrayWriter = resultWriter.current();
if (current.size() == 0) {
arrayWriter.add_item().setNoCopy(StringView());
resultWriter.commit();
return;
}

const char* const begin = current.begin();
const char* const end = current.end();
const char* pos = begin;
while (pos < end && pos < limit + begin) {
arrayWriter.add_item().setNoCopy(StringView(pos, 1));
pos += 1;
}
resultWriter.commit();
}

// Split with a non-empty pattern.
void splitInner(
StringView input,
const StringView delim,
int64_t limit,
Expand All @@ -99,6 +142,7 @@ class Split final : public exec::VectorFunction {
// adding them to the elements vector, until we reached the end of the
// string or the limit.
int32_t addedElements{0};
bool emptyDelim = delim.size() == 0 ? true : false;
auto* re = cache_.findOrCompile(delim);
const auto re2String = re2::StringPiece(input.data(), input.size());
size_t pos = 0;
Expand All @@ -110,11 +154,6 @@ class Split final : public exec::VectorFunction {
auto offset = fullMatch.data() - start;
const auto size = fullMatch.size();

if (size == 0) {
// delimer is empty string
offset += 1;
}

if (offset >= input.size()) {
break;
}
Expand All @@ -135,9 +174,6 @@ class Split final : public exec::VectorFunction {
StringView(input.data() + pos, input.size() - pos));
resultWriter.commit();
}

private:
mutable functions::detail::ReCache cache_;
};

std::shared_ptr<exec::VectorFunction> createSplit(
Expand Down
7 changes: 7 additions & 0 deletions velox/functions/sparksql/tests/SplitFunctionsTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,13 @@ TEST_F(SplitTest, split) {
{""},
});
assertEqualVectors(expected, run(inputStrings, delim, "split(C0, C1)"));
auto expected2 = makeArrayVector<StringView>({
{"I", ","},
{"o", "n"},
{""},
});
assertEqualVectors(
expected2, run(inputStrings, delim, "split(C0, C1, C2)", 2));

// Non-ascii, flat strings, flat delimiter, no limit.
delim = "లేదా";
Expand Down

0 comments on commit 304914c

Please sign in to comment.