Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

correctly treat backslash in datafusion-cli #14844

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions datafusion-cli/src/exec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ use crate::helper::split_from_semicolon;
use crate::print_format::PrintFormat;
use crate::{
command::{Command, OutputFormat},
helper::{unescape_input, CliHelper},
helper::CliHelper,
object_storage::get_object_store,
print_options::{MaxRows, PrintOptions},
};
Expand Down Expand Up @@ -172,7 +172,7 @@ pub async fn exec_from_repl(
}
}
Ok(line) => {
let lines = split_from_semicolon(line);
let lines = split_from_semicolon(&line);
for line in lines {
rl.add_history_entry(line.trim_end())?;
tokio::select! {
Expand Down Expand Up @@ -215,7 +215,6 @@ pub(super) async fn exec_and_print(
sql: String,
) -> Result<()> {
let now = Instant::now();
let sql = unescape_input(&sql)?;
let task_ctx = ctx.task_ctx();
let dialect = &task_ctx.session_config().options().sql_parser.dialect;
let dialect = dialect_from_str(dialect).ok_or_else(|| {
Expand Down
73 changes: 14 additions & 59 deletions datafusion-cli/src/helper.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,8 @@ use std::borrow::Cow;

use crate::highlighter::{NoSyntaxHighlighter, SyntaxHighlighter};

use datafusion::common::sql_datafusion_err;
use datafusion::error::DataFusionError;
use datafusion::sql::parser::{DFParser, Statement};
use datafusion::sql::sqlparser::dialect::dialect_from_str;
use datafusion::sql::sqlparser::parser::ParserError;

use rustyline::completion::{Completer, FilenameCompleter, Pair};
use rustyline::error::ReadlineError;
Expand Down Expand Up @@ -63,15 +60,6 @@ impl CliHelper {

fn validate_input(&self, input: &str) -> Result<ValidationResult> {
if let Some(sql) = input.strip_suffix(';') {
let sql = match unescape_input(sql) {
Ok(sql) => sql,
Err(err) => {
return Ok(ValidationResult::Invalid(Some(format!(
" 🤔 Invalid statement: {err}",
))))
}
};

let dialect = match dialect_from_str(&self.dialect) {
Some(dialect) => dialect,
None => {
Expand Down Expand Up @@ -166,40 +154,8 @@ impl Validator for CliHelper {

impl Helper for CliHelper {}

/// Unescape input string from readline.
///
/// The data read from stdio will be escaped, so we need to unescape the input before executing the input
pub fn unescape_input(input: &str) -> datafusion::error::Result<String> {
Comment on lines -171 to -172
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So this function was the source of the bug. Per the comment above, someone had a reason to write it though. Do you maybe know what changed? Like, did we change how we obtain input lines?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One thing that might have changed is that sqlparser got much more sophistcated in handling escaping itself / added a bunch of handling for the escaping in the parser (like maybe \u{} style syntax 🤔 )

let mut chars = input.chars();

let mut result = String::with_capacity(input.len());
while let Some(char) = chars.next() {
if char == '\\' {
if let Some(next_char) = chars.next() {
// https://static.rust-lang.org/doc/master/reference.html#literals
result.push(match next_char {
'0' => '\0',
'n' => '\n',
'r' => '\r',
't' => '\t',
'\\' => '\\',
_ => {
return Err(sql_datafusion_err!(ParserError::TokenizerError(
format!("unsupported escape char: '\\{}'", next_char)
)))
}
});
}
} else {
result.push(char);
}
}

Ok(result)
}

/// Splits a string which consists of multiple queries.
pub(crate) fn split_from_semicolon(sql: String) -> Vec<String> {
pub(crate) fn split_from_semicolon(sql: &str) -> Vec<String> {
let mut commands = Vec::new();
let mut current_command = String::new();
let mut in_single_quote = false;
Expand Down Expand Up @@ -310,14 +266,13 @@ mod tests {
)?;
assert!(matches!(result, ValidationResult::Valid(None)));

// should be invalid
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

removed this check since it would throw error later
image

let result = readline_direct(
Cursor::new(
r"create external table test stored as csv location 'data.csv' options ('format.delimiter' '\u{07}');"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

worth keeping this test case?

.as_bytes()),
&validator,
)?;
assert!(matches!(result, ValidationResult::Invalid(Some(_))));
Cursor::new(
r"select '\', '\\', '\\\\\', 'dsdsds\\\\', '\t', '\0', '\n';".as_bytes(),
),
&validator,
)?;
assert!(matches!(result, ValidationResult::Valid(None)));

Ok(())
}
Expand Down Expand Up @@ -346,34 +301,34 @@ mod tests {
fn test_split_from_semicolon() {
let sql = "SELECT 1; SELECT 2;";
let expected = vec!["SELECT 1;", "SELECT 2;"];
assert_eq!(split_from_semicolon(sql.to_string()), expected);
assert_eq!(split_from_semicolon(sql), expected);

let sql = r#"SELECT ";";"#;
let expected = vec![r#"SELECT ";";"#];
assert_eq!(split_from_semicolon(sql.to_string()), expected);
assert_eq!(split_from_semicolon(sql), expected);

let sql = "SELECT ';';";
let expected = vec!["SELECT ';';"];
assert_eq!(split_from_semicolon(sql.to_string()), expected);
assert_eq!(split_from_semicolon(sql), expected);

let sql = r#"SELECT 1; SELECT 'value;value'; SELECT 1 as "text;text";"#;
let expected = vec![
"SELECT 1;",
"SELECT 'value;value';",
r#"SELECT 1 as "text;text";"#,
];
assert_eq!(split_from_semicolon(sql.to_string()), expected);
assert_eq!(split_from_semicolon(sql), expected);

let sql = "";
let expected: Vec<String> = Vec::new();
assert_eq!(split_from_semicolon(sql.to_string()), expected);
assert_eq!(split_from_semicolon(sql), expected);

let sql = "SELECT 1";
let expected = vec!["SELECT 1;"];
assert_eq!(split_from_semicolon(sql.to_string()), expected);
assert_eq!(split_from_semicolon(sql), expected);

let sql = "SELECT 1; ";
let expected = vec!["SELECT 1;"];
assert_eq!(split_from_semicolon(sql.to_string()), expected);
assert_eq!(split_from_semicolon(sql), expected);
}
}
4 changes: 4 additions & 0 deletions datafusion-cli/tests/cli_integration.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@ fn init() {
["--command", "select 1; select 2;", "--format", "json", "-q"],
"[{\"Int64(1)\":1}]\n[{\"Int64(2)\":2}]\n"
)]
#[case::exec_backslash(
["--file", "tests/data/backslash.txt", "--format", "json", "-q"],
"[{\"Utf8(\\\"\\\\\\\")\":\"\\\\\",\"Utf8(\\\"\\\\\\\\\\\")\":\"\\\\\\\\\",\"Utf8(\\\"\\\\\\\\\\\\\\\\\\\\\\\")\":\"\\\\\\\\\\\\\\\\\\\\\",\"Utf8(\\\"dsdsds\\\\\\\\\\\\\\\\\\\")\":\"dsdsds\\\\\\\\\\\\\\\\\",\"Utf8(\\\"\\\\t\\\")\":\"\\\\t\",\"Utf8(\\\"\\\\0\\\")\":\"\\\\0\",\"Utf8(\\\"\\\\n\\\")\":\"\\\\n\"}]\n"
)]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's so many backslashes on a single line.
What does it test? How do we know it's really correct?
(when i test's semantics is debatable, it's prone to change later; it's better to have a test that "is obviously correct")

#[case::exec_from_files(
["--file", "tests/data/sql.txt", "--format", "json", "-q"],
"[{\"Int64(1)\":1}]\n"
Expand Down
1 change: 1 addition & 0 deletions datafusion-cli/tests/data/backslash.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
select '\', '\\', '\\\\\', 'dsdsds\\\\', '\t', '\0', '\n';