From 5a418d724d74dee213552ffa114a04a4486b95fc Mon Sep 17 00:00:00 2001 From: molsonkiko <46202915+molsonkiko@users.noreply.github.com> Date: Thu, 8 Feb 2024 19:36:31 -0800 Subject: [PATCH] regex form guess CSV params; fix multiline tb bug 1. the regex search form now automatically tries to guess the delimiter, eol and number of columns for a CSV file if the new auto_try_guess_csv_delim_newline setting is on. This is pretty slow, but there's room for improvement, and it makes life a lot easier. 2. Fix bug (introduced in last commit) where the Enter key does not add a new line in multiline textboxes. --- .github/workflows/CI_build.yml | 90 +++++----- CHANGELOG.md | 8 +- JsonToolsNppPlugin/Forms/NppFormHelper.cs | 25 +++ JsonToolsNppPlugin/Forms/RegexSearchForm.cs | 75 ++++++-- JsonToolsNppPlugin/Forms/TreeViewer.cs | 2 + JsonToolsNppPlugin/JSONTools/CsvSniffer.cs | 68 ++++++++ .../JSONTools/RemesPathFunctions.cs | 4 +- JsonToolsNppPlugin/JsonToolsNppPlugin.csproj | 2 + JsonToolsNppPlugin/Properties/AssemblyInfo.cs | 4 +- JsonToolsNppPlugin/Tests/CsvSnifferTests.cs | 163 ++++++++++++++++++ JsonToolsNppPlugin/Tests/TestRunner.cs | 2 + JsonToolsNppPlugin/Utils/Settings.cs | 7 + docs/README.md | 2 + most recent errors.txt | 88 +++++----- 14 files changed, 433 insertions(+), 107 deletions(-) create mode 100644 JsonToolsNppPlugin/JSONTools/CsvSniffer.cs create mode 100644 JsonToolsNppPlugin/Tests/CsvSnifferTests.cs diff --git a/.github/workflows/CI_build.yml b/.github/workflows/CI_build.yml index f598e75..7183730 100644 --- a/.github/workflows/CI_build.yml +++ b/.github/workflows/CI_build.yml @@ -1,45 +1,45 @@ -name: Continuous Integration - -on: - push: - paths-ignore: - - 'docs/**' - - '*.md' - - '*.txt' - - '*.PNG' - - 'makerelease.bat' - - 'testfiles/**' - -jobs: - build: - runs-on: windows-2022 - strategy: - max-parallel: 4 - matrix: - build_configuration: [Release, Debug] - build_platform: [x64, x86] - - steps: - - name: Checkout repo - uses: actions/checkout@v4 - - - name: Add msbuild to PATH - uses: microsoft/setup-msbuild@v2.0.0 - - - name: MSBuild of solution - run: msbuild JsonToolsNppPlugin/JsonToolsNppPlugin.sln /p:configuration="${{ matrix.build_configuration }}" /p:platform="${{ matrix.build_platform }}" /m /verbosity:minimal - - - name: Archive artifacts for x64 - if: matrix.build_platform == 'x64' && matrix.build_configuration == 'Release' - uses: actions/upload-artifact@v4 - with: - name: plugin_dll_x64 - path: JsonToolsNppPlugin\bin\${{ matrix.build_configuration }}-x64\JsonTools.dll - - - name: Archive artifacts for x86 - if: matrix.build_platform == 'x86' && matrix.build_configuration == 'Release' - uses: actions/upload-artifact@v4 - with: - name: plugin_dll_x86 - path: JsonToolsNppPlugin\bin\${{ matrix.build_configuration }}\JsonTools.dll - +name: Continuous Integration + +on: + push: + paths-ignore: + - 'docs/**' + - '*.md' + - '*.txt' + - '*.PNG' + - 'makerelease.bat' + - 'testfiles/**' + +jobs: + build: + runs-on: windows-2022 + strategy: + max-parallel: 4 + matrix: + build_configuration: [Release, Debug] + build_platform: [x64, x86] + + steps: + - name: Checkout repo + uses: actions/checkout@v4 + + - name: Add msbuild to PATH + uses: microsoft/setup-msbuild@v2.0.0 + + - name: MSBuild of solution + run: msbuild JsonToolsNppPlugin/JsonToolsNppPlugin.sln /p:configuration="${{ matrix.build_configuration }}" /p:platform="${{ matrix.build_platform }}" /m /verbosity:minimal + + - name: Archive artifacts for x64 + if: matrix.build_platform == 'x64' && matrix.build_configuration == 'Release' + uses: actions/upload-artifact@v4 + with: + name: plugin_dll_x64 + path: JsonToolsNppPlugin\bin\${{ matrix.build_configuration }}-x64\JsonTools.dll + + - name: Archive artifacts for x86 + if: matrix.build_platform == 'x86' && matrix.build_configuration == 'Release' + uses: actions/upload-artifact@v4 + with: + name: plugin_dll_x86 + path: JsonToolsNppPlugin\bin\${{ matrix.build_configuration }}\JsonTools.dll + diff --git a/CHANGELOG.md b/CHANGELOG.md index 705e182..ec7ddba 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,11 +26,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/). * Add `uses_context` field to ArgFunction instances, so that they have JQueryContext appended to their arguments, and they can reference fields of that JQueryContext. * This way we don't have to have these methods mutating and referencing a global static variable. * Additionally, the presence of a function with `uses_context=true` would serve as a flag that the query cannot be executed in parallel, because doing so would cause race conditions associated with the shared JQueryContext fields. -7. Make it so the regex search form makes a very basic effort to determine the quote character, delimiter, and number of columns in CSV files. - * maybe only try to do this for files with the `.csv` and `.tsv` extensions - * only test the `,` and `\t` delimiters, and only the `"` or `'` quote characters - * test only the first 10KB of the file, or first 25 lines, whichever comes first. -8. Unit tests that randomly generate text with JSON chars to make sure JSON parser never throws for any reason, since errors aren't caught. +7. Unit tests that randomly generate text with JSON chars to make sure JSON parser never throws for any reason, since errors aren't caught. ### To Be Changed @@ -52,6 +48,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/). - issue with treeview closing when a file with a treeview is moved from one view to another - `loop()` function used in `s_sub` callbacks is not thread-safe. *This doesn't matter right now* because RemesPath is single-threaded, but it could matter in the future. - __GrepperForm loses its JSON permanently when the buffer associated with its treeview is deleted.__ +- Since v7.0, holding down `Enter` in a multiline textbox (like the [tree viewer query box](/docs/README.md#remespath)) only adds one newline when the key is lifted. ## [7.0.0] - (UNRELEASED) YYYY-MM-DD @@ -63,6 +60,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/). 4. [Python-style single-line comments in RemesPath](/docs/RemesPath.md#comments-added-in-v62) 5. A [RemesPath user-defined language (UDL) file](/RemesPath%20UDL.xml), providing some very basic syntax highlighting. It is buggy, but that is because the UDL system is inherently buggy, not because I did anything wrong (as far as I know). 6. A `:` character between two key-value pairs in an object no longer causes a fatal error that makes the parser quit. +7. Add new `auto_try_guess_csv_delim_newline` setting. If this is true (default false), [Regex search form](/docs/README.md#regex-search-form) now makes a very basic attempt to "sniff" if the current file is CSV whenever it is opened, or when the `Parse as CSV?` button is toggled on. ### Changed diff --git a/JsonToolsNppPlugin/Forms/NppFormHelper.cs b/JsonToolsNppPlugin/Forms/NppFormHelper.cs index 721222f..574682b 100644 --- a/JsonToolsNppPlugin/Forms/NppFormHelper.cs +++ b/JsonToolsNppPlugin/Forms/NppFormHelper.cs @@ -79,6 +79,8 @@ public static void GenericKeyUpHandler(Form form, object sender, KeyEventArgs e, // Enter has the same effect as clicking a selected button btn.PerformClick(); } + else + PressEnterInTextBoxHandler(sender, isModal); } // Escape -> // * if this.IsModal (meaning this is a pop-up dialog), close this. @@ -97,6 +99,29 @@ public static void GenericKeyUpHandler(Form form, object sender, KeyEventArgs e, } } + /// <summary> + /// NPPM_MODELESSDIALOG consumes the KeyDown and KeyPress events for the Enter key,<br></br> + /// so our KeyUp handler needs to simulate pressing enter to add a new line in a multiline text box.<br></br> + /// Note that this does not fully repair the functionality of the Enter key in a multiline text box, + /// because only one newline can be created for a single keypress of Enter, no matter how long the key is held down. + /// </summary> + /// <param name="sender">the text box that sent the message</param> + /// <param name="isModal">if true, this blocks the parent application until closed. THIS IS ONLY TRUE OF POP-UP DIALOGS</param> + public static void PressEnterInTextBoxHandler(object sender, bool isModal) + { + + if (!isModal && sender is TextBox tb && tb.Multiline) + { + int selstart = tb.SelectionStart; + tb.SelectedText = ""; + string text = tb.Text; + tb.Text = text.Substring(0, selstart) + "\r\n" + text.Substring(selstart); + tb.SelectionStart = selstart + 2; // after the inserted newline + tb.SelectionLength = 0; + tb.ScrollToCaret(); + } + } + /// <summary> /// CALL THIS IN YOUR Dispose(bool disposing) METHOD, INSIDE OF THE ".Designer.cs" FILE<br></br> /// If this was a modeless dialog (i.e., !isModal; a dialog that does not block Notepad++ while open),<br></br> diff --git a/JsonToolsNppPlugin/Forms/RegexSearchForm.cs b/JsonToolsNppPlugin/Forms/RegexSearchForm.cs index 5a5aada..41726dc 100644 --- a/JsonToolsNppPlugin/Forms/RegexSearchForm.cs +++ b/JsonToolsNppPlugin/Forms/RegexSearchForm.cs @@ -5,6 +5,7 @@ using JSON_Tools.JSON_Tools; using System.Linq; using System.Collections.Generic; +using Kbg.NppPluginNET.PluginInfrastructure; namespace JSON_Tools.Forms { @@ -53,6 +54,10 @@ public RegexSearchForm() "}"), 0); GetTreeViewInRegexMode(); + // check it, see if we have a CSV + ParseAsCsvCheckBox.Checked = true; + if (NColumnsTextBox.Text.Length == 0) + ParseAsCsvCheckBox.Checked = false; } public void GrabFocus() @@ -86,7 +91,7 @@ private void GetTreeViewInRegexMode() private static readonly Dictionary<string, int> HEADER_HANDLING_ABBREV_MAP = new Dictionary<string, int> { ["\"h\""] = 1, ["\"n\""] = 0, ["\"d\""] = 2, ["1"] = 1, ["2"] = 2, ["0"] = 0 }; private static readonly Dictionary<string, int> NEWLINE_MAP = new Dictionary<string, int> { ["\"\\r\\n\""] = 0, ["\"\\n\""] = 1, ["\"\\r\""] = 2, ["0"] = 0, ["1"] = 1, ["2"] = 2 }; - + public void SearchButton_Click(object sender, EventArgs e) { GetTreeViewInRegexMode(); @@ -151,22 +156,68 @@ private void RegexSearchForm_KeyUp(object sender, KeyEventArgs e) NppFormHelper.GenericKeyUpHandler(this, sender, e, false); } + /// <summary> + /// <strong>Checking</strong>the ParseAsCsvCheckBox does the following:<br></br> + /// - reveals all the CSV-related controls<br></br> + /// - disables the regex-related controls<br></br> + /// - sniffs the first 16 KB of the document (or 16 lines, whichever comes first) + /// using every combo of (',', '\t') delimiters and ('\r\n', '\n', '\r') newlines + /// and sets the CSV controls appropriately if a match is found<br></br> + /// <strong>Unchecking</strong>the ParseAsCsvCheckBox does the following:<br></br> + /// - hides the CSV related controls<br></br> + /// - enables the regex-related controls + /// </summary> public void ParseAsCsvCheckBox_CheckedChanged(object sender, EventArgs e) { bool showCsvButtons = ParseAsCsvCheckBox.Checked; - QuoteCharTextBox.Visible = showCsvButtons; - QuoteCharTextBoxLabel.Visible = showCsvButtons; - DelimiterTextBox.Visible = showCsvButtons; - DelimiterTextBoxLabel.Visible = showCsvButtons; - NewlineComboBox.Visible = showCsvButtons; - NewlineComboBoxLabel.Visible = showCsvButtons; - HeaderHandlingComboBox.Visible = showCsvButtons; + // thanks to the magical mysteries of registering this form with NPPM_MODELESSDIALOG, + // the order in which I make controls visible defines their tab order. + DelimiterTextBox.Visible = showCsvButtons; + DelimiterTextBoxLabel.Visible = showCsvButtons; + QuoteCharTextBox.Visible = showCsvButtons; + QuoteCharTextBoxLabel.Visible = showCsvButtons; + NewlineComboBox.Visible = showCsvButtons; + NewlineComboBoxLabel.Visible = showCsvButtons; + NColumnsTextBox.Visible = showCsvButtons; + NColumnsTextBoxLabel.Visible = showCsvButtons; + HeaderHandlingComboBox.Visible = showCsvButtons; HeaderHandlingComboBoxLabel.Visible = showCsvButtons; - NColumnsTextBox.Visible = showCsvButtons; - NColumnsTextBoxLabel.Visible = showCsvButtons; - RegexTextBox.Enabled = !showCsvButtons; - IgnoreCaseCheckBox.Enabled = !showCsvButtons; + RegexTextBox.Enabled = !showCsvButtons; + IgnoreCaseCheckBox.Enabled = !showCsvButtons; IncludeFullMatchAsFirstItemCheckBox.Enabled = !showCsvButtons; + if (showCsvButtons && Main.settings.auto_try_guess_csv_delim_newline) + { + if (TrySniffCommonDelimsAndEols(out EndOfLine eol, out char delim, out int nColumns)) + { + // we found possible NColumns, delimiter, and Newline values + NColumnsTextBox.Text = nColumns.ToString(); + DelimiterTextBox.Text = ArgFunction.CsvCleanChar(delim); + QuoteCharTextBox.Text = "\""; + NewlineComboBox.SelectedIndex = eol == EndOfLine.CRLF ? 0 : eol == EndOfLine.LF ? 1 : 2; + } + } + } + + private static bool TrySniffCommonDelimsAndEols(out EndOfLine eol, out char delim, out int nColumns) + { + eol = EndOfLine.CRLF; + delim = '\x00'; + nColumns = -1; + string text = Npp.editor.GetText(CsvSniffer.DEFAULT_MAX_CHARS_TO_SNIFF * 3 / 2); + foreach (EndOfLine maybeEol in new EndOfLine[]{EndOfLine.CRLF, EndOfLine.LF, EndOfLine.CR}) + { + foreach (char maybeDelim in ",\t") + { + nColumns = CsvSniffer.Sniff(text, maybeEol, maybeDelim, '"'); + if (nColumns >= 2) + { + delim = maybeDelim; + eol = maybeEol; + return true; + } + } + } + return false; } diff --git a/JsonToolsNppPlugin/Forms/TreeViewer.cs b/JsonToolsNppPlugin/Forms/TreeViewer.cs index 68ba51f..0864d62 100644 --- a/JsonToolsNppPlugin/Forms/TreeViewer.cs +++ b/JsonToolsNppPlugin/Forms/TreeViewer.cs @@ -144,6 +144,8 @@ private void TreeViewer_KeyUp(object sender, KeyEventArgs e) selected.Collapse(true); // don't collapse the children as well else selected.Expand(); } + else if (QueryBox.Focused) + NppFormHelper.PressEnterInTextBoxHandler(QueryBox, false); } // Escape -> go to editor else if (e.KeyData == Keys.Escape) diff --git a/JsonToolsNppPlugin/JSONTools/CsvSniffer.cs b/JsonToolsNppPlugin/JSONTools/CsvSniffer.cs new file mode 100644 index 0000000..0c7815e --- /dev/null +++ b/JsonToolsNppPlugin/JSONTools/CsvSniffer.cs @@ -0,0 +1,68 @@ +using Kbg.NppPluginNET.PluginInfrastructure; +using System.Text.RegularExpressions; + +namespace JSON_Tools.JSON_Tools +{ + public class CsvSniffer + { + public const int DEFAULT_MAX_CHARS_TO_SNIFF = 1600; + + /// <summary> + /// Attempt to parse text as an RFC 4180-compliant CSV file with delimiter delimiter, newline eol, and quote character '"'.<br></br> + /// Each line, count how many columns there are. If there are two lines with different numbers of lines, or if all lines have one column, return -1.<br></br> + /// If maxLinesToSniff lines are consumed before hitting maxCharsToSniff characters, + /// and all the lines have the same number of columns, return that number of columns<br></br> + /// If maxCharsToSniff characters are consumed, return -1 unless at least minLinesToDecide lines were consumed. + /// </summary> + /// <param name="eol">CR, CRLF, or LF</param> + /// <param name="delimiter">the delimiter (e.g., ',' for a CSV file, '\t' for a TSV file)</param> + /// <param name="maxLinesToSniff">consume no more than this many complete lines while sniffing</param> + /// <param name="maxCharsToSniff">consume no more than this many characters while sniffing</param> + /// <param name="minLinesToDecide">return -1 if fewer than this many complete lines were consumed</param> + /// <returns>the number of columns in the file, or -1 if text does not appear to have that delimiter-eol combo</returns> + public static int Sniff(string text, EndOfLine eol, char delimiter, char quote, int maxLinesToSniff = 16, int maxCharsToSniff = DEFAULT_MAX_CHARS_TO_SNIFF, int minLinesToDecide = 6) + { + maxCharsToSniff = maxCharsToSniff > text.Length ? text.Length : maxCharsToSniff; + int nColumns = -1; + int nColumnsThisLine = 0; + int matchStart = 0; + int linesConsumed = 0; + string delimStr = ArgFunction.CsvCleanChar(delimiter); + string newline = eol == EndOfLine.CRLF ? "\r\n" : eol == EndOfLine.CR ? "\r" : "\n"; + string escapedNewline = JNode.StrToString(newline, false); + string newlineOrDelimiter = $"(?:{delimStr}|{escapedNewline}|\\z)"; + string regexStr = ArgFunction.CsvColumnRegex(ArgFunction.CsvCleanChar(delimiter), new string(quote, 1)) + newlineOrDelimiter; + Regex regex = new Regex(regexStr, RegexOptions.Compiled); + while (matchStart < maxCharsToSniff) + { + Match match = regex.Match(text, matchStart); + if (!match.Success) + return -1; + nColumnsThisLine++; + int matchEnd = matchStart + match.Length; + if (matchEnd == matchStart) + matchEnd++; + bool atEndOfLine = matchEnd >= text.Length || match.Value.EndsWith(newline); + if (atEndOfLine) + { + if (nColumns == -1) // first line + nColumns = nColumnsThisLine; + else if (nColumns == 1 // a row has only one column + || (nColumns >= 0 && nColumnsThisLine != nColumns)) // two rows with different numbers of columns + return -1; + nColumnsThisLine = 0; + linesConsumed++; + if (linesConsumed == maxLinesToSniff) + return nColumns; + } + matchStart = matchEnd; + } + if (linesConsumed < 2 // only one line is never enough to decide anything + || (matchStart < text.Length && linesConsumed < minLinesToDecide)) + // we haven't consumed enough lines to be confident in our delimiter-eol combo + // (unless we consumed the whole file, in which case we're fine) + return -1; + return nColumns; + } + } +} diff --git a/JsonToolsNppPlugin/JSONTools/RemesPathFunctions.cs b/JsonToolsNppPlugin/JSONTools/RemesPathFunctions.cs index 385b716..436aef7 100644 --- a/JsonToolsNppPlugin/JSONTools/RemesPathFunctions.cs +++ b/JsonToolsNppPlugin/JSONTools/RemesPathFunctions.cs @@ -2485,12 +2485,12 @@ public static JNode StrFind(List<JNode> args) /// <summary> /// converts the delimiter to a format suitable for use in regular expressions /// </summary> - private static string CsvCleanChar(char c) + public static string CsvCleanChar(char c) { return c == '\t' ? "\\t" : Regex.Escape(new string(c, 1)); } - private static string CsvColumnRegex(string delimiter, string quote) + public static string CsvColumnRegex(string delimiter, string quote) { return CSV_BASE_COLUMN_REGEX.Replace("{QUOTE}", quote).Replace("{DELIM}", delimiter); } diff --git a/JsonToolsNppPlugin/JsonToolsNppPlugin.csproj b/JsonToolsNppPlugin/JsonToolsNppPlugin.csproj index a41630b..53791c9 100644 --- a/JsonToolsNppPlugin/JsonToolsNppPlugin.csproj +++ b/JsonToolsNppPlugin/JsonToolsNppPlugin.csproj @@ -116,6 +116,7 @@ <Compile Include="Forms\TreeViewer.Designer.cs"> <DependentUpon>TreeViewer.cs</DependentUpon> </Compile> + <Compile Include="JSONTools\CsvSniffer.cs" /> <Compile Include="JSONTools\Dson.cs" /> <Compile Include="PluginInfrastructure\ClikeStringArray.cs" /> <Compile Include="PluginInfrastructure\DllExport\DllExportAttribute.cs" /> @@ -206,6 +207,7 @@ <DependentUpon>Resources.resx</DependentUpon> </Compile> <Compile Include="Tests\Benchmarker.cs" /> + <Compile Include="Tests\CsvSnifferTests.cs" /> <Compile Include="Tests\IniFileParserTests.cs" /> <Compile Include="Tests\JsonGrepperTests.cs" /> <Compile Include="Tests\JsonParserTests.cs" /> diff --git a/JsonToolsNppPlugin/Properties/AssemblyInfo.cs b/JsonToolsNppPlugin/Properties/AssemblyInfo.cs index 8aa0fcd..ddeacc0 100644 --- a/JsonToolsNppPlugin/Properties/AssemblyInfo.cs +++ b/JsonToolsNppPlugin/Properties/AssemblyInfo.cs @@ -28,5 +28,5 @@ // Build Number // Revision // -[assembly: AssemblyVersion("6.1.1.19")] -[assembly: AssemblyFileVersion("6.1.1.19")] +[assembly: AssemblyVersion("6.1.1.20")] +[assembly: AssemblyFileVersion("6.1.1.20")] diff --git a/JsonToolsNppPlugin/Tests/CsvSnifferTests.cs b/JsonToolsNppPlugin/Tests/CsvSnifferTests.cs new file mode 100644 index 0000000..e8728c4 --- /dev/null +++ b/JsonToolsNppPlugin/Tests/CsvSnifferTests.cs @@ -0,0 +1,163 @@ +using System; +using System.Collections.Generic; +using System.Text.RegularExpressions; +using JSON_Tools.JSON_Tools; +using JSON_Tools.Utils; +using Kbg.NppPluginNET.PluginInfrastructure; + +namespace JSON_Tools.Tests +{ + public class CsvSnifferTests + { + public static bool Test() + { + var testcases = new (string text, char delim, EndOfLine eol, char quote, int maxLinesToSniff, int maxCharsToSniff, int minLinesToDecide, int correctOutput)[]{ + ("foo\tbar\tbaz\r\n" + + "1\t2\t3\r\n" + + "\"4\t5\r\n\"\t6\t7", '\t', EndOfLine.CRLF, '"',16, 1000, 6, 3), + ("foo\tbar\tbaz\r\n" + + "1\t2\t3\r\n" + + "\"4\t5\r\n\"\t6\t7\r\n", '\t', EndOfLine.CRLF, '"', 16, 1000, 6, 3), // make sure trailing newlines tolerated + ("foo\tbar\tbaz\r\n" + + "1\t2\t3\r\n" + + "\"4\t5\r\n\"\t6\t7", '\t', EndOfLine.CRLF, '"', 16, 30, 6, -1), // stop before consuming last line + ("foo\tbar\tbaz\tquz\r\n" + + "1\t2\t3\t\"1\r\"\"2\"\r\n" + + "1\t2\t3\t\"1\r\"\"2\"\r\n" + + "1\t2\t3\t\"1\r\"\"2\"\r\n" + + "1\t2\t3\t\"1\r\"\"2\"\r\n" + + "1\t2\t3\t\"1\r\"\"2\"\r\n" + + "1\t2\t3\t\"1\r\"\"2\"\r\n" + + "1\t2\t3\t\"1\r\"\"2\"\r\n" + + "1\t2\t3\t\"1\r\"\"2\"\r\n" + + "1\t2\t3\t\"1\r\"\"2\"\r\n" + + "1\t2\t3\t\"1\r\"\"2\"\r\n" + + "1\t2\t3\t\"1\r\"\"2\"\r\n" + + "1\t2\t3\t\"1\r\"\"2\"\r\n" + + "1\t2\t3\t\"1\r\"\"2\"\r\n" + + "1\t2\t3\t\"1\r\"\"2\"\r\n" + + "1\t2\t3\t\"1\r\"\"2\"\r\n" + // the sniffer should stop at the end of this line + "1\t2\t3\t\"1\r\"\"2\"\t3\r\n" + // there are more than 4 columns here, but that shouldn't matter because the sniffer should have stopped + "1\t2\t3\t\"1\r\"\"2\"\t\t\r\n" + + "1\t2\t3\t\"1\r\"\"2\"\t\t\t\r\n" + + "1\t2\t3\t\"1\r\"\"2\"\r\n" + + "\"4\t5\r\n\"\t6\t7\t9", '\t', EndOfLine.CRLF, '"', 16, 10_000, 6, 4), + ("foo\tbar\tbaz\tquz\r\n" + + "1\t2\t3\t\"1\r\"\"2\"\r\n" + + "1\t2\t3\t\"1\r\"\"2\"\r\n" + + "1\t2\t3\t\"1\r\"\"2\"\r\n" + + "1\t2\t3\t\"1\r\"\"2\"\r\n" + + "1\t2\t3\t\"1\r\"\"2\"\r\n" + + "1\t2\t3\t\"1\r\"\"2\"\r\n" + + "1\t2\t3\t\"1\r\"\"2\"\r\n" + + "1\t2\t3\t\"1\r\"\"2\"\r\n" + + "1\t2\t3\t\"1\r\"\"2\"\r\n" + + "1\t2\t3\t\"1\r\"\"2\"\r\n" + + "1\t2\t3\t\"1\r\"\"2\"\r\n" + + "1\t2\t3\t\"1\r\"\"2\"\r\n" + + "1\t2\t3\t\"1\r\"\"2\"\r\n" + + "1\t2\t3\t\"1\r\"\"2\"\r\n" + + "1\t2\t3\t\"1\r\"\"2\"\r\n" + + "1\t2\t3\t\"1\r\"\"2\"\t3\r\n" + // there are more than 4 columns here, and the sniffer should go far enough to notice it and complain + "1\t2\t3\t\"1\r\"\"2\"\t\t\r\n" + + "1\t2\t3\t\"1\r\"\"2\"\t\t\t\r\n" + + "1\t2\t3\t\"1\r\"\"2\"\r\n" + + "\"4\t5\r\n\"\t6\t7\t9", '\t', EndOfLine.CRLF, '"', 20, 10_000, 6, -1), + ("baz,quz\n" + + "3,\"1\"\",2\"\n" + + "3,\"1\"\",2\"\n" + + "3,\"1\"\",2\"\n" + + "3,\"1\"\",2\"\n" + + "3,\"1\"\",2\"\n" + + "3,\"1\"\",2\"\n" + + "3,\"1\"\",2\"\n" + + "3,\"1\"\",2\"\n" + + "3,\"1\"\",2\"\n" + + "3,\"1\"\",2\"\n" + // the sniffer should stop at the end of this line + "3,\"1\"\",2\",3\n" + // there are more than 2 columns here, but that shouldn't matter because the sniffer should have stopped + "3,\"1\"\",2\",,\n" + + "\"4,5\n\",6,7,9", ',', EndOfLine.LF, '"', 11, 10_000, 6, 2), + ("baz$quz\r" + + "3$\"1\"\"$2\"\r" + + "3$\"1\"\"$2\"\r" + + "3$\"1\"\"$2\"\r" + + "3$\"1\"\"$2\"\r" + + "3$\"1\"\"$2\"\r" + + "3$\"1\"\"$2\"\r" + + "3$\"1\"\"$2\"\r" + // the sniffer should stop at the end of this line + "3$\"1\"\"$2\"$3\r" + // there are more than 2 columns here, but that shouldn't matter because the sniffer should have stopped + "3$\"1\"\"$2\"$$\r" + + "\"4$5\r\"$6$7$9", '$', EndOfLine.CR, '"', 8, 10_000, 5, 2), + ("baz$quz\r" + + "3$\"1\"\"$2\"\r" + + "3$\"1\"\"$2\"\r" + + "3$\"1\"\"$2\"\r" + + "3$\"1\"\"$2\"\r" + + "3$\"1\"\"$2\"\r" + + "3$\"1\"\"$2\"$3\r" + // the sniffer will stop here, and complain because there are more than 2 columns + "3$\"1\"\"$2\"$$\r" + + "\"4$5\r\"$6$7$9", '$', EndOfLine.CR, '"', 7, 10_000, 5, -1), + ("baz$quz\r" + + "3$\"1\"\"$2\"\r" + + "3$\"1\"\"$2\"\r" + + "3$\"1\"\"$2\"\r" + // the sniffer will stop here, and complain because it hasn't reached its minLinesToDecide of 5 + "3$\"1\"\"$2\"\r" + + "3$\"1\"\"$2\"\r" + + "3$\"1\"\"$2\"\r" + + "3$\"1\"\"$2\"\r" + + "3$\"1\"\"$2\"$3\r" + + "3$\"1\"\"$2\"$$\r" + + "\"4$5\r\"$6$7$9", '$', EndOfLine.CR, '"', 8, 38, 5, -1), + ("abc\r\nd,e\r\nf,g\r\nh,i\r\nj,k\r\nl,m\r\nn,o\r\np,q", '\t', EndOfLine.CRLF, '"', 16, 10_000, 5, -1), // 1 column, must quit + ("abc\r\nd,e\r\nf,g\r\nh,i\r\nj,k\r\nl,m\r\nn,o\r\np,q", ',', EndOfLine.CRLF, '"', 16, 10_000, 5, -1), // first row has 1 column, must quit + ("a,bc\r\nd,\r\n,g\r\nh,\"\"\r\n\"\",k\r\n,\r\nn,o\r\np,q", ',', EndOfLine.CRLF, '"', 16, 10_000, 5, 2), // rows with empty values + (",bc\nd,e\nf,g\nh,i\nj,\nl,m\nn,o\np,q", ',', EndOfLine.LF, '"', 16, 10_000, 5, 2), + ("nums,names,cities,date,zone,subzone,contaminated\r\nnan,Bluds,BUS,,1,a,TRUE\r\nnan,Bluds,BUS,,1,b,FALSE\r\nnan,Bluds,BUS,,1,c,FALSE\r\n0.5,dfsd,FUDG,12/13/2020 0:00,2,c,TRUE\r\n0.5,\"df\"\"sd\",FUDG,12/13/2020 0:00,2,d,FALSE\r\n0.5,dfsd,FUDG,12/13/2020 0:00,2,e,FALSE\r\n0.5,dfsd,\"FU\r\nDG\",12/13/2020 0:00,2,g,FALSE\r\n1.2,qere,GOLAR,,3,f,TRUE\r\n1.2,qere,GOLAR,,3,h,TRUE\r\n3.4,flodt,\"q,tün\",,4,q,FALSE\r\n4.6,Kjond,YUNOB,10/17/2014 0:00,5,w,TRUE\r\n4.6,Kjond,YUNOB,10/17/2014 0:00,5,z,FALSE\r\n7,Unyir,MOKJI,5/11/2017 0:00,6,i,TRUE\r\n", + ',', EndOfLine.CRLF, '"', 16, 10_000, 6, 7), + ("nums,names,cities,date,zone,subzone,contaminated\nnan,Bluds,BUS,,1,a,TRUE\nnan,Bluds,BUS,,1,b,FALSE\nnan,Bluds,BUS,,1,c,FALSE\n0.5,dfsd,FUDG,12/13/2020 0:00,2,c,TRUE\n0.5,\"df\"\"sd\",FUDG,12/13/2020 0:00,2,d,FALSE\n0.5,dfsd,FUDG,12/13/2020 0:00,2,e,FALSE\n0.5,dfsd,\"FU\nDG\",12/13/2020 0:00,2,g,FALSE\n1.2,qere,GOLAR,,3,f,TRUE\n1.2,qere,GOLAR,,3,h,TRUE\n3.4,flodt,\"q,tün\",,4,q,FALSE\n4.6,Kjond,YUNOB,10/17/2014 0:00,5,w,TRUE\n4.6,Kjond,YUNOB,10/17/2014 0:00,5,z,FALSE\n7,Unyir,MOKJI,5/11/2017 0:00,6,i,TRUE\n", + ',', EndOfLine.LF, '"', 16, 400, 5, 7), + ("nums,names,cities,date,zone,subzone,contaminated\nnan,Bluds,BUS,,1,a,TRUE\nnan,Bluds,BUS,,1,b,FALSE\nnan,Bluds,BUS,,1,c,FALSE\n0.5,dfsd,FUDG,12/13/2020 0:00,2,c,TRUE\n0.5,\"df\"\"sd\",FUDG,12/13/2020 0:00,2,d,FALSE\n0.5,dfsd,FUDG,12/13/2020 0:00,2,e,FALSE\n0.5,dfsd,\"FU\nDG\",12/13/2020 0:00,2,g,FALSE\n1.2,qere,GOLAR,,3,f,TRUE\n1.2,qere,GOLAR,,3,h,TRUE\n3.4,flodt,\"q,tün\",,4,q,FALSE\n4.6,Kjond,YUNOB,10/17/2014 0:00,5,w,TRUE\n4.6,Kjond,YUNOB,10/17/2014 0:00,5,z,FALSE\n7,Unyir,MOKJI,5/11/2017 0:00,6,i,TRUE\n", + ',', EndOfLine.CRLF, '"', 16, 400, 5, -1), + ("nums,names,cities,date,zone,subzone,contaminated\rnan,Bluds,BUS,,1,a,TRUE\rnan,Bluds,BUS,,1,b,FALSE\rnan,Bluds,BUS,,1,c,FALSE\r0.5,dfsd,FUDG,12/13/2020 0:00,2,c,TRUE\r0.5,\"df\"\"sd\",FUDG,12/13/2020 0:00,2,d,FALSE\r0.5,dfsd,FUDG,12/13/2020 0:00,2,e,FALSE\r0.5,dfsd,\"FU\rDG\",12/13/2020 0:00,2,g,FALSE\r1.2,qere,GOLAR,,3,f,TRUE\r1.2,qere,GOLAR,,3,h,TRUE\r3.4,flodt,\"q,tün\",,4,q,FALSE\r4.6,Kjond,YUNOB,10/17/2014 0:00,5,w,TRUE\r4.6,Kjond,YUNOB,10/17/2014 0:00,5,z,FALSE\r7,Unyir,MOKJI,5/11/2017 0:00,6,i,TRUE\r", + ',', EndOfLine.CR, '"', 16, 400, 7, 7), + ("nums,names,cities,date,zone,subzone,contaminated\rnan,Bluds,BUS,,1,a,TRUE\rnan,Bluds,BUS,,1,b,FALSE\rnan,Bluds,BUS,,1,c,FALSE\r0.5,dfsd,FUDG,12/13/2020 0:00,2,c,TRUE\r0.5,\"df\"\"sd\",FUDG,12/13/2020 0:00,2,d,FALSE\r0.5,dfsd,FUDG,12/13/2020 0:00,2,e,FALSE\r0.5,dfsd,\"FU\rDG\",12/13/2020 0:00,2,g,FALSE\r1.2,qere,GOLAR,,3,f,TRUE\r1.2,qere,GOLAR,,3,h,TRUE\r3.4,flodt,\"q,tün\",,4,q,FALSE\r4.6,Kjond,YUNOB,10/17/2014 0:00,5,w,TRUE\r4.6,Kjond,YUNOB,10/17/2014 0:00,5,z,FALSE\r7,Unyir,MOKJI,5/11/2017 0:00,6,i,TRUE\r", + ',', EndOfLine.CR, '"', 16, 4000, 7, 7), + ("nums,names,cities,date,zone,subzone,contaminated\rnan,Bluds,BUS,,1,a,TRUE\rnan,Bluds,BUS,,1,b,FALSE\rnan,Bluds,BUS,,1,c,FALSE\r0.5,dfsd,FUDG,12/13/2020 0:00,2,c,TRUE\r0.5,\"df\"\"sd\",FUDG,12/13/2020 0:00,2,d,FALSE\r0.5,dfsd,FUDG,12/13/2020 0:00,2,e,FALSE\r0.5,dfsd,\"FU\rDG\",12/13/2020 0:00,2,g,FALSE\r1.2,qere,GOLAR,,3,f,TRUE\r1.2,qere,GOLAR,,3,h,TRUE\r3.4,flodt,\"q,tün\",,4,q,FALSE\r4.6,Kjond,YUNOB,10/17/2014 0:00,5,w,TRUE\r4.6,Kjond,YUNOB,10/17/2014 0:00,5,z,FALSE\r7,Unyir,MOKJI,5/11/2017 0:00,6,i,TRUE", + ',', EndOfLine.CR, '"', 16, 4000, 7, 7), // same as previous, but w/o trailing EOL + ("nums,names,cities,date,zone,subzone,contaminated\rnan,Bluds,BUS,,1,a,TRUE\rnan,Bluds,BUS,,1,b,FALSE\rnan,Bluds,BUS,,1,c,FALSE\r0.5,dfsd,FUDG,12/13/2020 0:00,2,c,TRUE\r0.5,\"df\"\"sd\",FUDG,12/13/2020 0:00,2,d,FALSE\r0.5,dfsd,FUDG,12/13/2020 0:00,2,e,FALSE\r0.5,dfsd,\"FU\rDG\",12/13/2020 0:00,2,g,FALSE\r1.2,qere,GOLAR,,3,f,TRUE\r1.2,qere,GOLAR,,3,h,TRUE\r3.4,flodt,\"q,tün\",,4,q,FALSE\r4.6,Kjond,YUNOB,10/17/2014 0:00,5,w,TRUE\r4.6,Kjond,YUNOB,10/17/2014 0:00,5,z,FALSE\r7,Unyir,MOKJI,5/11/2017 0:00,6,i,TRUE\r", + ',', EndOfLine.CR, '"', 16, 400, 15, -1), // not enough lines to meet minLinesToDecide + ("nums,names,cities,date,zone,subzone,contaminated\rnan,Bluds,BUS,,1,a,TRUE\rnan,Bluds,BUS,,1,b,FALSE\rnan,Bluds,BUS,,1,c,FALSE\r0.5,dfsd,FUDG,12/13/2020 0:00,2,c,TRUE\r0.5,\"df\"\"sd\",FUDG,12/13/2020 0:00,2,d,FALSE\r0.5,dfsd,FUDG,12/13/2020 0:00,2,e,FALSE\r0.5,dfsd,\"FU\rDG\",12/13/2020 0:00,2,g,FALSE\r1.2,qere,GOLAR,,3,f,TRUE\r1.2,qere,GOLAR,,3,h,TRUE\r3.4,flodt,\"q,tün\",,4,q,FALSE\r4.6,Kjond,YUNOB,10/17/2014 0:00,5,w,TRUE\r4.6,Kjond,YUNOB,10/17/2014 0:00,5,z,FALSE\r7,Unyir,MOKJI,5/11/2017 0:00,6,i,TRUE\r", + ',', EndOfLine.CRLF, '"', 16, 400, 7, -1), + ("a,b,c,d,e,f,g,h,i", ',', EndOfLine.CRLF, '"', 16, 10_000, 5, -1), // one line, so can't decide anything + ("a\tb\tc\td\te\tf\r\n", ',', EndOfLine.CRLF, '"', 16, 10_000, 5, -1), // one line, so can't decide anything + }; + int testsFailed = 0; + int ii = 0; + + foreach ((string text, char delim, EndOfLine eol, char quote, int maxLinesToSniff, int maxCharsToSniff, int minLinesToDecide, int correctOutput) in testcases) + { + ii++; + int output; + string baseMsg = $"Expected CsvSniffer.Sniff({JNode.StrToString(text, true)}, {eol}, {quote}, '{ArgFunction.CsvCleanChar(delim)}', {maxLinesToSniff}, {maxCharsToSniff}, {minLinesToDecide}) to return {correctOutput}, but got"; + try + { + output = CsvSniffer.Sniff(text, eol, delim, quote, maxLinesToSniff, maxCharsToSniff, minLinesToDecide); + } + catch (Exception ex) + { + Npp.AddLine($"{baseMsg} exception:\r\n{RemesParser.PrettifyException(ex)}"); + testsFailed++; + continue; + } + if (output != correctOutput) + { + testsFailed++; + Npp.AddLine($"{baseMsg} {output}"); + } + } + Npp.AddLine($"Failed {testsFailed} tests."); + Npp.AddLine($"Passed {ii - testsFailed} tests."); + return testsFailed > 0; + } + } +} diff --git a/JsonToolsNppPlugin/Tests/TestRunner.cs b/JsonToolsNppPlugin/Tests/TestRunner.cs index 87b6e3b..0050446 100644 --- a/JsonToolsNppPlugin/Tests/TestRunner.cs +++ b/JsonToolsNppPlugin/Tests/TestRunner.cs @@ -47,6 +47,8 @@ public static async Task RunAll() (JsonSchemaValidatorTester.Test, "JsonSchema validator", false, false), (JsonTabularizerTester.Test, "JSON tabularizer", false, false), + + (CsvSnifferTests.Test, "CSV sniffer", false, false), // tests that require reading files (skip on Notepad++ earlier than v8) (JsonGrepperTester.TestFnames, "JSON grepper's file reading ability", true, false), diff --git a/JsonToolsNppPlugin/Utils/Settings.cs b/JsonToolsNppPlugin/Utils/Settings.cs index 2373ec6..cca1290 100644 --- a/JsonToolsNppPlugin/Utils/Settings.cs +++ b/JsonToolsNppPlugin/Utils/Settings.cs @@ -149,6 +149,13 @@ public class Settings : SettingsBase "If you want there to be NO toolbar icons, enter a character that does not represent an icon; do NOT leave this field empty."), Category("Miscellaneous"), DefaultValue("tcpo")] public string toolbar_icons { get; set; } + + [Description("If this setting is true,\r\n" + + "when the regex search form is opened, or when the \"Parse as CSV?\" checkbox in that form is toggled on,\r\n" + + "JsonTools will attempt to guess whether the current document is a CSV or TSV file, and how many columns and what newline it has.\r\n" + + "The regex search form will take slightly longer to open if this is true."), + Category("Miscellaneous"), DefaultValue(false)] + public bool auto_try_guess_csv_delim_newline { get; set; } #endregion #region GREP_API_SETTINGS diff --git a/docs/README.md b/docs/README.md index c9c62f7..c2cc5bf 100644 --- a/docs/README.md +++ b/docs/README.md @@ -627,6 +627,8 @@ Opening up a document in regex mode allows __querying and mutating the raw text You can view CSV files (any delimiter, quote character, and newline are allowed) with the treeview, providing that they comply with [RFC 4180](https://www.ietf.org/rfc/rfc4180.txt). +Beginning in [v7.0](/CHANGELOG.md#700---unreleased-yyyy-mm-dd), if the new `auto_try_guess_csv_delim_newline` global setting is set to `true`, whenever the regex search form is opened, or the `Parse as CSV?` button is toggled on, the regex search form will check the first 1600 characters of the current document to detect if it is a CSV or TSV file. This makes the regex search form load more slowly, but it makes it easier to parse CSV files. +  If you want to edit your document using RemesPath, the [`s_sub` function](/docs/RemesPath.md#vectorized-functions) may prove useful for regex-replacement, and the [`to_csv` function](/docs/RemesPath.md#non-vectorized-functions) may be useful for CSV editing. diff --git a/most recent errors.txt b/most recent errors.txt index c5f3800..6507e66 100644 --- a/most recent errors.txt +++ b/most recent errors.txt @@ -1,4 +1,4 @@ -Test results for JsonTools v6.1.1.19 on Notepad++ 8.5.8 64bit +Test results for JsonTools v6.1.1.20 on Notepad++ 8.5.8 64bit NOTE: Ctrl-F (regular expressions *on*) for "Failed [1-9]\d*" to find all failed tests Tests failed: YAML dumper ========================= @@ -154,6 +154,12 @@ Testing JSON tabularizer Failed 0 tests. Passed 61 tests. ========================= +Testing CSV sniffer +========================= + +Failed 0 tests. +Passed 23 tests. +========================= Testing JSON grepper's file reading ability ========================= @@ -195,33 +201,33 @@ Testing JsonParser performance Preview of json: [{"A": "Ky'c^g#~)0", "a": 1850111954, "b": 9318359041, "B": "Oyi:/ xxe2", "C": "sKCSa_^7Gg", "c": 7974777124, "d": 2670309238, "D": "0d_K)HmX!.", "E": ".uM*Z{0EJ_", "e": 6958410336, "f": 8050244728, "F": "1%SG_A!xB\t", "g": 3799657125, "G": "il1^k\\\nat*", "H": {"a": 6079042826, "b": 7292804611, "c" ... -To convert JSON string of size 89556 into JNode took 3.971 +/- 3.573 ms over 32 trials -Load times (ms): 4, 6, 22, 4, 1, 2, 1, 4, 1, 2, 6, 2, 2, 2, 3, 1, 1, 4, 5, 2, 2, 2, 5, 3, 5, 4, 1, 3, 5, 1, 2, 5 +To convert JSON string of size 89556 into JNode took 2.496 +/- 1.157 ms over 32 trials +Load times (ms): 2, 1, 2, 4, 2, 2, 2, 3, 1, 1, 4, 2, 1, 1, 2, 1, 1, 5, 1, 1, 1, 3, 1, 1, 5, 1, 1, 1, 3, 1, 4, 1 ========================= Performance tests for RemesPath (float arithmetic) ========================= -Compiling query "@[@[:].a * @[:].t < @[:].e]" took 0.054 ms the first time, including approximately 0.055 ms to tokenize the query. Subsequent executions are effectively free due to caching. -To run pre-compiled query "@[@[:].a * @[:].t < @[:].e]" on JNode from JSON of size 89556 into took 0.055 +/- 0.151 ms over 40 trials -Query times (ms): 0.066, 0.034, 0.046, 0.022, 0.022, 0.024, 0.023, 0.03, 0.023, 0.025, 0.041, 0.038, 0.036, 0.03, 0.022, 0.023, 0.023, 0.022, 0.022, 0.043, 0.028, 0.022, 0.023, 0.028, 0.039, 0.039, 0.038, 0.041, 0.994, 0.024, 0.021, 0.028, 0.022, 0.023, 0.037, 0.023, 0.023, 0.049, 0.037, 0.038 +Compiling query "@[@[:].a * @[:].t < @[:].e]" took 0.052 ms the first time, including approximately 0.051 ms to tokenize the query. Subsequent executions are effectively free due to caching. +To run pre-compiled query "@[@[:].a * @[:].t < @[:].e]" on JNode from JSON of size 89556 into took 0.027 +/- 0.015 ms over 40 trials +Query times (ms): 0.113, 0.046, 0.023, 0.05, 0.023, 0.023, 0.022, 0.028, 0.022, 0.023, 0.022, 0.022, 0.027, 0.027, 0.023, 0.022, 0.023, 0.022, 0.022, 0.026, 0.023, 0.022, 0.022, 0.022, 0.023, 0.026, 0.023, 0.023, 0.023, 0.022, 0.022, 0.026, 0.022, 0.022, 0.023, 0.023, 0.022, 0.025, 0.022, 0.024 Preview of result: [{"A": "Ky'c^g#~)0", "a": 1850111954, "b": 9318359041, "B": "Oyi:/ xxe2", "C": "sKCSa_^7Gg", "c": 7974777124, "d": 2670309238, "D": "0d_K)HmX!.", "E": ".uM*Z{0EJ_", "e": 6958410336, "f": 8050244728, "F": "1%SG_A!xB\t", "g": 3799657125, "G": "il1^k\\\nat*", "H": {"a": 6079042826, "b": 7292804611, "c" ... ========================= Performance tests for RemesPath (string operations) ========================= -Compiling query "@[@[:].z =~ `(?i)[a-z]{5}`]" took 0.051 ms the first time, including approximately 0.087 ms to tokenize the query. Subsequent executions are effectively free due to caching. -To run pre-compiled query "@[@[:].z =~ `(?i)[a-z]{5}`]" on JNode from JSON of size 89556 into took 0.073 +/- 0.043 ms over 40 trials -Query times (ms): 0.118, 0.057, 0.076, 0.096, 0.089, 0.054, 0.053, 0.057, 0.054, 0.054, 0.054, 0.053, 0.055, 0.059, 0.056, 0.055, 0.054, 0.053, 0.064, 0.055, 0.053, 0.054, 0.058, 0.057, 0.056, 0.055, 0.057, 0.097, 0.099, 0.098, 0.095, 0.108, 0.099, 0.084, 0.055, 0.053, 0.314, 0.064, 0.055, 0.054 +Compiling query "@[@[:].z =~ `(?i)[a-z]{5}`]" took 0.047 ms the first time, including approximately 0.048 ms to tokenize the query. Subsequent executions are effectively free due to caching. +To run pre-compiled query "@[@[:].z =~ `(?i)[a-z]{5}`]" on JNode from JSON of size 89556 into took 0.072 +/- 0.022 ms over 40 trials +Query times (ms): 0.149, 0.07, 0.063, 0.114, 0.081, 0.062, 0.061, 0.063, 0.062, 0.061, 0.061, 0.062, 0.078, 0.068, 0.061, 0.058, 0.06, 0.058, 0.06, 0.059, 0.06, 0.11, 0.104, 0.082, 0.064, 0.072, 0.063, 0.062, 0.058, 0.06, 0.058, 0.062, 0.06, 0.063, 0.062, 0.153, 0.065, 0.08, 0.072, 0.074 Preview of result: [{"A": "\n]o1VQ5t6g", "a": 4710024278, "b": 3268860721, "B": "g4Y7+ew^.v", "C": "<E_7XL7YS`", "c": 4921465277, "d": 9420665097, "D": "Q&S>NK<OOn", "E": "M?6Ll1W\nFM", "e": 4146283970, "f": 8384193493, "F": "z[jPvslL\tc", "g": 1578133296, "G": "m'M4h,`|Wk", "H": {"a": 5184250383, "b": 5337791147, "c" ... ========================= Performance tests for RemesPath (basic recursive search) ========================= -Compiling query "@..*" took 0.024 ms the first time, including approximately 0.031 ms to tokenize the query. Subsequent executions are effectively free due to caching. -To run pre-compiled query "@..*" on JNode from JSON of size 89556 into took 0.406 +/- 0.349 ms over 40 trials -Query times (ms): 0.438, 0.345, 0.377, 0.337, 0.346, 0.336, 0.366, 0.344, 1.066, 0.333, 0.352, 0.337, 0.365, 0.376, 0.284, 0.377, 0.231, 0.224, 0.263, 0.229, 0.224, 0.221, 0.233, 0.227, 0.228, 0.25, 0.35, 0.388, 0.316, 2.395, 0.395, 0.382, 0.366, 0.367, 0.374, 0.372, 0.374, 0.36, 0.362, 0.72 +Compiling query "@..*" took 0.021 ms the first time, including approximately 0.028 ms to tokenize the query. Subsequent executions are effectively free due to caching. +To run pre-compiled query "@..*" on JNode from JSON of size 89556 into took 0.407 +/- 0.592 ms over 40 trials +Query times (ms): 4.057, 0.388, 0.427, 0.41, 0.317, 0.305, 0.302, 0.299, 0.296, 0.301, 0.298, 0.296, 0.31, 0.328, 0.299, 0.308, 0.304, 0.302, 0.31, 0.301, 0.298, 0.303, 0.308, 0.297, 0.304, 0.296, 0.296, 0.321, 0.307, 0.32, 0.851, 0.249, 0.23, 0.228, 0.23, 0.269, 0.235, 0.234, 0.238, 0.296 Preview of result: [1850111954, 9318359041, 7974777124, 2670309238, 6958410336, 8050244728, 3799657125, 2612807147, 7785993340, 9842767454, 2257474583, 2736529372, 4821265864, 3302084501, null, Infinity, true, false, true, 0.201077552261751, 0.110978036654776, 0.50917270025261, 0.798199326980627, 0.615212956451379, 0. ... ========================= @@ -231,12 +237,12 @@ Performance tests for RemesPath (group_by, projections and aggregations) Compiling query "group_by(@, s).*{ Hmax: max((@[:].H)..*[is_num(@)][abs(@) < Infinity]), min_N: min((@[:].N)..*[is_num(@)][abs(@) < Infinity]) -}" took 0.546 ms the first time, including approximately 0.304 ms to tokenize the query. Subsequent executions are effectively free due to caching. +}" took 0.171 ms the first time, including approximately 0.159 ms to tokenize the query. Subsequent executions are effectively free due to caching. To run pre-compiled query "group_by(@, s).*{ Hmax: max((@[:].H)..*[is_num(@)][abs(@) < Infinity]), min_N: min((@[:].N)..*[is_num(@)][abs(@) < Infinity]) -}" on JNode from JSON of size 89556 into took 0.319 +/- 0.111 ms over 40 trials -Query times (ms): 0.442, 0.281, 0.319, 0.269, 0.588, 0.302, 0.301, 0.278, 0.265, 0.238, 0.284, 0.263, 0.253, 0.524, 0.178, 0.185, 0.248, 0.226, 0.2, 0.172, 0.286, 0.567, 0.332, 0.306, 0.301, 0.304, 0.272, 0.282, 0.287, 0.635, 0.403, 0.409, 0.287, 0.229, 0.287, 0.294, 0.294, 0.554, 0.292, 0.302 +}" on JNode from JSON of size 89556 into took 0.323 +/- 0.318 ms over 40 trials +Query times (ms): 0.406, 0.272, 0.26, 0.256, 0.202, 0.187, 0.187, 0.187, 0.256, 0.253, 0.25, 0.252, 0.209, 0.188, 2.224, 0.178, 0.238, 0.247, 0.251, 0.232, 0.249, 0.249, 0.428, 0.236, 0.233, 0.263, 0.28, 0.274, 0.279, 0.326, 0.539, 0.282, 0.257, 0.254, 0.247, 0.248, 0.228, 0.362, 0.662, 0.3 Preview of result: {"false": {"Hmax": 9703256074.0, "min_N": 0.0395243372266771}, "true": {"Hmax": 9695512197.0, "min_N": 0.0231773915713427}} ... ========================= @@ -246,12 +252,12 @@ Performance tests for RemesPath (variable assignments and simple aggregations) Compiling query "var qmask = @[:].q; var nmax_q = max(@[qmask].n); var nmax_notq = max(@[not qmask].n); -ifelse(nmax_q > nmax_notq, `when q=true, nmax = ` + str(nmax_q), `when q=false, nmax= ` + str(nmax_notq))" took 0.276 ms the first time, including approximately 0.211 ms to tokenize the query. Subsequent executions are effectively free due to caching. +ifelse(nmax_q > nmax_notq, `when q=true, nmax = ` + str(nmax_q), `when q=false, nmax= ` + str(nmax_notq))" took 0.147 ms the first time, including approximately 0.136 ms to tokenize the query. Subsequent executions are effectively free due to caching. To run pre-compiled query "var qmask = @[:].q; var nmax_q = max(@[qmask].n); var nmax_notq = max(@[not qmask].n); -ifelse(nmax_q > nmax_notq, `when q=true, nmax = ` + str(nmax_q), `when q=false, nmax= ` + str(nmax_notq))" on JNode from JSON of size 89556 into took 0.037 +/- 0.018 ms over 40 trials -Query times (ms): 0.146, 0.042, 0.027, 0.027, 0.04, 0.061, 0.032, 0.035, 0.033, 0.031, 0.034, 0.03, 0.034, 0.031, 0.037, 0.035, 0.033, 0.03, 0.039, 0.03, 0.029, 0.034, 0.032, 0.027, 0.028, 0.038, 0.034, 0.035, 0.036, 0.037, 0.03, 0.033, 0.035, 0.03, 0.031, 0.042, 0.038, 0.035, 0.033, 0.03 +ifelse(nmax_q > nmax_notq, `when q=true, nmax = ` + str(nmax_q), `when q=false, nmax= ` + str(nmax_notq))" on JNode from JSON of size 89556 into took 0.021 +/- 0.019 ms over 40 trials +Query times (ms): 0.094, 0.02, 0.047, 0.032, 0.026, 0.021, 0.11, 0.019, 0.016, 0.016, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.016, 0.015, 0.015, 0.015, 0.016, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.016, 0.015, 0.015, 0.015, 0.015, 0.015, 0.016, 0.016, 0.015, 0.015, 0.015, 0.016 Preview of result: "when q=false, nmax= 9830935647.0" ... ========================= @@ -260,11 +266,11 @@ Performance tests for RemesPath (references to compile-time constant variables) Compiling query "var X = X; var onetwo = j`[1, 2]`; -@[:]->at(@, X)->at(@, onetwo)" took 0.217 ms the first time, including approximately 0.118 ms to tokenize the query. Subsequent executions are effectively free due to caching. +@[:]->at(@, X)->at(@, onetwo)" took 0.084 ms the first time, including approximately 0.098 ms to tokenize the query. Subsequent executions are effectively free due to caching. To run pre-compiled query "var X = X; var onetwo = j`[1, 2]`; -@[:]->at(@, X)->at(@, onetwo)" on JNode from JSON of size 89556 into took 0.036 +/- 0.054 ms over 40 trials -Query times (ms): 0.089, 0.026, 0.022, 0.023, 0.024, 0.105, 0.056, 0.023, 0.022, 0.02, 0.021, 0.026, 0.024, 0.023, 0.022, 0.021, 0.025, 0.024, 0.02, 0.022, 0.024, 0.022, 0.023, 0.021, 0.023, 0.353, 0.026, 0.022, 0.024, 0.02, 0.024, 0.034, 0.023, 0.02, 0.02, 0.025, 0.023, 0.023, 0.022, 0.022 +@[:]->at(@, X)->at(@, onetwo)" on JNode from JSON of size 89556 into took 0.016 +/- 0.007 ms over 40 trials +Query times (ms): 0.048, 0.021, 0.013, 0.011, 0.012, 0.012, 0.017, 0.02, 0.013, 0.012, 0.012, 0.012, 0.015, 0.019, 0.014, 0.012, 0.012, 0.012, 0.012, 0.02, 0.019, 0.012, 0.011, 0.012, 0.012, 0.019, 0.021, 0.025, 0.017, 0.012, 0.011, 0.012, 0.012, 0.012, 0.012, 0.035, 0.012, 0.012, 0.011, 0.014 Preview of result: [[1695727848, 0.287562638736685], [2126430375, 0.00767794129708177], [5310550656, 0.380769772645687], [2519183283, 0.153176220930558], [6610062385, 0.662996225870666], [987168256, 0.924410189999928], [6615003609, 0.917112691225947], [4465232046, 0.684311931851536], [8654414565, 0.631485392105992], [ ... ========================= @@ -273,29 +279,29 @@ Performance tests for RemesPath (references to variables that are not compile-ti Compiling query "var X = @->`X`; var onetwo = @{1, 2}; -@[:]->at(@, X)->at(@, onetwo)" took 0.155 ms the first time, including approximately 0.153 ms to tokenize the query. Subsequent executions are effectively free due to caching. +@[:]->at(@, X)->at(@, onetwo)" took 0.417 ms the first time, including approximately 0.092 ms to tokenize the query. Subsequent executions are effectively free due to caching. To run pre-compiled query "var X = @->`X`; var onetwo = @{1, 2}; -@[:]->at(@, X)->at(@, onetwo)" on JNode from JSON of size 89556 into took 0.034 +/- 0.012 ms over 40 trials -Query times (ms): 0.096, 0.022, 0.027, 0.034, 0.033, 0.03, 0.048, 0.059, 0.037, 0.039, 0.031, 0.029, 0.041, 0.031, 0.018, 0.019, 0.027, 0.032, 0.033, 0.033, 0.033, 0.032, 0.028, 0.028, 0.029, 0.032, 0.032, 0.031, 0.033, 0.03, 0.033, 0.034, 0.03, 0.032, 0.03, 0.029, 0.032, 0.033, 0.031, 0.031 +@[:]->at(@, X)->at(@, onetwo)" on JNode from JSON of size 89556 into took 0.03 +/- 0.006 ms over 40 trials +Query times (ms): 0.062, 0.029, 0.028, 0.027, 0.028, 0.027, 0.027, 0.028, 0.027, 0.027, 0.029, 0.038, 0.027, 0.027, 0.038, 0.023, 0.028, 0.028, 0.028, 0.028, 0.031, 0.032, 0.033, 0.03, 0.029, 0.032, 0.029, 0.028, 0.029, 0.03, 0.028, 0.029, 0.029, 0.028, 0.028, 0.028, 0.027, 0.028, 0.027, 0.027 Preview of result: [[1695727848, 0.287562638736685], [2126430375, 0.00767794129708177], [5310550656, 0.380769772645687], [2519183283, 0.153176220930558], [6610062385, 0.662996225870666], [987168256, 0.924410189999928], [6615003609, 0.917112691225947], [4465232046, 0.684311931851536], [8654414565, 0.631485392105992], [ ... ========================= Performance tests for RemesPath (simple string mutations) ========================= -Compiling query "@[:].z = s_sub(@, g, B)" took 0.074 ms the first time, including approximately 0.087 ms to tokenize the query. Subsequent executions are effectively free due to caching. -To run pre-compiled query "@[:].z = s_sub(@, g, B)" on JNode from JSON of size 89556 into took 0.03 +/- 0.016 ms over 40 trials -Query times (ms): 0.046, 0.034, 0.041, 0.016, 0.013, 0.026, 0.04, 0.04, 0.02, 0.015, 0.014, 0.017, 0.039, 0.04, 0.114, 0.043, 0.026, 0.024, 0.018, 0.041, 0.041, 0.025, 0.028, 0.025, 0.027, 0.035, 0.022, 0.022, 0.026, 0.025, 0.025, 0.031, 0.033, 0.03, 0.034, 0.024, 0.023, 0.02, 0.019, 0.022 +Compiling query "@[:].z = s_sub(@, g, B)" took 0.05 ms the first time, including approximately 0.071 ms to tokenize the query. Subsequent executions are effectively free due to caching. +To run pre-compiled query "@[:].z = s_sub(@, g, B)" on JNode from JSON of size 89556 into took 0.021 +/- 0.008 ms over 40 trials +Query times (ms): 0.041, 0.024, 0.026, 0.016, 0.015, 0.014, 0.021, 0.02, 0.017, 0.016, 0.019, 0.017, 0.025, 0.024, 0.051, 0.042, 0.023, 0.013, 0.013, 0.018, 0.022, 0.022, 0.019, 0.012, 0.017, 0.017, 0.019, 0.016, 0.019, 0.018, 0.03, 0.019, 0.019, 0.02, 0.03, 0.024, 0.019, 0.017, 0.018, 0.019 Preview of result: [{"A": "Ky'c^g#~)0", "a": 1850111954, "b": 9318359041, "B": "Oyi:/ xxe2", "C": "sKCSa_^7Gg", "c": 7974777124, "d": 2670309238, "D": "0d_K)HmX!.", "E": ".uM*Z{0EJ_", "e": 6958410336, "f": 8050244728, "F": "1%SG_A!xB\t", "g": 3799657125, "G": "il1^k\\\nat*", "H": {"a": 6079042826, "b": 7292804611, "c" ... ========================= Performance tests for RemesPath (simple number mutations) ========================= -Compiling query "@[:].x = ifelse(@ < 0.5, @ + 3, @ - 3)" took 0.166 ms the first time, including approximately 0.136 ms to tokenize the query. Subsequent executions are effectively free due to caching. -To run pre-compiled query "@[:].x = ifelse(@ < 0.5, @ + 3, @ - 3)" on JNode from JSON of size 89556 into took 0.031 +/- 0.019 ms over 40 trials -Query times (ms): 0.053, 0.047, 0.038, 0.038, 0.045, 0.048, 0.033, 0.02, 0.023, 0.023, 0.053, 0.024, 0.02, 0.04, 0.042, 0.047, 0.034, 0.057, 0.126, 0.031, 0.019, 0.018, 0.018, 0.019, 0.019, 0.018, 0.018, 0.019, 0.019, 0.016, 0.028, 0.026, 0.032, 0.022, 0.018, 0.018, 0.02, 0.02, 0.019, 0.019 +Compiling query "@[:].x = ifelse(@ < 0.5, @ + 3, @ - 3)" took 0.08 ms the first time, including approximately 0.104 ms to tokenize the query. Subsequent executions are effectively free due to caching. +To run pre-compiled query "@[:].x = ifelse(@ < 0.5, @ + 3, @ - 3)" on JNode from JSON of size 89556 into took 0.038 +/- 0.013 ms over 40 trials +Query times (ms): 0.083, 0.031, 0.048, 0.039, 0.037, 0.036, 0.033, 0.036, 0.038, 0.031, 0.033, 0.036, 0.05, 0.046, 0.044, 0.071, 0.044, 0.034, 0.04, 0.046, 0.033, 0.033, 0.031, 0.035, 0.035, 0.044, 0.051, 0.051, 0.052, 0.054, 0.035, 0.022, 0.03, 0.029, 0.032, 0.028, 0.019, 0.02, 0.018, 0.02 Preview of result: [{"A": "Ky'c^g#~)0", "a": 1850111954, "b": 9318359041, "B": "Oyi:/ xxe2", "C": "sKCSa_^7Gg", "c": 7974777124, "d": 2670309238, "D": "0d_K)HmX!.", "E": ".uM*Z{0EJ_", "e": 6958410336, "f": 8050244728, "F": "1%SG_A!xB\t", "g": 3799657125, "G": "il1^k\\\nat*", "H": {"a": 6079042826, "b": 7292804611, "c" ... ========================= @@ -305,12 +311,12 @@ Performance tests for RemesPath (mutations with a for loop) Compiling query "var xhalf = @[:].x < 0.5; for lx = zip(@[:].l, xhalf); lx[0] = ifelse(lx[1], foo, bar); -end for;" took 0.17 ms the first time, including approximately 0.134 ms to tokenize the query. Subsequent executions are effectively free due to caching. +end for;" took 0.307 ms the first time, including approximately 0.247 ms to tokenize the query. Subsequent executions are effectively free due to caching. To run pre-compiled query "var xhalf = @[:].x < 0.5; for lx = zip(@[:].l, xhalf); lx[0] = ifelse(lx[1], foo, bar); -end for;" on JNode from JSON of size 89556 into took 0.044 +/- 0.014 ms over 40 trials -Query times (ms): 0.063, 0.041, 0.038, 0.038, 0.046, 0.04, 0.038, 0.037, 0.038, 0.042, 0.036, 0.035, 0.035, 0.036, 0.039, 0.036, 0.037, 0.062, 0.037, 0.038, 0.038, 0.04, 0.052, 0.038, 0.037, 0.036, 0.037, 0.038, 0.037, 0.037, 0.037, 0.076, 0.111, 0.068, 0.054, 0.052, 0.039, 0.037, 0.038, 0.042 +end for;" on JNode from JSON of size 89556 into took 0.128 +/- 0.232 ms over 40 trials +Query times (ms): 0.109, 0.08, 0.087, 0.091, 0.096, 0.08, 0.09, 0.09, 0.087, 0.206, 0.268, 0.081, 0.084, 0.074, 0.072, 0.07, 0.067, 0.067, 0.068, 0.071, 0.101, 0.111, 0.097, 0.164, 0.096, 0.077, 0.077, 0.08, 0.075, 1.555, 0.088, 0.074, 0.07, 0.084, 0.069, 0.067, 0.077, 0.049, 0.086, 0.095 Preview of result: [["bar", false], ["bar", false], ["foo", true], ["foo", true], ["foo", true], ["foo", true], ["foo", true], ["bar", false], ["bar", false], ["bar", false], ["foo", true], ["foo", true], ["bar", false], ["bar", false], ["foo", true], ["bar", false], ["bar", false], ["bar", false], ["foo", true], ["ba ... ========================= @@ -319,18 +325,18 @@ Testing performance of JSON compression and pretty-printing Preview of json: [{"A": "Ky'c^g#~)0", "a": 1850111954, "b": 9318359041, "B": "Oyi:/ xxe2", "C": "sKCSa_^7Gg", "c": 7974777124, "d": 2670309238, "D": "0d_K)HmX!.", "E": ".uM*Z{0EJ_", "e": 6958410336, "f": 8050244728, "F": "1%SG_A!xB\t", "g": 3799657125, "G": "il1^k\\\nat*", "H": {"a": 6079042826, "b": 7292804611, "c" ... -To compress JNode from JSON string of 89556 took 3.805 +/- 0.317 ms over 64 trials (minimal whitespace, sortKeys=TRUE) -To compress JNode from JSON string of 89556 took 1.999 +/- 0.178 ms over 64 trials (minimal whitespace, sortKeys=FALSE) -To Google-style pretty-print JNode from JSON string of 89556 took 4.167 +/- 0.543 ms over 64 trials (sortKeys=true, indent=4) -To Whitesmith-style pretty-print JNode from JSON string of 89556 took 4.219 +/- 0.571 ms over 64 trials (sortKeys=true, indent=4) -To PPrint-style pretty-print JNode from JSON string of 89556 took 6.007 +/- 0.666 ms over 64 trials (sortKeys=true, indent=4) +To compress JNode from JSON string of 89556 took 6.086 +/- 1.299 ms over 64 trials (minimal whitespace, sortKeys=TRUE) +To compress JNode from JSON string of 89556 took 2.518 +/- 0.471 ms over 64 trials (minimal whitespace, sortKeys=FALSE) +To Google-style pretty-print JNode from JSON string of 89556 took 4.161 +/- 0.382 ms over 64 trials (sortKeys=true, indent=4) +To Whitesmith-style pretty-print JNode from JSON string of 89556 took 4.405 +/- 0.741 ms over 64 trials (sortKeys=true, indent=4) +To PPrint-style pretty-print JNode from JSON string of 89556 took 8.421 +/- 1.694 ms over 64 trials (sortKeys=true, indent=4) ========================= Testing performance of JsonSchemaValidator and random JSON creation ========================= -To create a random set of tweet JSON of size 172726 (15 tweets) based on the matching schema took 6.23 +/- 2.932 ms over 64 trials -To compile the tweet schema to a validation function took 0.496 +/- 1.19 ms over 64 trials -To validate tweet JSON of size 172726 (15 tweets) based on the compiled schema took 1.028 +/- 0.196 ms over 64 trials +To create a random set of tweet JSON of size 157454 (15 tweets) based on the matching schema took 7.809 +/- 3.441 ms over 64 trials +To compile the tweet schema to a validation function took 0.6 +/- 1.325 ms over 64 trials +To validate tweet JSON of size 157454 (15 tweets) based on the compiled schema took 1.259 +/- 0.235 ms over 64 trials ========================= Testing JSON grepper's API request tool =========================