From 7a2ae16ade0a68f247c771fce6acbd9b42c0e6b0 Mon Sep 17 00:00:00 2001 From: Bart Kiers Date: Sun, 13 Oct 2024 18:36:16 +0200 Subject: [PATCH 1/5] Fix performance issue w.r.t. not_out_end --- src/main/antlr4/liquid/parser/v4/LiquidLexer.g4 | 1 + src/main/antlr4/liquid/parser/v4/LiquidParser.g4 | 13 ++++++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/src/main/antlr4/liquid/parser/v4/LiquidLexer.g4 b/src/main/antlr4/liquid/parser/v4/LiquidLexer.g4 index 19231d77..cd03301d 100644 --- a/src/main/antlr4/liquid/parser/v4/LiquidLexer.g4 +++ b/src/main/antlr4/liquid/parser/v4/LiquidLexer.g4 @@ -104,6 +104,7 @@ fragment LineBreak : '\r'? '\n' | '\r'; fragment Letter : [a-zA-Z]; fragment Digit : [0-9]; +// Note that when adding tokens to this `IN_TAG` mode, be sure to include them in the parser rule `not_out_end` as well! mode IN_TAG; OutStart2 : '{{' -> pushMode(IN_TAG); diff --git a/src/main/antlr4/liquid/parser/v4/LiquidParser.g4 b/src/main/antlr4/liquid/parser/v4/LiquidParser.g4 index 35e9a4c6..8b714d69 100644 --- a/src/main/antlr4/liquid/parser/v4/LiquidParser.g4 +++ b/src/main/antlr4/liquid/parser/v4/LiquidParser.g4 @@ -230,8 +230,19 @@ output | {isWarn() || isLax()}? outStart term filter* unparsed=not_out_end? OutEnd ; +// When doing `( ~OutEnd )+`, it appears ANTLR is much slower on large input text. Even when `isStrict() == true` the +// parser will never get here, but the prediction algorithm still tries this branch and takes too much time when the +// much too large set `( ~OutEnd )+` is used. The tokens below are all tokens that are possible when the lexer is in +// the `IN_TAG` mode. +// +// The input from https://github.com/bkiers/Liqp/issues/310 is tested by parsing it 100 times. When this rule contains +// `( ~OutEnd )+`, it ran in about 8000-8500 ms on average. With the individual tokens specified in the `IN_TAG` mode, +// the average runtime was around 3000-3200 ms. not_out_end - : ( ~OutEnd )+ + : ( OutStart2 | TagEnd | Str | DotDot | Dot | NEq | Eq | EqSign | GtEq | Gt | LtEq | Lt | Minus | Pipe + | Col | Comma | OPar | CPar | OBr | CBr | QMark | PathSep | DoubleNum | LongNum | Contains | In | And + | Or | True | False | Nil | With | Offset | Continue | Reversed | Empty | Blank | IdChain | Id + )+ ; filter From 6c867c0c33cf483ad02bbd45339daffe3adb341e Mon Sep 17 00:00:00 2001 From: Bart Kiers Date: Sun, 13 Oct 2024 20:03:36 +0200 Subject: [PATCH 2/5] Remove another `~TagEnd+`, remove unused `other_than_tag_start` --- src/main/antlr4/liquid/parser/v4/LiquidParser.g4 | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/src/main/antlr4/liquid/parser/v4/LiquidParser.g4 b/src/main/antlr4/liquid/parser/v4/LiquidParser.g4 index 8b714d69..4a419094 100644 --- a/src/main/antlr4/liquid/parser/v4/LiquidParser.g4 +++ b/src/main/antlr4/liquid/parser/v4/LiquidParser.g4 @@ -124,10 +124,6 @@ comment_tag : TagStart CommentStart TagEnd .*? TagStart CommentEnd TagEnd ; -other_than_tag_start - : ~( TagStart )* - ; - if_tag : TagStart IfStart expr TagEnd block elsif_tag* else_tag? TagStart IfEnd TagEnd ; @@ -239,10 +235,7 @@ output // `( ~OutEnd )+`, it ran in about 8000-8500 ms on average. With the individual tokens specified in the `IN_TAG` mode, // the average runtime was around 3000-3200 ms. not_out_end - : ( OutStart2 | TagEnd | Str | DotDot | Dot | NEq | Eq | EqSign | GtEq | Gt | LtEq | Lt | Minus | Pipe - | Col | Comma | OPar | CPar | OBr | CBr | QMark | PathSep | DoubleNum | LongNum | Contains | In | And - | Or | True | False | Nil | With | Offset | Continue | Reversed | Empty | Blank | IdChain | Id - )+ + : other_than_tag_end ; filter @@ -345,7 +338,10 @@ other_tag_parameters ; other_than_tag_end - : ~TagEnd+ + : ( OutStart2 | TagEnd | Str | DotDot | Dot | NEq | Eq | EqSign | GtEq | Gt | LtEq | Lt | Minus | Pipe + | Col | Comma | OPar | CPar | OBr | CBr | QMark | PathSep | DoubleNum | LongNum | Contains | In | And + | Or | True | False | Nil | With | Offset | Continue | Reversed | Empty | Blank | IdChain | Id + )+ ; filename From fe198223e863ac3f6915575cf2d47f76259b1d83 Mon Sep 17 00:00:00 2001 From: Bart Kiers Date: Sun, 13 Oct 2024 20:04:28 +0200 Subject: [PATCH 3/5] Fix comment --- src/main/antlr4/liquid/parser/v4/LiquidLexer.g4 | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/main/antlr4/liquid/parser/v4/LiquidLexer.g4 b/src/main/antlr4/liquid/parser/v4/LiquidLexer.g4 index cd03301d..fcabd88b 100644 --- a/src/main/antlr4/liquid/parser/v4/LiquidLexer.g4 +++ b/src/main/antlr4/liquid/parser/v4/LiquidLexer.g4 @@ -104,7 +104,8 @@ fragment LineBreak : '\r'? '\n' | '\r'; fragment Letter : [a-zA-Z]; fragment Digit : [0-9]; -// Note that when adding tokens to this `IN_TAG` mode, be sure to include them in the parser rule `not_out_end` as well! +// Note that when adding tokens to this `IN_TAG` mode, be sure to include them in the parser +// rule `other_than_tag_end` as well! mode IN_TAG; OutStart2 : '{{' -> pushMode(IN_TAG); From 2312fa4e069b68508dc0f121f8e3bff120e6fbcb Mon Sep 17 00:00:00 2001 From: Bart Kiers Date: Sun, 13 Oct 2024 20:21:10 +0200 Subject: [PATCH 4/5] Include correct tokens in `not_out_end` and `other_than_tag_end` This reverts commit fe198223e863ac3f6915575cf2d47f76259b1d83. --- src/main/antlr4/liquid/parser/v4/LiquidLexer.g4 | 2 +- src/main/antlr4/liquid/parser/v4/LiquidParser.g4 | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/main/antlr4/liquid/parser/v4/LiquidLexer.g4 b/src/main/antlr4/liquid/parser/v4/LiquidLexer.g4 index fcabd88b..b6bd353f 100644 --- a/src/main/antlr4/liquid/parser/v4/LiquidLexer.g4 +++ b/src/main/antlr4/liquid/parser/v4/LiquidLexer.g4 @@ -105,7 +105,7 @@ fragment Letter : [a-zA-Z]; fragment Digit : [0-9]; // Note that when adding tokens to this `IN_TAG` mode, be sure to include them in the parser -// rule `other_than_tag_end` as well! +// rules `not_out_end` and/or `other_tag_parameters` as well! mode IN_TAG; OutStart2 : '{{' -> pushMode(IN_TAG); diff --git a/src/main/antlr4/liquid/parser/v4/LiquidParser.g4 b/src/main/antlr4/liquid/parser/v4/LiquidParser.g4 index 4a419094..0ca0a9f1 100644 --- a/src/main/antlr4/liquid/parser/v4/LiquidParser.g4 +++ b/src/main/antlr4/liquid/parser/v4/LiquidParser.g4 @@ -235,7 +235,10 @@ output // `( ~OutEnd )+`, it ran in about 8000-8500 ms on average. With the individual tokens specified in the `IN_TAG` mode, // the average runtime was around 3000-3200 ms. not_out_end - : other_than_tag_end + : ( TagEnd | OutStart2 | Str | DotDot | Dot | NEq | Eq | EqSign | GtEq | Gt | LtEq | Lt | Minus | Pipe + | Col | Comma | OPar | CPar | OBr | CBr | QMark | PathSep | DoubleNum | LongNum | Contains | In | And + | Or | True | False | Nil | With | Offset | Continue | Reversed | Empty | Blank | IdChain | Id + )+ ; filter @@ -338,7 +341,7 @@ other_tag_parameters ; other_than_tag_end - : ( OutStart2 | TagEnd | Str | DotDot | Dot | NEq | Eq | EqSign | GtEq | Gt | LtEq | Lt | Minus | Pipe + : ( OutEnd | OutStart2 | Str | DotDot | Dot | NEq | Eq | EqSign | GtEq | Gt | LtEq | Lt | Minus | Pipe | Col | Comma | OPar | CPar | OBr | CBr | QMark | PathSep | DoubleNum | LongNum | Contains | In | And | Or | True | False | Nil | With | Offset | Continue | Reversed | Empty | Blank | IdChain | Id )+ From 2b8b61562ca2510b28cd98faa98f8ff15a052877 Mon Sep 17 00:00:00 2001 From: Bart Kiers Date: Mon, 14 Oct 2024 08:52:24 +0200 Subject: [PATCH 5/5] Update LiquidParser.g4 --- src/main/antlr4/liquid/parser/v4/LiquidParser.g4 | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/main/antlr4/liquid/parser/v4/LiquidParser.g4 b/src/main/antlr4/liquid/parser/v4/LiquidParser.g4 index 0ca0a9f1..7e6c4638 100644 --- a/src/main/antlr4/liquid/parser/v4/LiquidParser.g4 +++ b/src/main/antlr4/liquid/parser/v4/LiquidParser.g4 @@ -234,6 +234,8 @@ output // The input from https://github.com/bkiers/Liqp/issues/310 is tested by parsing it 100 times. When this rule contains // `( ~OutEnd )+`, it ran in about 8000-8500 ms on average. With the individual tokens specified in the `IN_TAG` mode, // the average runtime was around 3000-3200 ms. +// +// All tokens in the `IN_TAG` mode _except_ the `OutEnd` token not_out_end : ( TagEnd | OutStart2 | Str | DotDot | Dot | NEq | Eq | EqSign | GtEq | Gt | LtEq | Lt | Minus | Pipe | Col | Comma | OPar | CPar | OBr | CBr | QMark | PathSep | DoubleNum | LongNum | Contains | In | And @@ -340,6 +342,7 @@ other_tag_parameters : other_than_tag_end ; +// All tokens in the `IN_TAG` mode _except_ the `TagEnd` token other_than_tag_end : ( OutEnd | OutStart2 | Str | DotDot | Dot | NEq | Eq | EqSign | GtEq | Gt | LtEq | Lt | Minus | Pipe | Col | Comma | OPar | CPar | OBr | CBr | QMark | PathSep | DoubleNum | LongNum | Contains | In | And