From 78300c0018276fdbb875812d1abd8dffa363d07c Mon Sep 17 00:00:00 2001 From: Junichi Kobayashi Date: Mon, 22 May 2023 22:19:08 +0900 Subject: [PATCH 1/7] Add a lexer test --- spec/lrama/lexer_spec.rb | 59 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/spec/lrama/lexer_spec.rb b/spec/lrama/lexer_spec.rb index 71824276..45c672df 100644 --- a/spec/lrama/lexer_spec.rb +++ b/spec/lrama/lexer_spec.rb @@ -367,6 +367,65 @@ class : keyword_class tSTRING keyword_end %prec tPLUS ]) end + it "named references" do + y = <<~INPUT +%{ +// Prologue +%} + +%token NUM + +%% + +line: expr + { printf("\t%.10g\n", $expr); } + ; + +expr[result]: NUM + | expr[left] expr[right] '+' + { $result = $left + $right; } + ; +%% + INPUT + lexer = Lrama::Lexer.new(y) + + expect(lexer.grammar_rules_tokens).to eq([ + T.new(type: T::Ident_Colon, s_value: "line"), + T.new(type: T::Ident, s_value: "expr"), + T.new(type: T::User_code, s_value: %Q({ printf("\t%.10g\n", $expr); })), + T.new(type: T::Semicolon, s_value: ";"), + + T.new(type: T::Ident_Colon, s_value: "expr"), + T.new(type: T::Named_Ref, s_value: "result"), + + T.new(type: T::Ident, s_value: "NUM"), + + T.new(type: T::Bar, s_value: "|"), + T.new(type: T::Ident, s_value: "expr"), + T.new(type: T::Named_Ref, s_value: "left"), + T.new(type: T::Ident, s_value: "expr"), + T.new(type: T::Named_Ref, s_value: "right"), + T.new(type: T::Char, s_value: "'+'"), + T.new(type: T::User_code, s_value: "{ $result = $left + $right; }"), + T.new(type: T::Semicolon, s_value: ";"), + ]) + + user_codes = lexer.grammar_rules_tokens.select do |t| + t.type == T::User_code + end + + expect(user_codes.map(&:references)).to eq([ + [ + [:dollar, "expr", nil, 20, 24], + ], + [ + [:dollar, "result", nil, 2, 8], + [:dollar, "left", nil, 12, 16], + [:dollar, "right", nil, 20, 25], + ] + ]) + end + describe "user codes" do it "parses comments correctly" do y = <<~INPUT From 3b8d4e585653ddf07273f4ee9a303d69a9daa38a Mon Sep 17 00:00:00 2001 From: Junichi Kobayashi Date: Mon, 22 May 2023 23:00:20 +0900 Subject: [PATCH 2/7] Implement tokenize named references --- lib/lrama/lexer.rb | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/lib/lrama/lexer.rb b/lib/lrama/lexer.rb index 6c1139b4..44d5b706 100644 --- a/lib/lrama/lexer.rb +++ b/lib/lrama/lexer.rb @@ -47,6 +47,7 @@ def self.define_type(name) define_type(:Number) # 0 define_type(:Ident_Colon) # k_if:, k_if : (spaces can be there) define_type(:Ident) # api.pure, tNUMBER + define_type(:Named_Ref) # [foo] define_type(:Semicolon) # ; define_type(:Bar) # | define_type(:String) # "str" @@ -166,10 +167,15 @@ def lex_common(lines, tokens) tokens << create_token(Token::Number, Integer(ss[0]), line, ss.pos - column) when ss.scan(/(<[a-zA-Z0-9_]+>)/) tokens << create_token(Token::Tag, ss[0], line, ss.pos - column) + when ss.scan(/([a-zA-Z_.][-a-zA-Z0-9_.]*)\[([a-zA-Z_.][-a-zA-Z0-9_.]*)\]\s*:/) + tokens << create_token(Token::Ident_Colon, ss[1], line, ss.pos - column) + tokens << create_token(Token::Named_Ref, ss[2], line, ss.pos - column) when ss.scan(/([a-zA-Z_.][-a-zA-Z0-9_.]*)\s*:/) tokens << create_token(Token::Ident_Colon, ss[1], line, ss.pos - column) when ss.scan(/([a-zA-Z_.][-a-zA-Z0-9_.]*)/) tokens << create_token(Token::Ident, ss[0], line, ss.pos - column) + when ss.scan(/\[([a-zA-Z_.][-a-zA-Z0-9_.]*)\]/) + tokens << create_token(Token::Named_Ref, ss[1], line, ss.pos - column) when ss.scan(/%expect/) tokens << create_token(Token::P_expect, ss[0], line, ss.pos - column) when ss.scan(/%define/) @@ -257,6 +263,9 @@ def lex_user_code(ss, line, column, lines) when ss.scan(/\$(<[a-zA-Z0-9_]+>)?(\d+)/) # $1, $2, $1 tag = ss[1] ? create_token(Token::Tag, ss[1], line, str.length) : nil references << [:dollar, Integer(ss[2]), tag, str.length, str.length + ss[0].length - 1] + when ss.scan(/\$(<[a-zA-Z0-9_]+>)?([a-zA-Z_.][-a-zA-Z0-9_.]*)/) # $foo, $expr, $program + tag = ss[1] ? create_token(Token::Tag, ss[1], line, str.length) : nil + references << [:dollar, ss[2], tag, str.length, str.length + ss[0].length - 1] when ss.scan(/@\$/) # @$ references << [:at, "$", nil, str.length, str.length + ss[0].length - 1] when ss.scan(/@(\d)+/) # @1 From 042957923f5a76e42b0bc9538112006058cd0dfc Mon Sep 17 00:00:00 2001 From: Junichi Kobayashi Date: Mon, 12 Jun 2023 02:06:07 +0900 Subject: [PATCH 3/7] Rename number to value in Lrama::Reference --- lib/lrama/grammar.rb | 39 +++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/lib/lrama/grammar.rb b/lib/lrama/grammar.rb index 1daec444..25a60a9a 100644 --- a/lib/lrama/grammar.rb +++ b/lib/lrama/grammar.rb @@ -155,16 +155,16 @@ def translated_printer_code(tag) last_column = ref.last_column case - when ref.number == "$" && ref.type == :dollar # $$ + when ref.value == "$" && ref.type == :dollar # $$ # Omit "<>" member = tag.s_value[1..-2] str = "((*yyvaluep).#{member})" - when ref.number == "$" && ref.type == :at # @$ + when ref.value == "$" && ref.type == :at # @$ str = "(*yylocationp)" when ref.type == :dollar # $n - raise "$#{ref.number} can not be used in %printer." + raise "$#{ref.value} can not be used in %printer." when ref.type == :at # @n - raise "@#{ref.number} can not be used in %printer." + raise "@#{ref.value} can not be used in %printer." else raise "Unexpected. #{self}, #{ref}" end @@ -190,19 +190,19 @@ def translated_user_code last_column = ref.last_column case - when ref.number == "$" && ref.type == :dollar # $$ + when ref.value == "$" && ref.type == :dollar # $$ # Omit "<>" member = ref.tag.s_value[1..-2] str = "(yyval.#{member})" - when ref.number == "$" && ref.type == :at # @$ + when ref.value == "$" && ref.type == :at # @$ str = "(yyloc)" when ref.type == :dollar # $n - i = -ref.position_in_rhs + ref.number + i = -ref.position_in_rhs + ref.value # Omit "<>" member = ref.tag.s_value[1..-2] str = "(yyvsp[#{i}].#{member})" when ref.type == :at # @n - i = -ref.position_in_rhs + ref.number + i = -ref.position_in_rhs + ref.value str = "(yylsp[#{i}])" else raise "Unexpected. #{self}, #{ref}" @@ -226,14 +226,14 @@ def translated_initial_action_code last_column = ref.last_column case - when ref.number == "$" && ref.type == :dollar # $$ + when ref.value == "$" && ref.type == :dollar # $$ str = "yylval" - when ref.number == "$" && ref.type == :at # @$ + when ref.value == "$" && ref.type == :at # @$ str = "yylloc" when ref.type == :dollar # $n - raise "$#{ref.number} can not be used in initial_action." + raise "$#{ref.value} can not be used in initial_action." when ref.type == :at # @n - raise "@#{ref.number} can not be used in initial_action." + raise "@#{ref.value} can not be used in initial_action." else raise "Unexpected. #{self}, #{ref}" end @@ -247,7 +247,7 @@ def translated_initial_action_code # type: :dollar or :at # ex_tag: "$1" (Optional) - Reference = Struct.new(:type, :number, :ex_tag, :first_column, :last_column, :referring_symbol, :position_in_rhs, keyword_init: true) do + Reference = Struct.new(:type, :value, :ex_tag, :first_column, :last_column, :referring_symbol, :position_in_rhs, keyword_init: true) do def tag if ex_tag ex_tag @@ -382,8 +382,8 @@ def add_rule(lhs:, rhs:, lineno:) end def build_references(token_code) - token_code.references.map! do |type, number, tag, first_column, last_column| - Reference.new(type: type, number: number, ex_tag: tag, first_column: first_column, last_column: last_column) + token_code.references.map! do |type, value, tag, first_column, last_column| + Reference.new(type: type, value: value, ex_tag: tag, first_column: first_column, last_column: last_column) end token_code @@ -627,15 +627,14 @@ def normalize_rules ref.position_in_rhs = i - 1 next if ref.type == :at # $$, $n, @$, @n can be used in any actions - number = ref.number - if number == "$" + if ref.value == "$" # TODO: Should be postponed after middle actions are extracted? ref.referring_symbol = lhs else - raise "Can not refer following component. #{number} >= #{i}. #{token}" if number >= i - rhs1[number - 1].referred = true - ref.referring_symbol = rhs1[number - 1] + raise "Can not refer following component. #{ref.value} >= #{i}. #{token}" if ref.value >= i + rhs1[ref.value - 1].referred = true + ref.referring_symbol = rhs1[ref.value - 1] end end end From c64e13163a302c77b70e3af82de3a67069c9c8fd Mon Sep 17 00:00:00 2001 From: Junichi Kobayashi Date: Mon, 12 Jun 2023 02:08:04 +0900 Subject: [PATCH 4/7] Implement ref.value is a String --- lib/lrama/grammar.rb | 11 ++++++++++- lib/lrama/lexer.rb | 4 ++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/lib/lrama/grammar.rb b/lib/lrama/grammar.rb index 25a60a9a..a13c5807 100644 --- a/lib/lrama/grammar.rb +++ b/lib/lrama/grammar.rb @@ -631,10 +631,19 @@ def normalize_rules if ref.value == "$" # TODO: Should be postponed after middle actions are extracted? ref.referring_symbol = lhs - else + elsif ref.value.is_a?(Integer) raise "Can not refer following component. #{ref.value} >= #{i}. #{token}" if ref.value >= i rhs1[ref.value - 1].referred = true ref.referring_symbol = rhs1[ref.value - 1] + elsif ref.value.is_a?(String) + target_tokens = ([lhs] + rhs1 + [code]).compact.first(i) + referring_symbol_candidate = target_tokens.filter {|token| token.referred_by?(ref.value) } + raise "Referring symbol `#{ref.value}` is duplicated. #{token}" if referring_symbol_candidate.size >= 2 + raise "Referring symbol `#{ref.value}` is not found. #{token}" if referring_symbol_candidate.count == 0 + + referring_symbol = referring_symbol_candidate.first + referring_symbol.referred = true + ref.referring_symbol = referring_symbol end end end diff --git a/lib/lrama/lexer.rb b/lib/lrama/lexer.rb index 44d5b706..6e04e2b1 100644 --- a/lib/lrama/lexer.rb +++ b/lib/lrama/lexer.rb @@ -18,6 +18,10 @@ def to_s "#{super} line: #{line}, column: #{column}" end + def referred_by?(string) + [self.s_value, self.alias].include?(string) + end + @i = 0 @types = [] From 24f60a8fb850338dea715d36f023027a477669f7 Mon Sep 17 00:00:00 2001 From: Junichi Kobayashi Date: Mon, 12 Jun 2023 01:49:55 +0900 Subject: [PATCH 5/7] Add Token#alias --- lib/lrama/lexer.rb | 6 +++++- lib/lrama/parser.rb | 6 ++++++ lib/lrama/parser/token_scanner.rb | 4 ++++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/lib/lrama/lexer.rb b/lib/lrama/lexer.rb index 6e04e2b1..665a25ef 100644 --- a/lib/lrama/lexer.rb +++ b/lib/lrama/lexer.rb @@ -7,7 +7,7 @@ class Lexer include Lrama::Report::Duration # s_value is semantic value - Token = Struct.new(:type, :s_value, keyword_init: true) do + Token = Struct.new(:type, :s_value, :alias, keyword_init: true) do Type = Struct.new(:id, :name, keyword_init: true) attr_accessor :line, :column, :referred @@ -22,6 +22,10 @@ def referred_by?(string) [self.s_value, self.alias].include?(string) end + def ==(other) + self.class == other.class && self.type == other.type && self.s_value == other.s_value + end + @i = 0 @types = [] diff --git a/lib/lrama/parser.rb b/lib/lrama/parser.rb index 1de8a7a4..2296ea9b 100644 --- a/lib/lrama/parser.rb +++ b/lib/lrama/parser.rb @@ -175,6 +175,9 @@ def parse_grammar_rule(ts, grammar) # LHS lhs = ts.consume!(T::Ident_Colon) # class: lhs.type = T::Ident + if named_ref = ts.consume(T::Named_Ref) + lhs.alias = named_ref.s_value + end rhs = parse_grammar_rule_rhs(ts, grammar) @@ -247,6 +250,9 @@ def parse_grammar_rule_rhs(ts, grammar) grammar.build_references(code) a << code ts.next + when T::Named_Ref + ts.previous_token.alias = ts.current_token.s_value + ts.next when T::Bar # | break diff --git a/lib/lrama/parser/token_scanner.rb b/lib/lrama/parser/token_scanner.rb index b9c1522a..1ff67b30 100644 --- a/lib/lrama/parser/token_scanner.rb +++ b/lib/lrama/parser/token_scanner.rb @@ -14,6 +14,10 @@ def current_type current_token && current_token.type end + def previous_token + @tokens[@index - 1] + end + def next token = current_token @index += 1 From 99d8b294564550bafab1b7e4fce3b56c421ede30 Mon Sep 17 00:00:00 2001 From: Junichi Kobayashi Date: Mon, 12 Jun 2023 01:48:03 +0900 Subject: [PATCH 6/7] Numberize named references --- lib/lrama/lexer.rb | 17 +++++++++++++++++ lib/lrama/parser.rb | 7 ++++--- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/lib/lrama/lexer.rb b/lib/lrama/lexer.rb index 665a25ef..fd79a46b 100644 --- a/lib/lrama/lexer.rb +++ b/lib/lrama/lexer.rb @@ -26,6 +26,23 @@ def ==(other) self.class == other.class && self.type == other.type && self.s_value == other.s_value end + def numberize_references(lhs, rhs) + self.references.map! {|ref| + ref_name = ref[1] + if ref_name.is_a?(String) && ref_name != '$' + value = + if lhs.referred_by?(ref_name) + '$' + else + rhs.find_index {|token| token.referred_by?(ref_name) } + 1 + end + [ref[0], value, ref[2], ref[3], ref[4]] + else + ref + end + } + end + @i = 0 @types = [] diff --git a/lib/lrama/parser.rb b/lib/lrama/parser.rb index 2296ea9b..7d83e45d 100644 --- a/lib/lrama/parser.rb +++ b/lib/lrama/parser.rb @@ -179,7 +179,7 @@ def parse_grammar_rule(ts, grammar) lhs.alias = named_ref.s_value end - rhs = parse_grammar_rule_rhs(ts, grammar) + rhs = parse_grammar_rule_rhs(ts, grammar, lhs) grammar.add_rule(lhs: lhs, rhs: rhs, lineno: rhs.first ? rhs.first.line : lhs.line) @@ -189,7 +189,7 @@ def parse_grammar_rule(ts, grammar) # | bar_lineno = ts.current_token.line ts.next - rhs = parse_grammar_rule_rhs(ts, grammar) + rhs = parse_grammar_rule_rhs(ts, grammar, lhs) grammar.add_rule(lhs: lhs, rhs: rhs, lineno: rhs.first ? rhs.first.line : bar_lineno) when T::Semicolon # ; @@ -208,7 +208,7 @@ def parse_grammar_rule(ts, grammar) end end - def parse_grammar_rule_rhs(ts, grammar) + def parse_grammar_rule_rhs(ts, grammar, lhs) a = [] prec_seen = false code_after_prec = false @@ -247,6 +247,7 @@ def parse_grammar_rule_rhs(ts, grammar) end code = ts.current_token + code.numberize_references(lhs, a) grammar.build_references(code) a << code ts.next From faa42259023ebf6b3b0034d7200cd3f21d161f6d Mon Sep 17 00:00:00 2001 From: Junichi Kobayashi Date: Sat, 10 Jun 2023 02:50:35 +0900 Subject: [PATCH 7/7] Add parser specs --- spec/lrama/parser_spec.rb | 133 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 133 insertions(+) diff --git a/spec/lrama/parser_spec.rb b/spec/lrama/parser_spec.rb index 0056c042..365681ce 100644 --- a/spec/lrama/parser_spec.rb +++ b/spec/lrama/parser_spec.rb @@ -1143,6 +1143,139 @@ class : keyword_class tSTRING keyword_end { code 1 } ), ]) end + + context "includes named references" do + it "can parse" do + y = <<~INPUT +%{ +// Prologue +%} + +%union { + int i; +} + +%token NUM +%type expr + +%% + +input : /* empty */ + | input line +; + +line : '\\n' + | expr '\\n' + { printf("\\t%.10g\\n", $expr); } +; + +expr[result]: NUM + | expr[left] expr[right] '+' + { $result = $left + $right; } + | expr expr '-' + { $$ = $1 - $2; } +; + INPUT + grammar = Lrama::Parser.new(y).parse + + expect(grammar.rules).to eq([ + Rule.new( + id: 0, + lhs: grammar.find_symbol_by_s_value!("$accept"), + rhs: [ + grammar.find_symbol_by_s_value!("input"), + grammar.find_symbol_by_s_value!("YYEOF"), + ], + code: nil, + nullable: false, + precedence_sym: grammar.find_symbol_by_s_value!("YYEOF"), + lineno: 14, + ), + Rule.new( + id: 1, + lhs: grammar.find_symbol_by_s_value!("input"), + rhs: [ + ], + code: nil, + nullable: true, + precedence_sym: nil, + lineno: 14, + ), + Rule.new( + id: 2, + lhs: grammar.find_symbol_by_s_value!("input"), + rhs: [ + grammar.find_symbol_by_s_value!("input"), + grammar.find_symbol_by_s_value!("line"), + ], + code: nil, + nullable: false, + precedence_sym: nil, + lineno: 15, + ), + Rule.new( + id: 3, + lhs: grammar.find_symbol_by_s_value!("line"), + rhs: [ + grammar.find_symbol_by_s_value!("'\\n'"), + ], + code: nil, + nullable: false, + precedence_sym: grammar.find_symbol_by_s_value!("'\\n'"), + lineno: 18, + ), + Rule.new( + id: 4, + lhs: grammar.find_symbol_by_s_value!("line"), + rhs: [ + grammar.find_symbol_by_s_value!("expr"), + grammar.find_symbol_by_s_value!("'\\n'"), + ], + code: Code.new(type: :user_code, token_code: T.new(type: T::User_code, s_value: "{ printf(\"\\t%.10g\\n\", $expr); }")), + nullable: false, + precedence_sym: grammar.find_symbol_by_s_value!("'\\n'"), + lineno: 19, + ), + Rule.new( + id: 5, + lhs: grammar.find_symbol_by_s_value!("expr"), + rhs: [ + grammar.find_symbol_by_s_value!("NUM"), + ], + code: nil, + nullable: false, + precedence_sym: grammar.find_symbol_by_s_value!("NUM"), + lineno: 23, + ), + Rule.new( + id: 6, + lhs: grammar.find_symbol_by_s_value!("expr"), + rhs: [ + grammar.find_symbol_by_s_value!("expr"), + grammar.find_symbol_by_s_value!("expr"), + grammar.find_symbol_by_s_value!("'+'"), + ], + code: Code.new(type: :user_code, token_code: T.new(type: T::User_code, s_value: "{ $result = $left + $right; }")), + nullable: false, + precedence_sym: grammar.find_symbol_by_s_value!("'+'"), + lineno: 24, + ), + Rule.new( + id: 7, + lhs: grammar.find_symbol_by_s_value!("expr"), + rhs: [ + grammar.find_symbol_by_s_value!("expr"), + grammar.find_symbol_by_s_value!("expr"), + grammar.find_symbol_by_s_value!("'-'"), + ], + code: Code.new(type: :user_code, token_code: T.new(type: T::User_code, s_value: "{ $$ = $1 - $2; }")), + nullable: false, + precedence_sym: grammar.find_symbol_by_s_value!("'-'"), + lineno: 26, + ), + ]) + end + end end end end