diff --git a/compiler/surface/lexer.cppo.ml b/compiler/surface/lexer.cppo.ml index 992417c9b..755039338 100644 --- a/compiler/surface/lexer.cppo.ml +++ b/compiler/surface/lexer.cppo.ml @@ -358,6 +358,12 @@ let hspace = [%sedlex.regexp? Sub (white_space, Chars "\n\r")] (** Operator explicit typing suffix chars *) let op_kind_re = [%sedlex.regexp? "" | MR_MONEY_OP_SUFFIX | Chars "!.@^"] +(** Regexp matching every character except newlines *) +let any_but_eol = [%sedlex.regexp? (Sub (any, Chars "\n\r"))] + +(** Regexp matching newlines *) +let eol = [%sedlex.regexp? Opt '\r', '\n' ] + let op_kind = function | "" -> Ast.KPoly | "!" -> Ast.KInt @@ -376,7 +382,7 @@ let rec lex_code (lexbuf : lexbuf) : token = (* Whitespaces *) L.update_acc lexbuf; lex_code lexbuf - | '#', Star (Compl '\n'), '\n' -> + | '#', Star any_but_eol, eol -> (* Comments *) L.update_acc lexbuf; lex_code lexbuf @@ -737,7 +743,7 @@ let rec lex_directive_args (lexbuf : lexbuf) : token = | MR_EXTERNAL -> MODULE_EXTERNAL | Plus (Compl white_space) -> DIRECTIVE_ARG (Utf8.lexeme lexbuf) | Plus hspace -> lex_directive_args lexbuf - | '\n' | eof -> + | eol | eof -> L.context := Law; END_DIRECTIVE | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme @@ -753,7 +759,7 @@ let rec lex_directive (lexbuf : lexbuf) : token = | ":" -> L.context := Directive_args; COLON - | '\n' | eof -> + | eol | eof -> L.context := Law; END_DIRECTIVE | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme @@ -765,19 +771,20 @@ let lex_raw (lexbuf : lexbuf) : token = if at_bol then match%sedlex lexbuf with | eof -> EOF - | "```", Star hspace, ('\n' | eof) -> + | "```", Star hspace, (eol | eof) -> L.context := Law; LAW_TEXT (Utf8.lexeme lexbuf) | _ -> ( (* Nested match for lower priority; `_` matches length 0 so we effectively retry the sub-match at the same point *) match%sedlex lexbuf with - | Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf) + | Star (any_but_eol), (eol | eof) -> + LAW_TEXT (Utf8.lexeme lexbuf) | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme) else match%sedlex lexbuf with | eof -> EOF - | Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf) + | Star any_but_eol, (eol | eof) -> LAW_TEXT (Utf8.lexeme lexbuf) | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme (** Main lexing function used outside code blocks *) @@ -788,10 +795,10 @@ let lex_law (lexbuf : lexbuf) : token = if at_bol then match%sedlex lexbuf with | eof -> EOF - | "```catala", Star white_space, ('\n' | eof) -> + | "```catala", Star white_space, (eol | eof) -> L.context := Code; BEGIN_CODE - | "```catala-metadata", Star white_space, ('\n' | eof) -> + | "```catala-metadata", Star white_space, (eol | eof) -> L.context := Code; BEGIN_METADATA | "```", Star (idchar | '-') -> @@ -800,18 +807,18 @@ let lex_law (lexbuf : lexbuf) : token = | '>' -> L.context := Directive; BEGIN_DIRECTIVE - | Plus '#', Star hspace, Plus (Compl '\n'), Star hspace, ('\n' | eof) -> + | Plus '#', Star hspace, Plus any_but_eol, Star hspace, (eol | eof) -> L.get_law_heading lexbuf | _ -> ( (* Nested match for lower priority; `_` matches length 0 so we effectively retry the sub-match at the same point *) match%sedlex lexbuf with - | Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf) + | Star any_but_eol, (eol | eof) -> LAW_TEXT (Utf8.lexeme lexbuf) | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme) else match%sedlex lexbuf with | eof -> EOF - | Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf) + | Star any_but_eol, (eol | eof) -> LAW_TEXT (Utf8.lexeme lexbuf) | _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme (** Entry point of the lexer, distributes to {!val: lex_code} or {!val:lex_law} @@ -860,9 +867,9 @@ let line_dir_arg_upcase_re = let lex_line (lexbuf : lexbuf) : (string * L.line_token) option = match%sedlex lexbuf with | eof -> None - | "```catala-test-inline", Star hspace, ('\n' | eof) -> + | "```catala-test-inline", Star hspace, (eol | eof) -> Some (Utf8.lexeme lexbuf, LINE_INLINE_TEST) - | "```catala-test", Star (Compl '\n'), ('\n' | eof) -> + | "```catala-test", Star (any_but_eol), (eol | eof) -> let str = Utf8.lexeme lexbuf in (try let id = Re.Group.get (Re.exec line_test_id_re str) 1 in @@ -872,10 +879,10 @@ let lex_line (lexbuf : lexbuf) : (string * L.line_token) option = "Ignored invalid test section, must have an explicit \ `{ id = \"name\" }` specification"; Some (str, LINE_ANY)) - | "```", Star hspace, ('\n' | eof) -> + | "```", Star hspace, (eol | eof) -> Some (Utf8.lexeme lexbuf, LINE_BLOCK_END) - | '>', Star hspace, MR_LAW_INCLUDE, Star hspace, ':', Plus (Compl '\n'), - ('\n' | eof) -> + | '>', Star hspace, MR_LAW_INCLUDE, Star hspace, ':', Plus any_but_eol, + (eol | eof) -> let str = Utf8.lexeme lexbuf in (try let file = Re.Group.get (Re.exec line_dir_arg_re str) 1 in @@ -883,25 +890,25 @@ let lex_line (lexbuf : lexbuf) : (string * L.line_token) option = with Not_found -> Some (str, LINE_ANY)) | '>', Star hspace, MR_MODULE_DEF, Plus hspace, uppercase, Star (Compl white_space), Plus hspace, - MR_EXTERNAL, Star hspace, ('\n' | eof) -> + MR_EXTERNAL, Star hspace, (eol | eof) -> let str = Utf8.lexeme lexbuf in (try let mdl = Re.Group.get (Re.exec line_dir_arg_upcase_re str) 1 in Some (str, LINE_MODULE_DEF (mdl, true)) with Not_found -> Some (str, LINE_ANY)) - | '>', Star hspace, MR_MODULE_DEF, Plus hspace, uppercase, Star (Compl '\n'), - ('\n' | eof) -> + | '>', Star hspace, MR_MODULE_DEF, Plus hspace, uppercase, Star any_but_eol, + (eol | eof) -> let str = Utf8.lexeme lexbuf in (try let mdl = Re.Group.get (Re.exec line_dir_arg_upcase_re str) 1 in Some (str, LINE_MODULE_DEF (mdl, false)) with Not_found -> Some (str, LINE_ANY)) - | '>', Star hspace, MR_MODULE_USE, Plus hspace, uppercase, Star (Compl '\n'), - ('\n' | eof) -> + | '>', Star hspace, MR_MODULE_USE, Plus hspace, uppercase, Star (any_but_eol), + (eol | eof) -> let str = Utf8.lexeme lexbuf in (try let mdl = Re.Group.get (Re.exec line_dir_arg_upcase_re str) 1 in Some (str, LINE_MODULE_USE mdl) with Not_found -> Some (str, LINE_ANY)) - | Star (Compl '\n'), ('\n' | eof) -> Some (Utf8.lexeme lexbuf, LINE_ANY) + | Star any_but_eol, (eol | eof) -> Some (Utf8.lexeme lexbuf, LINE_ANY) | _ -> assert false