Skip to content

Commit

Permalink
Handle carriage returns in the lexer (#770)
Browse files Browse the repository at this point in the history
  • Loading branch information
AltGr authored Jan 20, 2025
2 parents 5f43c27 + 959ef6b commit ba3b268
Showing 1 changed file with 29 additions and 22 deletions.
51 changes: 29 additions & 22 deletions compiler/surface/lexer.cppo.ml
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,12 @@ let hspace = [%sedlex.regexp? Sub (white_space, Chars "\n\r")]
(** Operator explicit typing suffix chars *)
let op_kind_re = [%sedlex.regexp? "" | MR_MONEY_OP_SUFFIX | Chars "!.@^"]

(** Regexp matching every character except newlines *)
let any_but_eol = [%sedlex.regexp? (Sub (any, Chars "\n\r"))]

(** Regexp matching newlines *)
let eol = [%sedlex.regexp? Opt '\r', '\n' ]

let op_kind = function
| "" -> Ast.KPoly
| "!" -> Ast.KInt
Expand All @@ -376,7 +382,7 @@ let rec lex_code (lexbuf : lexbuf) : token =
(* Whitespaces *)
L.update_acc lexbuf;
lex_code lexbuf
| '#', Star (Compl '\n'), '\n' ->
| '#', Star any_but_eol, eol ->
(* Comments *)
L.update_acc lexbuf;
lex_code lexbuf
Expand Down Expand Up @@ -737,7 +743,7 @@ let rec lex_directive_args (lexbuf : lexbuf) : token =
| MR_EXTERNAL -> MODULE_EXTERNAL
| Plus (Compl white_space) -> DIRECTIVE_ARG (Utf8.lexeme lexbuf)
| Plus hspace -> lex_directive_args lexbuf
| '\n' | eof ->
| eol | eof ->
L.context := Law;
END_DIRECTIVE
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
Expand All @@ -753,7 +759,7 @@ let rec lex_directive (lexbuf : lexbuf) : token =
| ":" ->
L.context := Directive_args;
COLON
| '\n' | eof ->
| eol | eof ->
L.context := Law;
END_DIRECTIVE
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
Expand All @@ -765,19 +771,20 @@ let lex_raw (lexbuf : lexbuf) : token =
if at_bol then
match%sedlex lexbuf with
| eof -> EOF
| "```", Star hspace, ('\n' | eof) ->
| "```", Star hspace, (eol | eof) ->
L.context := Law;
LAW_TEXT (Utf8.lexeme lexbuf)
| _ -> (
(* Nested match for lower priority; `_` matches length 0 so we effectively retry the
sub-match at the same point *)
match%sedlex lexbuf with
| Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
| Star (any_but_eol), (eol | eof) ->
LAW_TEXT (Utf8.lexeme lexbuf)
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme)
else
match%sedlex lexbuf with
| eof -> EOF
| Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
| Star any_but_eol, (eol | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme

(** Main lexing function used outside code blocks *)
Expand All @@ -788,10 +795,10 @@ let lex_law (lexbuf : lexbuf) : token =
if at_bol then
match%sedlex lexbuf with
| eof -> EOF
| "```catala", Star white_space, ('\n' | eof) ->
| "```catala", Star white_space, (eol | eof) ->
L.context := Code;
BEGIN_CODE
| "```catala-metadata", Star white_space, ('\n' | eof) ->
| "```catala-metadata", Star white_space, (eol | eof) ->
L.context := Code;
BEGIN_METADATA
| "```", Star (idchar | '-') ->
Expand All @@ -800,18 +807,18 @@ let lex_law (lexbuf : lexbuf) : token =
| '>' ->
L.context := Directive;
BEGIN_DIRECTIVE
| Plus '#', Star hspace, Plus (Compl '\n'), Star hspace, ('\n' | eof) ->
| Plus '#', Star hspace, Plus any_but_eol, Star hspace, (eol | eof) ->
L.get_law_heading lexbuf
| _ -> (
(* Nested match for lower priority; `_` matches length 0 so we effectively retry the
sub-match at the same point *)
match%sedlex lexbuf with
| Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
| Star any_but_eol, (eol | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme)
else
match%sedlex lexbuf with
| eof -> EOF
| Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
| Star any_but_eol, (eol | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme

(** Entry point of the lexer, distributes to {!val: lex_code} or {!val:lex_law}
Expand Down Expand Up @@ -860,9 +867,9 @@ let line_dir_arg_upcase_re =
let lex_line (lexbuf : lexbuf) : (string * L.line_token) option =
match%sedlex lexbuf with
| eof -> None
| "```catala-test-inline", Star hspace, ('\n' | eof) ->
| "```catala-test-inline", Star hspace, (eol | eof) ->
Some (Utf8.lexeme lexbuf, LINE_INLINE_TEST)
| "```catala-test", Star (Compl '\n'), ('\n' | eof) ->
| "```catala-test", Star (any_but_eol), (eol | eof) ->
let str = Utf8.lexeme lexbuf in
(try
let id = Re.Group.get (Re.exec line_test_id_re str) 1 in
Expand All @@ -872,36 +879,36 @@ let lex_line (lexbuf : lexbuf) : (string * L.line_token) option =
"Ignored invalid test section, must have an explicit \
`{ id = \"name\" }` specification";
Some (str, LINE_ANY))
| "```", Star hspace, ('\n' | eof) ->
| "```", Star hspace, (eol | eof) ->
Some (Utf8.lexeme lexbuf, LINE_BLOCK_END)
| '>', Star hspace, MR_LAW_INCLUDE, Star hspace, ':', Plus (Compl '\n'),
('\n' | eof) ->
| '>', Star hspace, MR_LAW_INCLUDE, Star hspace, ':', Plus any_but_eol,
(eol | eof) ->
let str = Utf8.lexeme lexbuf in
(try
let file = Re.Group.get (Re.exec line_dir_arg_re str) 1 in
Some (str, LINE_INCLUDE file)
with Not_found -> Some (str, LINE_ANY))
| '>', Star hspace, MR_MODULE_DEF, Plus hspace,
uppercase, Star (Compl white_space), Plus hspace,
MR_EXTERNAL, Star hspace, ('\n' | eof) ->
MR_EXTERNAL, Star hspace, (eol | eof) ->
let str = Utf8.lexeme lexbuf in
(try
let mdl = Re.Group.get (Re.exec line_dir_arg_upcase_re str) 1 in
Some (str, LINE_MODULE_DEF (mdl, true))
with Not_found -> Some (str, LINE_ANY))
| '>', Star hspace, MR_MODULE_DEF, Plus hspace, uppercase, Star (Compl '\n'),
('\n' | eof) ->
| '>', Star hspace, MR_MODULE_DEF, Plus hspace, uppercase, Star any_but_eol,
(eol | eof) ->
let str = Utf8.lexeme lexbuf in
(try
let mdl = Re.Group.get (Re.exec line_dir_arg_upcase_re str) 1 in
Some (str, LINE_MODULE_DEF (mdl, false))
with Not_found -> Some (str, LINE_ANY))
| '>', Star hspace, MR_MODULE_USE, Plus hspace, uppercase, Star (Compl '\n'),
('\n' | eof) ->
| '>', Star hspace, MR_MODULE_USE, Plus hspace, uppercase, Star (any_but_eol),
(eol | eof) ->
let str = Utf8.lexeme lexbuf in
(try
let mdl = Re.Group.get (Re.exec line_dir_arg_upcase_re str) 1 in
Some (str, LINE_MODULE_USE mdl)
with Not_found -> Some (str, LINE_ANY))
| Star (Compl '\n'), ('\n' | eof) -> Some (Utf8.lexeme lexbuf, LINE_ANY)
| Star any_but_eol, (eol | eof) -> Some (Utf8.lexeme lexbuf, LINE_ANY)
| _ -> assert false

0 comments on commit ba3b268

Please sign in to comment.