From 95a24cb2efdd805c21aa8eb759864d3bb89b5fd4 Mon Sep 17 00:00:00 2001 From: Ivan Raikov Date: Mon, 10 Sep 2018 21:23:48 -0700 Subject: [PATCH] initial commit of C5 files --- lexgen.egg | 11 ++ lexgen.release-info | 7 ++ lexgen.scm | 298 ++++++++++++++++++++++++++++++++++++++++++++ tests/run.scm | 249 ++++++++++++++++++++++++++++++++++++ 4 files changed, 565 insertions(+) create mode 100644 lexgen.egg create mode 100644 lexgen.release-info create mode 100644 lexgen.scm create mode 100644 tests/run.scm diff --git a/lexgen.egg b/lexgen.egg new file mode 100644 index 0000000..433cf0f --- /dev/null +++ b/lexgen.egg @@ -0,0 +1,11 @@ +;; -*- Hen -*- + +((synopsis "Lexer combinators") + (license "GPL-3") + (category parsing) + (dependencies srfi-1 utf8 yasos) + (test-dependencies test srfi-14) + (author "Ivan Raikov") + (components (extension lexgen)) + ) + diff --git a/lexgen.release-info b/lexgen.release-info new file mode 100644 index 0000000..0967e13 --- /dev/null +++ b/lexgen.release-info @@ -0,0 +1,7 @@ +;; -*- scheme -*- +(repo git "git://github.com/iraikov/chicken-lexgen.git") +(uri targz "https://github.com/iraikov/chicken-lexgen/tarball/{egg-release}") +(uri files-list "http://code.call-cc.org/files-list?egg={egg-name};egg-release={egg-release};chicken-release={chicken-release}" old-uri) + +(release "8.0") + diff --git a/lexgen.scm b/lexgen.scm new file mode 100644 index 0000000..a3d694a --- /dev/null +++ b/lexgen.scm @@ -0,0 +1,298 @@ +;; +;; Lexer combinator library. +;; +;; Based on the SML lexer generator by Thant Tessman. +;; +;; Copyright 2009-2018 Ivan Raikov. +;; +;; +;; This program is free software: you can redistribute it and/or +;; modify it under the terms of the GNU General Public License as +;; published by the Free Software Foundation, either version 3 of the +;; License, or (at your option) any later version. +;; +;; This program is distributed in the hope that it will be useful, but +;; WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;; General Public License for more details. +;; +;; A full copy of the GPL license can be found at +;; . + +(module lexgen + + ( seq star bar + try pass pos opt lst + bind bind* rebind rebind* drop + lex tok char range set lit + ) + + + (import scheme (chicken base) + (only srfi-1 first second filter-map fold concatenate every lset<= ) + yasos yasos-collections + utf8 utf8-srfi-14) + + +;; +;; This is a lexer generator comprised in its core of five small +;; functions. The programmer assembles these functions into regular +;; expression pattern-matching functions. +;; +;; The idea is that a pattern matcher function takes a list of +;; streams, and returns a new list of streams advanced by every +;; combination allowed by the pattern matcher function. +;; +;; A stream is a list that can take one of two forms: +;; +;; 1) A list of two elements: the first element is a list of +;; elements consumed by the pattern matcher; the second element is a +;; list of characters not yet consumed. E.g., the list +;; +;; ((a) (b c d e)) +;; +;; represents a stream that contains the consumed character a, +;; and the unconsumed characters b c d e. +;; +;; 2) A list of three elements: the first two elements are as +;; before, but the third element is a procedure that is applied to +;; the tail of the unconsumed list, in order to obtain the next +;; character. E.g., the list: +;; +;; ((a) (b ) +;; +;; represents a stream that contains the consumed character a, the +;; unconsumed character b, and an input port to read subsequent +;; character from; and a procedure that reads one character from the +;; input port, and returns it along with the modified port. Note +;; that the use of side-effecting structures such as ports will lead +;; to erroneous results with backtracking parsers. +;; +;; Also note that the number of streams returned by the function +;; typically won't match the number of streams passed in. If the +;; pattern doesn't match at all, the empty list is returned. +;; + + +(define *eoi-object* (read (open-input-string ""))) +(define (eoi? x) (equal? x *eoi-object*)) +(define (make-eoi) *eoi-object*) + + +;; This matches a sequence of patterns. + +(define (seq p1 p2) + (lambda (sk fk strm) + (p1 (lambda (strm1) (p2 sk fk strm1)) fk strm))) + +;; This matches either one of two patterns. It's analogous to patterns +;; separated by the '|' in regular expressions. + +(define (bar p1 p2) + (lambda (sk fk strm) + (p1 sk (lambda _ (p2 sk fk strm)) strm))) + + +;; Kleene closure. Analogous to '*' + +(define (star p) + (lambda (sk fk strm) + (p (lambda (strm1) + (if (eoi? (cadr strm1)) (sk strm1) + ((star p) sk sk strm1))) sk strm))) + +;; this parser always succeeds + +(define (pass sk fk s) (sk s)) + +;; Positive closure. Analogous to '+' + +(define (pos pat) (seq pat (star pat))) + +;; Optional pattern. Analogous to '?' + +(define (opt pat) (bar pat pass)) + +;; Matches a consecutive list of patterns + +(define (lst ps) + (let ((ps (reverse ps))) + (let recur ((ps (cdr ps)) (p1 (car ps))) + (cond ((null? ps) p1) + (else (recur (cdr ps) (seq (car ps) p1))))))) + + +;; datatype used by bind and drop +(define-record-type box (make-box contents) + box? (contents box-contents )) + +(define box make-box) +(define unbox box-contents) + +;; Given a list (X_1 ... X_n), returns a list ( (X_1 ... X_(n-1)) X_n ) +(define-inline (split-at-last x) + (if (null? x) (list #f (list)) + (let loop ((prev (list (car x))) (rest (cdr x))) + (cond ((null? rest) + (if (null? (cdr prev)) + (list '() (car prev)) + (list (reverse (cdr prev)) (car prev)))) + (else (loop (cons (car rest) prev) (cdr rest))))))) + +;; helpers for bind +(define-inline (bind-apply f) + (lambda (s) + (cond ((pair? s) + (let ((eaten (car s)) + (food (cadr s))) + (let* ((ep (split-at-last eaten)) + (eaten1 (car ep)) + (eaten0 (cadr ep))) + (assert (box? eaten0)) + (let ((x (and (list? eaten1) (f eaten1)))) + (if x + (list (append x (unbox eaten0)) food) + (list (unbox eaten0) food))) + ))) + (else s)))) + +(define-inline (box-stream s) + (cond ((pair? s) + (let ((eaten (car s)) + (food (cadr s))) + (list (list (box eaten)) food))) + (else s))) + +;; Binds a procedure f to the consumed tokens returned by p +;; Calls failure on empty input +(define (bind f p) + (let ((ba (bind-apply f))) + (lambda (sk fk s) + (if (eoi? (cadr s)) + (fk s) + (let ((sk1 (lambda (s1) (sk (ba s1)))) + (fk1 (lambda (s1) (fk s)))) + (p sk1 fk1 (box-stream s))))))) + +;; Same as bind, but calls success on empty input +(define (bind* f p) + (let ((ba (bind-apply f))) + (lambda (sk fk s) + (if (eoi? (cadr s)) + (sk (ba (box-stream s))) + (let ((sk1 (lambda (s1) (sk (ba s1)))) + (fk1 (lambda (s1) (fk s)))) + (p sk1 fk1 (box-stream s))))))) + + +(define (drop p) + (bind (lambda x #f) p)) + + +;; helpers for rebind +(define-inline (rebind-apply g) + (lambda (i s) + (cond ((pair? s) + (let ((eaten (car s)) + (food (cdr s))) + (let* ((ep (split-at-last eaten)) + (eaten1 (car ep)) + (eaten0 (cadr ep))) + (assert (box? eaten0)) + (let* ((x (and (list? eaten1) (g i eaten1))) + (res (if x (cons (append x (unbox eaten0)) food) + (cons (unbox eaten0) food)))) + res)))) + (else s)))) + +;; Applies a procedure f to the un-consumed tokens, then applies +;; procedure g to the result of f and the tokens returned by p +;; Calls failure on empty input +(define (rebind f g p) + (let ((ra (rebind-apply g))) + (lambda (sk fk s) + (if (eoi? (cadr s)) + (fk s) + (let* ((info ((compose f cadr) s)) + (sk1 (lambda (s) (sk (ra info s))))) + (p sk1 fk (box-stream s))))))) + +;; Same as rebind, but calls success on empty input +(define (rebind* f g p) + (let ((ra (rebind-apply g))) + (lambda (sk fk s) + (if (eoi? (cadr s)) + (sk s) + (let* ((info ((compose f cadr) s)) + (sk1 (lambda (s) (sk (ra info s))))) + (p sk1 fk (box-stream s))))))) + + +;; This takes a pattern and a string, turns the string into a list of +;; streams (containing one stream), applies the pattern, and returns +;; the longest match. + +(define (->char-list s) + (if (string? s) (list (string->list s)) s)) + +(define (lex pat error ss) + (let* ((stream (cond ((string? ss) `(() . ,(->char-list ss))) + ((pair? ss) ss) + (else (error ss))))) + (pat (lambda (s) (list (reverse (first s)) (second s))) + (lambda (s) (error s)) stream))) + + + +;; 'tok' builds a pattern matcher function that applies procedure p to +;; a given token and an input character. If the procedure returns a +;; true value, that value is prepended to the list of consumed +;; elements, and the input character is removed from the list of input +;; elements. + +(define (tok t p) + (lambda (sk fk strm) + (let ((c (car strm)) + (u (cadr strm))) + (cond ((eoi? u) (fk strm)) + ((null? u) (fk (list c (make-eoi)))) + ((p t (lseq-first u)) => + (lambda (ans) (sk (list (cons ans c) (lseq-rest u))))) + (else (fk strm)) + ))) + ) + + + +;; Converts a binary predicate procedure to a binary procedure that +;; returns its right argument when the predicate is true, and false +;; otherwise. + +(define (try p) (lambda (x y) (let ((res (p x y))) (and res y)))) + + +;; Matches a single character + +(define (char c) (tok c (try char=?))) + +;; Matches any of a SRFI-14 set of characters. + +(define (set s) + (let ((cs (if (char-set? s) s (list->char-set (if (string? s) (string->list s) s))))) + (tok cs (try char-set-contains?)))) + +;; Range of characters. Analogous to character class '[]' + +(define (range a b) + (if (charchar-set (char->integer a) (+ 1 (char->integer b))) + (try char-set-contains?)))) + +;; Matches a literal string s + +(define (lit s) + (let ((f (lambda (t) (tok t (try char=?))))) + (lst (map f (if (string? s) (string->list s) s))))) + + +) diff --git a/tests/run.scm b/tests/run.scm new file mode 100644 index 0000000..d265036 --- /dev/null +++ b/tests/run.scm @@ -0,0 +1,249 @@ + +(require-extension scheme (chicken base) + (chicken format) + srfi-1 srfi-14 + test lexgen yasos yasos-collections) + +(define a-pat (tok #\a (try char=?))) +(define b-pat (tok #\b (try char=?))) +(define a-then-b-pat (seq a-pat b-pat)) +(define a-or-b-pat (bar a-pat b-pat)) +(define a-star-pat (star a-pat)) +(define a-star-or-b-pat (bar (star a-pat) b-pat)) +(define a-or-b-star-pat (star a-or-b-pat)) +(define a-b-opt-pat (seq a-pat (opt b-pat))) +(define b-opt-a-pat (seq (opt b-pat) a-pat)) +(define a-b-opt-a-pat (seq a-pat (seq (opt b-pat) a-pat))) +(define a-star-b-opt-pat (seq (star a-pat) (opt b-pat))) +(define aabac-pat (lit "aabac")) +(define drop-aabac-pat (drop aabac-pat)) +(define aa-pat (lit "aa")) +(define n4-pat (lst (list-tabulate 4 (lambda (i) (range #\0 #\9))))) + +(define abc-stream `(() ,(string->list "abc"))) +(define bac-stream `(() ,(string->list "bac"))) +(define aabac-stream `(() ,(string->list "aabac"))) +(define aaaabac-stream `(() ,(string->list "aaaabac"))) +(define num-stream `(() ,(string->list "1234"))) + + +(define (err s) + (print "lexical error on stream: " s) + `(error)) + +(test-group "lexgen test" + (test (sprintf "match [a] on ~S" "abc") + `((#\a) (#\b #\c)) (a-pat identity err abc-stream)) + + (test (sprintf "match [b] on ~S" "abc") + `(error) (b-pat identity err abc-stream)) + + (test (sprintf "match ab on ~S" "abc") + `((#\b #\a ) ( #\c)) + (a-then-b-pat identity err abc-stream)) + + (test (sprintf "match a|b on ~S" "abc") + `((#\a) (#\b #\c)) + (a-or-b-pat identity err abc-stream)) + + (test (sprintf "match a|b on ~S" "bac") + `((#\b) (#\a #\c)) + (a-or-b-pat identity err bac-stream)) + + (test (sprintf "match a* on ~S" "abc") + `((#\a) (#\b #\c)) + (a-star-pat identity err abc-stream)) + + (test (sprintf "match a* on ~S" "aabac") + `((#\a #\a) (#\b #\a #\c)) + (a-star-pat identity err aabac-stream)) + + (test (sprintf "match (a*|b) on ~S" "aabac") + `((#\a #\a) (#\b #\a #\c)) + (a-star-or-b-pat identity err aabac-stream)) + + (test (sprintf "match (a|b)* on ~S" "abc") + `((#\b #\a) (#\c)) + (a-or-b-star-pat identity err abc-stream)) + + (test (sprintf "match (a|b)* on ~S" "aabac") + `((#\a #\b #\a #\a) (#\c)) + (a-or-b-star-pat identity err aabac-stream)) + + (test (sprintf "match ab? on ~S" "abc") + `((#\b #\a) (#\c)) + (a-b-opt-pat identity err abc-stream)) + + (test (sprintf "match ab? on ~S" "aabac") + `((#\a) (#\a #\b #\a #\c)) + (a-b-opt-pat identity err aabac-stream)) + + (test (sprintf "match b?a on ~S" "abc") + `((#\a) (#\b #\c)) + (b-opt-a-pat identity err abc-stream)) + + (test (sprintf "match ab?a on ~S" "aabac") + `((#\a #\a) (#\b #\a #\c)) + (a-b-opt-a-pat identity err aabac-stream)) + + (test (sprintf "match a*b? on ~S" "aabac") + `((#\b #\a #\a) (#\a #\c)) + (a-star-b-opt-pat identity err aabac-stream)) + + (test (sprintf "match literal string ~S" "aabac") + `((#\c #\a #\b #\a #\a) ()) + (aabac-pat identity err aabac-stream)) + + (test (sprintf "match and drop literal string ~S" "aabac") + `(() ()) + (drop-aabac-pat identity err aabac-stream)) + + (test (sprintf "match n4 on ~S" "1234") + `((#\4 #\3 #\2 #\1) ()) + (n4-pat identity err num-stream)) + + ) +;; A pattern to match floating point numbers. +;; "-"?(([0-9]+(\\.[0-9]+)?)|(\\.[0-9]+))([eE][+-]?[0-9]+)? + +(define numpat + (let* ((digit (range #\0 #\9)) + (digits (pos digit)) + (fraction (seq (char #\.) digits)) + (significand (bar (seq digits (opt fraction)) fraction)) + (exp (seq (set "eE") (seq (opt (set "+-")) digits))) + (sign (opt (char #\-)))) + (seq sign (seq significand (opt exp))))) + +(print (lex numpat err "-123.45e-6")) + +(test-group "lexgen numpat test" + (test (sprintf "match numpat on ~S" "-123.45e-6") + `(#\- #\1 #\2 #\3 #\. #\4 #\5 #\e #\- #\6) + (car (lex numpat err "-123.45e-6"))) + (test (sprintf "match numpat on ~S" "hi there") + `(error) + (lex numpat err "hi there"))) + +(define (->char-list s) + (if (string? s) (string->list s) s)) + +(define (collect cs) + (let loop ((cs cs) (ax (list))) + (cond ((null? cs) `(,(list->string ax))) + ((atom? (car cs)) (loop (cdr cs) (cons (car cs) ax))) + (else (cons (list->string ax) cs))))) + +(define (make-exp x) + (or (and (pair? x) + (let ((x1 (collect x))) + (list `(exp . ,x1)))) x)) + +(define (make-significand x) + (or (and (pair? x) + (let ((x1 (collect x))) + (cons `(significand ,(car x1)) (cdr x1)))) x)) + +(define (make-sign x) + (or (and (pair? x) + (let ((x1 (collect x))) + (cons `(sign ,(car x1)) (cdr x1)))) x)) + +(define (check s) (lambda (s1) (if (null? s1) (err s) s1))) + +(define bnumpat + (let* ((digit (range #\0 #\9)) + (digits (star digit)) + (fraction (seq (char #\.) digits)) + (significand (bar (seq digits (opt fraction)) fraction)) + (exp (seq (set "eE") (seq (opt (set "+-")) digits))) + (sign (opt (char #\-)) ) + (pat (seq (bind make-sign sign) + (seq (bind make-significand significand) + (bind make-exp (opt exp)))))) + pat)) + +(define (num-parser s) (car (lex bnumpat err s))) + + +(test-group "lexgen num-parser test" + (test (sprintf "match num-parser on ~S" "-123.45e-6") + `((sign "-") (significand "123.45") (exp "e-6")) + (num-parser "-123.45e-6")) ) + + + +;; Tokens with position information + + +(define-record-type postok + (make-postok pos token) + postok? + (pos postok-pos ) + (token postok-token ) + ) + +(define pos? pair?) +(define pos-row car) +(define pos-col cdr) +(define make-pos cons) + +(define-record-printer (postok x out) + (fprintf out "#" + (postok-pos x) + (postok-token x))) + +(define (getpos p) + (let ((f (lambda (in) (and (pair? in) (postok-pos (car in))))) + (g (lambda (i s) (list (make-postok i (car s)))))) + (rebind f g p))) + +;; (define pos- +;; (let ((pos-tail +;; (lambda (strm) +;; (cond ((or (null? strm) (null? (cdr strm))) '()) +;; (else +;; (let* ((curtok (car strm)) +;; (pos0 (postok-pos curtok)) +;; (pos1 (let ((row0 (pos-row pos0)) +;; (col0 (pos-col pos0))) +;; (case (cadr strm) +;; ((#\newline) (make-pos (+ 1 row0) 1)) +;; ((#\return) (make-pos row0 1)) +;; (else (make-pos row0 (+ 1 col0)))))) +;; (res (cons (make-postok pos1 (cadr strm)) (cddr strm)))) +;; res))))) +;; (pos-null? null?) +;; (pos-head (compose postok-token car))) +;; (make- pos-null? pos-head pos-tail))) + +(define (make-pos-stream strm) + (let ((begpos (make-pos 1 1))) + `(() ,(cons (make-postok begpos (car strm)) (cdr strm))))) + +(define pos-numpat-stream + (make-pos-stream (string->list "-123.45e-6"))) + +;; (define pbnumpat +;; (let* ((digit (pos/range #\0 #\9)) +;; (digits (star digit)) +;; (fraction (seq (pos/char #\.) digits)) +;; (significand (bar (seq digits (opt fraction)) fraction)) +;; (exp (seq (pos/set "eE") (seq (opt (pos/set "+-")) digits))) +;; (sign (opt (pos/char #\-)) ) +;; (pat (seq (getpos (bind make-sign sign)) +;; (seq (getpos (bind make-significand significand)) +;; (getpos (bind make-exp (opt exp))))))) +;; pat)) + +(define (pos-num-parser s) (car (lex pbnumpat err s))) + +;; (test-group "lexgen pos-num-parser test" +;; (test (sprintf "match pos-num-parser on ~S" "-123.45e-6") +;; `(,(make-postok (make-pos 1 1) `(sign "-")) +;; ,(make-postok (make-pos 1 2) `(significand "123.45")) +;; ,(make-postok (make-pos 1 8) `(exp "e-6"))) +;; (pos-num-parser pos-numpat-stream)) +;; ) + +(test-exit)