mirror of
https://git.savannah.gnu.org/git/guile.git
synced 2025-04-29 19:30:36 +02:00
* test-suite/tests/peg.test (comment-grammar): Z can be anything. ("simple comment with forbidden char"): Remove. (html-grammar, html-example): New variables. ("parsing with complex grammars"): New test. Signed-off-by: Ludovic Courtès <ludo@gnu.org>
399 lines
16 KiB
Text
399 lines
16 KiB
Text
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;;;;; PEG test suite.
|
|
;; Tests the parsing capabilities of (ice-9 peg). Could use more
|
|
;; tests for edge cases.
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(define-module (test-suite test-peg)
|
|
:use-module (test-suite lib)
|
|
:use-module (ice-9 peg)
|
|
:use-module (ice-9 pretty-print)
|
|
:use-module (srfi srfi-1))
|
|
|
|
;; Doubled up for pasting into REPL.
|
|
(use-modules (test-suite lib))
|
|
(use-modules (ice-9 peg))
|
|
(use-modules (ice-9 pretty-print))
|
|
(use-modules (srfi srfi-1))
|
|
|
|
;; Evaluates an expression at the toplevel. Not the prettiest
|
|
;; solution to runtime issues ever, but m3h. Runs at toplevel so that
|
|
;; symbols are bound globally instead of in the scope of the pass-if
|
|
;; expression.
|
|
(define (eeval exp)
|
|
(eval exp (interaction-environment)))
|
|
(define make-prec (@@ (ice-9 peg) make-prec))
|
|
|
|
;; Maps the nonterminals defined in the PEG parser written as a PEG to
|
|
;; the nonterminals defined in the PEG parser written with
|
|
;; S-expressions.
|
|
(define grammar-mapping
|
|
'((Grammar Grammar)
|
|
(Definition Definition)
|
|
(Expression Expression)
|
|
(Sequence Sequence)
|
|
(Prefix Prefix)
|
|
(Suffix Suffix)
|
|
(Primary Primary)
|
|
(Identifier Identifier)
|
|
(Literal Literal)
|
|
(Class Class)
|
|
(NotInClass NotInClass)
|
|
(Range Range)
|
|
(Char Char)
|
|
(LEFTARROW LEFTARROW)
|
|
(AND AND)
|
|
(NOT NOT)
|
|
(QUESTION QUESTION)
|
|
(STAR STAR)
|
|
(PLUS PLUS)
|
|
(DOT DOT)))
|
|
|
|
;; Transforms the nonterminals defined in the PEG parser written as a PEG to the nonterminals defined in the PEG parser written with S-expressions.
|
|
(define (grammar-transform x)
|
|
(let ((res (assoc x grammar-mapping)))
|
|
(if res (cadr res) x)))
|
|
|
|
;; Maps a function onto a tree (recurses until it finds atoms, then calls the function on the atoms).
|
|
(define (tree-map fn lst)
|
|
(if (list? lst)
|
|
(if (null? lst)
|
|
lst
|
|
(cons (tree-map fn (car lst))
|
|
(tree-map fn (cdr lst))))
|
|
(fn lst)))
|
|
|
|
;; Tests to make sure that we can parse a PEG defining a grammar for
|
|
;; PEGs, then uses that grammar to parse the same PEG again to make
|
|
;; sure we get the same result (i.e. make sure our PEG grammar
|
|
;; expressed as a PEG is equivalent to our PEG grammar expressed with
|
|
;; S-expressions).
|
|
(with-test-prefix "PEG Grammar"
|
|
(pass-if
|
|
"defining PEGs with PEG"
|
|
(and (eeval `(define-peg-string-patterns ,(@@ (ice-9 peg) peg-as-peg))) #t))
|
|
(pass-if
|
|
"equivalence of definitions"
|
|
(equal?
|
|
(peg:tree (match-pattern (@@ (ice-9 peg) peg-grammar) (@@ (ice-9 peg) peg-as-peg)))
|
|
(tree-map
|
|
grammar-transform
|
|
(peg:tree (match-pattern Grammar (@@ (ice-9 peg) peg-as-peg)))))))
|
|
|
|
;; A grammar for pascal-style comments from Wikipedia.
|
|
(define comment-grammar
|
|
"Begin <-- '(*'
|
|
End <-- '*)'
|
|
C <- Begin N* End
|
|
N <- C / (!Begin !End Z)
|
|
Z <- .")
|
|
|
|
;; A short /etc/passwd file.
|
|
(define *etc-passwd*
|
|
"root:x:0:0:root:/root:/bin/bash
|
|
daemon:x:1:1:daemon:/usr/sbin:/bin/sh
|
|
bin:x:2:2:bin:/bin:/bin/sh
|
|
sys:x:3:3:sys:/dev:/bin/sh
|
|
nobody:x:65534:65534:nobody:/nonexistent:/bin/sh
|
|
messagebus:x:103:107::/var/run/dbus:/bin/false
|
|
")
|
|
|
|
;; A grammar for parsing /etc/passwd files.
|
|
(define-peg-string-patterns
|
|
"passwd <-- entry* !.
|
|
entry <-- login CO pass CO uid CO gid CO nameORcomment CO homedir CO shell NL*
|
|
login <-- text
|
|
pass <-- text
|
|
uid <-- [0-9]*
|
|
gid <-- [0-9]*
|
|
nameORcomment <-- text
|
|
homedir <-- path
|
|
shell <-- path
|
|
path <-- (SLASH pathELEMENT)*
|
|
pathELEMENT <-- (!NL !CO !'/' .)*
|
|
text <- (!NL !CO .)*
|
|
CO < ':'
|
|
NL < '\n'
|
|
SLASH < '/'")
|
|
|
|
;; Tests some actual parsing using PEGs.
|
|
(with-test-prefix "Parsing"
|
|
(eeval `(define-peg-string-patterns ,comment-grammar))
|
|
(pass-if
|
|
;; Pascal-style comment parsing
|
|
"simple comment"
|
|
(equal?
|
|
(match-pattern C "(*blah*)")
|
|
(make-prec 0 8 "(*blah*)"
|
|
'((Begin "(*") "blah" (End "*)")))))
|
|
(pass-if
|
|
"simple comment padded"
|
|
(equal?
|
|
(match-pattern C "(*blah*)abc")
|
|
(make-prec 0 8 "(*blah*)abc"
|
|
'((Begin "(*") "blah" (End "*)")))))
|
|
(pass-if
|
|
"nested comment"
|
|
(equal?
|
|
(match-pattern C "(*1(*2*)*)")
|
|
(make-prec 0 10 "(*1(*2*)*)"
|
|
'((Begin "(*") ("1" ((Begin "(*") "2" (End "*)"))) (End "*)")))))
|
|
(pass-if
|
|
"early termination"
|
|
(not (match-pattern C "(*blah")))
|
|
(pass-if
|
|
"never starts"
|
|
(not (match-pattern C "blah")))
|
|
;; /etc/passwd parsing
|
|
(pass-if
|
|
"/etc/passwd"
|
|
(equal?
|
|
(match-pattern passwd *etc-passwd*)
|
|
(make-prec 0 220 *etc-passwd*
|
|
'(passwd (entry (login "root") (pass "x") (uid "0") (gid "0") (nameORcomment "root") (homedir (path (pathELEMENT "root"))) (shell (path (pathELEMENT "bin") (pathELEMENT "bash")))) (entry (login "daemon") (pass "x") (uid "1") (gid "1") (nameORcomment "daemon") (homedir (path (pathELEMENT "usr") (pathELEMENT "sbin"))) (shell (path (pathELEMENT "bin") (pathELEMENT "sh")))) (entry (login "bin") (pass "x") (uid "2") (gid "2") (nameORcomment "bin") (homedir (path (pathELEMENT "bin"))) (shell (path (pathELEMENT "bin") (pathELEMENT "sh")))) (entry (login "sys") (pass "x") (uid "3") (gid "3") (nameORcomment "sys") (homedir (path (pathELEMENT "dev"))) (shell (path (pathELEMENT "bin") (pathELEMENT "sh")))) (entry (login "nobody") (pass "x") (uid "65534") (gid "65534") (nameORcomment "nobody") (homedir (path (pathELEMENT "nonexistent"))) (shell (path (pathELEMENT "bin") (pathELEMENT "sh")))) (entry (login "messagebus") (pass "x") (uid "103") (gid "107") nameORcomment (homedir (path (pathELEMENT "var") (pathELEMENT "run") (pathELEMENT "dbus"))) (shell (path (pathELEMENT "bin") (pathELEMENT "false")))))))))
|
|
|
|
;; Tests the functions for pulling data out of PEG Match Records.
|
|
(with-test-prefix "PEG Match Records"
|
|
(define-peg-pattern bs all (peg "'b'+"))
|
|
(pass-if
|
|
"basic parameter extraction"
|
|
(equal?
|
|
(let ((pm (search-for-pattern bs "aabbcc")))
|
|
`((string ,(peg:string pm))
|
|
(start ,(peg:start pm))
|
|
(end ,(peg:end pm))
|
|
(substring ,(peg:substring pm))
|
|
(tree ,(peg:tree pm))
|
|
(record? ,(peg-record? pm))))
|
|
'((string "aabbcc")
|
|
(start 2)
|
|
(end 4)
|
|
(substring "bb")
|
|
(tree (bs "bb"))
|
|
(record? #t)))))
|
|
|
|
;; PEG for parsing right-associative equations.
|
|
(define-peg-string-patterns
|
|
"expr <- sum
|
|
sum <-- (product ('+' / '-') sum) / product
|
|
product <-- (value ('*' / '/') product) / value
|
|
value <-- number / '(' expr ')'
|
|
number <-- [0-9]+")
|
|
|
|
;; Functions to actually evaluate the equations parsed with the PEG.
|
|
(define (parse-sum sum left . rest)
|
|
(if (null? rest)
|
|
(apply parse-product left)
|
|
(list (string->symbol (car rest))
|
|
(apply parse-product left)
|
|
(apply parse-sum (cadr rest)))))
|
|
|
|
(define (parse-product product left . rest)
|
|
(if (null? rest)
|
|
(apply parse-value left)
|
|
(list (string->symbol (car rest))
|
|
(apply parse-value left)
|
|
(apply parse-product (cadr rest)))))
|
|
|
|
(define (parse-value value first . rest)
|
|
(if (null? rest)
|
|
(string->number (cadr first))
|
|
(apply parse-sum (car rest))))
|
|
|
|
(define parse-expr parse-sum)
|
|
(define (eq-parse str) (apply parse-expr (peg:tree (match-pattern expr str))))
|
|
|
|
(with-test-prefix "Parsing right-associative equations"
|
|
(pass-if
|
|
"1"
|
|
(equal? (eq-parse "1") 1))
|
|
(pass-if
|
|
"1+2"
|
|
(equal? (eq-parse "1+2") '(+ 1 2)))
|
|
(pass-if
|
|
"1+2+3"
|
|
(equal? (eq-parse "1+2+3") '(+ 1 (+ 2 3))))
|
|
(pass-if
|
|
"1+2*3+4"
|
|
(equal? (eq-parse "1+2*3+4") '(+ 1 (+ (* 2 3) 4))))
|
|
(pass-if
|
|
"1+2/3*(4+5)/6-7-8"
|
|
(equal? (eq-parse "1+2/3*(4+5)/6-7-8")
|
|
'(+ 1 (- (/ 2 (* 3 (/ (+ 4 5) 6))) (- 7 8)))))
|
|
(pass-if
|
|
"1+1/2*3+(1+1)/2"
|
|
(equal? (eq-parse "1+1/2*3+(1+1)/2")
|
|
'(+ 1 (+ (/ 1 (* 2 3)) (/ (+ 1 1) 2))))))
|
|
|
|
;; PEG for parsing left-associative equations (normal ones).
|
|
(define-peg-string-patterns
|
|
"expr <- sum
|
|
sum <-- (product ('+' / '-'))* product
|
|
product <-- (value ('*' / '/'))* value
|
|
value <-- number / '(' expr ')'
|
|
number <-- [0-9]+")
|
|
|
|
;; Functions to actually evaluate the equations parsed with the PEG.
|
|
(define (make-left-parser next-func)
|
|
(lambda (sum first . rest)
|
|
(if (null? rest)
|
|
(apply next-func first)
|
|
(if (string? (cadr first))
|
|
(list (string->symbol (cadr first))
|
|
(apply next-func (car first))
|
|
(apply next-func (car rest)))
|
|
(car
|
|
(reduce
|
|
(lambda (l r)
|
|
(list (list (cadr r) (car r) (apply next-func (car l)))
|
|
(string->symbol (cadr l))))
|
|
'ignore
|
|
(append
|
|
(list (list (apply next-func (caar first))
|
|
(string->symbol (cadar first))))
|
|
(cdr first)
|
|
(list (append rest '("done"))))))))))
|
|
|
|
(define (parse-value value first . rest)
|
|
(if (null? rest)
|
|
(string->number (cadr first))
|
|
(apply parse-sum (car rest))))
|
|
(define parse-product (make-left-parser parse-value))
|
|
(define parse-sum (make-left-parser parse-product))
|
|
(define parse-expr parse-sum)
|
|
(define (eq-parse str) (apply parse-expr (peg:tree (match-pattern expr str))))
|
|
|
|
(with-test-prefix "Parsing left-associative equations"
|
|
(pass-if
|
|
"1"
|
|
(equal? (eq-parse "1") 1))
|
|
(pass-if
|
|
"1+2"
|
|
(equal? (eq-parse "1+2") '(+ 1 2)))
|
|
(pass-if
|
|
"1+2+3"
|
|
(equal? (eq-parse "1+2+3") '(+ (+ 1 2) 3)))
|
|
(pass-if
|
|
"1+2*3+4"
|
|
(equal? (eq-parse "1+2*3+4") '(+ (+ 1 (* 2 3)) 4)))
|
|
(pass-if
|
|
"1+2/3*(4+5)/6-7-8"
|
|
(equal? (eq-parse "1+2/3*(4+5)/6-7-8")
|
|
'(- (- (+ 1 (/ (* (/ 2 3) (+ 4 5)) 6)) 7) 8)))
|
|
(pass-if
|
|
"1+1/2*3+(1+1)/2"
|
|
(equal? (eq-parse "1+1/2*3+(1+1)/2")
|
|
'(+ (+ 1 (* (/ 1 2) 3)) (/ (+ 1 1) 2)))))
|
|
|
|
|
|
(define html-grammar
|
|
"
|
|
# Based on code from https://github.com/Fantom-Factory/afHtmlParser
|
|
# 2014-2023 Steve Eynon. This code was originally released under the following
|
|
# terms:
|
|
#
|
|
# Permission to use, copy, modify, and/or distribute this software for any
|
|
# purpose with or without fee is hereby granted, provided that the above
|
|
# copyright notice and this permission notice appear in all copies.
|
|
#
|
|
# THE SOFTWARE IS PROVIDED \"AS IS\" AND THE AUTHOR DISCLAIMS ALL
|
|
# WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES
|
|
# OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE
|
|
# FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY
|
|
# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
|
|
# IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
|
|
# OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
|
|
|
# PEG Rules for parsing well formed HTML 5 documents
|
|
# https://html.spec.whatwg.org/multipage/syntax.html
|
|
|
|
html <-- bom? blurb* doctype? blurb* xmlProlog? blurb* elem blurb*
|
|
bom <-- \"\\uFEFF\"
|
|
xmlProlog <-- \"<?xml\" (!\"?>\" .)+ \"?>\"
|
|
|
|
# ---- Doctype ----
|
|
|
|
doctype <-- \"<!DOCTYPE\" [ \\t\\n\\f\\r]+ [a-zA-Z0-9]+ (doctypePublicId / doctypeSystemId)* [ \\t\\n\\f\\r]* \">\"
|
|
doctypePublicId <-- [ \\t\\n\\f\\r]+ \"PUBLIC\" [ \\t\\n\\f\\r]+ ((\"\\\"\" [^\"]* \"\\\"\") / (\"'\" [^']* \"'\"))
|
|
doctypeSystemId <-- [ \\t\\n\\f\\r]+ (\"SYSTEM\" [ \\t\\n\\f\\r]+)? ((\"\\\"\" [^\"]* \"\\\"\") / (\"'\" [^']* \"'\"))
|
|
|
|
# ---- Elems ----
|
|
|
|
elem <-- voidElem / rawTextElem / escRawTextElem / selfClosingElem / normalElem
|
|
voidElem <-- \"<\" voidElemName attributes \">\"
|
|
rawTextElem <-- \"<\" rawTextElemName attributes \">\" rawTextContent endElem
|
|
escRawTextElem <-- \"<\" escRawTextElemName attributes \">\" escRawTextContent endElem
|
|
selfClosingElem <-- \"<\" elemName attributes \"/>\"
|
|
normalElem <-- \"<\" elemName attributes \">\" normalContent? endElem?
|
|
endElem <-- \"</\" elemName \">\"
|
|
|
|
elemName <-- [a-zA-Z] [^\\t\\n\\f />]*
|
|
voidElemName <-- \"area\" / \"base\" / \"br\" / \"col\" / \"embed\" /
|
|
\"hr\" / \"img\" / \"input\" / \"keygen\" / \"link\" /
|
|
\"meta\" / \"param\" / \"source\" / \"track\" / \"wbr\"
|
|
rawTextElemName <-- \"script\" / \"style\"
|
|
escRawTextElemName <-- \"textarea\" / \"title\"
|
|
|
|
rawTextContent <-- (!(\"</script>\" / \"</style>\") .)+
|
|
escRawTextContent <-- ((!(\"</textarea>\" / \"</title>\" / \"&\") .)+ / charRef)*
|
|
normalContent <-- !\"</\" (([^<&]+ / charRef) / comment / cdata / elem)*
|
|
|
|
# ---- Attributes ----
|
|
|
|
attributes <-- (&[^/>] ([ \\t]+ / doubleQuoteAttr / singleQuoteAttr / unquotedAttr / emptyAttr))*
|
|
attrName <-- [^ \\t\\n\\r\\f\"'>/=]+
|
|
emptyAttr <-- attrName+
|
|
unquotedAttr <-- attrName [ \\t]* \"=\" [ \\t]* (charRef / [^ \\t\\n\\r\\f\"'=<>`&]+)+
|
|
singleQuoteAttr <-- attrName [ \\t]* \"=\" [ \\t]* \"'\" (charRef / [^'&]+)* \"'\"
|
|
doubleQuoteAttr <-- attrName [ \\t]* \"=\" [ \\t]* \"\\\"\" (charRef / [^\"&]+)* \"\\\"\"
|
|
|
|
# ---- Character References ----
|
|
|
|
charRef <-- &\"&\" (decNumCharRef / hexNumCharRef / namedCharRef / borkedRef)
|
|
namedCharRef <-- \"&\" [^;>]+ \";\"
|
|
decNumCharRef <-- \"&#\" [0-9]+ \";\"
|
|
hexNumCharRef <-- \"&#x\" [a-fA-F0-9]+ \";\"
|
|
borkedRef <-- \"&\" &[ \\t]
|
|
|
|
# ---- Misc ----
|
|
|
|
cdata <-- \"<![CDATA[\" (!\"]]>\" .)+ \"]]>\"
|
|
comment <-- \"<!--\" (!\"--\" .)+ \"-->\"
|
|
blurb <-- [ \\t\\n\\f\\r]+ / comment")
|
|
|
|
(define html-example "
|
|
<!DOCTYPE html>
|
|
<html>
|
|
<head>
|
|
<title>Example Domain</title>
|
|
<meta charset=\"utf-8\" />
|
|
<meta http-equiv=\"Content-type\" content=\"text/html; charset=utf-8\" />
|
|
<meta name=\"viewport\" content=\"width=device-width, initial-scale=1\" />
|
|
<style type=\"text/css\">
|
|
body {
|
|
background-color: #f0f0f2;
|
|
margin: 0;
|
|
padding: 0;
|
|
}
|
|
</style>
|
|
</head>
|
|
|
|
<body>
|
|
<div>
|
|
<h1>Example Domain</h1>
|
|
<p>This domain is for use in illustrative examples in documents. You may
|
|
use this domain in literature without prior coordination or asking for
|
|
permission.</p> <p><a href=\"https://www.iana.org/domains/example\">More
|
|
information...</a></p>
|
|
</div>
|
|
</body>
|
|
</html>
|
|
")
|
|
|
|
(with-test-prefix "parsing with complex grammars"
|
|
(eeval `(define-peg-string-patterns ,html-grammar))
|
|
(pass-if
|
|
"HTML parsing"
|
|
(equal?
|
|
(peg:tree (match-pattern html html-example))
|
|
'(html (blurb "\n") (doctype "<!DOCTYPE html>") (blurb "\n") (elem (normalElem "<" (elemName "html") attributes ">" (normalContent "\n" (elem (normalElem "<" (elemName "head") attributes ">" (normalContent "\n " (elem (escRawTextElem "<" (escRawTextElemName "title") attributes ">" (escRawTextContent "Example Domain") (endElem "</" (elemName "title") ">"))) "\n " (elem (selfClosingElem "<" (elemName "meta") (attributes " " (doubleQuoteAttr (attrName "charset") "=\"utf-8\"") " ") "/>")) "\n " (elem (selfClosingElem "<" (elemName "meta") (attributes " " (doubleQuoteAttr (attrName "http-equiv") "=\"Content-type\"") " " (doubleQuoteAttr (attrName "content") "=\"text/html; charset=utf-8\"") " ") "/>")) "\n " (elem (selfClosingElem "<" (elemName "meta") (attributes " " (doubleQuoteAttr (attrName "name") "=\"viewport\"") " " (doubleQuoteAttr (attrName "content") "=\"width=device-width, initial-scale=1\"") " ") "/>")) "\n " (elem (rawTextElem "<" (rawTextElemName "style") (attributes " " (doubleQuoteAttr (attrName "type") "=\"text/css\"")) ">" (rawTextContent "\n body {\n background-color: #f0f0f2;\n margin: 0;\n padding: 0;\n }\n ") (endElem "</" (elemName "style") ">"))) "\n") (endElem "</" (elemName "head") ">"))) "\n\n" (elem (normalElem "<" (elemName "body") attributes ">" (normalContent "\n" (elem (normalElem "<" (elemName "div") attributes ">" (normalContent "\n " (elem (normalElem "<" (elemName "h1") attributes ">" (normalContent "Example Domain") (endElem "</" (elemName "h1") ">"))) "\n " (elem (normalElem "<" (elemName "p") attributes ">" (normalContent "This domain is for use in illustrative examples in documents. You may\n use this domain in literature without prior coordination or asking for\n permission.") (endElem "</" (elemName "p") ">"))) " " (elem (normalElem "<" (elemName "p") attributes ">" (normalContent (elem (normalElem "<" (elemName "a") (attributes " " (doubleQuoteAttr (attrName "href") "=\"https://www.iana.org/domains/example\"")) ">" (normalContent "More\n information...") (endElem "</" (elemName "a") ">")))) (endElem "</" (elemName "p") ">"))) "\n") (endElem "</" (elemName "div") ">"))) "\n") (endElem "</" (elemName "body") ">"))) "\n") (endElem "</" (elemName "html") ">"))) (blurb "\n")))))
|