mirror of
https://git.savannah.gnu.org/git/guile.git
synced 2025-04-30 03:40:34 +02:00
PEG: string-peg: Add HTML5 grammar test.
* test-suite/tests/peg.test (comment-grammar): Z can be anything. ("simple comment with forbidden char"): Remove. (html-grammar, html-example): New variables. ("parsing with complex grammars"): New test. Signed-off-by: Ludovic Courtès <ludo@gnu.org>
This commit is contained in:
parent
38ad264979
commit
6750f6cc80
1 changed files with 113 additions and 4 deletions
|
@ -86,7 +86,7 @@
|
||||||
End <-- '*)'
|
End <-- '*)'
|
||||||
C <- Begin N* End
|
C <- Begin N* End
|
||||||
N <- C / (!Begin !End Z)
|
N <- C / (!Begin !End Z)
|
||||||
Z <- [^X-Z]") ;; Forbid some characters to test not-in-range
|
Z <- .")
|
||||||
|
|
||||||
;; A short /etc/passwd file.
|
;; A short /etc/passwd file.
|
||||||
(define *etc-passwd*
|
(define *etc-passwd*
|
||||||
|
@ -126,9 +126,6 @@ SLASH < '/'")
|
||||||
(match-pattern C "(*blah*)")
|
(match-pattern C "(*blah*)")
|
||||||
(make-prec 0 8 "(*blah*)"
|
(make-prec 0 8 "(*blah*)"
|
||||||
'((Begin "(*") "blah" (End "*)")))))
|
'((Begin "(*") "blah" (End "*)")))))
|
||||||
(pass-if
|
|
||||||
"simple comment with forbidden char"
|
|
||||||
(not (match-pattern C "(*blYh*)")))
|
|
||||||
(pass-if
|
(pass-if
|
||||||
"simple comment padded"
|
"simple comment padded"
|
||||||
(equal?
|
(equal?
|
||||||
|
@ -288,3 +285,115 @@ number <-- [0-9]+")
|
||||||
(equal? (eq-parse "1+1/2*3+(1+1)/2")
|
(equal? (eq-parse "1+1/2*3+(1+1)/2")
|
||||||
'(+ (+ 1 (* (/ 1 2) 3)) (/ (+ 1 1) 2)))))
|
'(+ (+ 1 (* (/ 1 2) 3)) (/ (+ 1 1) 2)))))
|
||||||
|
|
||||||
|
|
||||||
|
(define html-grammar
|
||||||
|
"
|
||||||
|
# Based on code from https://github.com/Fantom-Factory/afHtmlParser
|
||||||
|
# 2014-2023 Steve Eynon. This code was originally released under the following
|
||||||
|
# terms:
|
||||||
|
#
|
||||||
|
# Permission to use, copy, modify, and/or distribute this software for any
|
||||||
|
# purpose with or without fee is hereby granted, provided that the above
|
||||||
|
# copyright notice and this permission notice appear in all copies.
|
||||||
|
#
|
||||||
|
# THE SOFTWARE IS PROVIDED \"AS IS\" AND THE AUTHOR DISCLAIMS ALL
|
||||||
|
# WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES
|
||||||
|
# OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE
|
||||||
|
# FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY
|
||||||
|
# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
|
||||||
|
# IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
|
||||||
|
# OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||||
|
|
||||||
|
# PEG Rules for parsing well formed HTML 5 documents
|
||||||
|
# https://html.spec.whatwg.org/multipage/syntax.html
|
||||||
|
|
||||||
|
html <-- bom? blurb* doctype? blurb* xmlProlog? blurb* elem blurb*
|
||||||
|
bom <-- \"\\uFEFF\"
|
||||||
|
xmlProlog <-- \"<?xml\" (!\"?>\" .)+ \"?>\"
|
||||||
|
|
||||||
|
# ---- Doctype ----
|
||||||
|
|
||||||
|
doctype <-- \"<!DOCTYPE\" [ \\t\\n\\f\\r]+ [a-zA-Z0-9]+ (doctypePublicId / doctypeSystemId)* [ \\t\\n\\f\\r]* \">\"
|
||||||
|
doctypePublicId <-- [ \\t\\n\\f\\r]+ \"PUBLIC\" [ \\t\\n\\f\\r]+ ((\"\\\"\" [^\"]* \"\\\"\") / (\"'\" [^']* \"'\"))
|
||||||
|
doctypeSystemId <-- [ \\t\\n\\f\\r]+ (\"SYSTEM\" [ \\t\\n\\f\\r]+)? ((\"\\\"\" [^\"]* \"\\\"\") / (\"'\" [^']* \"'\"))
|
||||||
|
|
||||||
|
# ---- Elems ----
|
||||||
|
|
||||||
|
elem <-- voidElem / rawTextElem / escRawTextElem / selfClosingElem / normalElem
|
||||||
|
voidElem <-- \"<\" voidElemName attributes \">\"
|
||||||
|
rawTextElem <-- \"<\" rawTextElemName attributes \">\" rawTextContent endElem
|
||||||
|
escRawTextElem <-- \"<\" escRawTextElemName attributes \">\" escRawTextContent endElem
|
||||||
|
selfClosingElem <-- \"<\" elemName attributes \"/>\"
|
||||||
|
normalElem <-- \"<\" elemName attributes \">\" normalContent? endElem?
|
||||||
|
endElem <-- \"</\" elemName \">\"
|
||||||
|
|
||||||
|
elemName <-- [a-zA-Z] [^\\t\\n\\f />]*
|
||||||
|
voidElemName <-- \"area\" / \"base\" / \"br\" / \"col\" / \"embed\" /
|
||||||
|
\"hr\" / \"img\" / \"input\" / \"keygen\" / \"link\" /
|
||||||
|
\"meta\" / \"param\" / \"source\" / \"track\" / \"wbr\"
|
||||||
|
rawTextElemName <-- \"script\" / \"style\"
|
||||||
|
escRawTextElemName <-- \"textarea\" / \"title\"
|
||||||
|
|
||||||
|
rawTextContent <-- (!(\"</script>\" / \"</style>\") .)+
|
||||||
|
escRawTextContent <-- ((!(\"</textarea>\" / \"</title>\" / \"&\") .)+ / charRef)*
|
||||||
|
normalContent <-- !\"</\" (([^<&]+ / charRef) / comment / cdata / elem)*
|
||||||
|
|
||||||
|
# ---- Attributes ----
|
||||||
|
|
||||||
|
attributes <-- (&[^/>] ([ \\t]+ / doubleQuoteAttr / singleQuoteAttr / unquotedAttr / emptyAttr))*
|
||||||
|
attrName <-- [^ \\t\\n\\r\\f\"'>/=]+
|
||||||
|
emptyAttr <-- attrName+
|
||||||
|
unquotedAttr <-- attrName [ \\t]* \"=\" [ \\t]* (charRef / [^ \\t\\n\\r\\f\"'=<>`&]+)+
|
||||||
|
singleQuoteAttr <-- attrName [ \\t]* \"=\" [ \\t]* \"'\" (charRef / [^'&]+)* \"'\"
|
||||||
|
doubleQuoteAttr <-- attrName [ \\t]* \"=\" [ \\t]* \"\\\"\" (charRef / [^\"&]+)* \"\\\"\"
|
||||||
|
|
||||||
|
# ---- Character References ----
|
||||||
|
|
||||||
|
charRef <-- &\"&\" (decNumCharRef / hexNumCharRef / namedCharRef / borkedRef)
|
||||||
|
namedCharRef <-- \"&\" [^;>]+ \";\"
|
||||||
|
decNumCharRef <-- \"&#\" [0-9]+ \";\"
|
||||||
|
hexNumCharRef <-- \"&#x\" [a-fA-F0-9]+ \";\"
|
||||||
|
borkedRef <-- \"&\" &[ \\t]
|
||||||
|
|
||||||
|
# ---- Misc ----
|
||||||
|
|
||||||
|
cdata <-- \"<![CDATA[\" (!\"]]>\" .)+ \"]]>\"
|
||||||
|
comment <-- \"<!--\" (!\"--\" .)+ \"-->\"
|
||||||
|
blurb <-- [ \\t\\n\\f\\r]+ / comment")
|
||||||
|
|
||||||
|
(define html-example "
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>Example Domain</title>
|
||||||
|
<meta charset=\"utf-8\" />
|
||||||
|
<meta http-equiv=\"Content-type\" content=\"text/html; charset=utf-8\" />
|
||||||
|
<meta name=\"viewport\" content=\"width=device-width, initial-scale=1\" />
|
||||||
|
<style type=\"text/css\">
|
||||||
|
body {
|
||||||
|
background-color: #f0f0f2;
|
||||||
|
margin: 0;
|
||||||
|
padding: 0;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
|
||||||
|
<body>
|
||||||
|
<div>
|
||||||
|
<h1>Example Domain</h1>
|
||||||
|
<p>This domain is for use in illustrative examples in documents. You may
|
||||||
|
use this domain in literature without prior coordination or asking for
|
||||||
|
permission.</p> <p><a href=\"https://www.iana.org/domains/example\">More
|
||||||
|
information...</a></p>
|
||||||
|
</div>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
")
|
||||||
|
|
||||||
|
(with-test-prefix "parsing with complex grammars"
|
||||||
|
(eeval `(define-peg-string-patterns ,html-grammar))
|
||||||
|
(pass-if
|
||||||
|
"HTML parsing"
|
||||||
|
(equal?
|
||||||
|
(peg:tree (match-pattern html html-example))
|
||||||
|
'(html (blurb "\n") (doctype "<!DOCTYPE html>") (blurb "\n") (elem (normalElem "<" (elemName "html") attributes ">" (normalContent "\n" (elem (normalElem "<" (elemName "head") attributes ">" (normalContent "\n " (elem (escRawTextElem "<" (escRawTextElemName "title") attributes ">" (escRawTextContent "Example Domain") (endElem "</" (elemName "title") ">"))) "\n " (elem (selfClosingElem "<" (elemName "meta") (attributes " " (doubleQuoteAttr (attrName "charset") "=\"utf-8\"") " ") "/>")) "\n " (elem (selfClosingElem "<" (elemName "meta") (attributes " " (doubleQuoteAttr (attrName "http-equiv") "=\"Content-type\"") " " (doubleQuoteAttr (attrName "content") "=\"text/html; charset=utf-8\"") " ") "/>")) "\n " (elem (selfClosingElem "<" (elemName "meta") (attributes " " (doubleQuoteAttr (attrName "name") "=\"viewport\"") " " (doubleQuoteAttr (attrName "content") "=\"width=device-width, initial-scale=1\"") " ") "/>")) "\n " (elem (rawTextElem "<" (rawTextElemName "style") (attributes " " (doubleQuoteAttr (attrName "type") "=\"text/css\"")) ">" (rawTextContent "\n body {\n background-color: #f0f0f2;\n margin: 0;\n padding: 0;\n }\n ") (endElem "</" (elemName "style") ">"))) "\n") (endElem "</" (elemName "head") ">"))) "\n\n" (elem (normalElem "<" (elemName "body") attributes ">" (normalContent "\n" (elem (normalElem "<" (elemName "div") attributes ">" (normalContent "\n " (elem (normalElem "<" (elemName "h1") attributes ">" (normalContent "Example Domain") (endElem "</" (elemName "h1") ">"))) "\n " (elem (normalElem "<" (elemName "p") attributes ">" (normalContent "This domain is for use in illustrative examples in documents. You may\n use this domain in literature without prior coordination or asking for\n permission.") (endElem "</" (elemName "p") ">"))) " " (elem (normalElem "<" (elemName "p") attributes ">" (normalContent (elem (normalElem "<" (elemName "a") (attributes " " (doubleQuoteAttr (attrName "href") "=\"https://www.iana.org/domains/example\"")) ">" (normalContent "More\n information...") (endElem "</" (elemName "a") ">")))) (endElem "</" (elemName "p") ">"))) "\n") (endElem "</" (elemName "div") ">"))) "\n") (endElem "</" (elemName "body") ">"))) "\n") (endElem "</" (elemName "html") ">"))) (blurb "\n")))))
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue