1
Fork 0
mirror of https://git.savannah.gnu.org/git/guile.git synced 2025-04-30 03:40:34 +02:00

PEG: Add full support for PEG + some extensions

This commit adds support for PEG as described in:

    <https://bford.info/pub/lang/peg.pdf>

It adds support for the missing features (comments, underscores in
identifiers and escaping) while keeping the extensions (dashes in
identifiers, < and <--).

The naming system tries to be as close as possible to the one proposed
in the paper.

* module/ice-9/peg/string-peg.scm: Rewrite PEG parser.
* test-suite/tests/peg.test: Fix import

Signed-off-by: Ludovic Courtès <ludo@gnu.org>
This commit is contained in:
Ekaitz Zarraga 2024-09-11 21:19:26 +02:00 committed by Ludovic Courtès
parent 47807c9b11
commit ff11753df1
No known key found for this signature in database
GPG key ID: 090B11993D9AEBB5
4 changed files with 313 additions and 180 deletions

7
NEWS
View file

@ -20,6 +20,13 @@ downright unusable (e.g., <https://bugs.gnu.org/72378>), non-conforming
(e.g., <https://bugs.gnu.org/72383>), or buggy (e.g.,
<https://bugs.gnu.org/72372>).
* New interfaces and functionality
** PEG parser
PEG grammar parser in (ice-9 peg string-peg) has been rewritten to cover
all the functionality defined in <https://bford.info/pub/lang/peg.pdf>.
** GOOPS: Introduce new forms method* and define-method*
The module (oop goops) now exports method* and define-method* which are

View file

@ -17,6 +17,10 @@ Wikipedia has a clear and concise introduction to PEGs if you want to
familiarize yourself with the syntax:
@url{http://en.wikipedia.org/wiki/Parsing_expression_grammar}.
The paper that introduced PEG contains a more detailed description of how PEG
works, and describes its syntax in detail:
@url{https://bford.info/pub/lang/peg.pdf}
The @code{(ice-9 peg)} module works by compiling PEGs down to lambda
expressions. These can either be stored in variables at compile-time by
the define macros (@code{define-peg-pattern} and
@ -216,8 +220,8 @@ should propagate up the parse tree. The normal @code{<-} propagates the
matched text up the parse tree, @code{<--} propagates the matched text
up the parse tree tagged with the name of the nonterminal, and @code{<}
discards that matched text and propagates nothing up the parse tree.
Also, nonterminals may consist of any alphanumeric character or a ``-''
character (in normal PEGs nonterminals can only be alphabetic).
Also, nonterminals may include ``-'' character, while in normal PEG it is not
allowed.
For example, if we:
@lisp

View file

@ -1,6 +1,7 @@
;;;; string-peg.scm --- representing PEG grammars as strings
;;;;
;;;; Copyright (C) 2010, 2011 Free Software Foundation, Inc.
;;;; Copyright (C) 2010, 2011, Free Software Foundation, Inc.
;;;; Copyright (C) 2024 Ekaitz Zarraga <ekaitz@elenq.tech>
;;;;
;;;; This library is free software; you can redistribute it and/or
;;;; modify it under the terms of the GNU Lesser General Public
@ -21,10 +22,15 @@
#:export (peg-as-peg
define-peg-string-patterns
peg-grammar)
#:use-module (ice-9 match)
#:use-module (ice-9 peg using-parsers)
#:use-module (srfi srfi-1)
#:use-module (ice-9 peg codegen)
#:use-module (ice-9 peg simplify-tree))
;; This module provides support for PEG as described in:
;; <https://bford.info/pub/lang/peg.pdf>
;; Gets the left-hand depth of a list.
(define (depth lst)
(if (or (not (list? lst)) (null? lst))
@ -38,22 +44,58 @@
;; Grammar for PEGs in PEG grammar.
(define peg-as-peg
"grammar <-- (nonterminal ('<--' / '<-' / '<') sp pattern)+
pattern <-- alternative (SLASH sp alternative)*
alternative <-- ([!&]? sp suffix)+
suffix <-- primary ([*+?] sp)*
primary <-- '(' sp pattern ')' sp / '.' sp / literal / charclass / nonterminal !'<'
literal <-- ['] (!['] .)* ['] sp
charclass <-- LB (!']' (CCrange / CCsingle))* RB sp
CCrange <-- . '-' .
CCsingle <-- .
nonterminal <-- [a-zA-Z0-9-]+ sp
sp < [ \t\n]*
SLASH < '/'
LB < '['
RB < ']'
"# Hierarchical syntax
Grammar <-- Spacing Definition+ EndOfFile
Definition <-- Identifier LEFTARROW Expression
Expression <-- Sequence (SLASH Sequence)*
Sequence <-- Prefix*
Prefix <-- (AND / NOT)? Suffix
Suffix <-- Primary (QUESTION / STAR / PLUS)?
Primary <-- Identifier !LEFTARROW
/ OPEN Expression CLOSE
/ Literal / Class / DOT
# Lexical syntax
Identifier <-- IdentStart IdentCont* Spacing
# NOTE: `-` is an extension
IdentStart <- [a-zA-Z_] / '-'
IdentCont <- IdentStart / [0-9]
Literal <-- SQUOTE (!SQUOTE Char)* SQUOTE Spacing
/ DQUOTE (!DQUOTE Char)* DQUOTE Spacing
Class <-- OPENBRACKET !NOTIN (!CLOSEBRACKET Range)* CLOSEBRACKET Spacing
Range <-- Char DASH Char / Char
Char <-- '\\\\' [nrt'\"\\[\\]\\\\]
/ '\\\\' [0-7][0-7][0-7]
/ '\\\\' [0-7][0-7]?
/ !'\\\\' .
# NOTE: `<--` and `<` are extensions
LEFTARROW <- ('<--' / '<-' / '<') Spacing
SQUOTE < [']
DQUOTE < [\"]
DASH < '-'
OPENBRACKET < '['
CLOSEBRACKET < ']'
SLASH < '/' Spacing
AND <-- '&' Spacing
NOT <-- '!' Spacing
QUESTION <-- '?' Spacing
STAR <-- '*' Spacing
PLUS <-- '+' Spacing
OPEN < '(' Spacing
CLOSE < ')' Spacing
DOT <-- '.' Spacing
Spacing < (Space / Comment)*
Comment < '#' (!EndOfLine .)* EndOfLine
Space < ' ' / '\t' / EndOfLine
EndOfLine < '\\r\\n' / '\\n' / '\\r'
EndOfFile < !.
")
(define-syntax define-sexp-parser
(lambda (x)
(syntax-case x ()
@ -63,35 +105,78 @@ RB < ']'
(syn (wrap-parser-for-users x matchf accumsym #'sym)))
#`(define sym #,syn))))))
(define-sexp-parser peg-grammar all
(+ (and peg-nonterminal (or "<--" "<-" "<") peg-sp peg-pattern)))
(define-sexp-parser peg-pattern all
(and peg-alternative
(* (and (ignore "/") peg-sp peg-alternative))))
(define-sexp-parser peg-alternative all
(+ (and (? (or "!" "&")) peg-sp peg-suffix)))
(define-sexp-parser peg-suffix all
(and peg-primary (* (and (or "*" "+" "?") peg-sp))))
(define-sexp-parser peg-primary all
(or (and "(" peg-sp peg-pattern ")" peg-sp)
(and "." peg-sp)
peg-literal
peg-charclass
(and peg-nonterminal (not-followed-by "<"))))
(define-sexp-parser peg-literal all
(and "'" (* (and (not-followed-by "'") peg-any)) "'" peg-sp))
(define-sexp-parser peg-charclass all
(and (ignore "[")
(* (and (not-followed-by "]")
(or charclass-range charclass-single)))
(ignore "]")
peg-sp))
(define-sexp-parser charclass-range all (and peg-any "-" peg-any))
(define-sexp-parser charclass-single all peg-any)
(define-sexp-parser peg-nonterminal all
(and (+ (or (range #\a #\z) (range #\A #\Z) (range #\0 #\9) "-")) peg-sp))
(define-sexp-parser peg-sp none
(* (or " " "\t" "\n")))
(define-sexp-parser Grammar all
(and Spacing (+ Definition) EndOfFile))
(define-sexp-parser Definition all
(and Identifier LEFTARROW Expression))
(define-sexp-parser Expression all
(and Sequence (* (and SLASH Sequence))))
(define-sexp-parser Sequence all
(* Prefix))
(define-sexp-parser Prefix all
(and (? (or AND NOT)) Suffix))
(define-sexp-parser Suffix all
(and Primary (? (or QUESTION STAR PLUS))))
(define-sexp-parser Primary all
(or (and Identifier (not-followed-by LEFTARROW))
(and OPEN Expression CLOSE)
Literal
Class
DOT))
(define-sexp-parser Identifier all
(and IdentStart (* IdentCont) Spacing))
(define-sexp-parser IdentStart body
(or (or (range #\a #\z) (range #\A #\Z) "_") "-")) ; NOTE: - is an extension
(define-sexp-parser IdentCont body
(or IdentStart (range #\0 #\9)))
(define-sexp-parser Literal all
(or (and SQUOTE (* (and (not-followed-by SQUOTE) Char)) SQUOTE Spacing)
(and DQUOTE (* (and (not-followed-by DQUOTE) Char)) DQUOTE Spacing)))
(define-sexp-parser Class all
(and OPENBRACKET (* (and (not-followed-by CLOSEBRACKET) Range)) CLOSEBRACKET Spacing))
(define-sexp-parser Range all
(or (and Char DASH Char) Char))
(define-sexp-parser Char all
(or (and "\\" (or "n" "r" "t" "'" "\"" "[" "]" "\\"))
(and "\\" (range #\0 #\7) (range #\0 #\7) (range #\0 #\7))
(and "\\" (range #\0 #\7) (? (range #\0 #\7)))
(and (not-followed-by "\\") peg-any)))
(define-sexp-parser LEFTARROW body
(and (or "<--" "<-" "<") Spacing)) ; NOTE: <-- and < are extensions
(define-sexp-parser SLASH none
(and "/" Spacing))
(define-sexp-parser AND all
(and "&" Spacing))
(define-sexp-parser NOT all
(and "!" Spacing))
(define-sexp-parser QUESTION all
(and "?" Spacing))
(define-sexp-parser STAR all
(and "*" Spacing))
(define-sexp-parser PLUS all
(and "+" Spacing))
(define-sexp-parser OPEN none
(and "(" Spacing))
(define-sexp-parser CLOSE none
(and ")" Spacing))
(define-sexp-parser DOT all
(and "." Spacing))
(define-sexp-parser SQUOTE none "'")
(define-sexp-parser DQUOTE none "\"")
(define-sexp-parser OPENBRACKET none "[")
(define-sexp-parser CLOSEBRACKET none "]")
(define-sexp-parser DASH none "-")
(define-sexp-parser Spacing none
(* (or Space Comment)))
(define-sexp-parser Comment none
(and "#" (* (and (not-followed-by EndOfLine) peg-any)) EndOfLine))
(define-sexp-parser Space none
(or " " "\t" EndOfLine))
(define-sexp-parser EndOfLine none
(or "\r\n" "\n" "\r"))
(define-sexp-parser EndOfFile none
(not-followed-by peg-any))
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;; PARSE STRING PEGS
@ -101,7 +186,7 @@ RB < ']'
;; will define all of the nonterminals in the grammar with equivalent
;; PEG s-expressions.
(define (peg-parser str for-syntax)
(let ((parsed (match-pattern peg-grammar str)))
(let ((parsed (match-pattern Grammar str)))
(if (not parsed)
(begin
;; (display "Invalid PEG grammar!\n")
@ -110,11 +195,154 @@ RB < ']'
(cond
((or (not (list? lst)) (null? lst))
lst)
((eq? (car lst) 'peg-grammar)
((eq? (car lst) 'Grammar)
(Grammar->defn lst for-syntax)))))))
;; (Grammar (Definition ...) (Definition ...))
(define (Grammar->defn lst for-syntax)
#`(begin
#,@(map (lambda (x) (peg-nonterm->defn x for-syntax))
(context-flatten (lambda (lst) (<= (depth lst) 2))
(cdr lst))))))))))
#,@(map (lambda (x) (Definition->defn x for-syntax))
(context-flatten (lambda (lst) (<= (depth lst) 1))
(cdr lst)))))
;; (Definition (Identifier "Something") "<-" (Expression ...))
;; `-> (define-peg-pattern Something 'all ...)
(define (Definition->defn lst for-syntax)
(match lst
(('Definition ('Identifier identifier) grabber expression)
#`(define-peg-pattern
#,(datum->syntax for-syntax (string->symbol identifier))
#,(match grabber
("<--" (datum->syntax for-syntax 'all))
("<-" (datum->syntax for-syntax 'body))
("<" (datum->syntax for-syntax 'none)))
#,(compressor
(Expression->defn expression for-syntax)
for-syntax)))))
;; (Expression X)
;; `-> (or X)
;; (Expression X Y)
;; `-> (or X Y)
;; (Expression X (Y Z ...))
;; `-> (or X Y Z ...)
(define (Expression->defn lst for-syntax)
(match lst
(('Expression seq ...)
#`(or #,@(map (lambda (x) (Sequence->defn x for-syntax))
(keyword-flatten '(Sequence) seq))))))
;; (Sequence X)
;; `-> (and X)
;; (Sequence X Y)
;; `-> (and X Y)
;; (Sequence X (Y Z ...))
;; `-> (and X Y Z ...)
(define (Sequence->defn lst for-syntax)
(match lst
(('Sequence pre ...)
#`(and #,@(map (lambda (x) (Prefix->defn x for-syntax))
(keyword-flatten '(Prefix) pre))))))
;; (Prefix (Suffix ...))
;; `-> (...)
;; (Prefix (NOT "!") (Suffix ...))
;; `-> (not-followed-by ...)
;; (Prefix (AND "&") (Suffix ...))
;; `-> (followed-by ...)
(define (Prefix->defn lst for-syntax)
(match lst
(('Prefix ('AND _) su) #`(followed-by #,(Suffix->defn su for-syntax)))
(('Prefix ('NOT _) su) #`(not-followed-by #,(Suffix->defn su for-syntax)))
(('Prefix suffix) (Suffix->defn suffix for-syntax))))
;; (Suffix (Primary ...))
;; `-> (...)
;; (Suffix (Primary ...) (STAR "*"))
;; `-> (* ...)
;; (Suffix (Primary ...) (QUESTION "?"))
;; `-> (? ...)
;; (Suffix (Primary ...) (PLUS "+"))
;; `-> (+ ...)
(define (Suffix->defn lst for-syntax)
(match lst
(('Suffix prim) (Primary->defn prim for-syntax))
(('Suffix prim ('STAR _)) #`(* #,(Primary->defn prim for-syntax)))
(('Suffix prim ('QUESTION _)) #`(? #,(Primary->defn prim for-syntax)))
(('Suffix prim ('PLUS _)) #`(+ #,(Primary->defn prim for-syntax)))))
(define (Primary->defn lst for-syntax)
(let ((value (second lst)))
(match (car value)
('DOT #'peg-any)
('Identifier (Identifier->defn value for-syntax))
('Expression (Expression->defn value for-syntax))
('Literal (Literal->defn value for-syntax))
('Class (Class->defn value for-syntax)))))
;; (Identifier "hello")
;; `-> hello
(define (Identifier->defn lst for-syntax)
(datum->syntax for-syntax (string->symbol (second lst))))
;; (Literal (Char "a") (Char "b") (Char "c"))
;; `-> "abc"
(define (Literal->defn lst for-syntax)
(apply string (map (lambda (x) (Char->defn x for-syntax)) (cdr lst))))
;; TODO: empty Class can happen: `[]`, but what does it represent?
;; (Class ...)
;; `-> (or ...)
(define (Class->defn lst for-syntax)
#`(or #,@(map (lambda (x) (Range->defn x for-syntax))
(cdr lst))))
;; For one character:
;; (Range (Char "a"))
;; `-> "a"
;; Or for a range:
;; (Range (Char "a") (Char "b"))
;; `-> (range #\a #\b)
(define (Range->defn lst for-syntax)
(match lst
(('Range ch)
(string (Char->defn ch for-syntax)))
(('Range range-beginning range-end)
#`(range
#,(Char->defn range-beginning for-syntax)
#,(Char->defn range-end for-syntax)))))
;; (Char "a")
;; `-> #\a
;; (Char "\\n")
;; `-> #\newline
;; (Char "\\135")
;; `-> #\]
(define (Char->defn lst for-syntax)
(let* ((charstr (second lst))
(first (string-ref charstr 0)))
(cond
((= 1 (string-length charstr)) first)
((char-numeric? (string-ref charstr 1))
(integer->char
(reduce + 0
(map
(lambda (x y)
(* (- (char->integer x) (char->integer #\0)) y))
(reverse (string->list charstr 1))
'(1 8 64)))))
(else
(case (string-ref charstr 1)
((#\n) #\newline)
((#\r) #\return)
((#\t) #\tab)
((#\') #\')
((#\]) #\])
((#\\) #\\)
((#\[) #\[))))))
(define peg-grammar Grammar)
;; Macro wrapper for PEG-PARSER. Parses PEG grammars expressed as strings and
;; defines all the appropriate nonterminals.
@ -124,119 +352,6 @@ RB < ']'
((_ str)
(peg-parser (syntax->datum #'str) x)))))
;; lst has format (nonterm grabber pattern), where
;; nonterm is a symbol (the name of the nonterminal),
;; grabber is a string (either "<", "<-" or "<--"), and
;; pattern is the parse of a PEG pattern expressed as as string.
(define (peg-nonterm->defn lst for-syntax)
(let* ((nonterm (car lst))
(grabber (cadr lst))
(pattern (caddr lst))
(nonterm-name (datum->syntax for-syntax
(string->symbol (cadr nonterm)))))
#`(define-peg-pattern #,nonterm-name
#,(cond
((string=? grabber "<--") (datum->syntax for-syntax 'all))
((string=? grabber "<-") (datum->syntax for-syntax 'body))
(else (datum->syntax for-syntax 'none)))
#,(compressor (peg-pattern->defn pattern for-syntax) for-syntax))))
;; lst has format ('peg-pattern ...).
;; After the context-flatten, (cdr lst) has format
;; (('peg-alternative ...) ...), where the outer list is a collection
;; of elements from a '/' alternative.
(define (peg-pattern->defn lst for-syntax)
#`(or #,@(map (lambda (x) (peg-alternative->defn x for-syntax))
(context-flatten (lambda (x) (eq? (car x) 'peg-alternative))
(cdr lst)))))
;; lst has format ('peg-alternative ...).
;; After the context-flatten, (cdr lst) has the format
;; (item ...), where each item has format either ("!" ...), ("&" ...),
;; or ('peg-suffix ...).
(define (peg-alternative->defn lst for-syntax)
#`(and #,@(map (lambda (x) (peg-body->defn x for-syntax))
(context-flatten (lambda (x) (or (string? (car x))
(eq? (car x) 'peg-suffix)))
(cdr lst)))))
;; lst has the format either
;; ("!" ('peg-suffix ...)), ("&" ('peg-suffix ...)), or
;; ('peg-suffix ...).
(define (peg-body->defn lst for-syntax)
(cond
((equal? (car lst) "&")
#`(followed-by #,(peg-suffix->defn (cadr lst) for-syntax)))
((equal? (car lst) "!")
#`(not-followed-by #,(peg-suffix->defn (cadr lst) for-syntax)))
((eq? (car lst) 'peg-suffix)
(peg-suffix->defn lst for-syntax))
(else `(peg-parse-body-fail ,lst))))
;; lst has format ('peg-suffix <peg-primary> (? (/ "*" "?" "+")))
(define (peg-suffix->defn lst for-syntax)
(let ((inner-defn (peg-primary->defn (cadr lst) for-syntax)))
(cond
((null? (cddr lst))
inner-defn)
((equal? (caddr lst) "*")
#`(* #,inner-defn))
((equal? (caddr lst) "?")
#`(? #,inner-defn))
((equal? (caddr lst) "+")
#`(+ #,inner-defn)))))
;; Parse a primary.
(define (peg-primary->defn lst for-syntax)
(let ((el (cadr lst)))
(cond
((list? el)
(cond
((eq? (car el) 'peg-literal)
(peg-literal->defn el for-syntax))
((eq? (car el) 'peg-charclass)
(peg-charclass->defn el for-syntax))
((eq? (car el) 'peg-nonterminal)
(datum->syntax for-syntax (string->symbol (cadr el))))))
((string? el)
(cond
((equal? el "(")
(peg-pattern->defn (caddr lst) for-syntax))
((equal? el ".")
(datum->syntax for-syntax 'peg-any))
(else (datum->syntax for-syntax
`(peg-parse-any unknown-string ,lst)))))
(else (datum->syntax for-syntax
`(peg-parse-any unknown-el ,lst))))))
;; Trims characters off the front and end of STR.
;; (trim-1chars "'ab'") -> "ab"
(define (trim-1chars str) (substring str 1 (- (string-length str) 1)))
;; Parses a literal.
(define (peg-literal->defn lst for-syntax)
(datum->syntax for-syntax (trim-1chars (cadr lst))))
;; Parses a charclass.
(define (peg-charclass->defn lst for-syntax)
#`(or
#,@(map
(lambda (cc)
(cond
((eq? (car cc) 'charclass-range)
#`(range #,(datum->syntax
for-syntax
(string-ref (cadr cc) 0))
#,(datum->syntax
for-syntax
(string-ref (cadr cc) 2))))
((eq? (car cc) 'charclass-single)
(datum->syntax for-syntax (cadr cc)))))
(context-flatten
(lambda (x) (or (eq? (car x) 'charclass-range)
(eq? (car x) 'charclass-single)))
(cdr lst)))))
;; Compresses a list to save the optimizer work.
;; e.g. (or (and a)) -> a
(define (compressor-core lst)
@ -263,11 +378,10 @@ RB < ']'
(let ((string (syntax->datum #'str-stx)))
(compile-peg-pattern
(compressor
(peg-pattern->defn
(peg:tree (match-pattern peg-pattern string)) #'str-stx)
(Expression->defn
(peg:tree (match-pattern Expression string)) #'str-stx)
#'str-stx)
(if (eq? accum 'all) 'body accum))))
(else (error "Bad embedded PEG string" args))))
(add-peg-compiler! 'peg peg-string-compile)

View file

@ -28,17 +28,25 @@
;; the nonterminals defined in the PEG parser written with
;; S-expressions.
(define grammar-mapping
'((grammar peg-grammar)
(pattern peg-pattern)
(alternative peg-alternative)
(suffix peg-suffix)
(primary peg-primary)
(literal peg-literal)
(charclass peg-charclass)
(CCrange charclass-range)
(CCsingle charclass-single)
(nonterminal peg-nonterminal)
(sp peg-sp)))
'((Grammar Grammar)
(Definition Definition)
(Expression Expression)
(Sequence Sequence)
(Prefix Prefix)
(Suffix Suffix)
(Primary Primary)
(Identifier Identifier)
(Literal Literal)
(Class Class)
(Range Range)
(Char Char)
(LEFTARROW LEFTARROW)
(AND AND)
(NOT NOT)
(QUESTION QUESTION)
(STAR STAR)
(PLUS PLUS)
(DOT DOT)))
;; Transforms the nonterminals defined in the PEG parser written as a PEG to the nonterminals defined in the PEG parser written with S-expressions.
(define (grammar-transform x)
@ -69,7 +77,7 @@
(peg:tree (match-pattern (@@ (ice-9 peg) peg-grammar) (@@ (ice-9 peg) peg-as-peg)))
(tree-map
grammar-transform
(peg:tree (match-pattern grammar (@@ (ice-9 peg) peg-as-peg)))))))
(peg:tree (match-pattern Grammar (@@ (ice-9 peg) peg-as-peg)))))))
;; A grammar for pascal-style comments from Wikipedia.
(define comment-grammar