1
Fork 0
mirror of https://git.savannah.gnu.org/git/guile.git synced 2025-04-29 19:30:36 +02:00

PEG: Add full support for PEG + some extensions

This commit adds support for PEG as described in:

    <https://bford.info/pub/lang/peg.pdf>

It adds support for the missing features (comments, underscores in
identifiers and escaping) while keeping the extensions (dashes in
identifiers, < and <--).

The naming system tries to be as close as possible to the one proposed
in the paper.

* module/ice-9/peg/string-peg.scm: Rewrite PEG parser.
* test-suite/tests/peg.test: Fix import

Signed-off-by: Ludovic Courtès <ludo@gnu.org>
This commit is contained in:
Ekaitz Zarraga 2024-09-11 21:19:26 +02:00 committed by Ludovic Courtès
parent 47807c9b11
commit ff11753df1
No known key found for this signature in database
GPG key ID: 090B11993D9AEBB5
4 changed files with 313 additions and 180 deletions

View file

@ -1,6 +1,7 @@
;;;; string-peg.scm --- representing PEG grammars as strings
;;;;
;;;; Copyright (C) 2010, 2011 Free Software Foundation, Inc.
;;;; Copyright (C) 2010, 2011, Free Software Foundation, Inc.
;;;; Copyright (C) 2024 Ekaitz Zarraga <ekaitz@elenq.tech>
;;;;
;;;; This library is free software; you can redistribute it and/or
;;;; modify it under the terms of the GNU Lesser General Public
@ -21,10 +22,15 @@
#:export (peg-as-peg
define-peg-string-patterns
peg-grammar)
#:use-module (ice-9 match)
#:use-module (ice-9 peg using-parsers)
#:use-module (srfi srfi-1)
#:use-module (ice-9 peg codegen)
#:use-module (ice-9 peg simplify-tree))
;; This module provides support for PEG as described in:
;; <https://bford.info/pub/lang/peg.pdf>
;; Gets the left-hand depth of a list.
(define (depth lst)
(if (or (not (list? lst)) (null? lst))
@ -38,22 +44,58 @@
;; Grammar for PEGs in PEG grammar.
(define peg-as-peg
"grammar <-- (nonterminal ('<--' / '<-' / '<') sp pattern)+
pattern <-- alternative (SLASH sp alternative)*
alternative <-- ([!&]? sp suffix)+
suffix <-- primary ([*+?] sp)*
primary <-- '(' sp pattern ')' sp / '.' sp / literal / charclass / nonterminal !'<'
literal <-- ['] (!['] .)* ['] sp
charclass <-- LB (!']' (CCrange / CCsingle))* RB sp
CCrange <-- . '-' .
CCsingle <-- .
nonterminal <-- [a-zA-Z0-9-]+ sp
sp < [ \t\n]*
SLASH < '/'
LB < '['
RB < ']'
"# Hierarchical syntax
Grammar <-- Spacing Definition+ EndOfFile
Definition <-- Identifier LEFTARROW Expression
Expression <-- Sequence (SLASH Sequence)*
Sequence <-- Prefix*
Prefix <-- (AND / NOT)? Suffix
Suffix <-- Primary (QUESTION / STAR / PLUS)?
Primary <-- Identifier !LEFTARROW
/ OPEN Expression CLOSE
/ Literal / Class / DOT
# Lexical syntax
Identifier <-- IdentStart IdentCont* Spacing
# NOTE: `-` is an extension
IdentStart <- [a-zA-Z_] / '-'
IdentCont <- IdentStart / [0-9]
Literal <-- SQUOTE (!SQUOTE Char)* SQUOTE Spacing
/ DQUOTE (!DQUOTE Char)* DQUOTE Spacing
Class <-- OPENBRACKET !NOTIN (!CLOSEBRACKET Range)* CLOSEBRACKET Spacing
Range <-- Char DASH Char / Char
Char <-- '\\\\' [nrt'\"\\[\\]\\\\]
/ '\\\\' [0-7][0-7][0-7]
/ '\\\\' [0-7][0-7]?
/ !'\\\\' .
# NOTE: `<--` and `<` are extensions
LEFTARROW <- ('<--' / '<-' / '<') Spacing
SQUOTE < [']
DQUOTE < [\"]
DASH < '-'
OPENBRACKET < '['
CLOSEBRACKET < ']'
SLASH < '/' Spacing
AND <-- '&' Spacing
NOT <-- '!' Spacing
QUESTION <-- '?' Spacing
STAR <-- '*' Spacing
PLUS <-- '+' Spacing
OPEN < '(' Spacing
CLOSE < ')' Spacing
DOT <-- '.' Spacing
Spacing < (Space / Comment)*
Comment < '#' (!EndOfLine .)* EndOfLine
Space < ' ' / '\t' / EndOfLine
EndOfLine < '\\r\\n' / '\\n' / '\\r'
EndOfFile < !.
")
(define-syntax define-sexp-parser
(lambda (x)
(syntax-case x ()
@ -63,35 +105,78 @@ RB < ']'
(syn (wrap-parser-for-users x matchf accumsym #'sym)))
#`(define sym #,syn))))))
(define-sexp-parser peg-grammar all
(+ (and peg-nonterminal (or "<--" "<-" "<") peg-sp peg-pattern)))
(define-sexp-parser peg-pattern all
(and peg-alternative
(* (and (ignore "/") peg-sp peg-alternative))))
(define-sexp-parser peg-alternative all
(+ (and (? (or "!" "&")) peg-sp peg-suffix)))
(define-sexp-parser peg-suffix all
(and peg-primary (* (and (or "*" "+" "?") peg-sp))))
(define-sexp-parser peg-primary all
(or (and "(" peg-sp peg-pattern ")" peg-sp)
(and "." peg-sp)
peg-literal
peg-charclass
(and peg-nonterminal (not-followed-by "<"))))
(define-sexp-parser peg-literal all
(and "'" (* (and (not-followed-by "'") peg-any)) "'" peg-sp))
(define-sexp-parser peg-charclass all
(and (ignore "[")
(* (and (not-followed-by "]")
(or charclass-range charclass-single)))
(ignore "]")
peg-sp))
(define-sexp-parser charclass-range all (and peg-any "-" peg-any))
(define-sexp-parser charclass-single all peg-any)
(define-sexp-parser peg-nonterminal all
(and (+ (or (range #\a #\z) (range #\A #\Z) (range #\0 #\9) "-")) peg-sp))
(define-sexp-parser peg-sp none
(* (or " " "\t" "\n")))
(define-sexp-parser Grammar all
(and Spacing (+ Definition) EndOfFile))
(define-sexp-parser Definition all
(and Identifier LEFTARROW Expression))
(define-sexp-parser Expression all
(and Sequence (* (and SLASH Sequence))))
(define-sexp-parser Sequence all
(* Prefix))
(define-sexp-parser Prefix all
(and (? (or AND NOT)) Suffix))
(define-sexp-parser Suffix all
(and Primary (? (or QUESTION STAR PLUS))))
(define-sexp-parser Primary all
(or (and Identifier (not-followed-by LEFTARROW))
(and OPEN Expression CLOSE)
Literal
Class
DOT))
(define-sexp-parser Identifier all
(and IdentStart (* IdentCont) Spacing))
(define-sexp-parser IdentStart body
(or (or (range #\a #\z) (range #\A #\Z) "_") "-")) ; NOTE: - is an extension
(define-sexp-parser IdentCont body
(or IdentStart (range #\0 #\9)))
(define-sexp-parser Literal all
(or (and SQUOTE (* (and (not-followed-by SQUOTE) Char)) SQUOTE Spacing)
(and DQUOTE (* (and (not-followed-by DQUOTE) Char)) DQUOTE Spacing)))
(define-sexp-parser Class all
(and OPENBRACKET (* (and (not-followed-by CLOSEBRACKET) Range)) CLOSEBRACKET Spacing))
(define-sexp-parser Range all
(or (and Char DASH Char) Char))
(define-sexp-parser Char all
(or (and "\\" (or "n" "r" "t" "'" "\"" "[" "]" "\\"))
(and "\\" (range #\0 #\7) (range #\0 #\7) (range #\0 #\7))
(and "\\" (range #\0 #\7) (? (range #\0 #\7)))
(and (not-followed-by "\\") peg-any)))
(define-sexp-parser LEFTARROW body
(and (or "<--" "<-" "<") Spacing)) ; NOTE: <-- and < are extensions
(define-sexp-parser SLASH none
(and "/" Spacing))
(define-sexp-parser AND all
(and "&" Spacing))
(define-sexp-parser NOT all
(and "!" Spacing))
(define-sexp-parser QUESTION all
(and "?" Spacing))
(define-sexp-parser STAR all
(and "*" Spacing))
(define-sexp-parser PLUS all
(and "+" Spacing))
(define-sexp-parser OPEN none
(and "(" Spacing))
(define-sexp-parser CLOSE none
(and ")" Spacing))
(define-sexp-parser DOT all
(and "." Spacing))
(define-sexp-parser SQUOTE none "'")
(define-sexp-parser DQUOTE none "\"")
(define-sexp-parser OPENBRACKET none "[")
(define-sexp-parser CLOSEBRACKET none "]")
(define-sexp-parser DASH none "-")
(define-sexp-parser Spacing none
(* (or Space Comment)))
(define-sexp-parser Comment none
(and "#" (* (and (not-followed-by EndOfLine) peg-any)) EndOfLine))
(define-sexp-parser Space none
(or " " "\t" EndOfLine))
(define-sexp-parser EndOfLine none
(or "\r\n" "\n" "\r"))
(define-sexp-parser EndOfFile none
(not-followed-by peg-any))
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;; PARSE STRING PEGS
@ -101,7 +186,7 @@ RB < ']'
;; will define all of the nonterminals in the grammar with equivalent
;; PEG s-expressions.
(define (peg-parser str for-syntax)
(let ((parsed (match-pattern peg-grammar str)))
(let ((parsed (match-pattern Grammar str)))
(if (not parsed)
(begin
;; (display "Invalid PEG grammar!\n")
@ -110,11 +195,154 @@ RB < ']'
(cond
((or (not (list? lst)) (null? lst))
lst)
((eq? (car lst) 'peg-grammar)
#`(begin
#,@(map (lambda (x) (peg-nonterm->defn x for-syntax))
(context-flatten (lambda (lst) (<= (depth lst) 2))
(cdr lst))))))))))
((eq? (car lst) 'Grammar)
(Grammar->defn lst for-syntax)))))))
;; (Grammar (Definition ...) (Definition ...))
(define (Grammar->defn lst for-syntax)
#`(begin
#,@(map (lambda (x) (Definition->defn x for-syntax))
(context-flatten (lambda (lst) (<= (depth lst) 1))
(cdr lst)))))
;; (Definition (Identifier "Something") "<-" (Expression ...))
;; `-> (define-peg-pattern Something 'all ...)
(define (Definition->defn lst for-syntax)
(match lst
(('Definition ('Identifier identifier) grabber expression)
#`(define-peg-pattern
#,(datum->syntax for-syntax (string->symbol identifier))
#,(match grabber
("<--" (datum->syntax for-syntax 'all))
("<-" (datum->syntax for-syntax 'body))
("<" (datum->syntax for-syntax 'none)))
#,(compressor
(Expression->defn expression for-syntax)
for-syntax)))))
;; (Expression X)
;; `-> (or X)
;; (Expression X Y)
;; `-> (or X Y)
;; (Expression X (Y Z ...))
;; `-> (or X Y Z ...)
(define (Expression->defn lst for-syntax)
(match lst
(('Expression seq ...)
#`(or #,@(map (lambda (x) (Sequence->defn x for-syntax))
(keyword-flatten '(Sequence) seq))))))
;; (Sequence X)
;; `-> (and X)
;; (Sequence X Y)
;; `-> (and X Y)
;; (Sequence X (Y Z ...))
;; `-> (and X Y Z ...)
(define (Sequence->defn lst for-syntax)
(match lst
(('Sequence pre ...)
#`(and #,@(map (lambda (x) (Prefix->defn x for-syntax))
(keyword-flatten '(Prefix) pre))))))
;; (Prefix (Suffix ...))
;; `-> (...)
;; (Prefix (NOT "!") (Suffix ...))
;; `-> (not-followed-by ...)
;; (Prefix (AND "&") (Suffix ...))
;; `-> (followed-by ...)
(define (Prefix->defn lst for-syntax)
(match lst
(('Prefix ('AND _) su) #`(followed-by #,(Suffix->defn su for-syntax)))
(('Prefix ('NOT _) su) #`(not-followed-by #,(Suffix->defn su for-syntax)))
(('Prefix suffix) (Suffix->defn suffix for-syntax))))
;; (Suffix (Primary ...))
;; `-> (...)
;; (Suffix (Primary ...) (STAR "*"))
;; `-> (* ...)
;; (Suffix (Primary ...) (QUESTION "?"))
;; `-> (? ...)
;; (Suffix (Primary ...) (PLUS "+"))
;; `-> (+ ...)
(define (Suffix->defn lst for-syntax)
(match lst
(('Suffix prim) (Primary->defn prim for-syntax))
(('Suffix prim ('STAR _)) #`(* #,(Primary->defn prim for-syntax)))
(('Suffix prim ('QUESTION _)) #`(? #,(Primary->defn prim for-syntax)))
(('Suffix prim ('PLUS _)) #`(+ #,(Primary->defn prim for-syntax)))))
(define (Primary->defn lst for-syntax)
(let ((value (second lst)))
(match (car value)
('DOT #'peg-any)
('Identifier (Identifier->defn value for-syntax))
('Expression (Expression->defn value for-syntax))
('Literal (Literal->defn value for-syntax))
('Class (Class->defn value for-syntax)))))
;; (Identifier "hello")
;; `-> hello
(define (Identifier->defn lst for-syntax)
(datum->syntax for-syntax (string->symbol (second lst))))
;; (Literal (Char "a") (Char "b") (Char "c"))
;; `-> "abc"
(define (Literal->defn lst for-syntax)
(apply string (map (lambda (x) (Char->defn x for-syntax)) (cdr lst))))
;; TODO: empty Class can happen: `[]`, but what does it represent?
;; (Class ...)
;; `-> (or ...)
(define (Class->defn lst for-syntax)
#`(or #,@(map (lambda (x) (Range->defn x for-syntax))
(cdr lst))))
;; For one character:
;; (Range (Char "a"))
;; `-> "a"
;; Or for a range:
;; (Range (Char "a") (Char "b"))
;; `-> (range #\a #\b)
(define (Range->defn lst for-syntax)
(match lst
(('Range ch)
(string (Char->defn ch for-syntax)))
(('Range range-beginning range-end)
#`(range
#,(Char->defn range-beginning for-syntax)
#,(Char->defn range-end for-syntax)))))
;; (Char "a")
;; `-> #\a
;; (Char "\\n")
;; `-> #\newline
;; (Char "\\135")
;; `-> #\]
(define (Char->defn lst for-syntax)
(let* ((charstr (second lst))
(first (string-ref charstr 0)))
(cond
((= 1 (string-length charstr)) first)
((char-numeric? (string-ref charstr 1))
(integer->char
(reduce + 0
(map
(lambda (x y)
(* (- (char->integer x) (char->integer #\0)) y))
(reverse (string->list charstr 1))
'(1 8 64)))))
(else
(case (string-ref charstr 1)
((#\n) #\newline)
((#\r) #\return)
((#\t) #\tab)
((#\') #\')
((#\]) #\])
((#\\) #\\)
((#\[) #\[))))))
(define peg-grammar Grammar)
;; Macro wrapper for PEG-PARSER. Parses PEG grammars expressed as strings and
;; defines all the appropriate nonterminals.
@ -124,119 +352,6 @@ RB < ']'
((_ str)
(peg-parser (syntax->datum #'str) x)))))
;; lst has format (nonterm grabber pattern), where
;; nonterm is a symbol (the name of the nonterminal),
;; grabber is a string (either "<", "<-" or "<--"), and
;; pattern is the parse of a PEG pattern expressed as as string.
(define (peg-nonterm->defn lst for-syntax)
(let* ((nonterm (car lst))
(grabber (cadr lst))
(pattern (caddr lst))
(nonterm-name (datum->syntax for-syntax
(string->symbol (cadr nonterm)))))
#`(define-peg-pattern #,nonterm-name
#,(cond
((string=? grabber "<--") (datum->syntax for-syntax 'all))
((string=? grabber "<-") (datum->syntax for-syntax 'body))
(else (datum->syntax for-syntax 'none)))
#,(compressor (peg-pattern->defn pattern for-syntax) for-syntax))))
;; lst has format ('peg-pattern ...).
;; After the context-flatten, (cdr lst) has format
;; (('peg-alternative ...) ...), where the outer list is a collection
;; of elements from a '/' alternative.
(define (peg-pattern->defn lst for-syntax)
#`(or #,@(map (lambda (x) (peg-alternative->defn x for-syntax))
(context-flatten (lambda (x) (eq? (car x) 'peg-alternative))
(cdr lst)))))
;; lst has format ('peg-alternative ...).
;; After the context-flatten, (cdr lst) has the format
;; (item ...), where each item has format either ("!" ...), ("&" ...),
;; or ('peg-suffix ...).
(define (peg-alternative->defn lst for-syntax)
#`(and #,@(map (lambda (x) (peg-body->defn x for-syntax))
(context-flatten (lambda (x) (or (string? (car x))
(eq? (car x) 'peg-suffix)))
(cdr lst)))))
;; lst has the format either
;; ("!" ('peg-suffix ...)), ("&" ('peg-suffix ...)), or
;; ('peg-suffix ...).
(define (peg-body->defn lst for-syntax)
(cond
((equal? (car lst) "&")
#`(followed-by #,(peg-suffix->defn (cadr lst) for-syntax)))
((equal? (car lst) "!")
#`(not-followed-by #,(peg-suffix->defn (cadr lst) for-syntax)))
((eq? (car lst) 'peg-suffix)
(peg-suffix->defn lst for-syntax))
(else `(peg-parse-body-fail ,lst))))
;; lst has format ('peg-suffix <peg-primary> (? (/ "*" "?" "+")))
(define (peg-suffix->defn lst for-syntax)
(let ((inner-defn (peg-primary->defn (cadr lst) for-syntax)))
(cond
((null? (cddr lst))
inner-defn)
((equal? (caddr lst) "*")
#`(* #,inner-defn))
((equal? (caddr lst) "?")
#`(? #,inner-defn))
((equal? (caddr lst) "+")
#`(+ #,inner-defn)))))
;; Parse a primary.
(define (peg-primary->defn lst for-syntax)
(let ((el (cadr lst)))
(cond
((list? el)
(cond
((eq? (car el) 'peg-literal)
(peg-literal->defn el for-syntax))
((eq? (car el) 'peg-charclass)
(peg-charclass->defn el for-syntax))
((eq? (car el) 'peg-nonterminal)
(datum->syntax for-syntax (string->symbol (cadr el))))))
((string? el)
(cond
((equal? el "(")
(peg-pattern->defn (caddr lst) for-syntax))
((equal? el ".")
(datum->syntax for-syntax 'peg-any))
(else (datum->syntax for-syntax
`(peg-parse-any unknown-string ,lst)))))
(else (datum->syntax for-syntax
`(peg-parse-any unknown-el ,lst))))))
;; Trims characters off the front and end of STR.
;; (trim-1chars "'ab'") -> "ab"
(define (trim-1chars str) (substring str 1 (- (string-length str) 1)))
;; Parses a literal.
(define (peg-literal->defn lst for-syntax)
(datum->syntax for-syntax (trim-1chars (cadr lst))))
;; Parses a charclass.
(define (peg-charclass->defn lst for-syntax)
#`(or
#,@(map
(lambda (cc)
(cond
((eq? (car cc) 'charclass-range)
#`(range #,(datum->syntax
for-syntax
(string-ref (cadr cc) 0))
#,(datum->syntax
for-syntax
(string-ref (cadr cc) 2))))
((eq? (car cc) 'charclass-single)
(datum->syntax for-syntax (cadr cc)))))
(context-flatten
(lambda (x) (or (eq? (car x) 'charclass-range)
(eq? (car x) 'charclass-single)))
(cdr lst)))))
;; Compresses a list to save the optimizer work.
;; e.g. (or (and a)) -> a
(define (compressor-core lst)
@ -263,11 +378,10 @@ RB < ']'
(let ((string (syntax->datum #'str-stx)))
(compile-peg-pattern
(compressor
(peg-pattern->defn
(peg:tree (match-pattern peg-pattern string)) #'str-stx)
(Expression->defn
(peg:tree (match-pattern Expression string)) #'str-stx)
#'str-stx)
(if (eq? accum 'all) 'body accum))))
(else (error "Bad embedded PEG string" args))))
(add-peg-compiler! 'peg peg-string-compile)