mirror of
https://git.savannah.gnu.org/git/guile.git
synced 2025-04-30 11:50:28 +02:00
Extensible PEG Syntax
* module/ice-9/peg/codegen.scm: Make the PEG syntax extensible, and move most of the current code generators to the new interface * doc/ref/api-peg.texi: Document PEG extensions in the PEG Internals section of the manual
This commit is contained in:
parent
bbc5564c42
commit
94e8517c16
2 changed files with 133 additions and 75 deletions
|
@ -992,3 +992,35 @@ interface.
|
||||||
|
|
||||||
The above function can be used to match a string by running
|
The above function can be used to match a string by running
|
||||||
@code{(peg-parse match-a-b "ab")}.
|
@code{(peg-parse match-a-b "ab")}.
|
||||||
|
|
||||||
|
@subsubheading Code Generators and Extensible Syntax
|
||||||
|
|
||||||
|
PEG expressions, such as those in a @code{define-nonterm} form, are
|
||||||
|
interpreted internally in two steps.
|
||||||
|
|
||||||
|
First, any string PEG is expanded into an s-expression PEG by the code
|
||||||
|
in the @code{(ice-9 peg string-peg)} module.
|
||||||
|
|
||||||
|
Then, then s-expression PEG that results is compiled into a parsing
|
||||||
|
function by the @code{(ice-9 peg codegen)} module. In particular, the
|
||||||
|
function @code{peg-sexp-compile} is called on the s-expression. It then
|
||||||
|
decides what to do based on the form it is passed.
|
||||||
|
|
||||||
|
The PEG syntax can be expanded by providing @code{peg-sexp-compile} more
|
||||||
|
options for what to do with its forms. The extended syntax will be
|
||||||
|
associated with a symbol, for instance @code{my-parsing-form}, and will
|
||||||
|
be called on all PEG expressions of the form
|
||||||
|
@lisp
|
||||||
|
(my-parsing-form ...)
|
||||||
|
@end lisp
|
||||||
|
|
||||||
|
The parsing function should take two arguments. The first will be a
|
||||||
|
syntax object containing a list with all of the arguments to the form
|
||||||
|
(but not the form's name), and the second will be the
|
||||||
|
@code{capture-type} argument that is passed to @code{define-nonterm}.
|
||||||
|
|
||||||
|
New functions can be registered by calling @code{(add-peg-compiler!
|
||||||
|
symbol function)}, where @code{symbol} is the symbol that will indicate
|
||||||
|
a form of this type and @code{function} is the code generating function
|
||||||
|
described above. The function @code{add-peg-compiler!} is exported from
|
||||||
|
the @code{(ice-9 peg codegen)} module.
|
||||||
|
|
|
@ -18,9 +18,7 @@
|
||||||
;;;;
|
;;;;
|
||||||
|
|
||||||
(define-module (ice-9 peg codegen)
|
(define-module (ice-9 peg codegen)
|
||||||
#:export (peg-sexp-compile wrap-parser-for-users)
|
#:export (peg-sexp-compile wrap-parser-for-users add-peg-compiler!)
|
||||||
#:use-module (ice-9 peg)
|
|
||||||
#:use-module (ice-9 peg string-peg)
|
|
||||||
#:use-module (ice-9 pretty-print)
|
#:use-module (ice-9 pretty-print)
|
||||||
#:use-module (system base pmatch))
|
#:use-module (system base pmatch))
|
||||||
|
|
||||||
|
@ -123,18 +121,35 @@ return EXP."
|
||||||
|
|
||||||
;; Generates code for matching a range of characters between start and end.
|
;; Generates code for matching a range of characters between start and end.
|
||||||
;; E.g.: (cg-range syntax #\a #\z 'body)
|
;; E.g.: (cg-range syntax #\a #\z 'body)
|
||||||
(define (cg-range start end accum)
|
(define (cg-range pat accum)
|
||||||
|
(syntax-case pat ()
|
||||||
|
((start end)
|
||||||
|
(if (not (and (char? (syntax->datum #'start))
|
||||||
|
(char? (syntax->datum #'end))))
|
||||||
|
(error "range PEG should have characters after it; instead got"
|
||||||
|
#'start #'end))
|
||||||
#`(lambda (str len pos)
|
#`(lambda (str len pos)
|
||||||
(and (< pos len)
|
(and (< pos len)
|
||||||
(let ((c (string-ref str pos)))
|
(let ((c (string-ref str pos)))
|
||||||
(and (char>=? c #,start)
|
(and (char>=? c start)
|
||||||
(char<=? c #,end)
|
(char<=? c end)
|
||||||
#,(case accum
|
#,(case accum
|
||||||
((all) #`(list (1+ pos) (list 'cg-range (string c))))
|
((all) #`(list (1+ pos) (list 'cg-range (string c))))
|
||||||
((name) #`(list (1+ pos) 'cg-range))
|
((name) #`(list (1+ pos) 'cg-range))
|
||||||
((body) #`(list (1+ pos) (string c)))
|
((body) #`(list (1+ pos) (string c)))
|
||||||
((none) #`(list (1+ pos) '()))
|
((none) #`(list (1+ pos) '()))
|
||||||
(else (error "bad accum" accum))))))))
|
(else (error "bad accum" accum))))))))))
|
||||||
|
|
||||||
|
;; Generate code to match a pattern and do nothing with the result
|
||||||
|
(define (cg-ignore pat accum)
|
||||||
|
(syntax-case pat ()
|
||||||
|
((inner)
|
||||||
|
(peg-sexp-compile #'inner 'none))))
|
||||||
|
|
||||||
|
(define (cg-capture pat accum)
|
||||||
|
(syntax-case pat ()
|
||||||
|
((inner)
|
||||||
|
(peg-sexp-compile #'inner 'body))))
|
||||||
|
|
||||||
;; Filters the accum argument to peg-sexp-compile for buildings like string
|
;; Filters the accum argument to peg-sexp-compile for buildings like string
|
||||||
;; literals (since we don't want to tag them with their name if we're doing an
|
;; literals (since we don't want to tag them with their name if we're doing an
|
||||||
|
@ -147,35 +162,11 @@ return EXP."
|
||||||
((eq? accum 'none) 'none)))
|
((eq? accum 'none) 'none)))
|
||||||
(define baf builtin-accum-filter)
|
(define baf builtin-accum-filter)
|
||||||
|
|
||||||
;; Takes an arbitrary expressions and accumulation variable, then parses it.
|
|
||||||
;; E.g.: (peg-sexp-compile syntax '(and "abc" (or "-" (range #\a #\z))) 'all)
|
|
||||||
(define (peg-sexp-compile pat accum)
|
|
||||||
(syntax-case pat (peg-any range ignore capture peg and or body)
|
|
||||||
(peg-any
|
|
||||||
(cg-peg-any (baf accum)))
|
|
||||||
(sym (identifier? #'sym) ;; nonterminal
|
|
||||||
#'sym)
|
|
||||||
(str (string? (syntax->datum #'str)) ;; literal string
|
|
||||||
(cg-string (syntax->datum #'str) (baf accum)))
|
|
||||||
((range start end) ;; range of characters (e.g. [a-z])
|
|
||||||
(and (char? (syntax->datum #'start)) (char? (syntax->datum #'end)))
|
|
||||||
(cg-range (syntax->datum #'start) (syntax->datum #'end) (baf accum)))
|
|
||||||
((ignore pat) ;; match but don't parse
|
|
||||||
(peg-sexp-compile #'pat 'none))
|
|
||||||
((capture pat) ;; parse
|
|
||||||
(peg-sexp-compile #'pat 'body))
|
|
||||||
((and pat ...)
|
|
||||||
(cg-and #'(pat ...) (baf accum)))
|
|
||||||
((or pat ...)
|
|
||||||
(cg-or #'(pat ...) (baf accum)))
|
|
||||||
((body type pat num)
|
|
||||||
(cg-body (baf accum) #'type #'pat #'num))))
|
|
||||||
|
|
||||||
;; Top-level function builder for AND. Reduces to a call to CG-AND-INT.
|
;; Top-level function builder for AND. Reduces to a call to CG-AND-INT.
|
||||||
(define (cg-and clauses accum)
|
(define (cg-and clauses accum)
|
||||||
#`(lambda (str len pos)
|
#`(lambda (str len pos)
|
||||||
(let ((body '()))
|
(let ((body '()))
|
||||||
#,(cg-and-int clauses accum #'str #'len #'pos #'body))))
|
#,(cg-and-int clauses (baf accum) #'str #'len #'pos #'body))))
|
||||||
|
|
||||||
;; Internal function builder for AND (calls itself).
|
;; Internal function builder for AND (calls itself).
|
||||||
(define (cg-and-int clauses accum str strlen at body)
|
(define (cg-and-int clauses accum str strlen at body)
|
||||||
|
@ -195,7 +186,7 @@ return EXP."
|
||||||
;; Top-level function builder for OR. Reduces to a call to CG-OR-INT.
|
;; Top-level function builder for OR. Reduces to a call to CG-OR-INT.
|
||||||
(define (cg-or clauses accum)
|
(define (cg-or clauses accum)
|
||||||
#`(lambda (str len pos)
|
#`(lambda (str len pos)
|
||||||
#,(cg-or-int clauses accum #'str #'len #'pos)))
|
#,(cg-or-int clauses (baf accum) #'str #'len #'pos)))
|
||||||
|
|
||||||
;; Internal function builder for OR (calls itself).
|
;; Internal function builder for OR (calls itself).
|
||||||
(define (cg-or-int clauses accum str strlen at)
|
(define (cg-or-int clauses accum str strlen at)
|
||||||
|
@ -207,40 +198,75 @@ return EXP."
|
||||||
#,(cg-or-int #'(rest ...) accum str strlen at)))))
|
#,(cg-or-int #'(rest ...) accum str strlen at)))))
|
||||||
|
|
||||||
;; Returns a function that parses a BODY element.
|
;; Returns a function that parses a BODY element.
|
||||||
(define (cg-body accum type pat num)
|
(define (cg-body args accum)
|
||||||
|
(syntax-case args ()
|
||||||
|
((type pat num)
|
||||||
#`(lambda (str strlen at)
|
#`(lambda (str strlen at)
|
||||||
(let ((body '()))
|
(let ((body '()))
|
||||||
(let lp ((end at) (count 0))
|
(let lp ((end at) (count 0))
|
||||||
(let* ((match (#,(peg-sexp-compile pat accum) str strlen end))
|
(let* ((match (#,(peg-sexp-compile #'pat (baf accum))
|
||||||
|
str strlen end))
|
||||||
(new-end (if match (car match) end))
|
(new-end (if match (car match) end))
|
||||||
(count (if (> new-end end) (1+ count) count)))
|
(count (if (> new-end end) (1+ count) count)))
|
||||||
(if (> new-end end)
|
(if (> new-end end)
|
||||||
(push-not-null! body (single-filter (cadr match))))
|
(push-not-null! body (single-filter (cadr match))))
|
||||||
(if (and (> new-end end)
|
(if (and (> new-end end)
|
||||||
#,(syntax-case num (+ * ?)
|
#,(syntax-case #'num (+ * ?)
|
||||||
(n (number? (syntax->datum #'n))
|
(n (number? (syntax->datum #'n))
|
||||||
#'(< count n))
|
#'(< count n))
|
||||||
(+ #t)
|
(+ #t)
|
||||||
(* #t)
|
(* #t)
|
||||||
(? #'(< count 1))))
|
(? #'(< count 1))))
|
||||||
(lp new-end count)
|
(lp new-end count)
|
||||||
(let ((success #,(syntax-case num (+ * ?)
|
(let ((success #,(syntax-case #'num (+ * ?)
|
||||||
(n (number? (syntax->datum #'n))
|
(n (number? (syntax->datum #'n))
|
||||||
#'(= count n))
|
#'(= count n))
|
||||||
(+ #'(>= count 1))
|
(+ #'(>= count 1))
|
||||||
(* #t)
|
(* #t)
|
||||||
(? #t))))
|
(? #t))))
|
||||||
#,(syntax-case type (! & lit)
|
#,(syntax-case #'type (! & lit)
|
||||||
(!
|
(!
|
||||||
#`(if success
|
#`(if success
|
||||||
#f
|
#f
|
||||||
#,(cggr accum 'cg-body #''() #'at)))
|
#,(cggr (baf accum) 'cg-body #''() #'at)))
|
||||||
(&
|
(&
|
||||||
#`(and success
|
#`(and success
|
||||||
#,(cggr accum 'cg-body #''() #'at)))
|
#,(cggr (baf accum) 'cg-body #''() #'at)))
|
||||||
(lit
|
(lit
|
||||||
#`(and success
|
#`(and success
|
||||||
#,(cggr accum 'cg-body #'(reverse body) #'new-end)))))))))))
|
#,(cggr (baf accum) 'cg-body
|
||||||
|
#'(reverse body) #'new-end)))))))))))))
|
||||||
|
|
||||||
|
;; Association list of functions to handle different expressions as PEGs
|
||||||
|
(define peg-compiler-alist '())
|
||||||
|
|
||||||
|
(define (add-peg-compiler! symbol function)
|
||||||
|
(set! peg-compiler-alist
|
||||||
|
(assq-set! peg-compiler-alist symbol function)))
|
||||||
|
|
||||||
|
(add-peg-compiler! 'range cg-range)
|
||||||
|
(add-peg-compiler! 'ignore cg-ignore)
|
||||||
|
(add-peg-compiler! 'capture cg-capture)
|
||||||
|
(add-peg-compiler! 'and cg-and)
|
||||||
|
(add-peg-compiler! 'or cg-or)
|
||||||
|
(add-peg-compiler! 'body cg-body)
|
||||||
|
|
||||||
|
;; Takes an arbitrary expressions and accumulation variable, then parses it.
|
||||||
|
;; E.g.: (peg-sexp-compile syntax '(and "abc" (or "-" (range #\a #\z))) 'all)
|
||||||
|
(define (peg-sexp-compile pat accum)
|
||||||
|
(syntax-case pat (peg-any range ignore capture peg and or body)
|
||||||
|
(peg-any
|
||||||
|
(cg-peg-any (baf accum)))
|
||||||
|
(sym (identifier? #'sym) ;; nonterminal
|
||||||
|
#'sym)
|
||||||
|
(str (string? (syntax->datum #'str)) ;; literal string
|
||||||
|
(cg-string (syntax->datum #'str) (baf accum)))
|
||||||
|
((name . args) (let* ((nm (syntax->datum #'name))
|
||||||
|
(entry (assq-ref peg-compiler-alist nm)))
|
||||||
|
(if entry
|
||||||
|
(entry #'args accum)
|
||||||
|
(error "Bad peg form" nm #'args
|
||||||
|
"Not one of" (map car peg-compiler-alist)))))))
|
||||||
|
|
||||||
;; Packages the results of a parser
|
;; Packages the results of a parser
|
||||||
(define (wrap-parser-for-users for-syntax parser accumsym s-syn)
|
(define (wrap-parser-for-users for-syntax parser accumsym s-syn)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue