1
Fork 0
mirror of https://git.savannah.gnu.org/git/guile.git synced 2025-04-30 20:00:19 +02:00

Split peg.scm

* module/ice-9/peg.scm: move code generators to new module
 * module/ice-9/peg/codegen.scm: new module for PEG code generators
This commit is contained in:
Noah Lavine 2011-03-05 15:23:59 -05:00 committed by Andy Wingo
parent 718bc349af
commit bff3ccd957
2 changed files with 250 additions and 218 deletions

View file

@ -18,8 +18,7 @@
;;;; ;;;;
(define-module (ice-9 peg) (define-module (ice-9 peg)
#:export (peg-sexp-compile #:export (peg-string-compile
peg-string-compile
context-flatten context-flatten
peg-parse peg-parse
define-nonterm define-nonterm
@ -34,8 +33,9 @@
peg:substring peg:substring
peg-record? peg-record?
keyword-flatten) keyword-flatten)
#:use-module (system base pmatch) #:use-module (ice-9 peg codegen)
#:use-module (ice-9 pretty-print)) #:re-export (peg-sexp-compile)
#:use-module (system base pmatch))
;;; ;;;
;;; Helper Macros ;;; Helper Macros
@ -58,221 +58,8 @@ execute the STMTs and try again."
((_) #t) ((_) #t)
(else #f))))) (else #f)))))
(define-syntax push!
(syntax-rules ()
"Push an object onto a list."
((_ lst obj)
(set! lst (cons obj lst)))))
(define-syntax single-filter
(syntax-rules ()
"If EXP is a list of one element, return the element. Otherwise
return EXP."
((_ exp)
(pmatch exp
((,elt) elt)
(,elts elts)))))
(define-syntax push-not-null!
(syntax-rules ()
"If OBJ is non-null, push it onto LST, otherwise do nothing."
((_ lst obj)
(if (not (null? obj))
(push! lst obj)))))
(eval-when (compile load eval) (eval-when (compile load eval)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;; CODE GENERATORS
;; These functions generate scheme code for parsing PEGs.
;; Conventions:
;; accum: (all name body none)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Code we generate will have a certain return structure depending on how we're
;; accumulating (the ACCUM variable).
(define (cg-generic-ret accum name body-uneval at)
;; name, body-uneval and at are syntax
#`(let ((body #,body-uneval))
#,(cond
((and (eq? accum 'all) name)
#`(list #,at
(cond
((not (list? body)) (list '#,name body))
((null? body) '#,name)
((symbol? (car body)) (list '#,name body))
(else (cons '#,name body)))))
((eq? accum 'name)
#`(list #,at '#,name))
((eq? accum 'body)
#`(list #,at
(cond
((single? body) (car body))
(else body))))
((eq? accum 'none)
#`(list #,at '()))
(else
(begin
(pretty-print `(cg-generic-ret-error ,accum ,name ,body-uneval ,at))
(pretty-print "Defaulting to accum of none.\n")
#`(list #,at '()))))))
;; The short name makes the formatting below much easier to read.
(define cggr cg-generic-ret)
;; Generates code that matches a particular string.
;; E.g.: (cg-string syntax "abc" 'body)
(define (cg-string pat accum)
(let ((plen (string-length pat)))
#`(lambda (str len pos)
(let ((end (+ pos #,plen)))
(and (<= end len)
(string= str #,pat pos end)
#,(case accum
((all) #`(list end (list 'cg-string #,pat)))
((name) #`(list end 'cg-string))
((body) #`(list end #,pat))
((none) #`(list end '()))
(else (error "bad accum" accum))))))))
;; Generates code for matching any character.
;; E.g.: (cg-peg-any syntax 'body)
(define (cg-peg-any accum)
#`(lambda (str len pos)
(and (< pos len)
#,(case accum
((all) #`(list (1+ pos)
(list 'cg-peg-any (substring str pos (1+ pos)))))
((name) #`(list (1+ pos) 'cg-peg-any))
((body) #`(list (1+ pos) (substring str pos (1+ pos))))
((none) #`(list (1+ pos) '()))
(else (error "bad accum" accum))))))
;; Generates code for matching a range of characters between start and end.
;; E.g.: (cg-range syntax #\a #\z 'body)
(define (cg-range start end accum)
#`(lambda (str len pos)
(and (< pos len)
(let ((c (string-ref str pos)))
(and (char>=? c #,start)
(char<=? c #,end)
#,(case accum
((all) #`(list (1+ pos) (list 'cg-range (string c))))
((name) #`(list (1+ pos) 'cg-range))
((body) #`(list (1+ pos) (string c)))
((none) #`(list (1+ pos) '()))
(else (error "bad accum" accum))))))))
;; Filters the accum argument to peg-sexp-compile for buildings like string
;; literals (since we don't want to tag them with their name if we're doing an
;; "all" accum).
(define (builtin-accum-filter accum)
(cond
((eq? accum 'all) 'body)
((eq? accum 'name) 'name)
((eq? accum 'body) 'body)
((eq? accum 'none) 'none)))
(define baf builtin-accum-filter)
;; Takes an arbitrary expressions and accumulation variable, then parses it.
;; E.g.: (peg-sexp-compile syntax '(and "abc" (or "-" (range #\a #\z))) 'all)
(define (peg-sexp-compile pat accum)
(syntax-case pat (peg-any range ignore capture peg and or body)
(peg-any
(cg-peg-any (baf accum)))
(sym (identifier? #'sym) ;; nonterminal
#'sym)
(str (string? (syntax->datum #'str)) ;; literal string
(cg-string (syntax->datum #'str) (baf accum)))
((range start end) ;; range of characters (e.g. [a-z])
(and (char? (syntax->datum #'start)) (char? (syntax->datum #'end)))
(cg-range (syntax->datum #'start) (syntax->datum #'end) (baf accum)))
((ignore pat) ;; match but don't parse
(peg-sexp-compile #'pat 'none))
((capture pat) ;; parse
(peg-sexp-compile #'pat 'body))
((peg pat) ;; embedded PEG string
(string? (syntax->datum #'pat))
(peg-string-compile #'pat (baf accum)))
((and pat ...)
(cg-and #'(pat ...) (baf accum)))
((or pat ...)
(cg-or #'(pat ...) (baf accum)))
((body type pat num)
(cg-body (baf accum) #'type #'pat #'num))))
;; Top-level function builder for AND. Reduces to a call to CG-AND-INT.
(define (cg-and clauses accum)
#`(lambda (str len pos)
(let ((body '()))
#,(cg-and-int clauses accum #'str #'len #'pos #'body))))
;; Internal function builder for AND (calls itself).
(define (cg-and-int clauses accum str strlen at body)
(syntax-case clauses ()
(()
(cggr accum 'cg-and #`(reverse #,body) at))
((first rest ...)
#`(let ((res (#,(peg-sexp-compile #'first accum) #,str #,strlen #,at)))
(and res
;; update AT and BODY then recurse
(let ((newat (car res))
(newbody (cadr res)))
(set! #,at newat)
(push-not-null! #,body (single-filter newbody))
#,(cg-and-int #'(rest ...) accum str strlen at body)))))))
;; Top-level function builder for OR. Reduces to a call to CG-OR-INT.
(define (cg-or clauses accum)
#`(lambda (str len pos)
#,(cg-or-int clauses accum #'str #'len #'pos)))
;; Internal function builder for OR (calls itself).
(define (cg-or-int clauses accum str strlen at)
(syntax-case clauses ()
(()
#f)
((first rest ...)
#`(or (#,(peg-sexp-compile #'first accum) #,str #,strlen #,at)
#,(cg-or-int #'(rest ...) accum str strlen at)))))
;; Returns a function that parses a BODY element.
(define (cg-body accum type pat num)
#`(lambda (str strlen at)
(let ((body '()))
(let lp ((end at) (count 0))
(let* ((match (#,(peg-sexp-compile pat accum) str strlen end))
(new-end (if match (car match) end))
(count (if (> new-end end) (1+ count) count)))
(if (> new-end end)
(push-not-null! body (single-filter (cadr match))))
(if (and (> new-end end)
#,(syntax-case num (+ * ?)
(n (number? (syntax->datum #'n))
#'(< count n))
(+ #t)
(* #t)
(? #'(< count 1))))
(lp new-end count)
(let ((success #,(syntax-case num (+ * ?)
(n (number? (syntax->datum #'n))
#'(= count n))
(+ #'(>= count 1))
(* #t)
(? #t))))
#,(syntax-case type (! & lit)
(!
#`(if success
#f
#,(cggr accum 'cg-body #''() #'at)))
(&
#`(and success
#,(cggr accum 'cg-body #''() #'at)))
(lit
#`(and success
#,(cggr accum 'cg-body #'(reverse body) #'new-end)))))))))))
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;; FOR DEFINING AND USING NONTERMINALS ;;;;; FOR DEFINING AND USING NONTERMINALS
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@ -511,7 +298,7 @@ RB < ']'
(let ((parsed (peg-parse peg-grammar str))) (let ((parsed (peg-parse peg-grammar str)))
(if (not parsed) (if (not parsed)
(begin (begin
;; (pretty-print "Invalid PEG grammar!\n") ;; (display "Invalid PEG grammar!\n")
#f) #f)
(let ((lst (peg:tree parsed))) (let ((lst (peg:tree parsed)))
(cond (cond

View file

@ -0,0 +1,245 @@
;;;; codegen.scm --- code generation for composable parsers
;;;;
;;;; Copyright (C) 2011 Free Software Foundation, Inc.
;;;;
;;;; This library is free software; you can redistribute it and/or
;;;; modify it under the terms of the GNU Lesser General Public
;;;; License as published by the Free Software Foundation; either
;;;; version 3 of the License, or (at your option) any later version.
;;;;
;;;; This library is distributed in the hope that it will be useful,
;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;;;; Lesser General Public License for more details.
;;;;
;;;; You should have received a copy of the GNU Lesser General Public
;;;; License along with this library; if not, write to the Free Software
;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;;;;
(define-module (ice-9 peg codegen)
#:export (peg-sexp-compile)
#:use-module (ice-9 peg)
#:use-module (ice-9 pretty-print)
#:use-module (system base pmatch))
(define-syntax single?
(syntax-rules ()
"Return #t if X is a list of one element."
((_ x)
(pmatch x
((_) #t)
(else #f)))))
(define-syntax single-filter
(syntax-rules ()
"If EXP is a list of one element, return the element. Otherwise
return EXP."
((_ exp)
(pmatch exp
((,elt) elt)
(,elts elts)))))
(define-syntax push-not-null!
(syntax-rules ()
"If OBJ is non-null, push it onto LST, otherwise do nothing."
((_ lst obj)
(if (not (null? obj))
(push! lst obj)))))
(define-syntax push!
(syntax-rules ()
"Push an object onto a list."
((_ lst obj)
(set! lst (cons obj lst)))))
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;; CODE GENERATORS
;; These functions generate scheme code for parsing PEGs.
;; Conventions:
;; accum: (all name body none)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Code we generate will have a certain return structure depending on how we're
;; accumulating (the ACCUM variable).
(define (cg-generic-ret accum name body-uneval at)
;; name, body-uneval and at are syntax
#`(let ((body #,body-uneval))
#,(cond
((and (eq? accum 'all) name)
#`(list #,at
(cond
((not (list? body)) (list '#,name body))
((null? body) '#,name)
((symbol? (car body)) (list '#,name body))
(else (cons '#,name body)))))
((eq? accum 'name)
#`(list #,at '#,name))
((eq? accum 'body)
#`(list #,at
(cond
((single? body) (car body))
(else body))))
((eq? accum 'none)
#`(list #,at '()))
(else
(begin
(pretty-print `(cg-generic-ret-error ,accum ,name ,body-uneval ,at))
(pretty-print "Defaulting to accum of none.\n")
#`(list #,at '()))))))
;; The short name makes the formatting below much easier to read.
(define cggr cg-generic-ret)
;; Generates code that matches a particular string.
;; E.g.: (cg-string syntax "abc" 'body)
(define (cg-string pat accum)
(let ((plen (string-length pat)))
#`(lambda (str len pos)
(let ((end (+ pos #,plen)))
(and (<= end len)
(string= str #,pat pos end)
#,(case accum
((all) #`(list end (list 'cg-string #,pat)))
((name) #`(list end 'cg-string))
((body) #`(list end #,pat))
((none) #`(list end '()))
(else (error "bad accum" accum))))))))
;; Generates code for matching any character.
;; E.g.: (cg-peg-any syntax 'body)
(define (cg-peg-any accum)
#`(lambda (str len pos)
(and (< pos len)
#,(case accum
((all) #`(list (1+ pos)
(list 'cg-peg-any (substring str pos (1+ pos)))))
((name) #`(list (1+ pos) 'cg-peg-any))
((body) #`(list (1+ pos) (substring str pos (1+ pos))))
((none) #`(list (1+ pos) '()))
(else (error "bad accum" accum))))))
;; Generates code for matching a range of characters between start and end.
;; E.g.: (cg-range syntax #\a #\z 'body)
(define (cg-range start end accum)
#`(lambda (str len pos)
(and (< pos len)
(let ((c (string-ref str pos)))
(and (char>=? c #,start)
(char<=? c #,end)
#,(case accum
((all) #`(list (1+ pos) (list 'cg-range (string c))))
((name) #`(list (1+ pos) 'cg-range))
((body) #`(list (1+ pos) (string c)))
((none) #`(list (1+ pos) '()))
(else (error "bad accum" accum))))))))
;; Filters the accum argument to peg-sexp-compile for buildings like string
;; literals (since we don't want to tag them with their name if we're doing an
;; "all" accum).
(define (builtin-accum-filter accum)
(cond
((eq? accum 'all) 'body)
((eq? accum 'name) 'name)
((eq? accum 'body) 'body)
((eq? accum 'none) 'none)))
(define baf builtin-accum-filter)
;; Takes an arbitrary expressions and accumulation variable, then parses it.
;; E.g.: (peg-sexp-compile syntax '(and "abc" (or "-" (range #\a #\z))) 'all)
(define (peg-sexp-compile pat accum)
(syntax-case pat (peg-any range ignore capture peg and or body)
(peg-any
(cg-peg-any (baf accum)))
(sym (identifier? #'sym) ;; nonterminal
#'sym)
(str (string? (syntax->datum #'str)) ;; literal string
(cg-string (syntax->datum #'str) (baf accum)))
((range start end) ;; range of characters (e.g. [a-z])
(and (char? (syntax->datum #'start)) (char? (syntax->datum #'end)))
(cg-range (syntax->datum #'start) (syntax->datum #'end) (baf accum)))
((ignore pat) ;; match but don't parse
(peg-sexp-compile #'pat 'none))
((capture pat) ;; parse
(peg-sexp-compile #'pat 'body))
((peg pat) ;; embedded PEG string
(string? (syntax->datum #'pat))
(peg-string-compile #'pat (baf accum)))
((and pat ...)
(cg-and #'(pat ...) (baf accum)))
((or pat ...)
(cg-or #'(pat ...) (baf accum)))
((body type pat num)
(cg-body (baf accum) #'type #'pat #'num))))
;; Top-level function builder for AND. Reduces to a call to CG-AND-INT.
(define (cg-and clauses accum)
#`(lambda (str len pos)
(let ((body '()))
#,(cg-and-int clauses accum #'str #'len #'pos #'body))))
;; Internal function builder for AND (calls itself).
(define (cg-and-int clauses accum str strlen at body)
(syntax-case clauses ()
(()
(cggr accum 'cg-and #`(reverse #,body) at))
((first rest ...)
#`(let ((res (#,(peg-sexp-compile #'first accum) #,str #,strlen #,at)))
(and res
;; update AT and BODY then recurse
(let ((newat (car res))
(newbody (cadr res)))
(set! #,at newat)
(push-not-null! #,body (single-filter newbody))
#,(cg-and-int #'(rest ...) accum str strlen at body)))))))
;; Top-level function builder for OR. Reduces to a call to CG-OR-INT.
(define (cg-or clauses accum)
#`(lambda (str len pos)
#,(cg-or-int clauses accum #'str #'len #'pos)))
;; Internal function builder for OR (calls itself).
(define (cg-or-int clauses accum str strlen at)
(syntax-case clauses ()
(()
#f)
((first rest ...)
#`(or (#,(peg-sexp-compile #'first accum) #,str #,strlen #,at)
#,(cg-or-int #'(rest ...) accum str strlen at)))))
;; Returns a function that parses a BODY element.
(define (cg-body accum type pat num)
#`(lambda (str strlen at)
(let ((body '()))
(let lp ((end at) (count 0))
(let* ((match (#,(peg-sexp-compile pat accum) str strlen end))
(new-end (if match (car match) end))
(count (if (> new-end end) (1+ count) count)))
(if (> new-end end)
(push-not-null! body (single-filter (cadr match))))
(if (and (> new-end end)
#,(syntax-case num (+ * ?)
(n (number? (syntax->datum #'n))
#'(< count n))
(+ #t)
(* #t)
(? #'(< count 1))))
(lp new-end count)
(let ((success #,(syntax-case num (+ * ?)
(n (number? (syntax->datum #'n))
#'(= count n))
(+ #'(>= count 1))
(* #t)
(? #t))))
#,(syntax-case type (! & lit)
(!
#`(if success
#f
#,(cggr accum 'cg-body #''() #'at)))
(&
#`(and success
#,(cggr accum 'cg-body #''() #'at)))
(lit
#`(and success
#,(cggr accum 'cg-body #'(reverse body) #'new-end)))))))))))