diff --git a/module/ice-9/peg.scm b/module/ice-9/peg.scm index d91a74e57..0acc4598b 100644 --- a/module/ice-9/peg.scm +++ b/module/ice-9/peg.scm @@ -18,8 +18,7 @@ ;;;; (define-module (ice-9 peg) - #:export (peg-sexp-compile - peg-string-compile + #:export (peg-string-compile context-flatten peg-parse define-nonterm @@ -34,8 +33,9 @@ peg:substring peg-record? keyword-flatten) - #:use-module (system base pmatch) - #:use-module (ice-9 pretty-print)) + #:use-module (ice-9 peg codegen) + #:re-export (peg-sexp-compile) + #:use-module (system base pmatch)) ;;; ;;; Helper Macros @@ -58,221 +58,8 @@ execute the STMTs and try again." ((_) #t) (else #f))))) -(define-syntax push! - (syntax-rules () - "Push an object onto a list." - ((_ lst obj) - (set! lst (cons obj lst))))) - -(define-syntax single-filter - (syntax-rules () - "If EXP is a list of one element, return the element. Otherwise -return EXP." - ((_ exp) - (pmatch exp - ((,elt) elt) - (,elts elts))))) - -(define-syntax push-not-null! - (syntax-rules () - "If OBJ is non-null, push it onto LST, otherwise do nothing." - ((_ lst obj) - (if (not (null? obj)) - (push! lst obj))))) - (eval-when (compile load eval) -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;;;; CODE GENERATORS -;; These functions generate scheme code for parsing PEGs. -;; Conventions: -;; accum: (all name body none) -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -;; Code we generate will have a certain return structure depending on how we're -;; accumulating (the ACCUM variable). -(define (cg-generic-ret accum name body-uneval at) - ;; name, body-uneval and at are syntax - #`(let ((body #,body-uneval)) - #,(cond - ((and (eq? accum 'all) name) - #`(list #,at - (cond - ((not (list? body)) (list '#,name body)) - ((null? body) '#,name) - ((symbol? (car body)) (list '#,name body)) - (else (cons '#,name body))))) - ((eq? accum 'name) - #`(list #,at '#,name)) - ((eq? accum 'body) - #`(list #,at - (cond - ((single? body) (car body)) - (else body)))) - ((eq? accum 'none) - #`(list #,at '())) - (else - (begin - (pretty-print `(cg-generic-ret-error ,accum ,name ,body-uneval ,at)) - (pretty-print "Defaulting to accum of none.\n") - #`(list #,at '())))))) - -;; The short name makes the formatting below much easier to read. -(define cggr cg-generic-ret) - -;; Generates code that matches a particular string. -;; E.g.: (cg-string syntax "abc" 'body) -(define (cg-string pat accum) - (let ((plen (string-length pat))) - #`(lambda (str len pos) - (let ((end (+ pos #,plen))) - (and (<= end len) - (string= str #,pat pos end) - #,(case accum - ((all) #`(list end (list 'cg-string #,pat))) - ((name) #`(list end 'cg-string)) - ((body) #`(list end #,pat)) - ((none) #`(list end '())) - (else (error "bad accum" accum)))))))) - -;; Generates code for matching any character. -;; E.g.: (cg-peg-any syntax 'body) -(define (cg-peg-any accum) - #`(lambda (str len pos) - (and (< pos len) - #,(case accum - ((all) #`(list (1+ pos) - (list 'cg-peg-any (substring str pos (1+ pos))))) - ((name) #`(list (1+ pos) 'cg-peg-any)) - ((body) #`(list (1+ pos) (substring str pos (1+ pos)))) - ((none) #`(list (1+ pos) '())) - (else (error "bad accum" accum)))))) - -;; Generates code for matching a range of characters between start and end. -;; E.g.: (cg-range syntax #\a #\z 'body) -(define (cg-range start end accum) - #`(lambda (str len pos) - (and (< pos len) - (let ((c (string-ref str pos))) - (and (char>=? c #,start) - (char<=? c #,end) - #,(case accum - ((all) #`(list (1+ pos) (list 'cg-range (string c)))) - ((name) #`(list (1+ pos) 'cg-range)) - ((body) #`(list (1+ pos) (string c))) - ((none) #`(list (1+ pos) '())) - (else (error "bad accum" accum)))))))) - -;; Filters the accum argument to peg-sexp-compile for buildings like string -;; literals (since we don't want to tag them with their name if we're doing an -;; "all" accum). -(define (builtin-accum-filter accum) - (cond - ((eq? accum 'all) 'body) - ((eq? accum 'name) 'name) - ((eq? accum 'body) 'body) - ((eq? accum 'none) 'none))) -(define baf builtin-accum-filter) - -;; Takes an arbitrary expressions and accumulation variable, then parses it. -;; E.g.: (peg-sexp-compile syntax '(and "abc" (or "-" (range #\a #\z))) 'all) -(define (peg-sexp-compile pat accum) - (syntax-case pat (peg-any range ignore capture peg and or body) - (peg-any - (cg-peg-any (baf accum))) - (sym (identifier? #'sym) ;; nonterminal - #'sym) - (str (string? (syntax->datum #'str)) ;; literal string - (cg-string (syntax->datum #'str) (baf accum))) - ((range start end) ;; range of characters (e.g. [a-z]) - (and (char? (syntax->datum #'start)) (char? (syntax->datum #'end))) - (cg-range (syntax->datum #'start) (syntax->datum #'end) (baf accum))) - ((ignore pat) ;; match but don't parse - (peg-sexp-compile #'pat 'none)) - ((capture pat) ;; parse - (peg-sexp-compile #'pat 'body)) - ((peg pat) ;; embedded PEG string - (string? (syntax->datum #'pat)) - (peg-string-compile #'pat (baf accum))) - ((and pat ...) - (cg-and #'(pat ...) (baf accum))) - ((or pat ...) - (cg-or #'(pat ...) (baf accum))) - ((body type pat num) - (cg-body (baf accum) #'type #'pat #'num)))) - -;; Top-level function builder for AND. Reduces to a call to CG-AND-INT. -(define (cg-and clauses accum) - #`(lambda (str len pos) - (let ((body '())) - #,(cg-and-int clauses accum #'str #'len #'pos #'body)))) - -;; Internal function builder for AND (calls itself). -(define (cg-and-int clauses accum str strlen at body) - (syntax-case clauses () - (() - (cggr accum 'cg-and #`(reverse #,body) at)) - ((first rest ...) - #`(let ((res (#,(peg-sexp-compile #'first accum) #,str #,strlen #,at))) - (and res - ;; update AT and BODY then recurse - (let ((newat (car res)) - (newbody (cadr res))) - (set! #,at newat) - (push-not-null! #,body (single-filter newbody)) - #,(cg-and-int #'(rest ...) accum str strlen at body))))))) - -;; Top-level function builder for OR. Reduces to a call to CG-OR-INT. -(define (cg-or clauses accum) - #`(lambda (str len pos) - #,(cg-or-int clauses accum #'str #'len #'pos))) - -;; Internal function builder for OR (calls itself). -(define (cg-or-int clauses accum str strlen at) - (syntax-case clauses () - (() - #f) - ((first rest ...) - #`(or (#,(peg-sexp-compile #'first accum) #,str #,strlen #,at) - #,(cg-or-int #'(rest ...) accum str strlen at))))) - -;; Returns a function that parses a BODY element. -(define (cg-body accum type pat num) - #`(lambda (str strlen at) - (let ((body '())) - (let lp ((end at) (count 0)) - (let* ((match (#,(peg-sexp-compile pat accum) str strlen end)) - (new-end (if match (car match) end)) - (count (if (> new-end end) (1+ count) count))) - (if (> new-end end) - (push-not-null! body (single-filter (cadr match)))) - (if (and (> new-end end) - #,(syntax-case num (+ * ?) - (n (number? (syntax->datum #'n)) - #'(< count n)) - (+ #t) - (* #t) - (? #'(< count 1)))) - (lp new-end count) - (let ((success #,(syntax-case num (+ * ?) - (n (number? (syntax->datum #'n)) - #'(= count n)) - (+ #'(>= count 1)) - (* #t) - (? #t)))) - #,(syntax-case type (! & lit) - (! - #`(if success - #f - #,(cggr accum 'cg-body #''() #'at))) - (& - #`(and success - #,(cggr accum 'cg-body #''() #'at))) - (lit - #`(and success - #,(cggr accum 'cg-body #'(reverse body) #'new-end))))))))))) - - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;; FOR DEFINING AND USING NONTERMINALS ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -511,7 +298,7 @@ RB < ']' (let ((parsed (peg-parse peg-grammar str))) (if (not parsed) (begin - ;; (pretty-print "Invalid PEG grammar!\n") + ;; (display "Invalid PEG grammar!\n") #f) (let ((lst (peg:tree parsed))) (cond diff --git a/module/ice-9/peg/codegen.scm b/module/ice-9/peg/codegen.scm new file mode 100644 index 000000000..43f44cc66 --- /dev/null +++ b/module/ice-9/peg/codegen.scm @@ -0,0 +1,245 @@ +;;;; codegen.scm --- code generation for composable parsers +;;;; +;;;; Copyright (C) 2011 Free Software Foundation, Inc. +;;;; +;;;; This library is free software; you can redistribute it and/or +;;;; modify it under the terms of the GNU Lesser General Public +;;;; License as published by the Free Software Foundation; either +;;;; version 3 of the License, or (at your option) any later version. +;;;; +;;;; This library is distributed in the hope that it will be useful, +;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;;;; Lesser General Public License for more details. +;;;; +;;;; You should have received a copy of the GNU Lesser General Public +;;;; License along with this library; if not, write to the Free Software +;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;;;; + +(define-module (ice-9 peg codegen) + #:export (peg-sexp-compile) + #:use-module (ice-9 peg) + #:use-module (ice-9 pretty-print) + #:use-module (system base pmatch)) + +(define-syntax single? + (syntax-rules () + "Return #t if X is a list of one element." + ((_ x) + (pmatch x + ((_) #t) + (else #f))))) + +(define-syntax single-filter + (syntax-rules () + "If EXP is a list of one element, return the element. Otherwise +return EXP." + ((_ exp) + (pmatch exp + ((,elt) elt) + (,elts elts))))) + +(define-syntax push-not-null! + (syntax-rules () + "If OBJ is non-null, push it onto LST, otherwise do nothing." + ((_ lst obj) + (if (not (null? obj)) + (push! lst obj))))) + +(define-syntax push! + (syntax-rules () + "Push an object onto a list." + ((_ lst obj) + (set! lst (cons obj lst))))) + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;; CODE GENERATORS +;; These functions generate scheme code for parsing PEGs. +;; Conventions: +;; accum: (all name body none) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Code we generate will have a certain return structure depending on how we're +;; accumulating (the ACCUM variable). +(define (cg-generic-ret accum name body-uneval at) + ;; name, body-uneval and at are syntax + #`(let ((body #,body-uneval)) + #,(cond + ((and (eq? accum 'all) name) + #`(list #,at + (cond + ((not (list? body)) (list '#,name body)) + ((null? body) '#,name) + ((symbol? (car body)) (list '#,name body)) + (else (cons '#,name body))))) + ((eq? accum 'name) + #`(list #,at '#,name)) + ((eq? accum 'body) + #`(list #,at + (cond + ((single? body) (car body)) + (else body)))) + ((eq? accum 'none) + #`(list #,at '())) + (else + (begin + (pretty-print `(cg-generic-ret-error ,accum ,name ,body-uneval ,at)) + (pretty-print "Defaulting to accum of none.\n") + #`(list #,at '())))))) + +;; The short name makes the formatting below much easier to read. +(define cggr cg-generic-ret) + +;; Generates code that matches a particular string. +;; E.g.: (cg-string syntax "abc" 'body) +(define (cg-string pat accum) + (let ((plen (string-length pat))) + #`(lambda (str len pos) + (let ((end (+ pos #,plen))) + (and (<= end len) + (string= str #,pat pos end) + #,(case accum + ((all) #`(list end (list 'cg-string #,pat))) + ((name) #`(list end 'cg-string)) + ((body) #`(list end #,pat)) + ((none) #`(list end '())) + (else (error "bad accum" accum)))))))) + +;; Generates code for matching any character. +;; E.g.: (cg-peg-any syntax 'body) +(define (cg-peg-any accum) + #`(lambda (str len pos) + (and (< pos len) + #,(case accum + ((all) #`(list (1+ pos) + (list 'cg-peg-any (substring str pos (1+ pos))))) + ((name) #`(list (1+ pos) 'cg-peg-any)) + ((body) #`(list (1+ pos) (substring str pos (1+ pos)))) + ((none) #`(list (1+ pos) '())) + (else (error "bad accum" accum)))))) + +;; Generates code for matching a range of characters between start and end. +;; E.g.: (cg-range syntax #\a #\z 'body) +(define (cg-range start end accum) + #`(lambda (str len pos) + (and (< pos len) + (let ((c (string-ref str pos))) + (and (char>=? c #,start) + (char<=? c #,end) + #,(case accum + ((all) #`(list (1+ pos) (list 'cg-range (string c)))) + ((name) #`(list (1+ pos) 'cg-range)) + ((body) #`(list (1+ pos) (string c))) + ((none) #`(list (1+ pos) '())) + (else (error "bad accum" accum)))))))) + +;; Filters the accum argument to peg-sexp-compile for buildings like string +;; literals (since we don't want to tag them with their name if we're doing an +;; "all" accum). +(define (builtin-accum-filter accum) + (cond + ((eq? accum 'all) 'body) + ((eq? accum 'name) 'name) + ((eq? accum 'body) 'body) + ((eq? accum 'none) 'none))) +(define baf builtin-accum-filter) + +;; Takes an arbitrary expressions and accumulation variable, then parses it. +;; E.g.: (peg-sexp-compile syntax '(and "abc" (or "-" (range #\a #\z))) 'all) +(define (peg-sexp-compile pat accum) + (syntax-case pat (peg-any range ignore capture peg and or body) + (peg-any + (cg-peg-any (baf accum))) + (sym (identifier? #'sym) ;; nonterminal + #'sym) + (str (string? (syntax->datum #'str)) ;; literal string + (cg-string (syntax->datum #'str) (baf accum))) + ((range start end) ;; range of characters (e.g. [a-z]) + (and (char? (syntax->datum #'start)) (char? (syntax->datum #'end))) + (cg-range (syntax->datum #'start) (syntax->datum #'end) (baf accum))) + ((ignore pat) ;; match but don't parse + (peg-sexp-compile #'pat 'none)) + ((capture pat) ;; parse + (peg-sexp-compile #'pat 'body)) + ((peg pat) ;; embedded PEG string + (string? (syntax->datum #'pat)) + (peg-string-compile #'pat (baf accum))) + ((and pat ...) + (cg-and #'(pat ...) (baf accum))) + ((or pat ...) + (cg-or #'(pat ...) (baf accum))) + ((body type pat num) + (cg-body (baf accum) #'type #'pat #'num)))) + +;; Top-level function builder for AND. Reduces to a call to CG-AND-INT. +(define (cg-and clauses accum) + #`(lambda (str len pos) + (let ((body '())) + #,(cg-and-int clauses accum #'str #'len #'pos #'body)))) + +;; Internal function builder for AND (calls itself). +(define (cg-and-int clauses accum str strlen at body) + (syntax-case clauses () + (() + (cggr accum 'cg-and #`(reverse #,body) at)) + ((first rest ...) + #`(let ((res (#,(peg-sexp-compile #'first accum) #,str #,strlen #,at))) + (and res + ;; update AT and BODY then recurse + (let ((newat (car res)) + (newbody (cadr res))) + (set! #,at newat) + (push-not-null! #,body (single-filter newbody)) + #,(cg-and-int #'(rest ...) accum str strlen at body))))))) + +;; Top-level function builder for OR. Reduces to a call to CG-OR-INT. +(define (cg-or clauses accum) + #`(lambda (str len pos) + #,(cg-or-int clauses accum #'str #'len #'pos))) + +;; Internal function builder for OR (calls itself). +(define (cg-or-int clauses accum str strlen at) + (syntax-case clauses () + (() + #f) + ((first rest ...) + #`(or (#,(peg-sexp-compile #'first accum) #,str #,strlen #,at) + #,(cg-or-int #'(rest ...) accum str strlen at))))) + +;; Returns a function that parses a BODY element. +(define (cg-body accum type pat num) + #`(lambda (str strlen at) + (let ((body '())) + (let lp ((end at) (count 0)) + (let* ((match (#,(peg-sexp-compile pat accum) str strlen end)) + (new-end (if match (car match) end)) + (count (if (> new-end end) (1+ count) count))) + (if (> new-end end) + (push-not-null! body (single-filter (cadr match)))) + (if (and (> new-end end) + #,(syntax-case num (+ * ?) + (n (number? (syntax->datum #'n)) + #'(< count n)) + (+ #t) + (* #t) + (? #'(< count 1)))) + (lp new-end count) + (let ((success #,(syntax-case num (+ * ?) + (n (number? (syntax->datum #'n)) + #'(= count n)) + (+ #'(>= count 1)) + (* #t) + (? #t)))) + #,(syntax-case type (! & lit) + (! + #`(if success + #f + #,(cggr accum 'cg-body #''() #'at))) + (& + #`(and success + #,(cggr accum 'cg-body #''() #'at))) + (lit + #`(and success + #,(cggr accum 'cg-body #'(reverse body) #'new-end)))))))))))