1
Fork 0
mirror of https://git.savannah.gnu.org/git/guile.git synced 2025-06-09 21:40:33 +02:00

peg: refactor peg-sexp-compile to operate on syntax directly

* module/ice-9/peg.scm (cg-generic-ret): Remove unused for-syntax
  argument.
  (peg-sexp-compile): Take the pattern as syntax directly, and use
  syntax-case to destructure it and dispatch to the code generators.
  (cg-and, cg-and-int, cg-or, cg-or-int): Refactor to operate on syntax
  instead of on s-expressions.
  (cg-body): Likewise; though this was a larger refactor.
  (define-nonterm, peg-match): Adapt to peg-sexp-compile calling
  convention change.
  (peg-string-compile): Likewise, and just take the grammar as a syntax
  object.
This commit is contained in:
Andy Wingo 2011-02-18 21:27:36 +01:00
parent 5041b82067
commit 3be2799eaf

View file

@ -91,7 +91,7 @@ return EXP."
;; Code we generate will have a certain return structure depending on how we're ;; Code we generate will have a certain return structure depending on how we're
;; accumulating (the ACCUM variable). ;; accumulating (the ACCUM variable).
(define (cg-generic-ret for-syntax accum name body-uneval at) (define (cg-generic-ret accum name body-uneval at)
;; name, body-uneval and at are syntax ;; name, body-uneval and at are syntax
#`(let ((body #,body-uneval)) #`(let ((body #,body-uneval))
#,(cond #,(cond
@ -183,135 +183,102 @@ return EXP."
;; Takes an arbitrary expressions and accumulation variable, then parses it. ;; Takes an arbitrary expressions and accumulation variable, then parses it.
;; E.g.: (peg-sexp-compile syntax '(and "abc" (or "-" (range #\a #\z))) 'all) ;; E.g.: (peg-sexp-compile syntax '(and "abc" (or "-" (range #\a #\z))) 'all)
(define (peg-sexp-compile for-syntax pat accum) (define (peg-sexp-compile pat accum)
(cond (syntax-case pat (peg-any range ignore capture peg and or body)
((string? pat) (cg-string pat (baf accum))) (peg-any
((symbol? pat) ;; either peg-any or a nonterminal (cg-peg-any (baf accum)))
(cond (sym (identifier? #'sym) ;; nonterminal
((eq? pat 'peg-any) (cg-peg-any (baf accum))) #'sym)
;; if pat is any other symbol it's a nonterminal, so just return it (str (string? (syntax->datum #'str)) ;; literal string
(else (datum->syntax for-syntax pat)))) (cg-string (syntax->datum #'str) (baf accum)))
((or (not (list? pat)) (null? pat)) ((range start end) ;; range of characters (e.g. [a-z])
;; anything besides a string, symbol, or list is an error (and (char? (syntax->datum #'start)) (char? (syntax->datum #'end)))
(datum->syntax for-syntax (cg-range (syntax->datum #'start) (syntax->datum #'end) (baf accum)))
(error-val `(peg-sexp-compile-error-1 ,pat ,accum)))) ((ignore pat) ;; match but don't parse
((eq? (car pat) 'range) ;; range of characters (e.g. [a-z]) (peg-sexp-compile #'pat 'none))
(cg-range (cadr pat) (caddr pat) (baf accum))) ((capture pat) ;; parse
((eq? (car pat) 'ignore) ;; match but don't parse (peg-sexp-compile #'pat 'body))
(peg-sexp-compile for-syntax (cadr pat) 'none)) ((peg pat) ;; embedded PEG string
((eq? (car pat) 'capture) ;; parse (string? (syntax->datum #'pat))
(peg-sexp-compile for-syntax (cadr pat) 'body)) (peg-string-compile #'pat (baf accum)))
((eq? (car pat) 'peg) ;; embedded PEG string ((and pat ...)
(peg-string-compile for-syntax (cadr pat) (baf accum))) (cg-and #'(pat ...) (baf accum)))
((eq? (car pat) 'and) ((or pat ...)
(cg-and for-syntax (cdr pat) (baf accum))) (cg-or #'(pat ...) (baf accum)))
((eq? (car pat) 'or) ((body type pat num)
(cg-or for-syntax (cdr pat) (baf accum))) (cg-body (baf accum) #'type #'pat #'num))))
((eq? (car pat) 'body)
(if (not (= (length pat) 4))
(datum->syntax for-syntax
(error-val `(peg-sexp-compile-error-2 ,pat ,accum)))
(datum->syntax for-syntax
(apply cg-body for-syntax (cons (baf accum) (cdr pat))))))
(else (datum->syntax for-syntax
(error-val `(peg-sexp-compile-error-3 ,pat ,accum))))))
;; Top-level function builder for AND. Reduces to a call to CG-AND-INT. ;; Top-level function builder for AND. Reduces to a call to CG-AND-INT.
(define (cg-and for-syntax arglst accum) (define (cg-and clauses accum)
#`(lambda (str strlen at) #`(lambda (str len pos)
(let ((body '())) (let ((body '()))
#,(cg-and-int for-syntax arglst accum #'str #'strlen #'at #'body)))) #,(cg-and-int clauses accum #'str #'len #'pos #'body))))
;; Internal function builder for AND (calls itself). ;; Internal function builder for AND (calls itself).
(define (cg-and-int for-syntax arglst accum str strlen at body) (define (cg-and-int clauses accum str strlen at body)
(if (null? arglst) (syntax-case clauses ()
(cggr for-syntax accum 'cg-and #`(reverse #,body) at) ;; base case (()
(let ((mf (peg-sexp-compile for-syntax (car arglst) accum))) ;; match function (cggr accum 'cg-and #`(reverse #,body) at))
#`(let ((res (#,mf #,str #,strlen #,at))) ((first rest ...)
(if (not res) #`(let ((res (#,(peg-sexp-compile #'first accum) #,str #,strlen #,at)))
#f ;; if the match failed, the and failed (and res
;; otherwise update AT and BODY then recurse ;; update AT and BODY then recurse
(let ((newat (car res)) (let ((newat (car res))
(newbody (cadr res))) (newbody (cadr res)))
(set! #,at newat) (set! #,at newat)
(push-not-null! #,body (single-filter newbody)) (push-not-null! #,body (single-filter newbody))
#,(cg-and-int for-syntax (cdr arglst) accum str strlen at body))))))) #,(cg-and-int #'(rest ...) accum str strlen at body)))))))
;; Top-level function builder for OR. Reduces to a call to CG-OR-INT. ;; Top-level function builder for OR. Reduces to a call to CG-OR-INT.
(define (cg-or for-syntax arglst accum) (define (cg-or clauses accum)
#`(lambda (str strlen at) #`(lambda (str len pos)
#,(cg-or-int for-syntax arglst accum #'str #'strlen #'at #'body))) #,(cg-or-int clauses accum #'str #'len #'pos)))
;; Internal function builder for OR (calls itself). ;; Internal function builder for OR (calls itself).
(define (cg-or-int for-syntax arglst accum str strlen at body) (define (cg-or-int clauses accum str strlen at)
(if (null? arglst) (syntax-case clauses ()
#f ;; base case (()
(let ((mf (peg-sexp-compile for-syntax (car arglst) accum))) #f)
#`(let ((res (#,mf #,str #,strlen #,at))) ((first rest ...)
(if res ;; if the match succeeds, we're done #`(or (#,(peg-sexp-compile #'first accum) #,str #,strlen #,at)
#,(cggr for-syntax accum 'cg-or #`(cadr res) #`(car res)) #,(cg-or-int #'(rest ...) accum str strlen at)))))
#,(cg-or-int for-syntax (cdr arglst) accum str strlen at body))))))
;; Returns a block of code that tries to match PAT, and on success updates AT
;; and BODY, return #f on failure and #t on success.
(define (cg-body-test for-syntax pat accum str strlen at body)
(let ((mf (peg-sexp-compile for-syntax pat accum)))
#`(let ((at2-body2 (#,mf #,str #,strlen #,at)))
(if (or (not at2-body2) (= #,at (car at2-body2)))
#f
(let ((at2 (car at2-body2))
(body2 (cadr at2-body2)))
(set! #,at at2)
(push-not-null!
#,body
(single-filter body2))
#t)))))
;; Returns a block of code that sees whether NUM wants us to try and match more
;; given that we've already matched COUNT.
(define (cg-body-more for-syntax num count)
(cond ((number? num) #`(< #,count #,(datum->syntax for-syntax num)))
((eq? num '+) #t)
((eq? num '*) #t)
((eq? num '?) #`(< #,count 1))
(else (error-val `(cg-body-more-error ,num ,count)))))
;; Returns a function that takes a paramter indicating whether or not the match
;; was succesful and returns what the body expression should return.
(define (cg-body-ret for-syntax accum type name body at at2)
#`(lambda (success)
#,(cond ((eq? type '!)
#`(if success #f #,(cggr for-syntax accum name ''() at)))
((eq? type '&)
#`(if success #,(cggr for-syntax accum name ''() at) #f))
((eq? type 'lit)
#`(if success
#,(cggr for-syntax accum name #`(reverse #,body) at2) #f))
(else (error-val
`(cg-body-ret-error ,type ,accum ,name ,body ,at ,at2))))))
;; Returns a block of code that sees whether COUNT satisfies the constraints of
;; NUM.
(define (cg-body-success for-syntax num count)
(cond ((number? num) #`(= #,count #,num))
((eq? num '+) #`(>= #,count 1))
((eq? num '*) #t)
((eq? num '?) #`(<= #,count 1))
(else `(cg-body-success-error ,num))))
;; Returns a function that parses a BODY element. ;; Returns a function that parses a BODY element.
(define (cg-body for-syntax accum type pat num) (define (cg-body accum type pat num)
(let (; this doesn't work with regular syntax, and I'd really #`(lambda (str strlen at)
; like to know why. (let ((body '()))
(at2 (datum->syntax for-syntax (gensym)))) (let lp ((end at) (count 0))
#`(lambda (str strlen at) (let* ((match (#,(peg-sexp-compile pat accum) str strlen end))
(let ((#,at2 at) (count 0) (body '())) (new-end (if match (car match) end))
(while (and #,(cg-body-test for-syntax pat accum (count (if (> new-end end) (1+ count) count)))
#'str #'strlen at2 #'body) (if (> new-end end)
(set! count (+ count 1)) (push-not-null! body (single-filter (cadr match))))
#,(cg-body-more for-syntax num #'count))) (if (and (> new-end end)
(#,(cg-body-ret for-syntax accum type 'cg-body #'body #'at at2) #,(syntax-case num (+ * ?)
#,(cg-body-success for-syntax num #'count)))))) (n (number? (syntax->datum #'n))
#'(< count n))
(+ #t)
(* #t)
(? #'(< count 1))))
(lp new-end count)
(let ((success #,(syntax-case num (+ * ?)
(n (number? (syntax->datum #'n))
#'(= count n))
(+ #'(>= count 1))
(* #t)
(? #t))))
#,(syntax-case type (! & lit)
(!
#`(if success
#f
#,(cggr accum 'cg-body #''() #'at)))
(&
#`(and success
#,(cggr accum 'cg-body #''() #'at)))
(lit
#`(and success
#,(cggr accum 'cg-body #'(reverse body) #'new-end)))))))))))
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;; FOR DEFINING AND USING NONTERMINALS ;;;;; FOR DEFINING AND USING NONTERMINALS
@ -355,8 +322,7 @@ return EXP."
(lambda (x) (lambda (x)
(syntax-case x () (syntax-case x ()
((_ sym accum pat) ((_ sym accum pat)
(let ((matchf (peg-sexp-compile x (syntax->datum #'pat) (let ((matchf (peg-sexp-compile #'pat (syntax->datum #'accum)))
(syntax->datum #'accum)))
(accumsym (syntax->datum #'accum)) (accumsym (syntax->datum #'accum))
(c (datum->syntax x (gensym))));; the cache (c (datum->syntax x (gensym))));; the cache
;; CODE is the code to parse the string if the result isn't cached. ;; CODE is the code to parse the string if the result isn't cached.
@ -389,12 +355,11 @@ return EXP."
(define-syntax peg-match (define-syntax peg-match
(lambda (x) (lambda (x)
(syntax-case x () (syntax-case x ()
((_ peg-matcher string-uncopied) ((_ pattern string-uncopied)
(let ((pmsym (syntax->datum #'peg-matcher))) (let ((pmsym (syntax->datum #'pattern)))
(let ((peg-sexp-compile (let ((matcher (if (string? (syntax->datum #'pattern))
(if (string? pmsym) (peg-string-compile #'pattern 'body)
(peg-string-compile x pmsym 'body) (peg-sexp-compile #'pattern 'body))))
(peg-sexp-compile x pmsym 'body))))
;; We copy the string before using it because it might have been ;; We copy the string before using it because it might have been
;; modified in-place since the last time it was parsed, which would ;; modified in-place since the last time it was parsed, which would
;; invalidate the cache. Guile uses copy-on-write for strings, so ;; invalidate the cache. Guile uses copy-on-write for strings, so
@ -403,8 +368,7 @@ return EXP."
(strlen (string-length string-uncopied)) (strlen (string-length string-uncopied))
(at 0)) (at 0))
(let ((ret (until (or (>= at strlen) (let ((ret (until (or (>= at strlen)
(#,peg-sexp-compile (#,matcher string strlen at))
string strlen at))
(set! at (+ at 1))))) (set! at (+ at 1)))))
(if (eq? ret #t) ;; (>= at strlen) succeeded (if (eq? ret #t) ;; (>= at strlen) succeeded
#f #f
@ -674,9 +638,13 @@ RB < ']'
(else (map compressor lst))))) (else (map compressor lst)))))
;; Builds a lambda-expressions for the pattern STR using accum. ;; Builds a lambda-expressions for the pattern STR using accum.
(define (peg-string-compile for-syntax str accum) (define (peg-string-compile str-stx accum)
(peg-sexp-compile for-syntax (peg-sexp-compile
(compressor (peg-parse-pattern (peg:tree (peg-parse peg-pattern str)))) (datum->syntax
str-stx
(compressor
(peg-parse-pattern
(peg:tree (peg-parse peg-pattern (syntax->datum str-stx))))))
accum)) accum))
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;