mirror of
https://git.savannah.gnu.org/git/guile.git
synced 2025-04-30 03:40:34 +02:00
add #:doctype-handler to xml->sxml
* module/sxml/simple.scm (read-internal-doctype-as-string): New helper. (xml->sxml): Add #:doctype-handler argument. * doc/ref/sxml.texi (Reading and Writing XML): Document #:doctype-handler. Fix some other examples, and fix the default value of #:declare-namespaces?. * test-suite/tests/sxml.simple.test: Add all tests from the manual here.
This commit is contained in:
parent
1260fd0b2c
commit
e10c250928
3 changed files with 170 additions and 23 deletions
|
@ -57,7 +57,8 @@ to text.
|
|||
|
||||
@deffn {Scheme Procedure} xml->sxml [string-or-port] [#:namespaces='()] @
|
||||
[#:declare-namespaces?=#t] [#:trim-whitespace?=#f] @
|
||||
[#:entities='()] [#:default-entity-handler=#f]
|
||||
[#:entities='()] [#:default-entity-handler=#f] @
|
||||
[#:doctype-handler=#f]
|
||||
Use SSAX to parse an XML document into SXML. Takes one optional
|
||||
argument, @var{string-or-port}, which defaults to the current input
|
||||
port. Returns the resulting SXML document. If @var{string-or-port} is
|
||||
|
@ -99,18 +100,19 @@ for certain namespaces with the @code{#:namespaces} keyword argument to
|
|||
@result{} (*TOP* (foo (ns2:baz)))
|
||||
@end example
|
||||
|
||||
Passing a true @code{#:declare-namespaces?} argument will cause the
|
||||
user-given @code{#:namespaces} to be treated as if they were declared on
|
||||
the root element.
|
||||
By default, namespaces passed to @code{xml->sxml} are treated as if they
|
||||
were declared on the root element. Passing a false
|
||||
@code{#:declare-namespaces?} argument will disable this behavior,
|
||||
requiring in-document declarations of namespaces before use..
|
||||
|
||||
@example
|
||||
(xml->sxml "<foo><ns2:baz/></foo>"
|
||||
#:namespaces '((ns2 . "http://example.org/ns2")))
|
||||
@result{} error: undeclared namespace: `bar'
|
||||
@result{} (*TOP* (foo (ns2:baz)))
|
||||
(xml->sxml "<foo><ns2:baz/></foo>"
|
||||
#:namespaces '((ns2 . "http://example.org/ns2"))
|
||||
#:declare-namespaces? #t)
|
||||
@result{} (*TOP* (foo (ns2:baz)))
|
||||
#:declare-namespaces? #f)
|
||||
@result{} error: undeclared namespace: `bar'
|
||||
@end example
|
||||
|
||||
By default, all whitespace in XML is significant. Passing the
|
||||
|
@ -120,10 +122,10 @@ whitespace in front, behind and between elements, treating it as
|
|||
|
||||
@example
|
||||
(xml->sxml "<foo>\n<bar> Alfie the parrot! </bar>\n</foo>")
|
||||
@result{} (*TOP* (foo "\n" (bar " Alfie the parrot! ") "\n")
|
||||
@result{} (*TOP* (foo "\n" (bar " Alfie the parrot! ") "\n"))
|
||||
(xml->sxml "<foo>\n<bar> Alfie the parrot! </bar>\n</foo>"
|
||||
#:trim-whitespace? #t)
|
||||
@result{} (*TOP* (foo (bar " Alfie the parrot! "))
|
||||
@result{} (*TOP* (foo (bar " Alfie the parrot! ")))
|
||||
@end example
|
||||
|
||||
Parsed entities may be declared with the @code{#:entities} keyword
|
||||
|
@ -159,6 +161,38 @@ numeric character entities.
|
|||
@result{} (*TOP* (foo "\xa0 foo"))
|
||||
@end example
|
||||
|
||||
By default, @code{xml->sxml} skips over the @code{<!DOCTYPE>}
|
||||
declaration, if any. This behavior can be overridden with the
|
||||
@code{#:doctype-handler} argument, which should be a procedure of three
|
||||
arguments: the @dfn{docname} (a symbol), @dfn{systemid} (a string), and
|
||||
the internal doctype subset (as a string or @code{#f} if not present).
|
||||
|
||||
The handler should return keyword arguments as multiple values, as if it
|
||||
were calling its continuation with keyword arguments. The continuation
|
||||
accepts the @code{#:entities} and @code{#:namespaces} keyword arguments,
|
||||
in the same format that @code{xml->sxml} itself takes. These entities
|
||||
and namespaces will be prepended to those given to the @code{xml->sxml}
|
||||
invocation.
|
||||
|
||||
@example
|
||||
(define (handle-foo docname systemid internal-subset)
|
||||
(case docname
|
||||
((foo)
|
||||
(values #:entities '((greets . "<i>Hello, world!</i>"))))
|
||||
(else
|
||||
(values))))
|
||||
|
||||
(xml->sxml "<!DOCTYPE foo><p>&greets;</p>"
|
||||
#:doctype-handler handle-foo)
|
||||
@result{} (*TOP* (p (i "Hello, world!")))
|
||||
@end example
|
||||
|
||||
If the document has no doctype declaration, the @var{doctype-handler} is
|
||||
invoked with @code{#f} for the three arguments.
|
||||
|
||||
In the future, the continuation may accept other keyword arguments, for
|
||||
example to validate the parsed SXML against the doctype.
|
||||
|
||||
@deffn {Scheme Procedure} sxml->xml tree [port]
|
||||
Serialize the SXML tree @var{tree} as XML. The output will be written to
|
||||
the current output port, unless the optional argument @var{port} is
|
||||
|
|
|
@ -26,6 +26,7 @@
|
|||
;;; Code:
|
||||
|
||||
(define-module (sxml simple)
|
||||
#:use-module (sxml ssax input-parse)
|
||||
#:use-module (sxml ssax)
|
||||
#:use-module (sxml transform)
|
||||
#:use-module (ice-9 match)
|
||||
|
@ -35,10 +36,6 @@
|
|||
;; Helpers from upstream/SSAX.scm.
|
||||
;;
|
||||
|
||||
(define (ssax:warn port msg . args)
|
||||
(format (current-ssax-error-port)
|
||||
";;; SSAX warning: ~a ~a\n" msg args))
|
||||
|
||||
; ssax:reverse-collect-str LIST-OF-FRAGS -> LIST-OF-FRAGS
|
||||
; given the list of fragments (some of which are text strings)
|
||||
; reverse the list and concatenate adjacent text strings.
|
||||
|
@ -65,6 +62,17 @@
|
|||
(cons (string-concatenate/shared strs) result)))
|
||||
'())))))))
|
||||
|
||||
(define (read-internal-doctype-as-string port)
|
||||
(string-concatenate/shared
|
||||
(let loop ()
|
||||
(let ((fragment
|
||||
(next-token '() '(#\]) "reading internal DOCTYPE" port)))
|
||||
(if (eqv? #\> (peek-next-char port))
|
||||
(begin
|
||||
(read-char port)
|
||||
(cons fragment '()))
|
||||
(cons* fragment "]" (loop)))))))
|
||||
|
||||
;; Ideas for the future for this interface:
|
||||
;;
|
||||
;; * Allow doctypes to provide parsed entities
|
||||
|
@ -81,7 +89,8 @@
|
|||
(declare-namespaces? #t)
|
||||
(trim-whitespace? #f)
|
||||
(entities '())
|
||||
(default-entity-handler #f))
|
||||
(default-entity-handler #f)
|
||||
(doctype-handler #f))
|
||||
"Use SSAX to parse an XML document into SXML. Takes one optional
|
||||
argument, @var{string-or-port}, which defaults to the current input
|
||||
port."
|
||||
|
@ -96,7 +105,7 @@ port."
|
|||
;; NAMESPACES: list of (DOC-PREFIX . (USER-PREFIX . URI)).
|
||||
;; A DOC-PREFIX of #f indicates that it comes from the user.
|
||||
;; Otherwise, prefixes are symbols.
|
||||
(define (user-namespaces)
|
||||
(define (munge-namespaces namespaces)
|
||||
(map (lambda (el)
|
||||
(match el
|
||||
((prefix . uri-string)
|
||||
|
@ -105,6 +114,9 @@ port."
|
|||
(ssax:uri-string->symbol uri-string)))))
|
||||
namespaces))
|
||||
|
||||
(define (user-namespaces)
|
||||
(munge-namespaces namespaces))
|
||||
|
||||
(define (user-entities)
|
||||
(if (and default-entity-handler
|
||||
(not (assq '*DEFAULT* entities)))
|
||||
|
@ -117,6 +129,13 @@ port."
|
|||
(symbol-append prefix (string->symbol ":") local-part))
|
||||
(_ name)))
|
||||
|
||||
(define (doctype-continuation seed)
|
||||
(lambda* (#:key (entities '()) (namespaces '()))
|
||||
(values #f
|
||||
(append entities (user-entities))
|
||||
(append (munge-namespaces namespaces) (user-namespaces))
|
||||
seed)))
|
||||
|
||||
;; The SEED in this parser is the SXML: initialized to '() at each new
|
||||
;; level by the fdown handlers; built in reverse by the fhere parsers;
|
||||
;; and reverse-collected by the fup handlers.
|
||||
|
@ -159,18 +178,29 @@ port."
|
|||
;;
|
||||
;; SEED builds up the content.
|
||||
(lambda (port docname systemid internal-subset? seed)
|
||||
(call-with-values
|
||||
(lambda ()
|
||||
(cond
|
||||
(doctype-handler
|
||||
(doctype-handler docname systemid
|
||||
(and internal-subset?
|
||||
(read-internal-doctype-as-string port))))
|
||||
(else
|
||||
(when internal-subset?
|
||||
(ssax:warn port "Internal DTD subset is not currently handled ")
|
||||
(ssax:skip-internal-dtd port))
|
||||
(ssax:warn port "DOCTYPE DECL " docname " "
|
||||
systemid " found and skipped")
|
||||
(values #f (user-entities) (user-namespaces) seed))
|
||||
(values))))
|
||||
(doctype-continuation seed)))
|
||||
|
||||
UNDECL-ROOT
|
||||
;; This is like the DOCTYPE handler, but for documents that do not
|
||||
;; have a <!DOCTYPE!> entry.
|
||||
(lambda (elem-gi seed)
|
||||
(values #f (user-entities) (user-namespaces) seed))
|
||||
(call-with-values
|
||||
(lambda ()
|
||||
(if doctype-handler
|
||||
(doctype-handler #f #f #f)
|
||||
(values)))
|
||||
(doctype-continuation seed)))
|
||||
|
||||
PI
|
||||
((*DEFAULT*
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
;;;; sxml.simple.test --- (sxml simple) -*- mode: scheme; coding: utf-8; -*-
|
||||
;;;;
|
||||
;;;; Copyright (C) 2010 Free Software Foundation, Inc.
|
||||
;;;; Copyright (C) 2010, 2013 Free Software Foundation, Inc.
|
||||
;;;;
|
||||
;;;; This library is free software; you can redistribute it and/or
|
||||
;;;; modify it under the terms of the GNU Lesser General Public
|
||||
|
@ -20,6 +20,8 @@
|
|||
#:use-module (test-suite lib)
|
||||
#:use-module (sxml simple))
|
||||
|
||||
(define parser-error '(parser-error . ""))
|
||||
|
||||
(define %xml-sample
|
||||
;; An XML sample without any space in between tags, to make it easier.
|
||||
(string-append "<?xml version='1.0' encoding='utf-8'?>"
|
||||
|
@ -50,3 +52,84 @@
|
|||
(lambda ()
|
||||
(sxml->xml
|
||||
(xml->sxml (open-input-string %xml-sample))))))))))
|
||||
|
||||
(with-test-prefix "namespaces"
|
||||
(pass-if-equal
|
||||
(xml->sxml "<foo xmlns=\"http://example.org/ns1\">text</foo>")
|
||||
'(*TOP* (http://example.org/ns1:foo "text")))
|
||||
|
||||
(pass-if-equal
|
||||
(xml->sxml "<foo xmlns=\"http://example.org/ns1\">text</foo>"
|
||||
#:namespaces '((ns1 . "http://example.org/ns1")))
|
||||
'(*TOP* (ns1:foo "text")))
|
||||
|
||||
(pass-if-equal
|
||||
(xml->sxml "<foo xmlns:bar=\"http://example.org/ns2\"><bar:baz/></foo>"
|
||||
#:namespaces '((ns2 . "http://example.org/ns2")))
|
||||
'(*TOP* (foo (ns2:baz))))
|
||||
|
||||
(pass-if-equal
|
||||
(xml->sxml "<foo><ns2:baz/></foo>"
|
||||
#:namespaces '((ns2 . "http://example.org/ns2")))
|
||||
'(*TOP* (foo (ns2:baz))))
|
||||
|
||||
(pass-if-exception "namespace undeclared" parser-error
|
||||
(xml->sxml "<foo><ns2:baz/></foo>"
|
||||
#:namespaces '((ns2 . "http://example.org/ns2"))
|
||||
#:declare-namespaces? #f)))
|
||||
|
||||
(with-test-prefix "whitespace"
|
||||
(pass-if-equal
|
||||
(xml->sxml "<foo>\n<bar> Alfie the parrot! </bar>\n</foo>")
|
||||
'(*TOP* (foo "\n" (bar " Alfie the parrot! ") "\n")))
|
||||
|
||||
(pass-if-equal
|
||||
(xml->sxml "<foo>\n<bar> Alfie the parrot! </bar>\n</foo>"
|
||||
#:trim-whitespace? #t)
|
||||
'(*TOP* (foo (bar " Alfie the parrot! ")))))
|
||||
|
||||
(with-test-prefix "parsed entities"
|
||||
(pass-if-equal
|
||||
'(*TOP* (foo "&"))
|
||||
(xml->sxml "<foo>&</foo>"))
|
||||
|
||||
(pass-if-exception "nbsp undefined" parser-error
|
||||
(xml->sxml "<foo> </foo>"))
|
||||
|
||||
(pass-if-equal
|
||||
'(*TOP* (foo "\xA0"))
|
||||
(xml->sxml "<foo> </foo>"
|
||||
#:entities '((nbsp . "\xA0"))))
|
||||
|
||||
(pass-if-equal
|
||||
'(*TOP* (foo "\xA0"))
|
||||
(xml->sxml "<foo> </foo>"))
|
||||
|
||||
(let ((ents '()))
|
||||
(pass-if-equal
|
||||
(xml->sxml "<foo> &foo;</foo>"
|
||||
#:default-entity-handler
|
||||
(lambda (port name)
|
||||
(case name
|
||||
((nbsp) "\xa0")
|
||||
(else
|
||||
(set! ents (cons name ents))
|
||||
"qux"))))
|
||||
'(*TOP* (foo "\xa0 qux")))
|
||||
|
||||
(pass-if-equal
|
||||
ents
|
||||
'(foo))))
|
||||
|
||||
(with-test-prefix "doctype handlers"
|
||||
(define (handle-foo docname systemid internal-subset)
|
||||
(case docname
|
||||
((foo)
|
||||
(values #:entities '((greets . "<i>Hello, world!</i>"))))
|
||||
(else
|
||||
(values))))
|
||||
|
||||
(pass-if-equal
|
||||
(xml->sxml "<!DOCTYPE foo><p>&greets;</p>"
|
||||
#:doctype-handler handle-foo)
|
||||
'(*TOP* (p (i "Hello, world!")))))
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue