mirror of
https://git.savannah.gnu.org/git/guile.git
synced 2025-06-22 03:30:22 +02:00
Improve handling of Unicode byte-order marks (BOMs).
* libguile/ports-internal.h (struct scm_port_internal): Add new members 'at_stream_start_for_bom_read' and 'at_stream_start_for_bom_write'. (SCM_UNICODE_BOM): New macro. (scm_i_port_iconv_descriptors): Add 'mode' parameter to prototype. * libguile/ports.c (scm_new_port_table_entry): Initialize 'at_stream_start_for_bom_read' and 'at_stream_start_for_bom_write'. (get_iconv_codepoint): Pass new 'mode' parameter to 'scm_i_port_iconv_descriptors'. (get_codepoint): After reading a codepoint at stream start, record that we're no longer at stream start, and consume a BOM where appropriate. (scm_seek): Set the stream start flags according to the new position. (looking_at_bytes): New static function. (scm_utf8_bom, scm_utf16be_bom, scm_utf16le_bom, scm_utf32be_bom, scm_utf32le_bom): New static const arrays. (decide_utf16_encoding, decide_utf32_encoding): New static functions. (scm_i_port_iconv_descriptors): Add new 'mode' parameter. If the specified encoding is UTF-16 or UTF-32, make that precise by deciding what byte order to use, and construct iconv descriptors based on the precise encoding. (scm_i_set_port_encoding_x): Record that we are now at stream start. Do not open the new iconv descriptors immediately; let them be initialized lazily. * libguile/print.c (display_string_using_iconv): Record that we're no longer at stream start. Write a BOM if appropriate. * doc/ref/api-io.texi (BOM Handling): New node. * test-suite/tests/ports.test ("set-port-encoding!, wrong encoding"): Adapt test to cope with the fact that 'set-port-encoding!' does not immediately open the iconv descriptors. (bv-read-test): New procedure. ("unicode byte-order marks (BOMs)"): New test prefix.
This commit is contained in:
parent
45c0878b86
commit
cdd3d6c9f4
5 changed files with 515 additions and 32 deletions
|
@ -24,7 +24,8 @@
|
|||
#:use-module (ice-9 popen)
|
||||
#:use-module (ice-9 rdelim)
|
||||
#:use-module (rnrs bytevectors)
|
||||
#:use-module ((rnrs io ports) #:select (open-bytevector-input-port)))
|
||||
#:use-module ((rnrs io ports) #:select (open-bytevector-input-port
|
||||
open-bytevector-output-port)))
|
||||
|
||||
(define (display-line . args)
|
||||
(for-each display args)
|
||||
|
@ -918,7 +919,9 @@
|
|||
|
||||
(pass-if-exception "set-port-encoding!, wrong encoding"
|
||||
exception:miscellaneous-error
|
||||
(set-port-encoding! (open-input-string "") "does-not-exist"))
|
||||
(let ((p (open-input-string "")))
|
||||
(set-port-encoding! p "does-not-exist")
|
||||
(read p)))
|
||||
|
||||
(pass-if-exception "%default-port-encoding, wrong encoding"
|
||||
exception:miscellaneous-error
|
||||
|
@ -1233,6 +1236,283 @@
|
|||
|
||||
|
||||
|
||||
(with-test-prefix "unicode byte-order marks (BOMs)"
|
||||
|
||||
(define (bv-read-test* encoding bv proc)
|
||||
(let ((port (open-bytevector-input-port bv)))
|
||||
(set-port-encoding! port encoding)
|
||||
(proc port)))
|
||||
|
||||
(define (bv-read-test encoding bv)
|
||||
(bv-read-test* encoding bv read-string))
|
||||
|
||||
(define (bv-write-test* encoding proc)
|
||||
(call-with-values
|
||||
(lambda () (open-bytevector-output-port))
|
||||
(lambda (port get-bytevector)
|
||||
(set-port-encoding! port encoding)
|
||||
(proc port)
|
||||
(get-bytevector))))
|
||||
|
||||
(define (bv-write-test encoding str)
|
||||
(bv-write-test* encoding
|
||||
(lambda (p)
|
||||
(display str p))))
|
||||
|
||||
(pass-if-equal "BOM not discarded from Latin-1 stream"
|
||||
"\xEF\xBB\xBF\x61"
|
||||
(bv-read-test "ISO-8859-1" #vu8(#xEF #xBB #xBF #x61)))
|
||||
|
||||
(pass-if-equal "BOM not discarded from Latin-2 stream"
|
||||
"\u010F\u0165\u017C\x61"
|
||||
(bv-read-test "ISO-8859-2" #vu8(#xEF #xBB #xBF #x61)))
|
||||
|
||||
(pass-if-equal "BOM not discarded from UTF-16BE stream"
|
||||
"\uFEFF\x61"
|
||||
(bv-read-test "UTF-16BE" #vu8(#xFE #xFF #x00 #x61)))
|
||||
|
||||
(pass-if-equal "BOM not discarded from UTF-16LE stream"
|
||||
"\uFEFF\x61"
|
||||
(bv-read-test "UTF-16LE" #vu8(#xFF #xFE #x61 #x00)))
|
||||
|
||||
(pass-if-equal "BOM not discarded from UTF-32BE stream"
|
||||
"\uFEFF\x61"
|
||||
(bv-read-test "UTF-32BE" #vu8(#x00 #x00 #xFE #xFF
|
||||
#x00 #x00 #x00 #x61)))
|
||||
|
||||
(pass-if-equal "BOM not discarded from UTF-32LE stream"
|
||||
"\uFEFF\x61"
|
||||
(bv-read-test "UTF-32LE" #vu8(#xFF #xFE #x00 #x00
|
||||
#x61 #x00 #x00 #x00)))
|
||||
|
||||
(pass-if-equal "BOM not written to UTF-8 stream"
|
||||
#vu8(#x61)
|
||||
(bv-write-test "UTF-8" "a"))
|
||||
|
||||
(pass-if-equal "BOM not written to UTF-16BE stream"
|
||||
#vu8(#x00 #x61)
|
||||
(bv-write-test "UTF-16BE" "a"))
|
||||
|
||||
(pass-if-equal "BOM not written to UTF-16LE stream"
|
||||
#vu8(#x61 #x00)
|
||||
(bv-write-test "UTF-16LE" "a"))
|
||||
|
||||
(pass-if-equal "BOM not written to UTF-32BE stream"
|
||||
#vu8(#x00 #x00 #x00 #x61)
|
||||
(bv-write-test "UTF-32BE" "a"))
|
||||
|
||||
(pass-if-equal "BOM not written to UTF-32LE stream"
|
||||
#vu8(#x61 #x00 #x00 #x00)
|
||||
(bv-write-test "UTF-32LE" "a"))
|
||||
|
||||
(pass-if "Don't read from the port unless user asks to"
|
||||
(let* ((p (make-soft-port
|
||||
(vector
|
||||
(lambda (c) #f) ; write char
|
||||
(lambda (s) #f) ; write string
|
||||
(lambda () #f) ; flush
|
||||
(lambda () (throw 'fail)) ; read char
|
||||
(lambda () #f))
|
||||
"rw")))
|
||||
(set-port-encoding! p "UTF-16")
|
||||
(display "abc" p)
|
||||
(set-port-encoding! p "UTF-32")
|
||||
(display "def" p)
|
||||
#t))
|
||||
|
||||
;; TODO: test that input and output streams are independent when
|
||||
;; appropriate, and linked when appropriate.
|
||||
|
||||
(pass-if-equal "BOM discarded from start of UTF-8 stream"
|
||||
"a"
|
||||
(bv-read-test "Utf-8" #vu8(#xEF #xBB #xBF #x61)))
|
||||
|
||||
(pass-if-equal "BOM discarded from start of UTF-8 stream after seek to 0"
|
||||
'(#\a "a")
|
||||
(bv-read-test* "uTf-8" #vu8(#xEF #xBB #xBF #x61)
|
||||
(lambda (p)
|
||||
(let ((c (read-char p)))
|
||||
(seek p 0 SEEK_SET)
|
||||
(let ((s (read-string p)))
|
||||
(list c s))))))
|
||||
|
||||
(pass-if-equal "Only one BOM discarded from start of UTF-8 stream"
|
||||
"\uFEFFa"
|
||||
(bv-read-test "UTF-8" #vu8(#xEF #xBB #xBF #xEF #xBB #xBF #x61)))
|
||||
|
||||
(pass-if-equal "BOM not discarded from UTF-8 stream after seek to > 0"
|
||||
"\uFEFFb"
|
||||
(bv-read-test* "UTF-8" #vu8(#x61 #xEF #xBB #xBF #x62)
|
||||
(lambda (p)
|
||||
(seek p 1 SEEK_SET)
|
||||
(read-string p))))
|
||||
|
||||
(pass-if-equal "BOM not discarded unless at start of UTF-8 stream"
|
||||
"a\uFEFFb"
|
||||
(bv-read-test "UTF-8" #vu8(#x61 #xEF #xBB #xBF #x62)))
|
||||
|
||||
(pass-if-equal "BOM (BE) written to start of UTF-16 stream"
|
||||
#vu8(#xFE #xFF #x00 #x61 #x00 #x62)
|
||||
(bv-write-test "UTF-16" "ab"))
|
||||
|
||||
(pass-if-equal "BOM (BE) written to UTF-16 stream after set-port-encoding!"
|
||||
#vu8(#xFE #xFF #x00 #x61 #x00 #x62 #xFE #xFF #x00 #x63 #x00 #x64)
|
||||
(bv-write-test* "UTF-16"
|
||||
(lambda (p)
|
||||
(display "ab" p)
|
||||
(set-port-encoding! p "UTF-16")
|
||||
(display "cd" p))))
|
||||
|
||||
(pass-if-equal "BOM discarded from start of UTF-16 stream (BE)"
|
||||
"a"
|
||||
(bv-read-test "UTF-16" #vu8(#xFE #xFF #x00 #x61)))
|
||||
|
||||
(pass-if-equal "BOM discarded from start of UTF-16 stream (BE) after seek to 0"
|
||||
'(#\a "a")
|
||||
(bv-read-test* "utf-16" #vu8(#xFE #xFF #x00 #x61)
|
||||
(lambda (p)
|
||||
(let ((c (read-char p)))
|
||||
(seek p 0 SEEK_SET)
|
||||
(let ((s (read-string p)))
|
||||
(list c s))))))
|
||||
|
||||
(pass-if-equal "Only one BOM discarded from start of UTF-16 stream (BE)"
|
||||
"\uFEFFa"
|
||||
(bv-read-test "Utf-16" #vu8(#xFE #xFF #xFE #xFF #x00 #x61)))
|
||||
|
||||
(pass-if-equal "BOM not discarded from UTF-16 stream (BE) after seek to > 0"
|
||||
"\uFEFFa"
|
||||
(bv-read-test* "uTf-16" #vu8(#xFE #xFF #xFE #xFF #x00 #x61)
|
||||
(lambda (p)
|
||||
(seek p 2 SEEK_SET)
|
||||
(read-string p))))
|
||||
|
||||
(pass-if-equal "BOM not discarded unless at start of UTF-16 stream"
|
||||
"a\uFEFFb"
|
||||
(let ((be (bv-read-test "utf-16" #vu8(#x00 #x61 #xFE #xFF #x00 #x62)))
|
||||
(le (bv-read-test "utf-16" #vu8(#x61 #x00 #xFF #xFE #x62 #x00))))
|
||||
(if (char=? #\a (string-ref be 0))
|
||||
be
|
||||
le)))
|
||||
|
||||
(pass-if-equal "BOM discarded from start of UTF-16 stream (LE)"
|
||||
"a"
|
||||
(bv-read-test "UTF-16" #vu8(#xFF #xFE #x61 #x00)))
|
||||
|
||||
(pass-if-equal "BOM discarded from start of UTF-16 stream (LE) after seek to 0"
|
||||
'(#\a "a")
|
||||
(bv-read-test* "Utf-16" #vu8(#xFF #xFE #x61 #x00)
|
||||
(lambda (p)
|
||||
(let ((c (read-char p)))
|
||||
(seek p 0 SEEK_SET)
|
||||
(let ((s (read-string p)))
|
||||
(list c s))))))
|
||||
|
||||
(pass-if-equal "Only one BOM discarded from start of UTF-16 stream (LE)"
|
||||
"\uFEFFa"
|
||||
(bv-read-test "UTf-16" #vu8(#xFF #xFE #xFF #xFE #x61 #x00)))
|
||||
|
||||
(pass-if-equal "BOM discarded from start of UTF-32 stream (BE)"
|
||||
"a"
|
||||
(bv-read-test "UTF-32" #vu8(#x00 #x00 #xFE #xFF
|
||||
#x00 #x00 #x00 #x61)))
|
||||
|
||||
(pass-if-equal "BOM discarded from start of UTF-32 stream (BE) after seek to 0"
|
||||
'(#\a "a")
|
||||
(bv-read-test* "utF-32" #vu8(#x00 #x00 #xFE #xFF
|
||||
#x00 #x00 #x00 #x61)
|
||||
(lambda (p)
|
||||
(let ((c (read-char p)))
|
||||
(seek p 0 SEEK_SET)
|
||||
(let ((s (read-string p)))
|
||||
(list c s))))))
|
||||
|
||||
(pass-if-equal "Only one BOM discarded from start of UTF-32 stream (BE)"
|
||||
"\uFEFFa"
|
||||
(bv-read-test "UTF-32" #vu8(#x00 #x00 #xFE #xFF
|
||||
#x00 #x00 #xFE #xFF
|
||||
#x00 #x00 #x00 #x61)))
|
||||
|
||||
(pass-if-equal "BOM not discarded from UTF-32 stream (BE) after seek to > 0"
|
||||
"\uFEFFa"
|
||||
(bv-read-test* "UtF-32" #vu8(#x00 #x00 #xFE #xFF
|
||||
#x00 #x00 #xFE #xFF
|
||||
#x00 #x00 #x00 #x61)
|
||||
(lambda (p)
|
||||
(seek p 4 SEEK_SET)
|
||||
(read-string p))))
|
||||
|
||||
(pass-if-equal "BOM discarded within UTF-16 stream (BE) after set-port-encoding!"
|
||||
"ab"
|
||||
(bv-read-test* "UTF-16" #vu8(#x00 #x61 #xFE #xFF #x00 #x62)
|
||||
(lambda (p)
|
||||
(let ((a (read-char p)))
|
||||
(set-port-encoding! p "UTF-16")
|
||||
(string a (read-char p))))))
|
||||
|
||||
(pass-if-equal "BOM discarded within UTF-16 stream (LE,BE) after set-port-encoding!"
|
||||
"ab"
|
||||
(bv-read-test* "utf-16" #vu8(#x00 #x61 #xFF #xFE #x62 #x00)
|
||||
(lambda (p)
|
||||
(let ((a (read-char p)))
|
||||
(set-port-encoding! p "UTF-16")
|
||||
(string a (read-char p))))))
|
||||
|
||||
(pass-if-equal "BOM discarded within UTF-32 stream (BE) after set-port-encoding!"
|
||||
"ab"
|
||||
(bv-read-test* "UTF-32" #vu8(#x00 #x00 #x00 #x61
|
||||
#x00 #x00 #xFE #xFF
|
||||
#x00 #x00 #x00 #x62)
|
||||
(lambda (p)
|
||||
(let ((a (read-char p)))
|
||||
(set-port-encoding! p "UTF-32")
|
||||
(string a (read-char p))))))
|
||||
|
||||
(pass-if-equal "BOM discarded within UTF-32 stream (LE,BE) after set-port-encoding!"
|
||||
"ab"
|
||||
(bv-read-test* "UTF-32" #vu8(#x00 #x00 #x00 #x61
|
||||
#xFF #xFE #x00 #x00
|
||||
#x62 #x00 #x00 #x00)
|
||||
(lambda (p)
|
||||
(let ((a (read-char p)))
|
||||
(set-port-encoding! p "UTF-32")
|
||||
(string a (read-char p))))))
|
||||
|
||||
(pass-if-equal "BOM not discarded unless at start of UTF-32 stream"
|
||||
"a\uFEFFb"
|
||||
(let ((be (bv-read-test "UTF-32" #vu8(#x00 #x00 #x00 #x61
|
||||
#x00 #x00 #xFE #xFF
|
||||
#x00 #x00 #x00 #x62)))
|
||||
(le (bv-read-test "UTF-32" #vu8(#x61 #x00 #x00 #x00
|
||||
#xFF #xFE #x00 #x00
|
||||
#x62 #x00 #x00 #x00))))
|
||||
(if (char=? #\a (string-ref be 0))
|
||||
be
|
||||
le)))
|
||||
|
||||
(pass-if-equal "BOM discarded from start of UTF-32 stream (LE)"
|
||||
"a"
|
||||
(bv-read-test "UTF-32" #vu8(#xFF #xFE #x00 #x00
|
||||
#x61 #x00 #x00 #x00)))
|
||||
|
||||
(pass-if-equal "BOM discarded from start of UTF-32 stream (LE) after seek to 0"
|
||||
'(#\a "a")
|
||||
(bv-read-test* "UTf-32" #vu8(#xFF #xFE #x00 #x00
|
||||
#x61 #x00 #x00 #x00)
|
||||
(lambda (p)
|
||||
(let ((c (read-char p)))
|
||||
(seek p 0 SEEK_SET)
|
||||
(let ((s (read-string p)))
|
||||
(list c s))))))
|
||||
|
||||
(pass-if-equal "Only one BOM discarded from start of UTF-32 stream (LE)"
|
||||
"\uFEFFa"
|
||||
(bv-read-test "UTF-32" #vu8(#xFF #xFE #x00 #x00
|
||||
#xFF #xFE #x00 #x00
|
||||
#x61 #x00 #x00 #x00))))
|
||||
|
||||
|
||||
|
||||
(define-syntax-rule (with-load-path path body ...)
|
||||
(let ((new path)
|
||||
(old %load-path))
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue