mirror of
https://git.savannah.gnu.org/git/guile.git
synced 2025-05-20 11:40:18 +02:00
detect and consume byte-order marks for textual ports
* libguile/ports.h: * libguile/ports.c (scm_consume_byte_order_mark): New procedure. * libguile/fports.c (scm_open_file): Call consume-byte-order-mark if we are opening a file in "r" mode. * libguile/read.c (scm_i_scan_for_encoding): Don't do anything about byte-order marks. * libguile/load.c (scm_primitive_load): Add a note about the duplicate encoding scan. * test-suite/tests/filesys.test: Add tests for UTF-8, UTF-16BE, and UTF-16LE BOM handling.
This commit is contained in:
parent
a8fa310b04
commit
b2cb557d75
6 changed files with 169 additions and 30 deletions
|
@ -1,5 +1,5 @@
|
||||||
/* Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
|
/* Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
|
||||||
* 2004, 2006, 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
|
* 2004, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc.
|
||||||
*
|
*
|
||||||
* This library is free software; you can redistribute it and/or
|
* This library is free software; you can redistribute it and/or
|
||||||
* modify it under the terms of the GNU Lesser General Public License
|
* modify it under the terms of the GNU Lesser General Public License
|
||||||
|
@ -399,7 +399,7 @@ SCM_DEFINE (scm_open_file, "open-file", 2, 0, 0,
|
||||||
#define FUNC_NAME s_scm_open_file
|
#define FUNC_NAME s_scm_open_file
|
||||||
{
|
{
|
||||||
SCM port;
|
SCM port;
|
||||||
int fdes, flags = 0, use_encoding = 1;
|
int fdes, flags = 0, scan_for_encoding = 0, consume_bom = 0, binary = 0;
|
||||||
unsigned int retries;
|
unsigned int retries;
|
||||||
char *file, *md, *ptr;
|
char *file, *md, *ptr;
|
||||||
|
|
||||||
|
@ -415,6 +415,8 @@ SCM_DEFINE (scm_open_file, "open-file", 2, 0, 0,
|
||||||
{
|
{
|
||||||
case 'r':
|
case 'r':
|
||||||
flags |= O_RDONLY;
|
flags |= O_RDONLY;
|
||||||
|
consume_bom = 1;
|
||||||
|
scan_for_encoding = 1;
|
||||||
break;
|
break;
|
||||||
case 'w':
|
case 'w':
|
||||||
flags |= O_WRONLY | O_CREAT | O_TRUNC;
|
flags |= O_WRONLY | O_CREAT | O_TRUNC;
|
||||||
|
@ -432,9 +434,12 @@ SCM_DEFINE (scm_open_file, "open-file", 2, 0, 0,
|
||||||
{
|
{
|
||||||
case '+':
|
case '+':
|
||||||
flags = (flags & ~(O_RDONLY | O_WRONLY)) | O_RDWR;
|
flags = (flags & ~(O_RDONLY | O_WRONLY)) | O_RDWR;
|
||||||
|
consume_bom = 0;
|
||||||
break;
|
break;
|
||||||
case 'b':
|
case 'b':
|
||||||
use_encoding = 0;
|
scan_for_encoding = 0;
|
||||||
|
consume_bom = 0;
|
||||||
|
binary = 1;
|
||||||
#if defined (O_BINARY)
|
#if defined (O_BINARY)
|
||||||
flags |= O_BINARY;
|
flags |= O_BINARY;
|
||||||
#endif
|
#endif
|
||||||
|
@ -473,21 +478,21 @@ SCM_DEFINE (scm_open_file, "open-file", 2, 0, 0,
|
||||||
port = scm_i_fdes_to_port (fdes, scm_i_mode_bits (mode),
|
port = scm_i_fdes_to_port (fdes, scm_i_mode_bits (mode),
|
||||||
fport_canonicalize_filename (filename));
|
fport_canonicalize_filename (filename));
|
||||||
|
|
||||||
if (use_encoding)
|
if (consume_bom)
|
||||||
{
|
scm_consume_byte_order_mark (port);
|
||||||
/* If this file has a coding declaration, use that as the port
|
|
||||||
encoding. */
|
if (binary)
|
||||||
if (SCM_INPUT_PORT_P (port))
|
|
||||||
{
|
|
||||||
char *enc = scm_i_scan_for_encoding (port);
|
|
||||||
if (enc != NULL)
|
|
||||||
scm_i_set_port_encoding_x (port, enc);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
/* If this is a binary file, use the binary-friendly ISO-8859-1
|
/* If this is a binary file, use the binary-friendly ISO-8859-1
|
||||||
encoding. */
|
encoding. */
|
||||||
scm_i_set_port_encoding_x (port, NULL);
|
scm_i_set_port_encoding_x (port, NULL);
|
||||||
|
else if (scan_for_encoding)
|
||||||
|
/* If this is an input port and the file has a coding declaration,
|
||||||
|
use that as the port encoding. */
|
||||||
|
{
|
||||||
|
char *enc = scm_i_scan_for_encoding (port);
|
||||||
|
if (enc != NULL)
|
||||||
|
scm_i_set_port_encoding_x (port, enc);
|
||||||
|
}
|
||||||
|
|
||||||
scm_dynwind_end ();
|
scm_dynwind_end ();
|
||||||
|
|
||||||
|
|
|
@ -106,6 +106,9 @@ SCM_DEFINE (scm_primitive_load, "primitive-load", 1, 0, 0,
|
||||||
scm_dynwind_begin (SCM_F_DYNWIND_REWINDABLE);
|
scm_dynwind_begin (SCM_F_DYNWIND_REWINDABLE);
|
||||||
scm_i_dynwind_current_load_port (port);
|
scm_i_dynwind_current_load_port (port);
|
||||||
|
|
||||||
|
/* FIXME: For better or for worse, scm_open_file already scans the
|
||||||
|
file for an encoding. This scans again; necessary for this
|
||||||
|
logic, but unnecessary overall. */
|
||||||
encoding = scm_i_scan_for_encoding (port);
|
encoding = scm_i_scan_for_encoding (port);
|
||||||
if (encoding)
|
if (encoding)
|
||||||
scm_i_set_port_encoding_x (port, encoding);
|
scm_i_set_port_encoding_x (port, encoding);
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
/* Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2003, 2004,
|
/* Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2003, 2004,
|
||||||
* 2006, 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
|
* 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc.
|
||||||
*
|
*
|
||||||
* This library is free software; you can redistribute it and/or
|
* This library is free software; you can redistribute it and/or
|
||||||
* modify it under the terms of the GNU Lesser General Public License
|
* modify it under the terms of the GNU Lesser General Public License
|
||||||
|
@ -2153,6 +2153,89 @@ SCM_DEFINE (scm_set_port_filename_x, "set-port-filename!", 2, 0, 0,
|
||||||
}
|
}
|
||||||
#undef FUNC_NAME
|
#undef FUNC_NAME
|
||||||
|
|
||||||
|
SCM_DEFINE (scm_consume_byte_order_mark, "consume-byte-order-mark", 1, 0, 0,
|
||||||
|
(SCM port),
|
||||||
|
"Peek ahead in @var{port} for a byte-order mark (\\uFEFF) encoded\n"
|
||||||
|
"in UTF-8 or in UTF-16. If found, consume the byte-order mark\n"
|
||||||
|
"and set the port to the indicated encoding.\n"
|
||||||
|
"\n"
|
||||||
|
"As a special case, if the port's encoding is already UTF-16LE\n"
|
||||||
|
"or UTF-16BE (as opposed to UTF-16), we consider that the user\n"
|
||||||
|
"has already asked for an explicit byte order. In this case no\n"
|
||||||
|
"scan is performed, and the byte-order mark (if any) is left in\n"
|
||||||
|
"the port.\n"
|
||||||
|
"\n"
|
||||||
|
"Return @code{#t} if a byte-order mark was consumed, and\n"
|
||||||
|
"@code{#f} otherwise.")
|
||||||
|
#define FUNC_NAME s_scm_consume_byte_order_mark
|
||||||
|
{
|
||||||
|
scm_t_port *pt;
|
||||||
|
const char *enc;
|
||||||
|
|
||||||
|
SCM_VALIDATE_PORT (1, port);
|
||||||
|
|
||||||
|
pt = SCM_PTAB_ENTRY (port);
|
||||||
|
enc = pt->encoding;
|
||||||
|
|
||||||
|
if (enc && strcasecmp (enc, "UTF-16BE") == 0)
|
||||||
|
return SCM_BOOL_F;
|
||||||
|
|
||||||
|
if (enc && strcasecmp (enc, "UTF-16LE") == 0)
|
||||||
|
return SCM_BOOL_F;
|
||||||
|
|
||||||
|
switch (scm_peek_byte_or_eof (port))
|
||||||
|
{
|
||||||
|
case 0xEF:
|
||||||
|
scm_get_byte_or_eof (port);
|
||||||
|
switch (scm_peek_byte_or_eof (port))
|
||||||
|
{
|
||||||
|
case 0xBB:
|
||||||
|
scm_get_byte_or_eof (port);
|
||||||
|
switch (scm_peek_byte_or_eof (port))
|
||||||
|
{
|
||||||
|
case 0xBF:
|
||||||
|
scm_get_byte_or_eof (port);
|
||||||
|
scm_i_set_port_encoding_x (port, "UTF-8");
|
||||||
|
return SCM_BOOL_T;
|
||||||
|
default:
|
||||||
|
scm_unget_byte (0xBB, port);
|
||||||
|
scm_unget_byte (0xEF, port);
|
||||||
|
return SCM_BOOL_F;
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
scm_unget_byte (0xEF, port);
|
||||||
|
return SCM_BOOL_F;
|
||||||
|
}
|
||||||
|
case 0xFE:
|
||||||
|
scm_get_byte_or_eof (port);
|
||||||
|
switch (scm_peek_byte_or_eof (port))
|
||||||
|
{
|
||||||
|
case 0xFF:
|
||||||
|
scm_get_byte_or_eof (port);
|
||||||
|
scm_i_set_port_encoding_x (port, "UTF-16BE");
|
||||||
|
return SCM_BOOL_T;
|
||||||
|
default:
|
||||||
|
scm_unget_byte (0xFE, port);
|
||||||
|
return SCM_BOOL_F;
|
||||||
|
}
|
||||||
|
case 0xFF:
|
||||||
|
scm_get_byte_or_eof (port);
|
||||||
|
switch (scm_peek_byte_or_eof (port))
|
||||||
|
{
|
||||||
|
case 0xFE:
|
||||||
|
scm_get_byte_or_eof (port);
|
||||||
|
scm_i_set_port_encoding_x (port, "UTF-16LE");
|
||||||
|
return SCM_BOOL_T;
|
||||||
|
default:
|
||||||
|
scm_unget_byte (0xFF, port);
|
||||||
|
return SCM_BOOL_F;
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
return SCM_BOOL_F;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#undef FUNC_NAME
|
||||||
|
|
||||||
/* A fluid specifying the default encoding for newly created ports. If it is
|
/* A fluid specifying the default encoding for newly created ports. If it is
|
||||||
a string, that is the encoding. If it is #f, it is in the "native"
|
a string, that is the encoding. If it is #f, it is in the "native"
|
||||||
(Latin-1) encoding. */
|
(Latin-1) encoding. */
|
||||||
|
|
|
@ -4,7 +4,7 @@
|
||||||
#define SCM_PORTS_H
|
#define SCM_PORTS_H
|
||||||
|
|
||||||
/* Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2003, 2004,
|
/* Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2003, 2004,
|
||||||
* 2006, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
|
* 2006, 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc.
|
||||||
*
|
*
|
||||||
* This library is free software; you can redistribute it and/or
|
* This library is free software; you can redistribute it and/or
|
||||||
* modify it under the terms of the GNU Lesser General Public License
|
* modify it under the terms of the GNU Lesser General Public License
|
||||||
|
@ -303,6 +303,7 @@ SCM_API SCM scm_port_column (SCM port);
|
||||||
SCM_API SCM scm_set_port_column_x (SCM port, SCM line);
|
SCM_API SCM scm_set_port_column_x (SCM port, SCM line);
|
||||||
SCM_API SCM scm_port_filename (SCM port);
|
SCM_API SCM scm_port_filename (SCM port);
|
||||||
SCM_API SCM scm_set_port_filename_x (SCM port, SCM filename);
|
SCM_API SCM scm_set_port_filename_x (SCM port, SCM filename);
|
||||||
|
SCM_API SCM scm_consume_byte_order_mark (SCM port);
|
||||||
SCM_INTERNAL const char *scm_i_default_port_encoding (void);
|
SCM_INTERNAL const char *scm_i_default_port_encoding (void);
|
||||||
SCM_INTERNAL void scm_i_set_default_port_encoding (const char *);
|
SCM_INTERNAL void scm_i_set_default_port_encoding (const char *);
|
||||||
SCM_INTERNAL void scm_i_set_port_encoding_x (SCM port, const char *str);
|
SCM_INTERNAL void scm_i_set_port_encoding_x (SCM port, const char *str);
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
/* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
|
/* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
|
||||||
* 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
|
* 2007, 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc.
|
||||||
*
|
*
|
||||||
* This library is free software; you can redistribute it and/or
|
* This library is free software; you can redistribute it and/or
|
||||||
* modify it under the terms of the GNU Lesser General Public License
|
* modify it under the terms of the GNU Lesser General Public License
|
||||||
|
@ -1985,7 +1985,6 @@ scm_i_scan_for_encoding (SCM port)
|
||||||
char header[SCM_ENCODING_SEARCH_SIZE+1];
|
char header[SCM_ENCODING_SEARCH_SIZE+1];
|
||||||
size_t bytes_read, encoding_length, i;
|
size_t bytes_read, encoding_length, i;
|
||||||
char *encoding = NULL;
|
char *encoding = NULL;
|
||||||
int utf8_bom = 0;
|
|
||||||
char *pos, *encoding_start;
|
char *pos, *encoding_start;
|
||||||
int in_comment;
|
int in_comment;
|
||||||
|
|
||||||
|
@ -2027,13 +2026,9 @@ scm_i_scan_for_encoding (SCM port)
|
||||||
|
|
||||||
bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
|
bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
|
||||||
header[bytes_read] = '\0';
|
header[bytes_read] = '\0';
|
||||||
scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
|
scm_seek (port, scm_from_int (-bytes_read), scm_from_int (SEEK_CUR));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (bytes_read > 3
|
|
||||||
&& header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
|
|
||||||
utf8_bom = 1;
|
|
||||||
|
|
||||||
/* search past "coding[:=]" */
|
/* search past "coding[:=]" */
|
||||||
pos = header;
|
pos = header;
|
||||||
while (1)
|
while (1)
|
||||||
|
@ -2102,11 +2097,6 @@ scm_i_scan_for_encoding (SCM port)
|
||||||
/* This wasn't in a comment */
|
/* This wasn't in a comment */
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
if (utf8_bom && strcmp(encoding, "UTF-8"))
|
|
||||||
scm_misc_error (NULL,
|
|
||||||
"the port input declares the encoding ~s but is encoded as UTF-8",
|
|
||||||
scm_list_1 (scm_from_locale_string (encoding)));
|
|
||||||
|
|
||||||
return encoding;
|
return encoding;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
;;;; filesys.test --- test file system functions -*- scheme -*-
|
;;;; filesys.test --- test file system functions -*- scheme -*-
|
||||||
;;;;
|
;;;;
|
||||||
;;;; Copyright (C) 2004, 2006 Free Software Foundation, Inc.
|
;;;; Copyright (C) 2004, 2006, 2013 Free Software Foundation, Inc.
|
||||||
;;;;
|
;;;;
|
||||||
;;;; This library is free software; you can redistribute it and/or
|
;;;; This library is free software; you can redistribute it and/or
|
||||||
;;;; modify it under the terms of the GNU Lesser General Public
|
;;;; modify it under the terms of the GNU Lesser General Public
|
||||||
|
@ -17,6 +17,8 @@
|
||||||
;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
|
|
||||||
(define-module (test-suite test-filesys)
|
(define-module (test-suite test-filesys)
|
||||||
|
#:use-module (ice-9 rdelim)
|
||||||
|
#:use-module (ice-9 binary-ports)
|
||||||
#:use-module (test-suite lib)
|
#:use-module (test-suite lib)
|
||||||
#:use-module (test-suite guile-test))
|
#:use-module (test-suite guile-test))
|
||||||
|
|
||||||
|
@ -127,3 +129,58 @@
|
||||||
|
|
||||||
(delete-file (test-file))
|
(delete-file (test-file))
|
||||||
(delete-file (test-symlink))
|
(delete-file (test-symlink))
|
||||||
|
|
||||||
|
(let ((s "\ufeffHello, world!"))
|
||||||
|
(define* (test-encoding encoding #:optional (ambient "ISO-8859-1"))
|
||||||
|
(with-fluids ((%default-port-encoding ambient))
|
||||||
|
(let* ((bytes (catch 'misc-error
|
||||||
|
(lambda ()
|
||||||
|
(call-with-values open-bytevector-output-port
|
||||||
|
(lambda (port get-bytevector)
|
||||||
|
(set-port-encoding! port encoding)
|
||||||
|
(display s port)
|
||||||
|
(get-bytevector))))
|
||||||
|
(lambda args
|
||||||
|
(throw 'unresolved))))
|
||||||
|
(name (string-copy "myfile-XXXXXX"))
|
||||||
|
(port (mkstemp! name)))
|
||||||
|
(put-bytevector port bytes)
|
||||||
|
(close-port port)
|
||||||
|
(let ((contents (call-with-input-file name read-string)))
|
||||||
|
(delete-file name)
|
||||||
|
contents))))
|
||||||
|
|
||||||
|
(pass-if "UTF-8"
|
||||||
|
(equal? (test-encoding "UTF-8")
|
||||||
|
"Hello, world!"))
|
||||||
|
|
||||||
|
(pass-if "UTF-16BE"
|
||||||
|
(equal? (test-encoding "UTF-16BE")
|
||||||
|
"Hello, world!"))
|
||||||
|
|
||||||
|
(pass-if "UTF-16LE"
|
||||||
|
(equal? (test-encoding "UTF-16LE")
|
||||||
|
"Hello, world!"))
|
||||||
|
|
||||||
|
(pass-if "UTF-8 (ambient)"
|
||||||
|
(equal? (test-encoding "UTF-8" "UTF-8")
|
||||||
|
"Hello, world!"))
|
||||||
|
|
||||||
|
(pass-if "UTF-8 (UTF-16 ambient)"
|
||||||
|
(equal? (test-encoding "UTF-8" "UTF-16")
|
||||||
|
"Hello, world!"))
|
||||||
|
|
||||||
|
;; Unicode 6.2 section 16.8:
|
||||||
|
;;
|
||||||
|
;; For compatibility with versions of the Unicode Standard prior to
|
||||||
|
;; Version 3.2, the code point U+FEFF has the word-joining semantics
|
||||||
|
;; of zero width no-break space when it is not used as a BOM. [...]
|
||||||
|
;;
|
||||||
|
;; Where the byte order is explicitly specified, such as in UTF-16BE
|
||||||
|
;; or UTF-16LE, then all U+FEFF characters -- even at the very
|
||||||
|
;; beginning of the text -- are to be interpreted as zero width
|
||||||
|
;; no-break spaces.
|
||||||
|
;;
|
||||||
|
(pass-if "UTF-16LE (ambient)"
|
||||||
|
(equal? (test-encoding "UTF-16LE" "UTF-16LE")
|
||||||
|
"\ufeffHello, world!")))
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue