1
Fork 0
mirror of https://git.savannah.gnu.org/git/guile.git synced 2025-05-20 11:40:18 +02:00

Revert "detect and consume byte-order marks for textual ports"

This reverts commit b2cb557d75, which was
pushed accidentally.
This commit is contained in:
Andy Wingo 2013-01-30 15:30:31 +01:00
parent e10c250928
commit 419c873624
6 changed files with 30 additions and 169 deletions

View file

@ -1,5 +1,5 @@
/* Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
* 2004, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc.
* 2004, 2006, 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License
@ -399,7 +399,7 @@ SCM_DEFINE (scm_open_file, "open-file", 2, 0, 0,
#define FUNC_NAME s_scm_open_file
{
SCM port;
int fdes, flags = 0, scan_for_encoding = 0, consume_bom = 0, binary = 0;
int fdes, flags = 0, use_encoding = 1;
unsigned int retries;
char *file, *md, *ptr;
@ -415,8 +415,6 @@ SCM_DEFINE (scm_open_file, "open-file", 2, 0, 0,
{
case 'r':
flags |= O_RDONLY;
consume_bom = 1;
scan_for_encoding = 1;
break;
case 'w':
flags |= O_WRONLY | O_CREAT | O_TRUNC;
@ -434,12 +432,9 @@ SCM_DEFINE (scm_open_file, "open-file", 2, 0, 0,
{
case '+':
flags = (flags & ~(O_RDONLY | O_WRONLY)) | O_RDWR;
consume_bom = 0;
break;
case 'b':
scan_for_encoding = 0;
consume_bom = 0;
binary = 1;
use_encoding = 0;
#if defined (O_BINARY)
flags |= O_BINARY;
#endif
@ -478,21 +473,21 @@ SCM_DEFINE (scm_open_file, "open-file", 2, 0, 0,
port = scm_i_fdes_to_port (fdes, scm_i_mode_bits (mode),
fport_canonicalize_filename (filename));
if (consume_bom)
scm_consume_byte_order_mark (port);
if (binary)
if (use_encoding)
{
/* If this file has a coding declaration, use that as the port
encoding. */
if (SCM_INPUT_PORT_P (port))
{
char *enc = scm_i_scan_for_encoding (port);
if (enc != NULL)
scm_i_set_port_encoding_x (port, enc);
}
}
else
/* If this is a binary file, use the binary-friendly ISO-8859-1
encoding. */
scm_i_set_port_encoding_x (port, NULL);
else if (scan_for_encoding)
/* If this is an input port and the file has a coding declaration,
use that as the port encoding. */
{
char *enc = scm_i_scan_for_encoding (port);
if (enc != NULL)
scm_i_set_port_encoding_x (port, enc);
}
scm_dynwind_end ();

View file

@ -106,9 +106,6 @@ SCM_DEFINE (scm_primitive_load, "primitive-load", 1, 0, 0,
scm_dynwind_begin (SCM_F_DYNWIND_REWINDABLE);
scm_i_dynwind_current_load_port (port);
/* FIXME: For better or for worse, scm_open_file already scans the
file for an encoding. This scans again; necessary for this
logic, but unnecessary overall. */
encoding = scm_i_scan_for_encoding (port);
if (encoding)
scm_i_set_port_encoding_x (port, encoding);

View file

@ -1,5 +1,5 @@
/* Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2003, 2004,
* 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc.
* 2006, 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License
@ -2153,89 +2153,6 @@ SCM_DEFINE (scm_set_port_filename_x, "set-port-filename!", 2, 0, 0,
}
#undef FUNC_NAME
SCM_DEFINE (scm_consume_byte_order_mark, "consume-byte-order-mark", 1, 0, 0,
(SCM port),
"Peek ahead in @var{port} for a byte-order mark (\\uFEFF) encoded\n"
"in UTF-8 or in UTF-16. If found, consume the byte-order mark\n"
"and set the port to the indicated encoding.\n"
"\n"
"As a special case, if the port's encoding is already UTF-16LE\n"
"or UTF-16BE (as opposed to UTF-16), we consider that the user\n"
"has already asked for an explicit byte order. In this case no\n"
"scan is performed, and the byte-order mark (if any) is left in\n"
"the port.\n"
"\n"
"Return @code{#t} if a byte-order mark was consumed, and\n"
"@code{#f} otherwise.")
#define FUNC_NAME s_scm_consume_byte_order_mark
{
scm_t_port *pt;
const char *enc;
SCM_VALIDATE_PORT (1, port);
pt = SCM_PTAB_ENTRY (port);
enc = pt->encoding;
if (enc && strcasecmp (enc, "UTF-16BE") == 0)
return SCM_BOOL_F;
if (enc && strcasecmp (enc, "UTF-16LE") == 0)
return SCM_BOOL_F;
switch (scm_peek_byte_or_eof (port))
{
case 0xEF:
scm_get_byte_or_eof (port);
switch (scm_peek_byte_or_eof (port))
{
case 0xBB:
scm_get_byte_or_eof (port);
switch (scm_peek_byte_or_eof (port))
{
case 0xBF:
scm_get_byte_or_eof (port);
scm_i_set_port_encoding_x (port, "UTF-8");
return SCM_BOOL_T;
default:
scm_unget_byte (0xBB, port);
scm_unget_byte (0xEF, port);
return SCM_BOOL_F;
}
default:
scm_unget_byte (0xEF, port);
return SCM_BOOL_F;
}
case 0xFE:
scm_get_byte_or_eof (port);
switch (scm_peek_byte_or_eof (port))
{
case 0xFF:
scm_get_byte_or_eof (port);
scm_i_set_port_encoding_x (port, "UTF-16BE");
return SCM_BOOL_T;
default:
scm_unget_byte (0xFE, port);
return SCM_BOOL_F;
}
case 0xFF:
scm_get_byte_or_eof (port);
switch (scm_peek_byte_or_eof (port))
{
case 0xFE:
scm_get_byte_or_eof (port);
scm_i_set_port_encoding_x (port, "UTF-16LE");
return SCM_BOOL_T;
default:
scm_unget_byte (0xFF, port);
return SCM_BOOL_F;
}
default:
return SCM_BOOL_F;
}
}
#undef FUNC_NAME
/* A fluid specifying the default encoding for newly created ports. If it is
a string, that is the encoding. If it is #f, it is in the "native"
(Latin-1) encoding. */

View file

@ -4,7 +4,7 @@
#define SCM_PORTS_H
/* Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2003, 2004,
* 2006, 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc.
* 2006, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License
@ -303,7 +303,6 @@ SCM_API SCM scm_port_column (SCM port);
SCM_API SCM scm_set_port_column_x (SCM port, SCM line);
SCM_API SCM scm_port_filename (SCM port);
SCM_API SCM scm_set_port_filename_x (SCM port, SCM filename);
SCM_API SCM scm_consume_byte_order_mark (SCM port);
SCM_INTERNAL const char *scm_i_default_port_encoding (void);
SCM_INTERNAL void scm_i_set_default_port_encoding (const char *);
SCM_INTERNAL void scm_i_set_port_encoding_x (SCM port, const char *str);

View file

@ -1,5 +1,5 @@
/* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
* 2007, 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc.
* 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License
@ -1985,6 +1985,7 @@ scm_i_scan_for_encoding (SCM port)
char header[SCM_ENCODING_SEARCH_SIZE+1];
size_t bytes_read, encoding_length, i;
char *encoding = NULL;
int utf8_bom = 0;
char *pos, *encoding_start;
int in_comment;
@ -2026,9 +2027,13 @@ scm_i_scan_for_encoding (SCM port)
bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
header[bytes_read] = '\0';
scm_seek (port, scm_from_int (-bytes_read), scm_from_int (SEEK_CUR));
scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
}
if (bytes_read > 3
&& header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
utf8_bom = 1;
/* search past "coding[:=]" */
pos = header;
while (1)
@ -2097,6 +2102,11 @@ scm_i_scan_for_encoding (SCM port)
/* This wasn't in a comment */
return NULL;
if (utf8_bom && strcmp(encoding, "UTF-8"))
scm_misc_error (NULL,
"the port input declares the encoding ~s but is encoded as UTF-8",
scm_list_1 (scm_from_locale_string (encoding)));
return encoding;
}

View file

@ -1,6 +1,6 @@
;;;; filesys.test --- test file system functions -*- scheme -*-
;;;;
;;;; Copyright (C) 2004, 2006, 2013 Free Software Foundation, Inc.
;;;; Copyright (C) 2004, 2006 Free Software Foundation, Inc.
;;;;
;;;; This library is free software; you can redistribute it and/or
;;;; modify it under the terms of the GNU Lesser General Public
@ -17,8 +17,6 @@
;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
(define-module (test-suite test-filesys)
#:use-module (ice-9 rdelim)
#:use-module (ice-9 binary-ports)
#:use-module (test-suite lib)
#:use-module (test-suite guile-test))
@ -129,58 +127,3 @@
(delete-file (test-file))
(delete-file (test-symlink))
(let ((s "\ufeffHello, world!"))
(define* (test-encoding encoding #:optional (ambient "ISO-8859-1"))
(with-fluids ((%default-port-encoding ambient))
(let* ((bytes (catch 'misc-error
(lambda ()
(call-with-values open-bytevector-output-port
(lambda (port get-bytevector)
(set-port-encoding! port encoding)
(display s port)
(get-bytevector))))
(lambda args
(throw 'unresolved))))
(name (string-copy "myfile-XXXXXX"))
(port (mkstemp! name)))
(put-bytevector port bytes)
(close-port port)
(let ((contents (call-with-input-file name read-string)))
(delete-file name)
contents))))
(pass-if "UTF-8"
(equal? (test-encoding "UTF-8")
"Hello, world!"))
(pass-if "UTF-16BE"
(equal? (test-encoding "UTF-16BE")
"Hello, world!"))
(pass-if "UTF-16LE"
(equal? (test-encoding "UTF-16LE")
"Hello, world!"))
(pass-if "UTF-8 (ambient)"
(equal? (test-encoding "UTF-8" "UTF-8")
"Hello, world!"))
(pass-if "UTF-8 (UTF-16 ambient)"
(equal? (test-encoding "UTF-8" "UTF-16")
"Hello, world!"))
;; Unicode 6.2 section 16.8:
;;
;; For compatibility with versions of the Unicode Standard prior to
;; Version 3.2, the code point U+FEFF has the word-joining semantics
;; of zero width no-break space when it is not used as a BOM. [...]
;;
;; Where the byte order is explicitly specified, such as in UTF-16BE
;; or UTF-16LE, then all U+FEFF characters -- even at the very
;; beginning of the text -- are to be interpreted as zero width
;; no-break spaces.
;;
(pass-if "UTF-16LE (ambient)"
(equal? (test-encoding "UTF-16LE" "UTF-16LE")
"\ufeffHello, world!")))