1
Fork 0
mirror of https://git.savannah.gnu.org/git/guile.git synced 2025-05-20 11:40:18 +02:00

Port encodings are case-insensitive, but normalized to upper-case.

* libguile/ports.c (ascii_toupper, encoding_matches)
  (canonicalize_encoding): New helpers.

  (scm_c_make_port_with_encoding):
  (scm_i_set_default_port_encoding):
  (scm_i_set_port_encoding_x): Use the new helpers to be
  case-insensitive and also to canonicalize the internal representation
  to upper-case ASCII names.

  (scm_i_default_port_encoding): Never return NULL.
  (scm_port_encoding): The encoding is always a string.

* libguile/read.c (scm_i_scan_for_encoding): Use a locale-independent
  check instead of isalnum.  Don't upcase the result: the port code will
  handle that.

* test-suite/tests/web-response.test ("example-1"): Adapt test to expect
  normalized (upper-case) encoding for the response port.
This commit is contained in:
Andy Wingo 2013-01-15 14:31:49 +01:00
parent 08467a7e61
commit 93c4fa2174
3 changed files with 95 additions and 51 deletions

View file

@ -1,5 +1,5 @@
/* Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2003, 2004,
* 2006, 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
* 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License
@ -89,6 +89,56 @@
#define HAVE_FTRUNCATE 1
#endif
/* Port encodings are case-insensitive ASCII strings. */
static char
ascii_toupper (char c)
{
return (c < 'a' || c > 'z') ? c : ('A' + (c - 'a'));
}
/* It is only necessary to use this function on encodings that come from
the user and have not been canonicalized yet. Encodings that are set
on ports or in the default encoding fluid are in upper-case, and can
be compared with strcmp. */
static int
encoding_matches (const char *enc, const char *upper)
{
if (!enc)
enc = "ISO-8859-1";
while (*enc)
if (ascii_toupper (*enc++) != *upper++)
return 0;
return !*upper;
}
static char*
canonicalize_encoding (const char *enc)
{
char *ret;
int i;
if (!enc)
return "ISO-8859-1";
ret = scm_gc_strdup (enc, "port");
for (i = 0; ret[i]; i++)
{
if (ret[i] > 127)
/* Restrict to ASCII. */
scm_misc_error (NULL, "invalid character encoding ~s",
scm_list_1 (scm_from_latin1_string (enc)));
else
ret[i] = ascii_toupper (ret[i]);
}
return ret;
}
/* The port kind table --- a dynamically resized array of port types. */
@ -603,13 +653,23 @@ scm_c_make_port_with_encoding (scm_t_bits tag, unsigned long mode_bits,
entry->rw_active = SCM_PORT_NEITHER;
entry->port = ret;
entry->stream = stream;
entry->encoding = encoding ? scm_gc_strdup (encoding, "port") : NULL;
if (encoding && strcmp (encoding, "UTF-8") == 0)
entry->encoding_mode = SCM_PORT_ENCODING_MODE_UTF8;
else if (!encoding || strcmp (encoding, "ISO-8859-1") == 0)
entry->encoding_mode = SCM_PORT_ENCODING_MODE_LATIN1;
if (encoding_matches (encoding, "UTF-8"))
{
entry->encoding_mode = SCM_PORT_ENCODING_MODE_UTF8;
entry->encoding = "UTF-8";
}
else if (encoding_matches (encoding, "ISO-8859-1"))
{
entry->encoding_mode = SCM_PORT_ENCODING_MODE_LATIN1;
entry->encoding = "ISO-8859-1";
}
else
entry->encoding_mode = SCM_PORT_ENCODING_MODE_ICONV;
{
entry->encoding_mode = SCM_PORT_ENCODING_MODE_ICONV;
entry->encoding = canonicalize_encoding (encoding);
}
entry->ilseq_handler = handler;
entry->iconv_descriptors = NULL;
@ -806,44 +866,30 @@ scm_i_set_default_port_encoding (const char *encoding)
scm_misc_error (NULL, "tried to set port encoding fluid before it is initialized",
SCM_EOL);
if (encoding == NULL
|| !strcmp (encoding, "ASCII")
|| !strcmp (encoding, "ANSI_X3.4-1968")
|| !strcmp (encoding, "ISO-8859-1"))
if (encoding_matches (encoding, "ASCII")
|| encoding_matches (encoding, "ANSI_X3.4-1968")
|| encoding_matches (encoding, "ISO-8859-1"))
scm_fluid_set_x (SCM_VARIABLE_REF (default_port_encoding_var), SCM_BOOL_F);
else
{
SCM str;
size_t i;
str = scm_from_latin1_string (encoding);
/* Restrict to ASCII. */
for (i = 0; encoding[i]; i++)
if (encoding[i] > 127)
scm_misc_error ("scm_i_set_default_port_encoding",
"invalid character encoding ~s", scm_list_1 (str));
scm_fluid_set_x (SCM_VARIABLE_REF (default_port_encoding_var), str);
}
scm_fluid_set_x (SCM_VARIABLE_REF (default_port_encoding_var),
scm_from_latin1_string (canonicalize_encoding (encoding)));
}
/* Return the name of the default encoding for newly created ports; a
return value of NULL means "ISO-8859-1". */
/* Return the name of the default encoding for newly created ports. */
const char *
scm_i_default_port_encoding (void)
{
if (!scm_port_encoding_init)
return NULL;
return "ISO-8859-1";
else if (!scm_is_fluid (SCM_VARIABLE_REF (default_port_encoding_var)))
return NULL;
return "ISO-8859-1";
else
{
SCM encoding;
encoding = scm_fluid_ref (SCM_VARIABLE_REF (default_port_encoding_var));
if (!scm_is_string (encoding))
return NULL;
return "ISO-8859-1";
else
return scm_i_string_chars (encoding);
}
@ -1041,13 +1087,13 @@ scm_i_set_port_encoding_x (SCM port, const char *encoding)
pt = SCM_PTAB_ENTRY (port);
prev = pt->iconv_descriptors;
if (encoding && strcmp (encoding, "UTF-8") == 0)
if (encoding_matches (encoding, "UTF-8"))
{
pt->encoding = "UTF-8";
pt->encoding_mode = SCM_PORT_ENCODING_MODE_UTF8;
pt->iconv_descriptors = NULL;
}
else if (!encoding || strcmp (encoding, "ISO-8859-1") == 0)
else if (encoding_matches (encoding, "ISO-8859-1"))
{
pt->encoding = "ISO-8859-1";
pt->encoding_mode = SCM_PORT_ENCODING_MODE_LATIN1;
@ -1056,11 +1102,12 @@ scm_i_set_port_encoding_x (SCM port, const char *encoding)
else
{
/* Open descriptors before mutating the port. */
char *gc_encoding = canonicalize_encoding (encoding);
pt->iconv_descriptors =
open_iconv_descriptors (encoding,
open_iconv_descriptors (gc_encoding,
SCM_INPUT_PORT_P (port),
SCM_OUTPUT_PORT_P (port));
pt->encoding = scm_gc_strdup (encoding, "port");
pt->encoding = gc_encoding;
pt->encoding_mode = SCM_PORT_ENCODING_MODE_ICONV;
}
@ -1074,17 +1121,9 @@ SCM_DEFINE (scm_port_encoding, "port-encoding", 1, 0, 0,
"uses to interpret its input and output.\n")
#define FUNC_NAME s_scm_port_encoding
{
scm_t_port *pt;
const char *enc;
SCM_VALIDATE_PORT (1, port);
pt = SCM_PTAB_ENTRY (port);
enc = pt->encoding;
if (enc)
return scm_from_latin1_string (pt->encoding);
else
return SCM_BOOL_F;
return scm_from_latin1_string (SCM_PTAB_ENTRY (port)->encoding);
}
#undef FUNC_NAME

View file

@ -25,7 +25,6 @@
#endif
#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include <unistd.h>
#include <unicase.h>
@ -1949,6 +1948,15 @@ scm_get_hash_procedure (int c)
#define SCM_ENCODING_SEARCH_SIZE (500)
static int
is_encoding_char (char c)
{
if (c >= 'a' && c <= 'z') return 1;
if (c >= 'A' && c <= 'Z') return 1;
if (c >= '0' && c <= '9') return 1;
return strchr ("_-.:/,+=()", c) != NULL;
}
/* Search the first few hundred characters of a file for an Emacs-like coding
declaration. Returns either NULL or a string whose storage has been
allocated with `scm_gc_malloc ()'. */
@ -2034,8 +2042,7 @@ scm_i_scan_for_encoding (SCM port)
i = 0;
while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
&& encoding_start + i - header < bytes_read
&& (isalnum ((int) encoding_start[i])
|| strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
&& is_encoding_char (encoding_start[i]))
i++;
encoding_length = i;
@ -2043,8 +2050,6 @@ scm_i_scan_for_encoding (SCM port)
return NULL;
encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
for (i = 0; i < encoding_length; i++)
encoding[i] = toupper ((int) encoding[i]);
/* push backwards to make sure we were in a comment */
in_comment = 0;
@ -2076,7 +2081,7 @@ scm_i_scan_for_encoding (SCM port)
/* This wasn't in a comment */
return NULL;
if (utf8_bom && strcmp(encoding, "UTF-8"))
if (utf8_bom && strcasecmp (encoding, "UTF-8"))
scm_misc_error (NULL,
"the port input declares the encoding ~s but is encoded as UTF-8",
scm_list_1 (scm_from_locale_string (encoding)));

View file

@ -1,6 +1,6 @@
;;;; web-response.test --- HTTP responses -*- mode: scheme; coding: utf-8; -*-
;;;;
;;;; Copyright (C) 2010, 2011, 2012 Free Software Foundation, Inc.
;;;; Copyright (C) 2010, 2011, 2012, 2013 Free Software Foundation, Inc.
;;;;
;;;; This library is free software; you can redistribute it and/or
;;;; modify it under the terms of the GNU Lesser General Public
@ -113,7 +113,7 @@ consectetur adipisicing elit,\r
(response-content-encoding r))
(pass-if-equal "response-body-port"
`("utf-8" ,body)
`("UTF-8" ,body)
(with-fluids ((%default-port-encoding #f))
(let* ((r (read-response (open-input-string example-1)))
(p (response-body-port r)))