1
Fork 0
mirror of https://git.savannah.gnu.org/git/guile.git synced 2025-06-11 06:20:23 +02:00

Reader option for R6RS hex escapes

This adds a reader option 'r6rs-hex-escapes that modifies the
behavior of numeric escapes in characters and strings.  When enabled,
variable-length character hex escapes (#\xNNN) are allowed and become
the default output format for numerically-escaped characters.  Also,
string hex escapes switch to a semicolon terminated hex escape (\xNNNN;).

* libguile/print.c (PRINT_CHAR_ESCAPE): new macro
  (iprin1): use new macro PRINT_CHAR_ESCAPE

* libguile/private-options.h (SCM_R6RS_ESCAPES_P): new #define

* libguile/read.c (scm_read_opts): add new option r6rs-hex-escapes
  (SCM_READ_HEX_ESCAPE): modify to take a terminator parameter
  (scm_read_string): parse R6RS hex string escapes
  (scm_read_character): parse R6RS hex character escapes

* test-suite/tests/chars.test (with-read-options): new procedure
  (R6RS hex escapes): new tests

* test-suite/tests/strings.test (with-read-options): new procedure
  (R6RS hex escapes): new tests
This commit is contained in:
Michael Gran 2010-01-12 21:02:41 -08:00
parent 8470b3f45b
commit dea901d66e
5 changed files with 274 additions and 68 deletions

View file

@ -409,6 +409,22 @@ SCM_GPROC(s_display, "display", 1, 1, 0, scm_display, g_display);
static void iprin1 (SCM exp, SCM port, scm_print_state *pstate); static void iprin1 (SCM exp, SCM port, scm_print_state *pstate);
/* Print a character as an octal or hex escape. */
#define PRINT_CHAR_ESCAPE(i, port) \
do \
{ \
if (!SCM_R6RS_ESCAPES_P) \
scm_intprint (i, 8, port); \
else \
{ \
scm_puts ("x", port); \
scm_intprint (i, 16, port); \
} \
} \
while (0)
void void
scm_iprin1 (SCM exp, SCM port, scm_print_state *pstate) scm_iprin1 (SCM exp, SCM port, scm_print_state *pstate)
{ {
@ -488,7 +504,7 @@ iprin1 (SCM exp, SCM port, scm_print_state *pstate)
else else
/* Character is graphic but unrepresentable in /* Character is graphic but unrepresentable in
this port's encoding. */ this port's encoding. */
scm_intprint (i, 8, port); PRINT_CHAR_ESCAPE (i, port);
} }
else else
{ {
@ -507,12 +523,12 @@ iprin1 (SCM exp, SCM port, scm_print_state *pstate)
else else
/* Character is graphic but unrepresentable in /* Character is graphic but unrepresentable in
this port's encoding. */ this port's encoding. */
scm_intprint (i, 8, port); PRINT_CHAR_ESCAPE (i, port);
} }
} }
else else
/* Character is a non-graphical character. */ /* Character is a non-graphical character. */
scm_intprint (i, 8, port); PRINT_CHAR_ESCAPE (i, port);
} }
else else
scm_i_charprint (i, port); scm_i_charprint (i, port);
@ -579,9 +595,9 @@ iprin1 (SCM exp, SCM port, scm_print_state *pstate)
case scm_tc7_string: case scm_tc7_string:
if (SCM_WRITINGP (pstate)) if (SCM_WRITINGP (pstate))
{ {
size_t i, j, len; size_t i, len;
static char const hex[] = "0123456789abcdef"; static char const hex[] = "0123456789abcdef";
char buf[8]; char buf[9];
scm_putc ('"', port); scm_putc ('"', port);
@ -647,37 +663,61 @@ iprin1 (SCM exp, SCM port, scm_print_state *pstate)
{ {
/* Character is graphic but unrepresentable in /* Character is graphic but unrepresentable in
this port's encoding or is not graphic. */ this port's encoding or is not graphic. */
if (ch <= 0xFF) if (!SCM_R6RS_ESCAPES_P)
{ {
buf[0] = '\\'; if (ch <= 0xFF)
buf[1] = 'x'; {
buf[2] = hex[ch / 16]; buf[0] = '\\';
buf[3] = hex[ch % 16]; buf[1] = 'x';
scm_lfwrite (buf, 4, port); buf[2] = hex[ch / 16];
buf[3] = hex[ch % 16];
scm_lfwrite (buf, 4, port);
}
else if (ch <= 0xFFFF)
{
buf[0] = '\\';
buf[1] = 'u';
buf[2] = hex[(ch & 0xF000) >> 12];
buf[3] = hex[(ch & 0xF00) >> 8];
buf[4] = hex[(ch & 0xF0) >> 4];
buf[5] = hex[(ch & 0xF)];
scm_lfwrite (buf, 6, port);
}
else if (ch > 0xFFFF)
{
buf[0] = '\\';
buf[1] = 'U';
buf[2] = hex[(ch & 0xF00000) >> 20];
buf[3] = hex[(ch & 0xF0000) >> 16];
buf[4] = hex[(ch & 0xF000) >> 12];
buf[5] = hex[(ch & 0xF00) >> 8];
buf[6] = hex[(ch & 0xF0) >> 4];
buf[7] = hex[(ch & 0xF)];
scm_lfwrite (buf, 8, port);
}
} }
else if (ch <= 0xFFFF) else
{ {
buf[0] = '\\'; scm_t_wchar ch2 = ch;
buf[1] = 'u';
buf[2] = hex[(ch & 0xF000) >> 12]; /* Print an R6RS variable-length hex escape: "\xNNNN;"
buf[3] = hex[(ch & 0xF00) >> 8]; */
buf[4] = hex[(ch & 0xF0) >> 4]; int i = 8;
buf[5] = hex[(ch & 0xF)]; buf[i] = ';';
scm_lfwrite (buf, 6, port); i --;
j = i + 1; if (ch == 0)
} buf[i--] = '0';
else if (ch > 0xFFFF) else
{ while (ch2 > 0)
buf[0] = '\\'; {
buf[1] = 'U'; buf[i] = hex[ch2 & 0xF];
buf[2] = hex[(ch & 0xF00000) >> 20]; ch2 >>= 4;
buf[3] = hex[(ch & 0xF0000) >> 16]; i --;
buf[4] = hex[(ch & 0xF000) >> 12]; }
buf[5] = hex[(ch & 0xF00) >> 8]; buf[i] = 'x';
buf[6] = hex[(ch & 0xF0) >> 4]; i --;
buf[7] = hex[(ch & 0xF)]; buf[i] = '\\';
scm_lfwrite (buf, 8, port); scm_lfwrite (buf + i, 9 - i, port);
j = i + 1;
} }
} }
} }

View file

@ -94,9 +94,13 @@ SCM_API scm_t_option scm_read_opts[];
#if SCM_ENABLE_ELISP #if SCM_ENABLE_ELISP
#define SCM_ELISP_VECTORS_P scm_read_opts[4].val #define SCM_ELISP_VECTORS_P scm_read_opts[4].val
#define SCM_ESCAPED_PARENS_P scm_read_opts[5].val #define SCM_ESCAPED_PARENS_P scm_read_opts[5].val
#define SCM_N_READ_OPTIONS 6 #endif
#define SCM_R6RS_ESCAPES_P scm_read_opts[6].val
#if SCM_ENABLE_ELISP
#define SCM_N_READ_OPTIONS 7
#else #else
#define SCM_N_READ_OPTIONS 4 #define SCM_N_READ_OPTIONS 5
#endif #endif
#endif /* PRIVATE_OPTIONS */ #endif /* PRIVATE_OPTIONS */

View file

@ -76,6 +76,8 @@ scm_t_option scm_read_opts[] = {
{ SCM_OPTION_BOOLEAN, "elisp-strings", 0, { SCM_OPTION_BOOLEAN, "elisp-strings", 0,
"Support `\\(' and `\\)' in strings."}, "Support `\\(' and `\\)' in strings."},
#endif #endif
{ SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
"Use R6RS variable-length character and string hex escapes."},
{ 0, }, { 0, },
}; };
@ -412,32 +414,37 @@ scm_read_sexp (scm_t_wchar chr, SCM port)
/* Read a hexadecimal number NDIGITS in length. Put its value into the variable /* Read a hexadecimal number NDIGITS in length. Put its value into the variable
C. */ C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
#define SCM_READ_HEX_ESCAPE(ndigits) \ found. */
do \ #define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
{ \ do \
scm_t_wchar a; \ { \
size_t i = 0; \ scm_t_wchar a; \
c = 0; \ size_t i = 0; \
while (i < ndigits) \ c = 0; \
{ \ while (i < ndigits) \
a = scm_getc (port); \ { \
if (a == EOF) \ a = scm_getc (port); \
goto str_eof; \ if (a == EOF) \
if ('0' <= a && a <= '9') \ goto str_eof; \
a -= '0'; \ if (terminator \
else if ('A' <= a && a <= 'F') \ && (a == (scm_t_wchar) terminator) \
a = a - 'A' + 10; \ && (i > 0)) \
else if ('a' <= a && a <= 'f') \ break; \
a = a - 'a' + 10; \ if ('0' <= a && a <= '9') \
else \ a -= '0'; \
{ \ else if ('A' <= a && a <= 'F') \
c = a; \ a = a - 'A' + 10; \
goto bad_escaped; \ else if ('a' <= a && a <= 'f') \
} \ a = a - 'a' + 10; \
c = c * 16 + a; \ else \
i ++; \ { \
} \ c = a; \
goto bad_escaped; \
} \
c = c * 16 + a; \
i ++; \
} \
} while (0) } while (0)
static SCM static SCM
@ -511,13 +518,16 @@ scm_read_string (int chr, SCM port)
c = '\010'; c = '\010';
break; break;
case 'x': case 'x':
SCM_READ_HEX_ESCAPE (2); if (SCM_R6RS_ESCAPES_P)
SCM_READ_HEX_ESCAPE (10, ';');
else
SCM_READ_HEX_ESCAPE (2, '\0');
break; break;
case 'u': case 'u':
SCM_READ_HEX_ESCAPE (4); SCM_READ_HEX_ESCAPE (4, '\0');
break; break;
case 'U': case 'U':
SCM_READ_HEX_ESCAPE (6); SCM_READ_HEX_ESCAPE (6, '\0');
break; break;
default: default:
bad_escaped: bad_escaped:
@ -828,6 +838,26 @@ scm_read_character (scm_t_wchar chr, SCM port)
} }
} }
if (cp == 'x' && (charname_len > 1) && SCM_R6RS_ESCAPES_P)
{
SCM p;
scm_t_wchar chr;
/* Convert from hex, skipping the initial 'x' character in CHARNAME */
p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
scm_from_uint (16));
if (SCM_I_INUMP (p))
{
scm_t_wchar c = SCM_I_INUM (p);
if (SCM_IS_UNICODE_CHAR (c))
return SCM_MAKE_CHAR (c);
else
scm_i_input_error (FUNC_NAME, port,
"out-of-range hex character escape: ~a",
scm_list_1 (charname));
}
}
/* The names of characters should never have non-Latin1 /* The names of characters should never have non-Latin1
characters. */ characters. */
if (scm_i_is_narrow_string (charname) if (scm_i_is_narrow_string (charname)

View file

@ -29,6 +29,16 @@
(cons #t "out-of-range")) (cons #t "out-of-range"))
;; Run THUNK in the context of the reader options OPTS
(define (with-read-options opts thunk)
(let ((saved-options (read-options)))
(dynamic-wind
(lambda ()
(read-options opts))
thunk
(lambda ()
(read-options saved-options)))))
(with-test-prefix "basic char handling" (with-test-prefix "basic char handling"
(with-test-prefix "evaluator" (with-test-prefix "evaluator"
@ -313,3 +323,37 @@
(with-output-to-string (lambda () (write #\soh))) (with-output-to-string (lambda () (write #\soh)))
"#\\soh")))) "#\\soh"))))
(with-test-prefix "R6RS hex escapes"
(pass-if "one-digit hex escape"
(eqv? (with-read-options '(r6rs-hex-escapes)
(lambda ()
(with-input-from-string "#\\xA" read)))
(integer->char #x0A)))
(pass-if "two-digit hex escape"
(eqv? (with-read-options '(r6rs-hex-escapes)
(lambda ()
(with-input-from-string "#\\xFF" read)))
(integer->char #xFF)))
(pass-if "four-digit hex escape"
(eqv? (with-read-options '(r6rs-hex-escapes)
(lambda ()
(with-input-from-string "#\\x00FF" read)))
(integer->char #xFF)))
(pass-if "eight-digit hex escape"
(eqv? (with-read-options '(r6rs-hex-escapes)
(lambda ()
(with-input-from-string "#\\x00006587" read)))
(integer->char #x6587)))
(pass-if "write R6RS escapes"
(string=?
(with-read-options '(r6rs-hex-escapes)
(lambda ()
(with-output-to-string
(lambda ()
(write (integer->char #x80))))))
"#\\x80")))

View file

@ -2,23 +2,24 @@
;;;; Jim Blandy <jimb@red-bean.com> --- August 1999 ;;;; Jim Blandy <jimb@red-bean.com> --- August 1999
;;;; ;;;;
;;;; Copyright (C) 1999, 2001, 2004, 2005, 2006, 2008, 2009 Free Software Foundation, Inc. ;;;; Copyright (C) 1999, 2001, 2004, 2005, 2006, 2008, 2009 Free Software Foundation, Inc.
;;;; ;;;;
;;;; This library is free software; you can redistribute it and/or ;;;; This library is free software; you can redistribute it and/or
;;;; modify it under the terms of the GNU Lesser General Public ;;;; modify it under the terms of the GNU Lesser General Public
;;;; License as published by the Free Software Foundation; either ;;;; License as published by the Free Software Foundation; either
;;;; version 3 of the License, or (at your option) any later version. ;;;; version 3 of the License, or (at your option) any later version.
;;;; ;;;;
;;;; This library is distributed in the hope that it will be useful, ;;;; This library is distributed in the hope that it will be useful,
;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of ;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;;;; Lesser General Public License for more details. ;;;; Lesser General Public License for more details.
;;;; ;;;;
;;;; You should have received a copy of the GNU Lesser General Public ;;;; You should have received a copy of the GNU Lesser General Public
;;;; License along with this library; if not, write to the Free Software ;;;; License along with this library; if not, write to the Free Software
;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
(define-module (test-strings) (define-module (test-strings)
#:use-module (test-suite lib)) #:use-module (test-suite lib)
#:use-module (srfi srfi-1))
(define exception:read-only-string (define exception:read-only-string
(cons 'misc-error "^string is read-only")) (cons 'misc-error "^string is read-only"))
@ -29,6 +30,16 @@
(define exception:wrong-type-arg (define exception:wrong-type-arg
(cons #t "Wrong type")) (cons #t "Wrong type"))
;; Run THUNK in the context of the reader options OPTS
(define (with-read-options opts thunk)
(let ((saved-options (read-options)))
(dynamic-wind
(lambda ()
(read-options opts))
thunk
(lambda ()
(read-options saved-options)))))
;; Create a string from integer char values, eg. (string-ints 65) => "A" ;; Create a string from integer char values, eg. (string-ints 65) => "A"
(define (string-ints . args) (define (string-ints . args)
(apply string (map integer->char args))) (apply string (map integer->char args)))
@ -229,6 +240,83 @@
(pass-if "Guile extensions backslash escapes" (pass-if "Guile extensions backslash escapes"
(string=? "\0" (string #\nul)))) (string=? "\0" (string #\nul))))
(with-test-prefix "R6RS hex escapes"
(pass-if-exception "non-hex char in two-digit hex-escape"
exception:illegal-escape
(with-read-options '(r6rs-hex-escapes)
(lambda ()
(with-input-from-string "\"\\x0g;\"" read))))
(pass-if-exception "non-hex char in four-digit hex-escape"
exception:illegal-escape
(with-read-options '(r6rs-hex-escapes)
(lambda ()
(with-input-from-string "\"\\x000g;\"" read))))
(pass-if-exception "non-hex char in six-digit hex-escape"
exception:illegal-escape
(with-read-options '(r6rs-hex-escapes)
(lambda ()
(with-input-from-string "\"\\x00000g;\"" read))))
(pass-if-exception "no semicolon at termination of one-digit hex-escape"
exception:illegal-escape
(with-read-options '(r6rs-hex-escapes)
(lambda ()
(with-input-from-string "\"\\x0\"" read))))
(pass-if-exception "no semicolon at termination of three-digit hex-escape"
exception:illegal-escape
(with-read-options '(r6rs-hex-escapes)
(lambda ()
(with-input-from-string "\"\\x000\"" read))))
(pass-if "two-digit hex escape"
(eqv?
(with-read-options '(r6rs-hex-escapes)
(lambda ()
(string-ref (with-input-from-string "\"--\\xff;--\"" read) 2)))
(integer->char #xff)))
(pass-if "four-digit hex escape"
(eqv?
(with-read-options '(r6rs-hex-escapes)
(lambda ()
(string-ref (with-input-from-string "\"--\\x0100;--\"" read) 2)))
(integer->char #x0100)))
(pass-if "six-digit hex escape"
(eqv?
(with-read-options '(r6rs-hex-escapes)
(lambda ()
(string-ref (with-input-from-string "\"--\\x010300;--\"" read) 2)))
(integer->char #x010300)))
(pass-if "escaped characters match non-escaped ASCII characters"
(string=?
(with-read-options '(r6rs-hex-escapes)
(lambda ()
(with-input-from-string "\"\\x41;\\x0042;\\x000043;\"" read)))
"ABC"))
(pass-if "write R6RS escapes"
(let* ((s1 (apply string
(map integer->char '(#x8 ; backspace
#x20 ; space
#x30 ; zero
#x40 ; at sign
))))
(s2 (with-read-options '(r6rs-hex-escapes)
(lambda ()
(with-output-to-string
(lambda () (write s1)))))))
(lset= eqv?
(string->list s2)
(list #\" #\\ #\x #\8 #\; #\space #\0 #\@ #\")))))
;; ;;
;; string? ;; string?
;; ;;