1
Fork 0
mirror of https://git.savannah.gnu.org/git/guile.git synced 2025-04-30 03:40:34 +02:00

Improved support for Unicode title case in Guile's string and character APIs.

* doc/ref/api-data.texi (Characters): Documentation for `char-titlecase'.
* doc/ref/api-i18n.texi (Character Case Mapping): Documentation for
  `char-locale-titlecase' and `string-locale-titlecase'.

* libguile/chars.c, libguile/chars.h (scm_char_titlecase, scm_c_titlecase): New
  functions.

* libguile/i18n.c, libguile/i18n.h (chr_to_case, scm_char_locale_titlecase,
  str_to_case, scm_string_locale_titlecase): New functions.
* libguile/i18n.c (scm_char_locale_downcase, scm_char_locale_upcase,
  scm_string_locale_downcase, scm_string_locale_upcase): Refactor to share code
  via chr_to_case and str_to_case, as appropriate.
* module/ice-9/i18n.scm (char-locale-title-case, string-locale-titlecase): New
  functions.

* libguile/srfi-13.c (string_titlecase_x): Use uc_totitle instead of uc_toupper.

* test-suite/tests/chars.test: Tests for `char-titlecase'.
* test-suite/tests/i18n.test: Tests for `char-locale-titlecase' and
  `string-locale-titlecase'.
* test-suite/tests/srfi-13.test: Tests for `string-titlecase'.
This commit is contained in:
Julian Graham 2009-12-22 00:19:56 -05:00
parent 9b5a0d8460
commit 820f33aaed
11 changed files with 227 additions and 103 deletions

View file

@ -1901,6 +1901,19 @@ Return the uppercase character version of @var{chr}.
Return the lowercase character version of @var{chr}. Return the lowercase character version of @var{chr}.
@end deffn @end deffn
@rnindex char-titlecase
@deffn {Scheme Procedure} char-titlecase chr
@deffnx {C Function} scm_char_titlecase (chr)
Return the titlecase character version of @var{chr} if one exists;
otherwise return the uppercase version.
For most characters these will be the same, but the Unicode Standard
includes certain digraph compatibility characters, such as @code{U+01F3}
``dz'', for which the uppercase and titlecase characters are different
(@code{U+01F1} ``DZ'' and @code{U+01F2} ``Dz'' in this case,
respectively).
@end deffn
@node Character Sets @node Character Sets
@subsection Character Sets @subsection Character Sets

View file

@ -197,6 +197,12 @@ Return the uppercase character that corresponds to @var{chr} according
to either @var{locale} or the current locale. to either @var{locale} or the current locale.
@end deffn @end deffn
@deffn {Scheme Procedure} char-locale-titlecase chr [locale]
@deffnx {C Function} scm_char_locale_titlecase (chr, locale)
Return the titlecase character that corresponds to @var{chr} according
to either @var{locale} or the current locale.
@end deffn
@deffn {Scheme Procedure} string-locale-upcase str [locale] @deffn {Scheme Procedure} string-locale-upcase str [locale]
@deffnx {C Function} scm_string_locale_upcase (str, locale) @deffnx {C Function} scm_string_locale_upcase (str, locale)
Return a new string that is the uppercase version of @var{str} Return a new string that is the uppercase version of @var{str}
@ -209,6 +215,12 @@ Return a new string that is the down-case version of @var{str}
according to either @var{locale} or the current locale. according to either @var{locale} or the current locale.
@end deffn @end deffn
@deffn {Scheme Procedure} string-locale-titlecase str [locale]
@deffnx {C Function} scm_string_locale_titlecase (str, locale)
Return a new string that is the titlecase version of @var{str}
according to either @var{locale} or the current locale.
@end deffn
Note that in the current implementation Guile has no notion of Note that in the current implementation Guile has no notion of
multibyte characters and in a multibyte locale characters may not be multibyte characters and in a multibyte locale characters may not be
converted correctly. converted correctly.

View file

@ -391,7 +391,6 @@ SCM_DEFINE (scm_char_lower_case_p, "char-lower-case?", 1, 0, 0,
} }
#undef FUNC_NAME #undef FUNC_NAME
SCM_DEFINE (scm_char_is_both_p, "char-is-both?", 1, 0, 0, SCM_DEFINE (scm_char_is_both_p, "char-is-both?", 1, 0, 0,
(SCM chr), (SCM chr),
"Return @code{#t} iff @var{chr} is either uppercase or lowercase, else\n" "Return @code{#t} iff @var{chr} is either uppercase or lowercase, else\n"
@ -458,6 +457,16 @@ SCM_DEFINE (scm_char_downcase, "char-downcase", 1, 0, 0,
} }
#undef FUNC_NAME #undef FUNC_NAME
SCM_DEFINE (scm_char_titlecase, "char-titlecase", 1, 0, 0,
(SCM chr),
"Return the titlecase character version of @var{chr}.")
#define FUNC_NAME s_scm_char_titlecase
{
SCM_VALIDATE_CHAR (1, chr);
return SCM_MAKE_CHAR (scm_c_titlecase (SCM_CHAR(chr)));
}
#undef FUNC_NAME
@ -480,6 +489,12 @@ scm_c_downcase (scm_t_wchar c)
return uc_tolower ((int) c); return uc_tolower ((int) c);
} }
scm_t_wchar
scm_c_titlecase (scm_t_wchar c)
{
return uc_totitle ((int) c);
}
/* There are a few sets of character names: R5RS, Guile /* There are a few sets of character names: R5RS, Guile

View file

@ -80,8 +80,10 @@ SCM_API SCM scm_char_to_integer (SCM chr);
SCM_API SCM scm_integer_to_char (SCM n); SCM_API SCM scm_integer_to_char (SCM n);
SCM_API SCM scm_char_upcase (SCM chr); SCM_API SCM scm_char_upcase (SCM chr);
SCM_API SCM scm_char_downcase (SCM chr); SCM_API SCM scm_char_downcase (SCM chr);
SCM_API SCM scm_char_titlecase (SCM chr);
SCM_API scm_t_wchar scm_c_upcase (scm_t_wchar c); SCM_API scm_t_wchar scm_c_upcase (scm_t_wchar c);
SCM_API scm_t_wchar scm_c_downcase (scm_t_wchar c); SCM_API scm_t_wchar scm_c_downcase (scm_t_wchar c);
SCM_API scm_t_wchar scm_c_titlecase (scm_t_wchar c);
SCM_INTERNAL const char *scm_i_charname (SCM chr); SCM_INTERNAL const char *scm_i_charname (SCM chr);
SCM_INTERNAL SCM scm_i_charname_to_char (const char *charname, SCM_INTERNAL SCM scm_i_charname_to_char (const char *charname,
size_t charname_len); size_t charname_len);

View file

@ -1098,22 +1098,17 @@ u32_locale_tocase (const scm_t_uint32 *c_s1, size_t len,
} }
static SCM
SCM_DEFINE (scm_char_locale_downcase, "char-locale-downcase", 1, 1, 0, chr_to_case (SCM chr, scm_t_locale c_locale,
(SCM chr, SCM locale), scm_t_uint32 *(*func) (const scm_t_uint32 *, size_t, const char *,
"Return the lowercase character that corresponds to @var{chr} " uninorm_t, scm_t_uint32 *, size_t *),
"according to either @var{locale} or the current locale.") int *err)
#define FUNC_NAME s_scm_char_locale_downcase
{ {
int ret; int ret;
scm_t_locale c_locale;
scm_t_wchar *buf; scm_t_wchar *buf;
scm_t_uint32 *downbuf; scm_t_uint32 *convbuf;
size_t downlen; size_t convlen;
SCM str, downchar; SCM str, convchar;
SCM_VALIDATE_CHAR (1, chr);
SCM_VALIDATE_OPTIONAL_LOCALE_COPY (2, locale, c_locale);
str = scm_i_make_wide_string (1, &buf); str = scm_i_make_wide_string (1, &buf);
buf[0] = SCM_CHAR (chr); buf[0] = SCM_CHAR (chr);
@ -1121,26 +1116,49 @@ SCM_DEFINE (scm_char_locale_downcase, "char-locale-downcase", 1, 1, 0,
if (c_locale != NULL) if (c_locale != NULL)
RUN_IN_LOCALE_SECTION (c_locale, ret = RUN_IN_LOCALE_SECTION (c_locale, ret =
u32_locale_tocase ((scm_t_uint32 *) buf, 1, u32_locale_tocase ((scm_t_uint32 *) buf, 1,
&downbuf, &convbuf,
&downlen, u32_tolower)); &convlen, func));
else else
ret = ret =
u32_locale_tocase ((scm_t_uint32 *) buf, 1, &downbuf, u32_locale_tocase ((scm_t_uint32 *) buf, 1, &convbuf,
&downlen, u32_tolower); &convlen, func);
if (SCM_UNLIKELY (ret != 0)) if (SCM_UNLIKELY (ret != 0))
{ {
errno = ret; *err = ret;
scm_syserror (FUNC_NAME); return NULL;
} }
if (downlen == 1) if (convlen == 1)
downchar = SCM_MAKE_CHAR ((scm_t_wchar) downbuf[0]); convchar = SCM_MAKE_CHAR ((scm_t_wchar) convbuf[0]);
else else
downchar = chr; convchar = chr;
free (downbuf); free (convbuf);
return downchar; return convchar;
}
SCM_DEFINE (scm_char_locale_downcase, "char-locale-downcase", 1, 1, 0,
(SCM chr, SCM locale),
"Return the lowercase character that corresponds to @var{chr} "
"according to either @var{locale} or the current locale.")
#define FUNC_NAME s_scm_char_locale_downcase
{
scm_t_locale c_locale;
SCM ret;
int err = 0;
SCM_VALIDATE_CHAR (1, chr);
SCM_VALIDATE_OPTIONAL_LOCALE_COPY (2, locale, c_locale);
ret = chr_to_case (chr, c_locale, u32_tolower, &err);
if (err != 0)
{
errno = err;
scm_syserror (FUNC_NAME);
}
return ret;
} }
#undef FUNC_NAME #undef FUNC_NAME
@ -1150,59 +1168,60 @@ SCM_DEFINE (scm_char_locale_upcase, "char-locale-upcase", 1, 1, 0,
"according to either @var{locale} or the current locale.") "according to either @var{locale} or the current locale.")
#define FUNC_NAME s_scm_char_locale_upcase #define FUNC_NAME s_scm_char_locale_upcase
{ {
int ret;
scm_t_locale c_locale; scm_t_locale c_locale;
scm_t_wchar *buf; SCM ret;
scm_t_uint32 *upbuf; int err = 0;
size_t uplen;
SCM str, upchar;
SCM_VALIDATE_CHAR (1, chr); SCM_VALIDATE_CHAR (1, chr);
SCM_VALIDATE_OPTIONAL_LOCALE_COPY (2, locale, c_locale); SCM_VALIDATE_OPTIONAL_LOCALE_COPY (2, locale, c_locale);
str = scm_i_make_wide_string (1, &buf); ret = chr_to_case (chr, c_locale, u32_toupper, &err);
buf[0] = SCM_CHAR (chr);
if (c_locale != NULL) if (err != 0)
RUN_IN_LOCALE_SECTION (c_locale, ret =
u32_locale_tocase ((scm_t_uint32 *) buf, 1,
&upbuf,
&uplen, u32_toupper));
else
ret =
u32_locale_tocase ((scm_t_uint32 *) buf, 1, &upbuf,
&uplen, u32_toupper);
if (SCM_UNLIKELY (ret != 0))
{ {
errno = ret; errno = err;
scm_syserror (FUNC_NAME); scm_syserror (FUNC_NAME);
} }
if (uplen == 1) return ret;
upchar = SCM_MAKE_CHAR ((scm_t_wchar) upbuf[0]);
else
upchar = chr;
free (upbuf);
return upchar;
} }
#undef FUNC_NAME #undef FUNC_NAME
SCM_DEFINE (scm_string_locale_upcase, "string-locale-upcase", 1, 1, 0, SCM_DEFINE (scm_char_locale_titlecase, "char-locale-titlecase", 1, 1, 0,
(SCM str, SCM locale), (SCM chr, SCM locale),
"Return a new string that is the uppercase version of " "Return the titlecase character that corresponds to @var{chr} "
"@var{str} according to either @var{locale} or the current " "according to either @var{locale} or the current locale.")
"locale.") #define FUNC_NAME s_scm_char_locale_titlecase
#define FUNC_NAME s_scm_string_locale_upcase {
scm_t_locale c_locale;
SCM ret;
int err = 0;
SCM_VALIDATE_CHAR (1, chr);
SCM_VALIDATE_OPTIONAL_LOCALE_COPY (2, locale, c_locale);
ret = chr_to_case (chr, c_locale, u32_totitle, &err);
if (err != 0)
{
errno = err;
scm_syserror (FUNC_NAME);
}
return ret;
}
#undef FUNC_NAME
static SCM
str_to_case (SCM str, scm_t_locale c_locale,
scm_t_uint32 *(*func) (const scm_t_uint32 *, size_t, const char *,
uninorm_t, scm_t_uint32 *, size_t *),
int *err)
{ {
scm_t_wchar *c_str, *c_buf; scm_t_wchar *c_str, *c_buf;
scm_t_uint32 *c_upstr; scm_t_uint32 *c_convstr;
size_t len, uplen; size_t len, convlen;
int ret; int ret;
scm_t_locale c_locale; SCM convstr;
SCM upstr;
SCM_VALIDATE_STRING (1, str);
SCM_VALIDATE_OPTIONAL_LOCALE_COPY (2, locale, c_locale);
len = scm_i_string_length (str); len = scm_i_string_length (str);
if (len == 0) if (len == 0)
return scm_nullstr; return scm_nullstr;
@ -1211,28 +1230,52 @@ SCM_DEFINE (scm_string_locale_upcase, "string-locale-upcase", 1, 1, 0,
if (c_locale) if (c_locale)
RUN_IN_LOCALE_SECTION (c_locale, ret = RUN_IN_LOCALE_SECTION (c_locale, ret =
u32_locale_tocase ((scm_t_uint32 *) c_str, len, u32_locale_tocase ((scm_t_uint32 *) c_str, len,
&c_upstr, &c_convstr,
&uplen, u32_toupper)); &convlen, func));
else else
ret = ret =
u32_locale_tocase ((scm_t_uint32 *) c_str, len, u32_locale_tocase ((scm_t_uint32 *) c_str, len,
&c_upstr, &uplen, u32_toupper); &c_convstr, &convlen, func);
scm_remember_upto_here (str); scm_remember_upto_here (str);
if (SCM_UNLIKELY (ret != 0)) if (SCM_UNLIKELY (ret != 0))
{ {
errno = ret; *err = ret;
scm_syserror (FUNC_NAME); return NULL;
} }
upstr = scm_i_make_wide_string (uplen, &c_buf); convstr = scm_i_make_wide_string (convlen, &c_buf);
memcpy (c_buf, c_upstr, uplen * sizeof (scm_t_wchar)); memcpy (c_buf, c_convstr, convlen * sizeof (scm_t_wchar));
free (c_upstr); free (c_convstr);
scm_i_try_narrow_string (upstr); scm_i_try_narrow_string (convstr);
return upstr; return convstr;
}
SCM_DEFINE (scm_string_locale_upcase, "string-locale-upcase", 1, 1, 0,
(SCM str, SCM locale),
"Return a new string that is the uppercase version of "
"@var{str} according to either @var{locale} or the current "
"locale.")
#define FUNC_NAME s_scm_string_locale_upcase
{
scm_t_locale c_locale;
SCM ret;
int err = 0;
SCM_VALIDATE_STRING (1, str);
SCM_VALIDATE_OPTIONAL_LOCALE_COPY (2, locale, c_locale);
ret = str_to_case (str, c_locale, u32_toupper, &err);
if (err != 0)
{
errno = err;
scm_syserror (FUNC_NAME);
}
return ret;
} }
#undef FUNC_NAME #undef FUNC_NAME
@ -1243,45 +1286,46 @@ SCM_DEFINE (scm_string_locale_downcase, "string-locale-downcase", 1, 1, 0,
"locale.") "locale.")
#define FUNC_NAME s_scm_string_locale_downcase #define FUNC_NAME s_scm_string_locale_downcase
{ {
scm_t_wchar *c_str, *c_buf;
scm_t_uint32 *c_downstr;
size_t len, downlen;
int ret;
scm_t_locale c_locale; scm_t_locale c_locale;
SCM downstr; SCM ret;
int err = 0;
SCM_VALIDATE_STRING (1, str); SCM_VALIDATE_STRING (1, str);
SCM_VALIDATE_OPTIONAL_LOCALE_COPY (2, locale, c_locale); SCM_VALIDATE_OPTIONAL_LOCALE_COPY (2, locale, c_locale);
len = scm_i_string_length (str);
if (len == 0)
return scm_nullstr;
SCM_STRING_TO_U32_BUF (str, c_str);
if (c_locale) ret = str_to_case (str, c_locale, u32_tolower, &err);
RUN_IN_LOCALE_SECTION (c_locale, ret =
u32_locale_tocase ((scm_t_uint32 *) c_str, len,
&c_downstr,
&downlen, u32_tolower));
else
ret =
u32_locale_tocase ((scm_t_uint32 *) c_str, len,
&c_downstr, &downlen, u32_tolower);
scm_remember_upto_here (str); if (err != 0)
if (SCM_UNLIKELY (ret != 0))
{ {
errno = ret; errno = err;
scm_syserror (FUNC_NAME); scm_syserror (FUNC_NAME);
} }
return ret;
}
#undef FUNC_NAME
downstr = scm_i_make_wide_string (downlen, &c_buf); SCM_DEFINE (scm_string_locale_titlecase, "string-locale-titlecase", 1, 1, 0,
memcpy (c_buf, c_downstr, downlen * sizeof (scm_t_wchar)); (SCM str, SCM locale),
free (c_downstr); "Return a new string that is the title-case version of "
"@var{str} according to either @var{locale} or the current "
"locale.")
#define FUNC_NAME s_scm_string_locale_titlecase
{
scm_t_locale c_locale;
SCM ret;
int err = 0;
scm_i_try_narrow_string (downstr); SCM_VALIDATE_STRING (1, str);
SCM_VALIDATE_OPTIONAL_LOCALE_COPY (2, locale, c_locale);
return downstr; ret = str_to_case (str, c_locale, u32_totitle, &err);
if (err != 0)
{
errno = err;
scm_syserror (FUNC_NAME);
}
return ret;
} }
#undef FUNC_NAME #undef FUNC_NAME

View file

@ -38,8 +38,10 @@ SCM_API SCM scm_char_locale_ci_gt (SCM c1, SCM c2, SCM locale);
SCM_API SCM scm_char_locale_ci_eq (SCM c1, SCM c2, SCM locale); SCM_API SCM scm_char_locale_ci_eq (SCM c1, SCM c2, SCM locale);
SCM_API SCM scm_char_locale_upcase (SCM chr, SCM locale); SCM_API SCM scm_char_locale_upcase (SCM chr, SCM locale);
SCM_API SCM scm_char_locale_downcase (SCM chr, SCM locale); SCM_API SCM scm_char_locale_downcase (SCM chr, SCM locale);
SCM_API SCM scm_char_locale_titlecase (SCM chr, SCM locale);
SCM_API SCM scm_string_locale_upcase (SCM chr, SCM locale); SCM_API SCM scm_string_locale_upcase (SCM chr, SCM locale);
SCM_API SCM scm_string_locale_downcase (SCM chr, SCM locale); SCM_API SCM scm_string_locale_downcase (SCM chr, SCM locale);
SCM_API SCM scm_string_locale_titlecase (SCM chr, SCM locale);
SCM_API SCM scm_locale_string_to_integer (SCM str, SCM base, SCM locale); SCM_API SCM scm_locale_string_to_integer (SCM str, SCM base, SCM locale);
SCM_API SCM scm_locale_string_to_inexact (SCM str, SCM locale); SCM_API SCM scm_locale_string_to_inexact (SCM str, SCM locale);

View file

@ -2198,7 +2198,7 @@ string_titlecase_x (SCM str, size_t start, size_t end)
{ {
if (!in_word) if (!in_word)
{ {
scm_i_string_set_x (str, i, uc_toupper (SCM_CHAR (ch))); scm_i_string_set_x (str, i, uc_totitle (SCM_CHAR (ch)));
in_word = 1; in_word = 1;
} }
else else

View file

@ -42,8 +42,8 @@
char-locale-ci<? char-locale-ci>? char-locale-ci=? char-locale-ci<? char-locale-ci>? char-locale-ci=?
;; character mapping ;; character mapping
char-locale-downcase char-locale-upcase char-locale-downcase char-locale-upcase char-locale-titlecase
string-locale-downcase string-locale-upcase string-locale-downcase string-locale-upcase string-locale-titlecase
;; reading numbers ;; reading numbers
locale-string->integer locale-string->inexact locale-string->integer locale-string->inexact

View file

@ -245,7 +245,11 @@
(eqv? (char-upcase #\a) #\A)) (eqv? (char-upcase #\a) #\A))
(pass-if "char-downcase" (pass-if "char-downcase"
(eqv? (char-downcase #\A) #\a))) (eqv? (char-downcase #\A) #\a))
(pass-if "char-titlecase"
(and (eqv? (char-titlecase #\a) #\A)
(eqv? (char-titlecase #\763) #\762))))
(with-test-prefix "charnames" (with-test-prefix "charnames"

View file

@ -202,6 +202,14 @@
(and (eq? #\Z (char-locale-upcase #\z)) (and (eq? #\Z (char-locale-upcase #\z))
(eq? #\Z (char-locale-upcase #\z (make-locale LC_ALL "C"))))) (eq? #\Z (char-locale-upcase #\z (make-locale LC_ALL "C")))))
(pass-if "char-locale-titlecase"
(and (eq? #\T (char-locale-titlecase #\t))
(eq? #\T (char-locale-titlecase #\t (make-locale LC_ALL "C")))))
(pass-if "char-locale-titlecase Dž"
(and (eq? #\762 (char-locale-titlecase #\763))
(eq? #\762 (char-locale-titlecase #\763 (make-locale LC_ALL "C")))))
(pass-if "char-locale-upcase Turkish" (pass-if "char-locale-upcase Turkish"
(under-turkish-utf8-locale-or-unresolved (under-turkish-utf8-locale-or-unresolved
(lambda () (lambda ()
@ -229,6 +237,11 @@
(and (string=? "Z" (string-locale-upcase "z")) (and (string=? "Z" (string-locale-upcase "z"))
(string=? "Z" (string-locale-upcase "z" (make-locale LC_ALL "C"))))) (string=? "Z" (string-locale-upcase "z" (make-locale LC_ALL "C")))))
(pass-if "string-locale-titlecase"
(and (string=? "Hello, World" (string-locale-titlecase "hello, world"))
(string=? "Hello, World" (string-locale-titlecase
"hello, world" (make-locale LC_ALL "C")))))
(pass-if "string-locale-upcase Turkish" (pass-if "string-locale-upcase Turkish"
(under-turkish-utf8-locale-or-unresolved (under-turkish-utf8-locale-or-unresolved
(lambda () (lambda ()

View file

@ -147,6 +147,25 @@
(pass-if "more than one match, start and end index" (pass-if "more than one match, start and end index"
(string-any char-upper-case? "abCDE" 1 4)))) (string-any char-upper-case? "abCDE" 1 4))))
;;;
;;; string-titlecase
;;;
(with-test-prefix "string-titlecase"
(pass-if "all-lower"
(string=? "Foo" (string-titlecase "foo")))
(pass-if "all-upper"
(string=? "Foo" (string-titlecase "FOO")))
(pass-if "two-words"
(string=? "Hello, World!" (string-titlecase "hello, world!")))
(pass-if "titlecase-characters"
(string=? (list->string '(#\762))
(string-titlecase (list->string '(#\763))))))
;;; ;;;
;;; string-append/shared ;;; string-append/shared
;;; ;;;