1
Fork 0
mirror of https://git.savannah.gnu.org/git/guile.git synced 2025-05-01 04:10:18 +02:00

Support for Unicode string normalization functions

* libguile/strings.c, libguile/strings.h (normalize_str,
  scm_string_normalize_nfc, scm_string_normalize_nfd, scm_normalize_nfkc,
  scm_string_normalize_nfkd): New functions.
* test-suite/tests/strings.test: Unit tests for `string-normalize-nfc',
  `string-normalize-nfd', `string-normalize-nfkc', and
  `string-normalize-nfkd'.
* doc/ref/api-data.texi (String Comparison): Documentation for normalization
  functions.
This commit is contained in:
Julian Graham 2010-01-03 01:06:05 -05:00
parent 441891f376
commit edb7bb4766
4 changed files with 182 additions and 0 deletions

View file

@ -25,6 +25,7 @@
#include <string.h>
#include <stdio.h>
#include <ctype.h>
#include <uninorm.h>
#include <unistr.h>
#include <uniconv.h>
@ -1736,6 +1737,78 @@ scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len)
return len;
}
/* This function is a partial clone of SCM_STRING_TO_U32_BUF from
libguile/i18n.c. It would be useful to have this factored out into a more
convenient location, but its use of alloca makes that tricky to do. */
static SCM
normalize_str (SCM string, uninorm_t form)
{
SCM ret;
scm_t_uint32 *w_str;
scm_t_wchar *cbuf;
size_t rlen, len = scm_i_string_length (string);
if (scm_i_is_narrow_string (string))
{
size_t i;
const char *buf = scm_i_string_chars (string);
w_str = alloca (sizeof (scm_t_wchar) * (len + 1));
for (i = 0; i < len; i ++)
w_str[i] = (unsigned char) buf[i];
w_str[len] = 0;
}
else w_str = (scm_t_uint32 *) scm_i_string_wide_chars (string);
w_str = u32_normalize (form, w_str, len, NULL, &rlen);
ret = scm_i_make_wide_string (rlen, &cbuf);
u32_cpy ((scm_t_uint32 *) cbuf, w_str, rlen);
free (w_str);
return ret;
}
SCM_DEFINE (scm_string_normalize_nfc, "string-normalize-nfc", 1, 0, 0,
(SCM string),
"Returns the NFC normalized form of @var{string}.")
#define FUNC_NAME s_scm_string_normalize_nfc
{
SCM_VALIDATE_STRING (1, string);
return normalize_str (string, UNINORM_NFC);
}
#undef FUNC_NAME
SCM_DEFINE (scm_string_normalize_nfd, "string-normalize-nfd", 1, 0, 0,
(SCM string),
"Returns the NFD normalized form of @var{string}.")
#define FUNC_NAME s_scm_string_normalize_nfd
{
SCM_VALIDATE_STRING (1, string);
return normalize_str (string, UNINORM_NFD);
}
#undef FUNC_NAME
SCM_DEFINE (scm_string_normalize_nfkc, "string-normalize-nfkc", 1, 0, 0,
(SCM string),
"Returns the NFKC normalized form of @var{string}.")
#define FUNC_NAME s_scm_string_normalize_nfkc
{
SCM_VALIDATE_STRING (1, string);
return normalize_str (string, UNINORM_NFKC);
}
#undef FUNC_NAME
SCM_DEFINE (scm_string_normalize_nfkd, "string-normalize-nfkd", 1, 0, 0,
(SCM string),
"Returns the NFKD normalized form of @var{string}.")
#define FUNC_NAME s_scm_string_normalize_nfkd
{
SCM_VALIDATE_STRING (1, string);
return normalize_str (string, UNINORM_NFKD);
}
#undef FUNC_NAME
/* converts C scm_array of strings to SCM scm_list of strings. */
/* If argc < 0, a null terminated scm_array is assumed. */
SCM