mirror of
https://git.savannah.gnu.org/git/guile.git
synced 2025-04-30 11:50:28 +02:00
Support for Unicode string normalization functions
* libguile/strings.c, libguile/strings.h (normalize_str, scm_string_normalize_nfc, scm_string_normalize_nfd, scm_normalize_nfkc, scm_string_normalize_nfkd): New functions. * test-suite/tests/strings.test: Unit tests for `string-normalize-nfc', `string-normalize-nfd', `string-normalize-nfkc', and `string-normalize-nfkd'. * doc/ref/api-data.texi (String Comparison): Documentation for normalization functions.
This commit is contained in:
parent
441891f376
commit
edb7bb4766
4 changed files with 182 additions and 0 deletions
|
@ -3273,6 +3273,70 @@ Compute a hash value for @var{S}. the optional argument @var{bound} is a non-ne
|
||||||
Compute a hash value for @var{S}. the optional argument @var{bound} is a non-negative exact integer specifying the range of the hash function. A positive value restricts the return value to the range [0,bound).
|
Compute a hash value for @var{S}. the optional argument @var{bound} is a non-negative exact integer specifying the range of the hash function. A positive value restricts the return value to the range [0,bound).
|
||||||
@end deffn
|
@end deffn
|
||||||
|
|
||||||
|
Because the same visual appearance of an abstract Unicode character can
|
||||||
|
be obtained via multiple sequences of Unicode characters, even the
|
||||||
|
case-insensitive string comparison functions described above may return
|
||||||
|
@code{#f} when presented with strings containing different
|
||||||
|
representations of the same character. For example, the Unicode
|
||||||
|
character ``LATIN SMALL LETTER S WITH DOT BELOW AND DOT ABOVE'' can be
|
||||||
|
represented with a single character (U+1E69) or by the character ``LATIN
|
||||||
|
SMALL LETTER S'' (U+0073) followed by the combining marks ``COMBINING
|
||||||
|
DOT BELOW'' (U+0323) and ``COMBINING DOT ABOVE'' (U+0307).
|
||||||
|
|
||||||
|
For this reason, it is often desirable to ensure that the strings
|
||||||
|
to be compared are using a mutually consistent representation for every
|
||||||
|
character. The Unicode standard defines two methods of normalizing the
|
||||||
|
contents of strings: Decomposition, which breaks composite characters
|
||||||
|
into a set of constituent characters with an ordering defined by the
|
||||||
|
Unicode Standard; and composition, which performs the converse.
|
||||||
|
|
||||||
|
There are two decomposition operations. ``Canonical decomposition''
|
||||||
|
produces character sequences that share the same visual appearance as
|
||||||
|
the original characters, while ``compatiblity decomposition'' produces
|
||||||
|
ones whose visual appearances may differ from the originals but which
|
||||||
|
represent the same abstract character.
|
||||||
|
|
||||||
|
These operations are encapsulated in the following set of normalization
|
||||||
|
forms:
|
||||||
|
|
||||||
|
@table @dfn
|
||||||
|
@item NFD
|
||||||
|
Characters are decomposed to their canonical forms.
|
||||||
|
|
||||||
|
@item NFKD
|
||||||
|
Characters are decomposed to their compatibility forms.
|
||||||
|
|
||||||
|
@item NFC
|
||||||
|
Characters are decomposed to their canonical forms, then composed.
|
||||||
|
|
||||||
|
@item NFKC
|
||||||
|
Characters are decomposed to their compatibility forms, then composed.
|
||||||
|
|
||||||
|
@end table
|
||||||
|
|
||||||
|
The functions below put their arguments into one of the forms described
|
||||||
|
above.
|
||||||
|
|
||||||
|
@deffn {Scheme Procedure} string-normalize-nfd s
|
||||||
|
@deffnx {C Function} scm_string_normalize_nfd (s)
|
||||||
|
Return the @code{NFD} normalized form of @var{s}.
|
||||||
|
@end deffn
|
||||||
|
|
||||||
|
@deffn {Scheme Procedure} string-normalize-nfkd s
|
||||||
|
@deffnx {C Function} scm_string_normalize_nfkd (s)
|
||||||
|
Return the @code{NFKD} normalized form of @var{s}.
|
||||||
|
@end deffn
|
||||||
|
|
||||||
|
@deffn {Scheme Procedure} string-normalize-nfc s
|
||||||
|
@deffnx {C Function} scm_string_normalize_nfc (s)
|
||||||
|
Return the @code{NFC} normalized form of @var{s}.
|
||||||
|
@end deffn
|
||||||
|
|
||||||
|
@deffn {Scheme Procedure} string-normalize-nfkc s
|
||||||
|
@deffnx {C Function} scm_string_normalize_nfkc (s)
|
||||||
|
Return the @code{NFKC} normalized form of @var{s}.
|
||||||
|
@end deffn
|
||||||
|
|
||||||
@node String Searching
|
@node String Searching
|
||||||
@subsubsection String Searching
|
@subsubsection String Searching
|
||||||
|
|
||||||
|
|
|
@ -25,6 +25,7 @@
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <ctype.h>
|
#include <ctype.h>
|
||||||
|
#include <uninorm.h>
|
||||||
#include <unistr.h>
|
#include <unistr.h>
|
||||||
#include <uniconv.h>
|
#include <uniconv.h>
|
||||||
|
|
||||||
|
@ -1736,6 +1737,78 @@ scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len)
|
||||||
return len;
|
return len;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* This function is a partial clone of SCM_STRING_TO_U32_BUF from
|
||||||
|
libguile/i18n.c. It would be useful to have this factored out into a more
|
||||||
|
convenient location, but its use of alloca makes that tricky to do. */
|
||||||
|
|
||||||
|
static SCM
|
||||||
|
normalize_str (SCM string, uninorm_t form)
|
||||||
|
{
|
||||||
|
SCM ret;
|
||||||
|
scm_t_uint32 *w_str;
|
||||||
|
scm_t_wchar *cbuf;
|
||||||
|
size_t rlen, len = scm_i_string_length (string);
|
||||||
|
|
||||||
|
if (scm_i_is_narrow_string (string))
|
||||||
|
{
|
||||||
|
size_t i;
|
||||||
|
const char *buf = scm_i_string_chars (string);
|
||||||
|
|
||||||
|
w_str = alloca (sizeof (scm_t_wchar) * (len + 1));
|
||||||
|
|
||||||
|
for (i = 0; i < len; i ++)
|
||||||
|
w_str[i] = (unsigned char) buf[i];
|
||||||
|
w_str[len] = 0;
|
||||||
|
}
|
||||||
|
else w_str = (scm_t_uint32 *) scm_i_string_wide_chars (string);
|
||||||
|
w_str = u32_normalize (form, w_str, len, NULL, &rlen);
|
||||||
|
|
||||||
|
ret = scm_i_make_wide_string (rlen, &cbuf);
|
||||||
|
u32_cpy ((scm_t_uint32 *) cbuf, w_str, rlen);
|
||||||
|
free (w_str);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
SCM_DEFINE (scm_string_normalize_nfc, "string-normalize-nfc", 1, 0, 0,
|
||||||
|
(SCM string),
|
||||||
|
"Returns the NFC normalized form of @var{string}.")
|
||||||
|
#define FUNC_NAME s_scm_string_normalize_nfc
|
||||||
|
{
|
||||||
|
SCM_VALIDATE_STRING (1, string);
|
||||||
|
return normalize_str (string, UNINORM_NFC);
|
||||||
|
}
|
||||||
|
#undef FUNC_NAME
|
||||||
|
|
||||||
|
SCM_DEFINE (scm_string_normalize_nfd, "string-normalize-nfd", 1, 0, 0,
|
||||||
|
(SCM string),
|
||||||
|
"Returns the NFD normalized form of @var{string}.")
|
||||||
|
#define FUNC_NAME s_scm_string_normalize_nfd
|
||||||
|
{
|
||||||
|
SCM_VALIDATE_STRING (1, string);
|
||||||
|
return normalize_str (string, UNINORM_NFD);
|
||||||
|
}
|
||||||
|
#undef FUNC_NAME
|
||||||
|
|
||||||
|
SCM_DEFINE (scm_string_normalize_nfkc, "string-normalize-nfkc", 1, 0, 0,
|
||||||
|
(SCM string),
|
||||||
|
"Returns the NFKC normalized form of @var{string}.")
|
||||||
|
#define FUNC_NAME s_scm_string_normalize_nfkc
|
||||||
|
{
|
||||||
|
SCM_VALIDATE_STRING (1, string);
|
||||||
|
return normalize_str (string, UNINORM_NFKC);
|
||||||
|
}
|
||||||
|
#undef FUNC_NAME
|
||||||
|
|
||||||
|
SCM_DEFINE (scm_string_normalize_nfkd, "string-normalize-nfkd", 1, 0, 0,
|
||||||
|
(SCM string),
|
||||||
|
"Returns the NFKD normalized form of @var{string}.")
|
||||||
|
#define FUNC_NAME s_scm_string_normalize_nfkd
|
||||||
|
{
|
||||||
|
SCM_VALIDATE_STRING (1, string);
|
||||||
|
return normalize_str (string, UNINORM_NFKD);
|
||||||
|
}
|
||||||
|
#undef FUNC_NAME
|
||||||
|
|
||||||
/* converts C scm_array of strings to SCM scm_list of strings. */
|
/* converts C scm_array of strings to SCM scm_list of strings. */
|
||||||
/* If argc < 0, a null terminated scm_array is assumed. */
|
/* If argc < 0, a null terminated scm_array is assumed. */
|
||||||
SCM
|
SCM
|
||||||
|
|
|
@ -142,6 +142,11 @@ SCM_INTERNAL char *scm_to_stringn (SCM str, size_t *lenp,
|
||||||
SCM_INTERNAL scm_t_uint8 *scm_i_to_utf8_string (SCM str);
|
SCM_INTERNAL scm_t_uint8 *scm_i_to_utf8_string (SCM str);
|
||||||
SCM_API size_t scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len);
|
SCM_API size_t scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len);
|
||||||
|
|
||||||
|
SCM_API SCM scm_string_normalize_nfd (SCM str);
|
||||||
|
SCM_API SCM scm_string_normalize_nfkd (SCM str);
|
||||||
|
SCM_API SCM scm_string_normalize_nfc (SCM str);
|
||||||
|
SCM_API SCM scm_string_normalize_nfkc (SCM str);
|
||||||
|
|
||||||
SCM_API SCM scm_makfromstrs (int argc, char **argv);
|
SCM_API SCM scm_makfromstrs (int argc, char **argv);
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -385,6 +385,46 @@
|
||||||
(eq? (char-ci>=? (integer->char 0) (integer->char 255))
|
(eq? (char-ci>=? (integer->char 0) (integer->char 255))
|
||||||
(string-ci>=? (string-ints 0) (string-ints 255)))))
|
(string-ci>=? (string-ints 0) (string-ints 255)))))
|
||||||
|
|
||||||
|
;;
|
||||||
|
;; Unicode string normalization forms
|
||||||
|
;;
|
||||||
|
|
||||||
|
;;
|
||||||
|
;; string-normalize-nfd
|
||||||
|
;;
|
||||||
|
|
||||||
|
(with-test-prefix "string-normalize-nfd"
|
||||||
|
|
||||||
|
(pass-if "canonical decomposition is equal?"
|
||||||
|
(equal? (string-normalize-nfd "\xe9") "\x65\u0301")))
|
||||||
|
|
||||||
|
;;
|
||||||
|
;; string-normalize-nfkd
|
||||||
|
;;
|
||||||
|
|
||||||
|
(with-test-prefix "string-normalize-nfkd"
|
||||||
|
|
||||||
|
(pass-if "compatibility decomposition is equal?"
|
||||||
|
(equal? (string-normalize-nfkd "\u1e9b\u0323") "s\u0323\u0307")))
|
||||||
|
|
||||||
|
;;
|
||||||
|
;; string-normalize-nfc
|
||||||
|
;;
|
||||||
|
|
||||||
|
(with-test-prefix "string-normalize-nfc"
|
||||||
|
|
||||||
|
(pass-if "canonical composition is equal?"
|
||||||
|
(equal? (string-normalize-nfc "\x65\u0301") "\xe9")))
|
||||||
|
|
||||||
|
;;
|
||||||
|
;; string-normalize-nfkc
|
||||||
|
;;
|
||||||
|
|
||||||
|
(with-test-prefix "string-normalize-nfkc"
|
||||||
|
|
||||||
|
(pass-if "compatibility composition is equal?"
|
||||||
|
(equal? (string-normalize-nfkc "\u1e9b\u0323") "\u1e69")))
|
||||||
|
|
||||||
;;
|
;;
|
||||||
;; string-ref
|
;; string-ref
|
||||||
;;
|
;;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue