diff --git a/doc/ref/api-data.texi b/doc/ref/api-data.texi index e847c9caa..8e797acc3 100755 --- a/doc/ref/api-data.texi +++ b/doc/ref/api-data.texi @@ -3273,6 +3273,70 @@ Compute a hash value for @var{S}. the optional argument @var{bound} is a non-ne Compute a hash value for @var{S}. the optional argument @var{bound} is a non-negative exact integer specifying the range of the hash function. A positive value restricts the return value to the range [0,bound). @end deffn +Because the same visual appearance of an abstract Unicode character can +be obtained via multiple sequences of Unicode characters, even the +case-insensitive string comparison functions described above may return +@code{#f} when presented with strings containing different +representations of the same character. For example, the Unicode +character ``LATIN SMALL LETTER S WITH DOT BELOW AND DOT ABOVE'' can be +represented with a single character (U+1E69) or by the character ``LATIN +SMALL LETTER S'' (U+0073) followed by the combining marks ``COMBINING +DOT BELOW'' (U+0323) and ``COMBINING DOT ABOVE'' (U+0307). + +For this reason, it is often desirable to ensure that the strings +to be compared are using a mutually consistent representation for every +character. The Unicode standard defines two methods of normalizing the +contents of strings: Decomposition, which breaks composite characters +into a set of constituent characters with an ordering defined by the +Unicode Standard; and composition, which performs the converse. + +There are two decomposition operations. ``Canonical decomposition'' +produces character sequences that share the same visual appearance as +the original characters, while ``compatiblity decomposition'' produces +ones whose visual appearances may differ from the originals but which +represent the same abstract character. + +These operations are encapsulated in the following set of normalization +forms: + +@table @dfn +@item NFD +Characters are decomposed to their canonical forms. + +@item NFKD +Characters are decomposed to their compatibility forms. + +@item NFC +Characters are decomposed to their canonical forms, then composed. + +@item NFKC +Characters are decomposed to their compatibility forms, then composed. + +@end table + +The functions below put their arguments into one of the forms described +above. + +@deffn {Scheme Procedure} string-normalize-nfd s +@deffnx {C Function} scm_string_normalize_nfd (s) +Return the @code{NFD} normalized form of @var{s}. +@end deffn + +@deffn {Scheme Procedure} string-normalize-nfkd s +@deffnx {C Function} scm_string_normalize_nfkd (s) +Return the @code{NFKD} normalized form of @var{s}. +@end deffn + +@deffn {Scheme Procedure} string-normalize-nfc s +@deffnx {C Function} scm_string_normalize_nfc (s) +Return the @code{NFC} normalized form of @var{s}. +@end deffn + +@deffn {Scheme Procedure} string-normalize-nfkc s +@deffnx {C Function} scm_string_normalize_nfkc (s) +Return the @code{NFKC} normalized form of @var{s}. +@end deffn + @node String Searching @subsubsection String Searching diff --git a/libguile/strings.c b/libguile/strings.c index 3151bbeb1..0cbab3e51 100644 --- a/libguile/strings.c +++ b/libguile/strings.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -1736,6 +1737,78 @@ scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len) return len; } +/* This function is a partial clone of SCM_STRING_TO_U32_BUF from + libguile/i18n.c. It would be useful to have this factored out into a more + convenient location, but its use of alloca makes that tricky to do. */ + +static SCM +normalize_str (SCM string, uninorm_t form) +{ + SCM ret; + scm_t_uint32 *w_str; + scm_t_wchar *cbuf; + size_t rlen, len = scm_i_string_length (string); + + if (scm_i_is_narrow_string (string)) + { + size_t i; + const char *buf = scm_i_string_chars (string); + + w_str = alloca (sizeof (scm_t_wchar) * (len + 1)); + + for (i = 0; i < len; i ++) + w_str[i] = (unsigned char) buf[i]; + w_str[len] = 0; + } + else w_str = (scm_t_uint32 *) scm_i_string_wide_chars (string); + w_str = u32_normalize (form, w_str, len, NULL, &rlen); + + ret = scm_i_make_wide_string (rlen, &cbuf); + u32_cpy ((scm_t_uint32 *) cbuf, w_str, rlen); + free (w_str); + return ret; +} + +SCM_DEFINE (scm_string_normalize_nfc, "string-normalize-nfc", 1, 0, 0, + (SCM string), + "Returns the NFC normalized form of @var{string}.") +#define FUNC_NAME s_scm_string_normalize_nfc +{ + SCM_VALIDATE_STRING (1, string); + return normalize_str (string, UNINORM_NFC); +} +#undef FUNC_NAME + +SCM_DEFINE (scm_string_normalize_nfd, "string-normalize-nfd", 1, 0, 0, + (SCM string), + "Returns the NFD normalized form of @var{string}.") +#define FUNC_NAME s_scm_string_normalize_nfd +{ + SCM_VALIDATE_STRING (1, string); + return normalize_str (string, UNINORM_NFD); +} +#undef FUNC_NAME + +SCM_DEFINE (scm_string_normalize_nfkc, "string-normalize-nfkc", 1, 0, 0, + (SCM string), + "Returns the NFKC normalized form of @var{string}.") +#define FUNC_NAME s_scm_string_normalize_nfkc +{ + SCM_VALIDATE_STRING (1, string); + return normalize_str (string, UNINORM_NFKC); +} +#undef FUNC_NAME + +SCM_DEFINE (scm_string_normalize_nfkd, "string-normalize-nfkd", 1, 0, 0, + (SCM string), + "Returns the NFKD normalized form of @var{string}.") +#define FUNC_NAME s_scm_string_normalize_nfkd +{ + SCM_VALIDATE_STRING (1, string); + return normalize_str (string, UNINORM_NFKD); +} +#undef FUNC_NAME + /* converts C scm_array of strings to SCM scm_list of strings. */ /* If argc < 0, a null terminated scm_array is assumed. */ SCM diff --git a/libguile/strings.h b/libguile/strings.h index edff0f825..6eafafa5d 100644 --- a/libguile/strings.h +++ b/libguile/strings.h @@ -142,6 +142,11 @@ SCM_INTERNAL char *scm_to_stringn (SCM str, size_t *lenp, SCM_INTERNAL scm_t_uint8 *scm_i_to_utf8_string (SCM str); SCM_API size_t scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len); +SCM_API SCM scm_string_normalize_nfd (SCM str); +SCM_API SCM scm_string_normalize_nfkd (SCM str); +SCM_API SCM scm_string_normalize_nfc (SCM str); +SCM_API SCM scm_string_normalize_nfkc (SCM str); + SCM_API SCM scm_makfromstrs (int argc, char **argv); diff --git a/test-suite/tests/strings.test b/test-suite/tests/strings.test index 013c1a863..984178d72 100644 --- a/test-suite/tests/strings.test +++ b/test-suite/tests/strings.test @@ -385,6 +385,46 @@ (eq? (char-ci>=? (integer->char 0) (integer->char 255)) (string-ci>=? (string-ints 0) (string-ints 255))))) +;; +;; Unicode string normalization forms +;; + +;; +;; string-normalize-nfd +;; + +(with-test-prefix "string-normalize-nfd" + + (pass-if "canonical decomposition is equal?" + (equal? (string-normalize-nfd "\xe9") "\x65\u0301"))) + +;; +;; string-normalize-nfkd +;; + +(with-test-prefix "string-normalize-nfkd" + + (pass-if "compatibility decomposition is equal?" + (equal? (string-normalize-nfkd "\u1e9b\u0323") "s\u0323\u0307"))) + +;; +;; string-normalize-nfc +;; + +(with-test-prefix "string-normalize-nfc" + + (pass-if "canonical composition is equal?" + (equal? (string-normalize-nfc "\x65\u0301") "\xe9"))) + +;; +;; string-normalize-nfkc +;; + +(with-test-prefix "string-normalize-nfkc" + + (pass-if "compatibility composition is equal?" + (equal? (string-normalize-nfkc "\u1e9b\u0323") "\u1e69"))) + ;; ;; string-ref ;;