mirror of
https://git.savannah.gnu.org/git/guile.git
synced 2025-04-30 03:40:34 +02:00
Reimplement 'unidata_to_charset.pl' in Awk.
* libguile/unidata_to_charset.pl: Delete file. * libguile/unidata_to_charset.awk: New file. * libguile/Makefile.am (EXTRA_DIST): Adjust accordingly. Signed-off-by: Ludovic Courtès <ludo@gnu.org>
This commit is contained in:
parent
6e82a4516a
commit
63886aeda2
3 changed files with 410 additions and 402 deletions
|
@ -728,7 +728,7 @@ EXTRA_DIST = ChangeLog-scm ChangeLog-threads \
|
||||||
guile-func-name-check \
|
guile-func-name-check \
|
||||||
cpp-E.syms cpp-E.c cpp-SIG.syms cpp-SIG.c \
|
cpp-E.syms cpp-E.c cpp-SIG.syms cpp-SIG.c \
|
||||||
c-tokenize.lex \
|
c-tokenize.lex \
|
||||||
scmconfig.h.top libgettext.h unidata_to_charset.pl libguile.map \
|
scmconfig.h.top libgettext.h unidata_to_charset.awk libguile.map \
|
||||||
vm-operations.h libguile-@GUILE_EFFECTIVE_VERSION@-gdb.scm \
|
vm-operations.h libguile-@GUILE_EFFECTIVE_VERSION@-gdb.scm \
|
||||||
$(lightening_c_files) $(lightening_extra_files)
|
$(lightening_c_files) $(lightening_extra_files)
|
||||||
# $(DOT_DOC_FILES) $(EXTRA_DOT_DOC_FILES) \
|
# $(DOT_DOC_FILES) $(EXTRA_DOT_DOC_FILES) \
|
||||||
|
|
409
libguile/unidata_to_charset.awk
Normal file
409
libguile/unidata_to_charset.awk
Normal file
|
@ -0,0 +1,409 @@
|
||||||
|
# unidata_to_charset.awk --- Compute SRFI-14 charsets from UnicodeData.txt
|
||||||
|
#
|
||||||
|
# Copyright (C) 2009, 2010, 2022 Free Software Foundation, Inc.
|
||||||
|
#
|
||||||
|
# This library is free software; you can redistribute it and/or
|
||||||
|
# modify it under the terms of the GNU Lesser General Public
|
||||||
|
# License as published by the Free Software Foundation; either
|
||||||
|
# version 3 of the License, or (at your option) any later version.
|
||||||
|
#
|
||||||
|
# This library is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
# Lesser General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU Lesser General Public
|
||||||
|
# License along with this library; if not, write to the Free Software
|
||||||
|
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
|
|
||||||
|
# Utilities
|
||||||
|
###########
|
||||||
|
|
||||||
|
# Print MESSAGE to standard error, and exit with STATUS.
|
||||||
|
function die(status, message) {
|
||||||
|
print "unidata_to_charset.awk:", message | "cat 1>&2";
|
||||||
|
exit_status = status;
|
||||||
|
exit exit_status;
|
||||||
|
}
|
||||||
|
|
||||||
|
# Parse the string S as a hexadecimal number. Note that R, C, and B are
|
||||||
|
# local variables that need not be set by callers. Most Awk
|
||||||
|
# implementations have an 'strtonum' function that we could use, but it
|
||||||
|
# is not part of POSIX.
|
||||||
|
function hex(s, r, c, b) {
|
||||||
|
if (length(s) == 0) {
|
||||||
|
die(1, "Cannot parse empty string as hexadecimal.");
|
||||||
|
}
|
||||||
|
r = 0;
|
||||||
|
for (i = 1; i <= length(s); i++) {
|
||||||
|
c = substr(s, i, 1);
|
||||||
|
b = 0;
|
||||||
|
if (c == "0") { b = 0; }
|
||||||
|
else if (c == "1") { b = 1; }
|
||||||
|
else if (c == "2") { b = 2; }
|
||||||
|
else if (c == "3") { b = 3; }
|
||||||
|
else if (c == "4") { b = 4; }
|
||||||
|
else if (c == "5") { b = 5; }
|
||||||
|
else if (c == "6") { b = 6; }
|
||||||
|
else if (c == "7") { b = 7; }
|
||||||
|
else if (c == "8") { b = 8; }
|
||||||
|
else if (c == "9") { b = 9; }
|
||||||
|
else if (c == "A") { b = 10; }
|
||||||
|
else if (c == "B") { b = 11; }
|
||||||
|
else if (c == "C") { b = 12; }
|
||||||
|
else if (c == "D") { b = 13; }
|
||||||
|
else if (c == "E") { b = 14; }
|
||||||
|
else if (c == "F") { b = 15; }
|
||||||
|
else { die(1, "Invalid hexadecimal character: " c); }
|
||||||
|
r *= 16;
|
||||||
|
r += b;
|
||||||
|
}
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
# Program initialization
|
||||||
|
########################
|
||||||
|
|
||||||
|
BEGIN {
|
||||||
|
# The columns are separated by semicolons.
|
||||||
|
FS = ";";
|
||||||
|
|
||||||
|
# This will help us handle errors.
|
||||||
|
exit_status = 0;
|
||||||
|
|
||||||
|
# List of charsets.
|
||||||
|
all_charsets_count = 0;
|
||||||
|
all_charsets[all_charsets_count++] = "lower_case";
|
||||||
|
all_charsets[all_charsets_count++] = "upper_case";
|
||||||
|
all_charsets[all_charsets_count++] = "title_case";
|
||||||
|
all_charsets[all_charsets_count++] = "letter";
|
||||||
|
all_charsets[all_charsets_count++] = "digit";
|
||||||
|
all_charsets[all_charsets_count++] = "hex_digit";
|
||||||
|
all_charsets[all_charsets_count++] = "letter_plus_digit";
|
||||||
|
all_charsets[all_charsets_count++] = "graphic";
|
||||||
|
all_charsets[all_charsets_count++] = "whitespace";
|
||||||
|
all_charsets[all_charsets_count++] = "printing";
|
||||||
|
all_charsets[all_charsets_count++] = "iso_control";
|
||||||
|
all_charsets[all_charsets_count++] = "punctuation";
|
||||||
|
all_charsets[all_charsets_count++] = "symbol";
|
||||||
|
all_charsets[all_charsets_count++] = "blank";
|
||||||
|
all_charsets[all_charsets_count++] = "ascii";
|
||||||
|
all_charsets[all_charsets_count++] = "empty";
|
||||||
|
all_charsets[all_charsets_count++] = "designated";
|
||||||
|
|
||||||
|
# Initialize charset state table.
|
||||||
|
for (i in all_charsets) {
|
||||||
|
cs = all_charsets[i];
|
||||||
|
state[cs, "start"] = -1;
|
||||||
|
state[cs, "end"] = -1;
|
||||||
|
state[cs, "count"] = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Record initialization
|
||||||
|
#######################
|
||||||
|
|
||||||
|
# In this block we give names to each field, and do some basic
|
||||||
|
# initialization.
|
||||||
|
{
|
||||||
|
codepoint = hex($1);
|
||||||
|
name = $2;
|
||||||
|
category = $3;
|
||||||
|
uppercase = $13;
|
||||||
|
lowercase = $14;
|
||||||
|
|
||||||
|
codepoint_end = codepoint;
|
||||||
|
charset_count = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
# Some pairs of lines in UnicodeData.txt delimit ranges of
|
||||||
|
# characters.
|
||||||
|
name ~ /First>$/ {
|
||||||
|
getline;
|
||||||
|
last_name = name;
|
||||||
|
sub(/First>$/, "Last>", last_name);
|
||||||
|
if (last_name != $2) {
|
||||||
|
die(1, "Invalid range in Unicode data.");
|
||||||
|
exit_status = 1;
|
||||||
|
exit 1;
|
||||||
|
}
|
||||||
|
codepoint_end = hex($1);
|
||||||
|
}
|
||||||
|
|
||||||
|
# Character set predicates
|
||||||
|
##########################
|
||||||
|
|
||||||
|
## The lower_case character set
|
||||||
|
###############################
|
||||||
|
|
||||||
|
# For Unicode, we follow Java's specification: a character is
|
||||||
|
# lowercase if
|
||||||
|
# * it is not in the range [U+2000,U+2FFF] ([8192,12287]), and
|
||||||
|
# * the Unicode attribute table does not give a lowercase mapping
|
||||||
|
# for it, and
|
||||||
|
# * at least one of the following is true:
|
||||||
|
# o the Unicode attribute table gives a mapping to uppercase
|
||||||
|
# for the character, or
|
||||||
|
# o the name for the character in the Unicode attribute table
|
||||||
|
# contains the words "SMALL LETTER" or "SMALL LIGATURE".
|
||||||
|
|
||||||
|
(codepoint < 8192 || codepoint > 12287) &&
|
||||||
|
lowercase == "" &&
|
||||||
|
(uppercase != "" || name ~ /(SMALL LETTER|SMALL LIGATURE)/) {
|
||||||
|
charsets[charset_count++] = "lower_case";
|
||||||
|
}
|
||||||
|
|
||||||
|
## The upper_case character set
|
||||||
|
###############################
|
||||||
|
|
||||||
|
# For Unicode, we follow Java's specification: a character is
|
||||||
|
# uppercase if
|
||||||
|
# * it is not in the range [U+2000,U+2FFF] ([8192,12287]), and
|
||||||
|
# * the Unicode attribute table does not give an uppercase mapping
|
||||||
|
# for it (this excludes titlecase characters), and
|
||||||
|
# * at least one of the following is true:
|
||||||
|
# o the Unicode attribute table gives a mapping to lowercase
|
||||||
|
# for the character, or
|
||||||
|
# o the name for the character in the Unicode attribute table
|
||||||
|
# contains the words "CAPITAL LETTER" or "CAPITAL LIGATURE".
|
||||||
|
|
||||||
|
(codepoint < 8192 || codepoint > 12287) &&
|
||||||
|
uppercase == "" &&
|
||||||
|
(lowercase != "" || name ~ /(CAPITAL LETTER|CAPITAL LIGATURE)/) {
|
||||||
|
charsets[charset_count++] = "upper_case";
|
||||||
|
}
|
||||||
|
|
||||||
|
## The title_case character set
|
||||||
|
###############################
|
||||||
|
|
||||||
|
# A character is titlecase if it has the category Lt in the character
|
||||||
|
# attribute database.
|
||||||
|
|
||||||
|
category == "Lt" {
|
||||||
|
charsets[charset_count++] = "title_case";
|
||||||
|
}
|
||||||
|
|
||||||
|
## The letter character set
|
||||||
|
###########################
|
||||||
|
|
||||||
|
# A letter is any character with one of the letter categories (Lu, Ll,
|
||||||
|
# Lt, Lm, Lo) in the Unicode character database.
|
||||||
|
|
||||||
|
category == "Lu" ||
|
||||||
|
category == "Ll" ||
|
||||||
|
category == "Lt" ||
|
||||||
|
category == "Lm" ||
|
||||||
|
category == "Lo" {
|
||||||
|
charsets[charset_count++] = "letter";
|
||||||
|
charsets[charset_count++] = "letter_plus_digit";
|
||||||
|
}
|
||||||
|
|
||||||
|
## The digit character set
|
||||||
|
##########################
|
||||||
|
|
||||||
|
# A character is a digit if it has the category Nd in the character
|
||||||
|
# attribute database. In Latin-1 and ASCII, the only such characters
|
||||||
|
# are 0123456789. In Unicode, there are other digit characters in
|
||||||
|
# other code blocks, such as Gujarati digits and Tibetan digits.
|
||||||
|
|
||||||
|
category == "Nd" {
|
||||||
|
charsets[charset_count++] = "digit";
|
||||||
|
charsets[charset_count++] = "letter_plus_digit";
|
||||||
|
}
|
||||||
|
|
||||||
|
## The hex_digit character set
|
||||||
|
##############################
|
||||||
|
|
||||||
|
# The only hex digits are 0123456789abcdefABCDEF.
|
||||||
|
|
||||||
|
(codepoint >= 48 && codepoint <= 57) ||
|
||||||
|
(codepoint >= 65 && codepoint <= 70) ||
|
||||||
|
(codepoint >= 97 && codepoint <= 102) {
|
||||||
|
charsets[charset_count++] = "hex_digit";
|
||||||
|
}
|
||||||
|
|
||||||
|
## The graphic character set
|
||||||
|
############################
|
||||||
|
|
||||||
|
# Characters that would 'use ink' when printed
|
||||||
|
|
||||||
|
category ~ /L|M|N|P|S/ {
|
||||||
|
charsets[charset_count++] = "graphic";
|
||||||
|
charsets[charset_count++] = "printing";
|
||||||
|
}
|
||||||
|
|
||||||
|
## The whitespace character set
|
||||||
|
###############################
|
||||||
|
|
||||||
|
# A whitespace character is either
|
||||||
|
# * a character with one of the space, line, or paragraph separator
|
||||||
|
# categories (Zs, Zl or Zp) of the Unicode character database.
|
||||||
|
# * U+0009 (09) Horizontal tabulation (\t control-I)
|
||||||
|
# * U+000A (10) Line feed (\n control-J)
|
||||||
|
# * U+000B (11) Vertical tabulation (\v control-K)
|
||||||
|
# * U+000C (12) Form feed (\f control-L)
|
||||||
|
# * U+000D (13) Carriage return (\r control-M)
|
||||||
|
|
||||||
|
category ~ /Zs|Zl|Zp/ ||
|
||||||
|
(codepoint >= 9 && codepoint <= 13) {
|
||||||
|
charsets[charset_count++] = "whitespace";
|
||||||
|
charsets[charset_count++] = "printing";
|
||||||
|
}
|
||||||
|
|
||||||
|
## The iso_control character set
|
||||||
|
################################
|
||||||
|
|
||||||
|
# The ISO control characters are the Unicode/Latin-1 characters in the
|
||||||
|
# ranges [U+0000,U+001F] ([0,31]) and [U+007F,U+009F] ([127,159]).
|
||||||
|
|
||||||
|
(codepoint >= 0 && codepoint <= 31) ||
|
||||||
|
(codepoint >= 127 && codepoint <= 159) {
|
||||||
|
charsets[charset_count++] = "iso_control";
|
||||||
|
}
|
||||||
|
|
||||||
|
## The punctuation character set
|
||||||
|
################################
|
||||||
|
|
||||||
|
# A punctuation character is any character that has one of the
|
||||||
|
# punctuation categories in the Unicode character database (Pc, Pd,
|
||||||
|
# Ps, Pe, Pi, Pf, or Po.)
|
||||||
|
|
||||||
|
# Note that srfi-14 gives conflicting requirements!! It claims that
|
||||||
|
# only the Unicode punctuation is necessary, but, explicitly calls out
|
||||||
|
# the soft hyphen character (U+00AD) as punctution. Current versions
|
||||||
|
# of Unicode consider U+00AD to be a formatting character, not
|
||||||
|
# punctuation.
|
||||||
|
|
||||||
|
category ~ /P/ {
|
||||||
|
charsets[charset_count++] = "punctuation";
|
||||||
|
}
|
||||||
|
|
||||||
|
## The symbol character set
|
||||||
|
###########################
|
||||||
|
|
||||||
|
# A symbol is any character that has one of the symbol categories in
|
||||||
|
# the Unicode character database (Sm, Sc, Sk, or So).
|
||||||
|
|
||||||
|
category ~ /S/ {
|
||||||
|
charsets[charset_count++] = "symbol";
|
||||||
|
}
|
||||||
|
|
||||||
|
## The blank character set
|
||||||
|
##########################
|
||||||
|
|
||||||
|
# Blank chars are horizontal whitespace. A blank character is either
|
||||||
|
# * a character with the space separator category (Zs) in the
|
||||||
|
# Unicode character database.
|
||||||
|
# * U+0009 (9) Horizontal tabulation (\t control-I)
|
||||||
|
|
||||||
|
category ~ /Zs/ || codepoint == 9 {
|
||||||
|
charsets[charset_count++] = "blank";
|
||||||
|
}
|
||||||
|
|
||||||
|
## The ascii character set
|
||||||
|
##########################
|
||||||
|
|
||||||
|
codepoint <= 127 {
|
||||||
|
charsets[charset_count++] = "ascii";
|
||||||
|
}
|
||||||
|
|
||||||
|
## The designated character set
|
||||||
|
###############################
|
||||||
|
|
||||||
|
# Designated -- All characters except for the surrogates
|
||||||
|
|
||||||
|
category !~ /Cs/ {
|
||||||
|
charsets[charset_count++] = "designated";
|
||||||
|
}
|
||||||
|
|
||||||
|
## Other character sets
|
||||||
|
#######################
|
||||||
|
|
||||||
|
# Note that the "letter_plus_digit" and "printing" character sets, which
|
||||||
|
# are unions of other character sets, are included in the patterns
|
||||||
|
# matching their constituent parts (i.e., the "letter_plus_digit"
|
||||||
|
# character set is included as part of the "letter" and "digit"
|
||||||
|
# patterns).
|
||||||
|
#
|
||||||
|
# Also, the "empty" character is computed by doing precisely nothing!
|
||||||
|
|
||||||
|
# Keeping track of state
|
||||||
|
########################
|
||||||
|
|
||||||
|
# Update the state for each charset.
|
||||||
|
{
|
||||||
|
for (i = 0; i < charset_count; i++) {
|
||||||
|
cs = charsets[i];
|
||||||
|
if (state[cs, "start"] == -1) {
|
||||||
|
state[cs, "start"] = codepoint;
|
||||||
|
state[cs, "end"] = codepoint_end;
|
||||||
|
} else if (state[cs, "end"] + 1 == codepoint) {
|
||||||
|
state[cs, "end"] = codepoint_end;
|
||||||
|
} else {
|
||||||
|
count = state[cs, "count"];
|
||||||
|
state[cs, "count"]++;
|
||||||
|
state[cs, "ranges", count, 0] = state[cs, "start"];
|
||||||
|
state[cs, "ranges", count, 1] = state[cs, "end"];
|
||||||
|
state[cs, "start"] = codepoint;
|
||||||
|
state[cs, "end"] = codepoint_end;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Printing and error handling
|
||||||
|
#############################
|
||||||
|
|
||||||
|
END {
|
||||||
|
# Normally, an exit statement runs all the 'END' blocks before
|
||||||
|
# actually exiting. We use the 'exit_status' variable to short
|
||||||
|
# circuit the rest of the 'END' block by reissuing the exit
|
||||||
|
# statement.
|
||||||
|
if (exit_status != 0) {
|
||||||
|
exit exit_status;
|
||||||
|
}
|
||||||
|
|
||||||
|
# Write a bit of a header.
|
||||||
|
print("/* srfi-14.i.c -- standard SRFI-14 character set data */");
|
||||||
|
print("");
|
||||||
|
print("/* This file is #include'd by srfi-14.c. */");
|
||||||
|
print("");
|
||||||
|
print("/* This file was generated from");
|
||||||
|
print(" https://unicode.org/Public/UNIDATA/UnicodeData.txt");
|
||||||
|
print(" with the unidata_to_charset.awk script. */");
|
||||||
|
print("");
|
||||||
|
|
||||||
|
for (i = 0; i < all_charsets_count; i++) {
|
||||||
|
cs = all_charsets[i];
|
||||||
|
|
||||||
|
# Extra logic to ensure that the last range is included.
|
||||||
|
if (state[cs, "start"] != -1) {
|
||||||
|
count = state[cs, "count"];
|
||||||
|
state[cs, "count"]++;
|
||||||
|
state[cs, "ranges", count, 0] = state[cs, "start"];
|
||||||
|
state[cs, "ranges", count, 1] = state[cs, "end"];
|
||||||
|
}
|
||||||
|
|
||||||
|
count = state[cs, "count"];
|
||||||
|
|
||||||
|
print("static const scm_t_char_range cs_" cs "_ranges[] = {");
|
||||||
|
for (j = 0; j < count; j++) {
|
||||||
|
rstart = state[cs, "ranges", j, 0];
|
||||||
|
rend = state[cs, "ranges", j, 1];
|
||||||
|
if (j + 1 < count) {
|
||||||
|
printf(" {0x%04x, 0x%04x},\n", rstart, rend);
|
||||||
|
} else {
|
||||||
|
printf(" {0x%04x, 0x%04x}\n", rstart, rend);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
print("};");
|
||||||
|
print("");
|
||||||
|
|
||||||
|
count = state[cs, "count"];
|
||||||
|
printf("static const size_t cs_%s_len = %d;\n", cs, count);
|
||||||
|
if (i + 1 < all_charsets_count) {
|
||||||
|
print("");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# And we're done.
|
|
@ -1,401 +0,0 @@
|
||||||
#!/usr/bin/perl
|
|
||||||
# unidata_to_charset.pl --- Compute SRFI-14 charsets from UnicodeData.txt
|
|
||||||
#
|
|
||||||
# Copyright (C) 2009, 2010, 2022 Free Software Foundation, Inc.
|
|
||||||
#
|
|
||||||
# This library is free software; you can redistribute it and/or
|
|
||||||
# modify it under the terms of the GNU Lesser General Public
|
|
||||||
# License as published by the Free Software Foundation; either
|
|
||||||
# version 3 of the License, or (at your option) any later version.
|
|
||||||
#
|
|
||||||
# This library is distributed in the hope that it will be useful,
|
|
||||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
# Lesser General Public License for more details.
|
|
||||||
#
|
|
||||||
# You should have received a copy of the GNU Lesser General Public
|
|
||||||
# License along with this library; if not, write to the Free Software
|
|
||||||
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
||||||
|
|
||||||
open(my $in, "<", "UnicodeData.txt") or die "Can't open UnicodeData.txt: $!";
|
|
||||||
open(my $out, ">", "srfi-14.i.c") or die "Can't open srfi-14.i.c: $!";
|
|
||||||
|
|
||||||
# For Unicode, we follow Java's specification: a character is
|
|
||||||
# lowercase if
|
|
||||||
# * it is not in the range [U+2000,U+2FFF], and
|
|
||||||
# * the Unicode attribute table does not give a lowercase mapping
|
|
||||||
# for it, and
|
|
||||||
# * at least one of the following is true:
|
|
||||||
# o the Unicode attribute table gives a mapping to uppercase
|
|
||||||
# for the character, or
|
|
||||||
# o the name for the character in the Unicode attribute table
|
|
||||||
# contains the words "SMALL LETTER" or "SMALL LIGATURE".
|
|
||||||
|
|
||||||
sub lower_case {
|
|
||||||
my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
|
|
||||||
if (($codepoint < 0x2000 || $codepoint > 0x2FFF)
|
|
||||||
&& (!defined($lowercase) || $lowercase eq "")
|
|
||||||
&& ((defined($uppercase) && $uppercase ne "")
|
|
||||||
|| ($name =~ /(SMALL LETTER|SMALL LIGATURE)/))) {
|
|
||||||
return 1;
|
|
||||||
} else {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
# For Unicode, we follow Java's specification: a character is
|
|
||||||
# uppercase if
|
|
||||||
# * it is not in the range [U+2000,U+2FFF], and
|
|
||||||
# * the Unicode attribute table does not give an uppercase mapping
|
|
||||||
# for it (this excludes titlecase characters), and
|
|
||||||
# * at least one of the following is true:
|
|
||||||
# o the Unicode attribute table gives a mapping to lowercase
|
|
||||||
# for the character, or
|
|
||||||
# o the name for the character in the Unicode attribute table
|
|
||||||
# contains the words "CAPITAL LETTER" or "CAPITAL LIGATURE".
|
|
||||||
|
|
||||||
sub upper_case {
|
|
||||||
my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
|
|
||||||
if (($codepoint < 0x2000 || $codepoint > 0x2FFF)
|
|
||||||
&& (!defined($uppercase) || $uppercase eq "")
|
|
||||||
&& ((defined($lowercase) && $lowercase ne "")
|
|
||||||
|| ($name =~ /(CAPITAL LETTER|CAPITAL LIGATURE)/))) {
|
|
||||||
return 1;
|
|
||||||
} else {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
# A character is titlecase if it has the category Lt in the character
|
|
||||||
# attribute database.
|
|
||||||
|
|
||||||
sub title_case {
|
|
||||||
my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
|
|
||||||
if (defined($category) && $category eq "Lt") {
|
|
||||||
return 1;
|
|
||||||
} else {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
# A letter is any character with one of the letter categories (Lu, Ll,
|
|
||||||
# Lt, Lm, Lo) in the Unicode character database.
|
|
||||||
|
|
||||||
sub letter {
|
|
||||||
my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
|
|
||||||
if (defined($category) && ($category eq "Lu"
|
|
||||||
|| $category eq "Ll"
|
|
||||||
|| $category eq "Lt"
|
|
||||||
|| $category eq "Lm"
|
|
||||||
|| $category eq "Lo")) {
|
|
||||||
return 1;
|
|
||||||
} else {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
# A character is a digit if it has the category Nd in the character
|
|
||||||
# attribute database. In Latin-1 and ASCII, the only such characters
|
|
||||||
# are 0123456789. In Unicode, there are other digit characters in
|
|
||||||
# other code blocks, such as Gujarati digits and Tibetan digits.
|
|
||||||
|
|
||||||
sub digit {
|
|
||||||
my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
|
|
||||||
if (defined($category) && $category eq "Nd") {
|
|
||||||
return 1;
|
|
||||||
} else {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
# The only hex digits are 0123456789abcdefABCDEF.
|
|
||||||
|
|
||||||
sub hex_digit {
|
|
||||||
my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
|
|
||||||
if (($codepoint >= 0x30 && $codepoint <= 0x39)
|
|
||||||
|| ($codepoint >= 0x41 && $codepoint <= 0x46)
|
|
||||||
|| ($codepoint >= 0x61 && $codepoint <= 0x66)) {
|
|
||||||
return 1;
|
|
||||||
} else {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
# The union of char-set:letter and char-set:digit.
|
|
||||||
|
|
||||||
sub letter_plus_digit {
|
|
||||||
my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
|
|
||||||
if (letter($codepoint, $name, $category, $uppercase, $lowercase)
|
|
||||||
|| digit($codepoint, $name, $category, $uppercase, $lowercase)) {
|
|
||||||
return 1;
|
|
||||||
} else {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
# Characters that would 'use ink' when printed
|
|
||||||
sub graphic {
|
|
||||||
my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
|
|
||||||
if ($category =~ (/L|M|N|P|S/)) {
|
|
||||||
return 1;
|
|
||||||
} else {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
# A whitespace character is either
|
|
||||||
# * a character with one of the space, line, or paragraph separator
|
|
||||||
# categories (Zs, Zl or Zp) of the Unicode character database.
|
|
||||||
# * U+0009 Horizontal tabulation (\t control-I)
|
|
||||||
# * U+000A Line feed (\n control-J)
|
|
||||||
# * U+000B Vertical tabulation (\v control-K)
|
|
||||||
# * U+000C Form feed (\f control-L)
|
|
||||||
# * U+000D Carriage return (\r control-M)
|
|
||||||
|
|
||||||
sub whitespace {
|
|
||||||
my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
|
|
||||||
if ($category =~ (/Zs|Zl|Zp/)
|
|
||||||
|| $codepoint == 0x9
|
|
||||||
|| $codepoint == 0xA
|
|
||||||
|| $codepoint == 0xB
|
|
||||||
|| $codepoint == 0xC
|
|
||||||
|| $codepoint == 0xD) {
|
|
||||||
return 1;
|
|
||||||
} else {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
# A printing character is one that would occupy space when printed,
|
|
||||||
# i.e., a graphic character or a space character. char-set:printing is
|
|
||||||
# the union of char-set:whitespace and char-set:graphic.
|
|
||||||
|
|
||||||
sub printing {
|
|
||||||
my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
|
|
||||||
if (whitespace($codepoint, $name, $category, $uppercase, $lowercase)
|
|
||||||
|| graphic($codepoint, $name, $category, $uppercase, $lowercase)) {
|
|
||||||
return 1;
|
|
||||||
} else {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
# The ISO control characters are the Unicode/Latin-1 characters in the
|
|
||||||
# ranges [U+0000,U+001F] and [U+007F,U+009F].
|
|
||||||
|
|
||||||
sub iso_control {
|
|
||||||
my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
|
|
||||||
if (($codepoint >= 0x00 && $codepoint <= 0x1F)
|
|
||||||
|| ($codepoint >= 0x7F && $codepoint <= 0x9F)) {
|
|
||||||
return 1;
|
|
||||||
} else {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
# A punctuation character is any character that has one of the
|
|
||||||
# punctuation categories in the Unicode character database (Pc, Pd,
|
|
||||||
# Ps, Pe, Pi, Pf, or Po.)
|
|
||||||
|
|
||||||
# Note that srfi-14 gives conflicting requirements!! It claims that
|
|
||||||
# only the Unicode punctuation is necessary, but, explicitly calls out
|
|
||||||
# the soft hyphen character (U+00AD) as punctution. Current versions
|
|
||||||
# of Unicode consider U+00AD to be a formatting character, not
|
|
||||||
# punctuation.
|
|
||||||
|
|
||||||
sub punctuation {
|
|
||||||
my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
|
|
||||||
if ($category =~ (/P/)) {
|
|
||||||
return 1;
|
|
||||||
} else {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
# A symbol is any character that has one of the symbol categories in
|
|
||||||
# the Unicode character database (Sm, Sc, Sk, or So).
|
|
||||||
|
|
||||||
sub symbol {
|
|
||||||
my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
|
|
||||||
if ($category =~ (/S/)) {
|
|
||||||
return 1;
|
|
||||||
} else {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
# Blank chars are horizontal whitespace. A blank character is either
|
|
||||||
# * a character with the space separator category (Zs) in the
|
|
||||||
# Unicode character database.
|
|
||||||
# * U+0009 Horizontal tabulation (\t control-I)
|
|
||||||
sub blank {
|
|
||||||
my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
|
|
||||||
if ($category =~ (/Zs/)
|
|
||||||
|| $codepoint == 0x9) {
|
|
||||||
return 1;
|
|
||||||
} else {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
# ASCII
|
|
||||||
sub ascii {
|
|
||||||
my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
|
|
||||||
if ($codepoint <= 0x7F) {
|
|
||||||
return 1;
|
|
||||||
} else {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
# Empty
|
|
||||||
sub empty {
|
|
||||||
my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
# Designated -- All characters except for the surrogates
|
|
||||||
sub designated {
|
|
||||||
my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
|
|
||||||
if ($category =~ (/Cs/)) {
|
|
||||||
return 0;
|
|
||||||
} else {
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
# The procedure generates the two C structures necessary to describe a
|
|
||||||
# given category.
|
|
||||||
sub compute {
|
|
||||||
my($f) = @_;
|
|
||||||
my $start = -1;
|
|
||||||
my $end = -1;
|
|
||||||
my $len = 0;
|
|
||||||
my @rstart = (-1);
|
|
||||||
my @rend = (-1);
|
|
||||||
|
|
||||||
seek($in, 0, 0) or die "Can't seek to beginning of file: $!";
|
|
||||||
|
|
||||||
print "$f\n";
|
|
||||||
|
|
||||||
while (<$in>) {
|
|
||||||
# Parse the 14 column, semicolon-delimited UnicodeData.txt
|
|
||||||
# file
|
|
||||||
chomp;
|
|
||||||
my(@fields) = split(/;/);
|
|
||||||
|
|
||||||
# The codepoint: an integer
|
|
||||||
my $codepoint = hex($fields[0]);
|
|
||||||
|
|
||||||
# If this is a character range, the last character in this
|
|
||||||
# range
|
|
||||||
my $codepoint_end = $codepoint;
|
|
||||||
|
|
||||||
# The name of the character
|
|
||||||
my $name = $fields[1];
|
|
||||||
|
|
||||||
# A two-character category code, such as Ll (lower-case
|
|
||||||
# letter)
|
|
||||||
my $category = $fields[2];
|
|
||||||
|
|
||||||
# The codepoint of the uppercase version of this char
|
|
||||||
my $uppercase = $fields[12];
|
|
||||||
|
|
||||||
# The codepoint of the lowercase version of this char
|
|
||||||
my $lowercase = $fields[13];
|
|
||||||
|
|
||||||
my $pass = &$f($codepoint,$name,$category,$uppercase,$lowercase);
|
|
||||||
if ($pass == 1) {
|
|
||||||
|
|
||||||
# Some pairs of lines in UnicodeData.txt delimit ranges of
|
|
||||||
# characters.
|
|
||||||
if ($name =~ /First/) {
|
|
||||||
$line = <$in>;
|
|
||||||
die $! if $!;
|
|
||||||
$codepoint_end = hex( (split(/;/, $line))[0] );
|
|
||||||
}
|
|
||||||
|
|
||||||
# Compute ranges of characters [start:end] that meet the
|
|
||||||
# criteria. Store the ranges.
|
|
||||||
if ($start == -1) {
|
|
||||||
$start = $codepoint;
|
|
||||||
$end = $codepoint_end;
|
|
||||||
} elsif ($end + 1 == $codepoint) {
|
|
||||||
$end = $codepoint_end;
|
|
||||||
} else {
|
|
||||||
$rstart[$len] = $start;
|
|
||||||
$rend[$len] = $end;
|
|
||||||
$len++;
|
|
||||||
$start = $codepoint;
|
|
||||||
$end = $codepoint_end;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
# Extra logic to ensure that the last range is included
|
|
||||||
if ($start != -1) {
|
|
||||||
if ($len > 0 && $rstart[@rstart-1] != $start) {
|
|
||||||
$rstart[$len] = $start;
|
|
||||||
$rend[$len] = $end;
|
|
||||||
$len++;
|
|
||||||
} elsif ($len == 0) {
|
|
||||||
$rstart[0] = $start;
|
|
||||||
$rend[0] = $end;
|
|
||||||
$len++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
# Print the C struct that contains the range list.
|
|
||||||
print $out "static const scm_t_char_range cs_" . $f . "_ranges[] = {\n";
|
|
||||||
if ($rstart[0] != -1) {
|
|
||||||
for (my $i=0; $i<@rstart-1; $i++) {
|
|
||||||
printf $out " {0x%04x, 0x%04x},\n", $rstart[$i], $rend[$i];
|
|
||||||
}
|
|
||||||
printf $out " {0x%04x, 0x%04x}\n", $rstart[@rstart-1], $rend[@rstart-1];
|
|
||||||
}
|
|
||||||
print $out "};\n\n";
|
|
||||||
|
|
||||||
# Print the C struct that contains the range list length and
|
|
||||||
# pointer to the range list.
|
|
||||||
print $out "static const size_t cs_${f}_len = $len;\n\n";
|
|
||||||
}
|
|
||||||
|
|
||||||
# Write a bit of a header
|
|
||||||
print $out "/* srfi-14.i.c -- standard SRFI-14 character set data */\n\n";
|
|
||||||
print $out "/* This file is #include'd by srfi-14.c. */\n\n";
|
|
||||||
print $out "/* This file was generated from\n";
|
|
||||||
print $out " http://unicode.org/Public/UNIDATA/UnicodeData.txt\n";
|
|
||||||
print $out " with the unidata_to_charset.pl script. */\n\n";
|
|
||||||
|
|
||||||
# Write the C structs for each SRFI-14 charset
|
|
||||||
compute "lower_case";
|
|
||||||
compute "upper_case";
|
|
||||||
compute "title_case";
|
|
||||||
compute "letter";
|
|
||||||
compute "digit";
|
|
||||||
compute "hex_digit";
|
|
||||||
compute "letter_plus_digit";
|
|
||||||
compute "graphic";
|
|
||||||
compute "whitespace";
|
|
||||||
compute "printing";
|
|
||||||
compute "iso_control";
|
|
||||||
compute "punctuation";
|
|
||||||
compute "symbol";
|
|
||||||
compute "blank";
|
|
||||||
compute "ascii";
|
|
||||||
compute "empty";
|
|
||||||
compute "designated";
|
|
||||||
|
|
||||||
close $in;
|
|
||||||
close $out;
|
|
||||||
|
|
||||||
exec ('indent srfi-14.i.c') or print STDERR "call to 'indent' failed: $!";
|
|
||||||
|
|
||||||
# And we're done.
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue