1
Fork 0
mirror of https://git.savannah.gnu.org/git/guile.git synced 2025-04-29 19:30:36 +02:00
guile/libguile/unidata_to_charset.awk
Timothy Sample 9f8e05e513 Create 'srfi-14.i.c' during build.
* libguile/srfi-14.i.c: Delete file.
* libguile/UnicodeData.txt: New file.
* libguile/unidata_to_charset.awk: Skip comments.
* libguile/Makefile.am (EXTRA_DIST): Add 'UnicodeData.txt'.
(srfi-14.i.c): New rule.
(noinst_HEADERS): Remove 'srfi-14.i.c'.
(BUILT_INCLUDES, CLEANFILES): Add 'srfi-14.i.c'.

Signed-off-by: Ludovic Courtès <ludo@gnu.org>
2022-03-24 14:28:46 +01:00

417 lines
13 KiB
Awk

# unidata_to_charset.awk --- Compute SRFI-14 charsets from UnicodeData.txt
#
# Copyright (C) 2009, 2010, 2022 Free Software Foundation, Inc.
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 3 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
# Utilities
###########
# Print MESSAGE to standard error, and exit with STATUS.
function die(status, message) {
print "unidata_to_charset.awk:", message | "cat 1>&2";
exit_status = status;
exit exit_status;
}
# Parse the string S as a hexadecimal number. Note that R, C, and B are
# local variables that need not be set by callers. Most Awk
# implementations have an 'strtonum' function that we could use, but it
# is not part of POSIX.
function hex(s, r, c, b) {
if (length(s) == 0) {
die(1, "Cannot parse empty string as hexadecimal.");
}
r = 0;
for (i = 1; i <= length(s); i++) {
c = substr(s, i, 1);
b = 0;
if (c == "0") { b = 0; }
else if (c == "1") { b = 1; }
else if (c == "2") { b = 2; }
else if (c == "3") { b = 3; }
else if (c == "4") { b = 4; }
else if (c == "5") { b = 5; }
else if (c == "6") { b = 6; }
else if (c == "7") { b = 7; }
else if (c == "8") { b = 8; }
else if (c == "9") { b = 9; }
else if (c == "A") { b = 10; }
else if (c == "B") { b = 11; }
else if (c == "C") { b = 12; }
else if (c == "D") { b = 13; }
else if (c == "E") { b = 14; }
else if (c == "F") { b = 15; }
else { die(1, "Invalid hexadecimal character: " c); }
r *= 16;
r += b;
}
return r;
}
# Program initialization
########################
BEGIN {
# The columns are separated by semicolons.
FS = ";";
# This will help us handle errors.
exit_status = 0;
# List of charsets.
all_charsets_count = 0;
all_charsets[all_charsets_count++] = "lower_case";
all_charsets[all_charsets_count++] = "upper_case";
all_charsets[all_charsets_count++] = "title_case";
all_charsets[all_charsets_count++] = "letter";
all_charsets[all_charsets_count++] = "digit";
all_charsets[all_charsets_count++] = "hex_digit";
all_charsets[all_charsets_count++] = "letter_plus_digit";
all_charsets[all_charsets_count++] = "graphic";
all_charsets[all_charsets_count++] = "whitespace";
all_charsets[all_charsets_count++] = "printing";
all_charsets[all_charsets_count++] = "iso_control";
all_charsets[all_charsets_count++] = "punctuation";
all_charsets[all_charsets_count++] = "symbol";
all_charsets[all_charsets_count++] = "blank";
all_charsets[all_charsets_count++] = "ascii";
all_charsets[all_charsets_count++] = "empty";
all_charsets[all_charsets_count++] = "designated";
# Initialize charset state table.
for (i in all_charsets) {
cs = all_charsets[i];
state[cs, "start"] = -1;
state[cs, "end"] = -1;
state[cs, "count"] = 0;
}
}
# Comments
##########
# Skip comments so we can include a copyright notice in the data file.
/^#/ {
next;
}
# Record initialization
#######################
# In this block we give names to each field, and do some basic
# initialization.
{
codepoint = hex($1);
name = $2;
category = $3;
uppercase = $13;
lowercase = $14;
codepoint_end = codepoint;
charset_count = 0;
}
# Some pairs of lines in UnicodeData.txt delimit ranges of
# characters.
name ~ /First>$/ {
getline;
last_name = name;
sub(/First>$/, "Last>", last_name);
if (last_name != $2) {
die(1, "Invalid range in Unicode data.");
exit_status = 1;
exit 1;
}
codepoint_end = hex($1);
}
# Character set predicates
##########################
## The lower_case character set
###############################
# For Unicode, we follow Java's specification: a character is
# lowercase if
# * it is not in the range [U+2000,U+2FFF] ([8192,12287]), and
# * the Unicode attribute table does not give a lowercase mapping
# for it, and
# * at least one of the following is true:
# o the Unicode attribute table gives a mapping to uppercase
# for the character, or
# o the name for the character in the Unicode attribute table
# contains the words "SMALL LETTER" or "SMALL LIGATURE".
(codepoint < 8192 || codepoint > 12287) &&
lowercase == "" &&
(uppercase != "" || name ~ /(SMALL LETTER|SMALL LIGATURE)/) {
charsets[charset_count++] = "lower_case";
}
## The upper_case character set
###############################
# For Unicode, we follow Java's specification: a character is
# uppercase if
# * it is not in the range [U+2000,U+2FFF] ([8192,12287]), and
# * the Unicode attribute table does not give an uppercase mapping
# for it (this excludes titlecase characters), and
# * at least one of the following is true:
# o the Unicode attribute table gives a mapping to lowercase
# for the character, or
# o the name for the character in the Unicode attribute table
# contains the words "CAPITAL LETTER" or "CAPITAL LIGATURE".
(codepoint < 8192 || codepoint > 12287) &&
uppercase == "" &&
(lowercase != "" || name ~ /(CAPITAL LETTER|CAPITAL LIGATURE)/) {
charsets[charset_count++] = "upper_case";
}
## The title_case character set
###############################
# A character is titlecase if it has the category Lt in the character
# attribute database.
category == "Lt" {
charsets[charset_count++] = "title_case";
}
## The letter character set
###########################
# A letter is any character with one of the letter categories (Lu, Ll,
# Lt, Lm, Lo) in the Unicode character database.
category == "Lu" ||
category == "Ll" ||
category == "Lt" ||
category == "Lm" ||
category == "Lo" {
charsets[charset_count++] = "letter";
charsets[charset_count++] = "letter_plus_digit";
}
## The digit character set
##########################
# A character is a digit if it has the category Nd in the character
# attribute database. In Latin-1 and ASCII, the only such characters
# are 0123456789. In Unicode, there are other digit characters in
# other code blocks, such as Gujarati digits and Tibetan digits.
category == "Nd" {
charsets[charset_count++] = "digit";
charsets[charset_count++] = "letter_plus_digit";
}
## The hex_digit character set
##############################
# The only hex digits are 0123456789abcdefABCDEF.
(codepoint >= 48 && codepoint <= 57) ||
(codepoint >= 65 && codepoint <= 70) ||
(codepoint >= 97 && codepoint <= 102) {
charsets[charset_count++] = "hex_digit";
}
## The graphic character set
############################
# Characters that would 'use ink' when printed
category ~ /L|M|N|P|S/ {
charsets[charset_count++] = "graphic";
charsets[charset_count++] = "printing";
}
## The whitespace character set
###############################
# A whitespace character is either
# * a character with one of the space, line, or paragraph separator
# categories (Zs, Zl or Zp) of the Unicode character database.
# * U+0009 (09) Horizontal tabulation (\t control-I)
# * U+000A (10) Line feed (\n control-J)
# * U+000B (11) Vertical tabulation (\v control-K)
# * U+000C (12) Form feed (\f control-L)
# * U+000D (13) Carriage return (\r control-M)
category ~ /Zs|Zl|Zp/ ||
(codepoint >= 9 && codepoint <= 13) {
charsets[charset_count++] = "whitespace";
charsets[charset_count++] = "printing";
}
## The iso_control character set
################################
# The ISO control characters are the Unicode/Latin-1 characters in the
# ranges [U+0000,U+001F] ([0,31]) and [U+007F,U+009F] ([127,159]).
(codepoint >= 0 && codepoint <= 31) ||
(codepoint >= 127 && codepoint <= 159) {
charsets[charset_count++] = "iso_control";
}
## The punctuation character set
################################
# A punctuation character is any character that has one of the
# punctuation categories in the Unicode character database (Pc, Pd,
# Ps, Pe, Pi, Pf, or Po.)
# Note that srfi-14 gives conflicting requirements!! It claims that
# only the Unicode punctuation is necessary, but, explicitly calls out
# the soft hyphen character (U+00AD) as punctution. Current versions
# of Unicode consider U+00AD to be a formatting character, not
# punctuation.
category ~ /P/ {
charsets[charset_count++] = "punctuation";
}
## The symbol character set
###########################
# A symbol is any character that has one of the symbol categories in
# the Unicode character database (Sm, Sc, Sk, or So).
category ~ /S/ {
charsets[charset_count++] = "symbol";
}
## The blank character set
##########################
# Blank chars are horizontal whitespace. A blank character is either
# * a character with the space separator category (Zs) in the
# Unicode character database.
# * U+0009 (9) Horizontal tabulation (\t control-I)
category ~ /Zs/ || codepoint == 9 {
charsets[charset_count++] = "blank";
}
## The ascii character set
##########################
codepoint <= 127 {
charsets[charset_count++] = "ascii";
}
## The designated character set
###############################
# Designated -- All characters except for the surrogates
category !~ /Cs/ {
charsets[charset_count++] = "designated";
}
## Other character sets
#######################
# Note that the "letter_plus_digit" and "printing" character sets, which
# are unions of other character sets, are included in the patterns
# matching their constituent parts (i.e., the "letter_plus_digit"
# character set is included as part of the "letter" and "digit"
# patterns).
#
# Also, the "empty" character is computed by doing precisely nothing!
# Keeping track of state
########################
# Update the state for each charset.
{
for (i = 0; i < charset_count; i++) {
cs = charsets[i];
if (state[cs, "start"] == -1) {
state[cs, "start"] = codepoint;
state[cs, "end"] = codepoint_end;
} else if (state[cs, "end"] + 1 == codepoint) {
state[cs, "end"] = codepoint_end;
} else {
count = state[cs, "count"];
state[cs, "count"]++;
state[cs, "ranges", count, 0] = state[cs, "start"];
state[cs, "ranges", count, 1] = state[cs, "end"];
state[cs, "start"] = codepoint;
state[cs, "end"] = codepoint_end;
}
}
}
# Printing and error handling
#############################
END {
# Normally, an exit statement runs all the 'END' blocks before
# actually exiting. We use the 'exit_status' variable to short
# circuit the rest of the 'END' block by reissuing the exit
# statement.
if (exit_status != 0) {
exit exit_status;
}
# Write a bit of a header.
print("/* srfi-14.i.c -- standard SRFI-14 character set data */");
print("");
print("/* This file is #include'd by srfi-14.c. */");
print("");
print("/* This file was generated from");
print(" https://unicode.org/Public/UNIDATA/UnicodeData.txt");
print(" with the unidata_to_charset.awk script. */");
print("");
for (i = 0; i < all_charsets_count; i++) {
cs = all_charsets[i];
# Extra logic to ensure that the last range is included.
if (state[cs, "start"] != -1) {
count = state[cs, "count"];
state[cs, "count"]++;
state[cs, "ranges", count, 0] = state[cs, "start"];
state[cs, "ranges", count, 1] = state[cs, "end"];
}
count = state[cs, "count"];
print("static const scm_t_char_range cs_" cs "_ranges[] = {");
for (j = 0; j < count; j++) {
rstart = state[cs, "ranges", j, 0];
rend = state[cs, "ranges", j, 1];
if (j + 1 < count) {
printf(" {0x%04x, 0x%04x},\n", rstart, rend);
} else {
printf(" {0x%04x, 0x%04x}\n", rstart, rend);
}
}
print("};");
print("");
count = state[cs, "count"];
printf("static const size_t cs_%s_len = %d;\n", cs, count);
if (i + 1 < all_charsets_count) {
print("");
}
}
}
# And we're done.