diff --git a/libguile/unidata_to_charset.pl b/libguile/unidata_to_charset.pl new file mode 100755 index 000000000..6871e67ee --- /dev/null +++ b/libguile/unidata_to_charset.pl @@ -0,0 +1,399 @@ +#!/usr/bin/perl +# unidata_to_charset.pl --- Compute SRFI-14 charsets from UnicodeData.txt +# +# Copyright (C) 2009 Free Software Foundation, Inc. +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 3 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +open(my $in, "<", "UnicodeData.txt") or die "Can't open UnicodeData.txt: $!"; +open(my $out, ">", "srfi-14.i.c") or die "Can't open srfi-14.i.c: $!"; + +# For Unicode, we follow Java's specification: a character is +# lowercase if +# * it is not in the range [U+2000,U+2FFF], and +# * the Unicode attribute table does not give a lowercase mapping +# for it, and +# * at least one of the following is true: +# o the Unicode attribute table gives a mapping to uppercase +# for the character, or +# o the name for the character in the Unicode attribute table +# contains the words "SMALL LETTER" or "SMALL LIGATURE". + +sub lower_case { + my($codepoint, $name, $category, $uppercase, $lowercase)= @_; + if (($codepoint < 0x2000 || $codepoint > 0x2FFF) + && (!defined($lowercase) || $lowercase eq "") + && ((defined($uppercase) && $uppercase ne "") + || ($name =~ /(SMALL LETTER|SMALL LIGATURE)/))) { + return 1; + } else { + return 0; + } +} + +# For Unicode, we follow Java's specification: a character is +# uppercase if +# * it is not in the range [U+2000,U+2FFF], and +# * the Unicode attribute table does not give an uppercase mapping +# for it (this excludes titlecase characters), and +# * at least one of the following is true: +# o the Unicode attribute table gives a mapping to lowercase +# for the character, or +# o the name for the character in the Unicode attribute table +# contains the words "CAPITAL LETTER" or "CAPITAL LIGATURE". + +sub upper_case { + my($codepoint, $name, $category, $uppercase, $lowercase)= @_; + if (($codepoint < 0x2000 || $codepoint > 0x2FFF) + && (!defined($uppercase) || $uppercase eq "") + && ((defined($lowercase) && $lowercase ne "") + || ($name =~ /(CAPITAL LETTER|CAPITAL LIGATURE)/))) { + return 1; + } else { + return 0; + } +} + +# A character is titlecase if it has the category Lt in the character +# attribute database. + +sub title_case { + my($codepoint, $name, $category, $uppercase, $lowercase)= @_; + if (defined($category) && $category eq "Lt") { + return 1; + } else { + return 0; + } +} + +# A letter is any character with one of the letter categories (Lu, Ll, +# Lt, Lm, Lo) in the Unicode character database. + +sub letter { + my($codepoint, $name, $category, $uppercase, $lowercase)= @_; + if (defined($category) && ($category eq "Lu" + || $category eq "Ll" + || $category eq "Lt" + || $category eq "Lm" + || $category eq "Lo")) { + return 1; + } else { + return 0; + } +} + +# A character is a digit if it has the category Nd in the character +# attribute database. In Latin-1 and ASCII, the only such characters +# are 0123456789. In Unicode, there are other digit characters in +# other code blocks, such as Gujarati digits and Tibetan digits. + +sub digit { + my($codepoint, $name, $category, $uppercase, $lowercase)= @_; + if (defined($category) && $category eq "Nd") { + return 1; + } else { + return 0; + } +} + +# The only hex digits are 0123456789abcdefABCDEF. + +sub hex_digit { + my($codepoint, $name, $category, $uppercase, $lowercase)= @_; + if (($codepoint >= 0x30 && $codepoint <= 0x39) + || ($codepoint >= 0x41 && $codepoint <= 0x46) + || ($codepoint >= 0x61 && $codepoint <= 0x66)) { + return 1; + } else { + return 0; + } +} + +# The union of char-set:letter and char-set:digit. + +sub letter_plus_digit { + my($codepoint, $name, $category, $uppercase, $lowercase)= @_; + if (letter($codepoint, $name, $category, $uppercase, $lowercase) + || digit($codepoint, $name, $category, $uppercase, $lowercase)) { + return 1; + } else { + return 0; + } +} + +# Characters that would 'use ink' when printed +sub graphic { + my($codepoint, $name, $category, $uppercase, $lowercase)= @_; + if ($category =~ (/L|M|N|P|S/)) { + return 1; + } else { + return 0; + } +} + +# A whitespace character is either +# * a character with one of the space, line, or paragraph separator +# categories (Zs, Zl or Zp) of the Unicode character database. +# * U+0009 Horizontal tabulation (\t control-I) +# * U+000A Line feed (\n control-J) +# * U+000B Vertical tabulation (\v control-K) +# * U+000C Form feed (\f control-L) +# * U+000D Carriage return (\r control-M) + +sub whitespace { + my($codepoint, $name, $category, $uppercase, $lowercase)= @_; + if ($category =~ (/Zs|Zl|Zp/) + || $codepoint == 0x9 + || $codepoint == 0xA + || $codepoint == 0xB + || $codepoint == 0xC + || $codepoint == 0xD) { + return 1; + } else { + return 0; + } +} + +# A printing character is one that would occupy space when printed, +# i.e., a graphic character or a space character. char-set:printing is +# the union of char-set:whitespace and char-set:graphic. + +sub printing { + my($codepoint, $name, $category, $uppercase, $lowercase)= @_; + if (whitespace($codepoint, $name, $category, $uppercase, $lowercase) + || graphic($codepoint, $name, $category, $uppercase, $lowercase)) { + return 1; + } else { + return 0; + } +} + +# The ISO control characters are the Unicode/Latin-1 characters in the +# ranges [U+0000,U+001F] and [U+007F,U+009F]. + +sub iso_control { + my($codepoint, $name, $category, $uppercase, $lowercase)= @_; + if (($codepoint >= 0x00 && $codepoint <= 0x1F) + || ($codepoint >= 0x7F && $codepoint <= 0x9F)) { + return 1; + } else { + return 0; + } +} + +# A punctuation character is any character that has one of the +# punctuation categories in the Unicode character database (Pc, Pd, +# Ps, Pe, Pi, Pf, or Po.) + +# Note that srfi-14 gives conflicting requirements!! It claims that +# only the Unicode punctuation is necessary, but, explicitly calls out +# the soft hyphen character (U+00AD) as punctution. Current versions +# of Unicode consider U+00AD to be a formatting character, not +# punctuation. + +sub punctuation { + my($codepoint, $name, $category, $uppercase, $lowercase)= @_; + if ($category =~ (/P/)) { + return 1; + } else { + return 0; + } +} + +# A symbol is any character that has one of the symbol categories in +# the Unicode character database (Sm, Sc, Sk, or So). + +sub symbol { + my($codepoint, $name, $category, $uppercase, $lowercase)= @_; + if ($category =~ (/S/)) { + return 1; + } else { + return 0; + } +} + +# Blank chars are horizontal whitespace. A blank character is either +# * a character with the space separator category (Zs) in the +# Unicode character database. +# * U+0009 Horizontal tabulation (\t control-I) +sub blank { + my($codepoint, $name, $category, $uppercase, $lowercase)= @_; + if ($category =~ (/Zs/) + || $codepoint == 0x9) { + return 1; + } else { + return 0; + } +} + +# ASCII +sub ascii { + my($codepoint, $name, $category, $uppercase, $lowercase)= @_; + if ($codepoint <= 0x7F) { + return 1; + } else { + return 0; + } +} + +# Empty +sub empty { + my($codepoint, $name, $category, $uppercase, $lowercase)= @_; + return 0; +} + +# Full -- All characters. +sub full { + my($codepoint, $name, $category, $uppercase, $lowercase)= @_; + return 1; +} + + +# The procedure generates the two C structures necessary to describe a +# given category. +sub compute { + my($f) = @_; + my $start = -1; + my $end = -1; + my $len = 0; + my @rstart = (-1); + my @rend = (-1); + + seek($in, 0, 0) or die "Can't seek to beginning of file: $!"; + + print "$f\n"; + + while (<$in>) { + # Parse the 14 column, semicolon-delimited UnicodeData.txt + # file + chomp; + my(@fields) = split(/;/); + + # The codepoint: an integer + my $codepoint = hex($fields[0]); + + # If this is a character range, the last character in this + # range + my $codepoint_end = $codepoint; + + # The name of the character + my $name = $fields[1]; + + # A two-character category code, such as Ll (lower-case + # letter) + my $category = $fields[2]; + + # The codepoint of the uppercase version of this char + my $uppercase = $fields[12]; + + # The codepoint of the lowercase version of this char + my $lowercase = $fields[13]; + + my $pass = &$f($codepoint,$name,$category,$uppercase,$lowercase); + if ($pass == 1) { + + # Some pairs of lines in UnicodeData.txt delimit ranges of + # characters. + if ($name =~ /First/) { + $line = <$in>; + die $! if $!; + $codepoint_end = hex( (split(/;/, $line))[0] ); + } + + # Compute ranges of characters [start:end] that meet the + # criteria. Store the ranges. + if ($start == -1) { + $start = $codepoint; + $end = $codepoint_end; + } elsif ($end + 1 == $codepoint) { + $end = $codepoint_end; + } else { + $rstart[$len] = $start; + $rend[$len] = $end; + $len++; + $start = $codepoint; + $end = $codepoint_end; + } + } + } + + # Extra logic to ensure that the last range is included + if ($start != -1) { + if ($len > 0 && $rstart[@rstart-1] != $start) { + $rstart[$len] = $start; + $rend[$len] = $end; + $len++; + } elsif ($len == 0) { + $rstart[0] = $start; + $rend[0] = $end; + } + } + + # Print the C struct that contains the range list. + print $out "scm_t_char_range cs_" . $f . "_ranges[] = {\n"; + if ($rstart[0] != -1) { + for (my $i=0; $i<@rstart-1; $i++) { + printf $out " {0x%04x, 0x%04x},\n", $rstart[$i], $rend[$i]; + } + printf $out " {0x%04x, 0x%04x}\n", $rstart[@rstart-1], $rend[@rstart-1]; + } + print $out "};\n\n"; + + # Print the C struct that contains the range list length and + # pointer to the range list. + print $out "scm_t_char_set cs_${f} = {\n"; + print $out " $len,\n"; + print $out " cs_" . $f . "_ranges\n"; + print $out "};\n\n"; +} + +# Write a bit of a header +print $out "/* srfi-14.i.c -- standard SRFI-14 character set data */\n\n"; +print $out "/* This file is #include'd by srfi-14.c. */\n\n"; +print $out "/* This file was generated from\n" +print $out " http://unicode.org/Public/UNIDATA/UnicodeData.txt\n"; +print $out " with the unidata_to_charset.pl script. */\n\n"; + +# Write the C structs for each SRFI-14 charset +compute "lower_case"; +compute "upper_case"; +compute "title_case"; +compute "letter"; +compute "digit"; +compute "hex_digit"; +compute "letter_plus_digit"; +compute "graphic"; +compute "whitespace"; +compute "printing"; +compute "iso_control"; +compute "punctuation"; +compute "symbol"; +compute "blank"; +compute "ascii"; +compute "empty"; +compute "full"; + +close $in; +close $out; + +exec ('indent srfi-14.i.c') or print STDERR "call to 'indent' failed: $!"; + +# And we're done. + + + + + +