Reimplement 'unidata_to_charset.pl' in Awk.

* libguile/unidata_to_charset.pl: Delete file. * libguile/unidata_to_charset.awk: New file. * libguile/Makefile.am (EXTRA_DIST): Adjust accordingly. Signed-off-by: Ludovic Courtès <ludo@gnu.org>
2025-06-21 11:10:21 +02:00 · 2022-03-16 21:13:45 -06:00 · 2022-03-16 21:13:45 -06:00 · 63886aeda2
commit 63886aeda2
parent 6e82a4516a
3 changed files with 410 additions and 402 deletions
--- a/libguile/Makefile.am
+++ b/libguile/Makefile.am
@ -728,7 +728,7 @@ EXTRA_DIST = ChangeLog-scm ChangeLog-threads				\
    guile-func-name-check						\
    cpp-E.syms cpp-E.c cpp-SIG.syms cpp-SIG.c				\
    c-tokenize.lex							\
-    scmconfig.h.top libgettext.h unidata_to_charset.pl libguile.map	\
+    scmconfig.h.top libgettext.h unidata_to_charset.awk libguile.map	\
    vm-operations.h libguile-@GUILE_EFFECTIVE_VERSION@-gdb.scm		\
    $(lightening_c_files) $(lightening_extra_files)
 #    $(DOT_DOC_FILES) $(EXTRA_DOT_DOC_FILES) \
--- a/libguile/unidata_to_charset.awk
+++ b/libguile/unidata_to_charset.awk
@ -0,0 +1,409 @@
 # unidata_to_charset.awk --- Compute SRFI-14 charsets from UnicodeData.txt
 #
 # Copyright (C) 2009, 2010, 2022 Free Software Foundation, Inc.
 #
 # This library is free software; you can redistribute it and/or
 # modify it under the terms of the GNU Lesser General Public
 # License as published by the Free Software Foundation; either
 # version 3 of the License, or (at your option) any later version.
 #
 # This library is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # Lesser General Public License for more details.
 #
 # You should have received a copy of the GNU Lesser General Public
 # License along with this library; if not, write to the Free Software
 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 # Utilities
 ###########
 # Print MESSAGE to standard error, and exit with STATUS.
 function die(status, message) {
    print "unidata_to_charset.awk:", message | "cat 1>&2";
    exit_status = status;
    exit exit_status;
 }
 # Parse the string S as a hexadecimal number.  Note that R, C, and B are
 # local variables that need not be set by callers.  Most Awk
 # implementations have an 'strtonum' function that we could use, but it
 # is not part of POSIX.
 function hex(s, r, c, b) {
    if (length(s) == 0) {
        die(1, "Cannot parse empty string as hexadecimal.");
    }
    r = 0;
    for (i = 1; i <= length(s); i++) {
        c = substr(s, i, 1);
        b = 0;
        if      (c == "0") { b =  0; }
        else if (c == "1") { b =  1; }
        else if (c == "2") { b =  2; }
        else if (c == "3") { b =  3; }
        else if (c == "4") { b =  4; }
        else if (c == "5") { b =  5; }
        else if (c == "6") { b =  6; }
        else if (c == "7") { b =  7; }
        else if (c == "8") { b =  8; }
        else if (c == "9") { b =  9; }
        else if (c == "A") { b = 10; }
        else if (c == "B") { b = 11; }
        else if (c == "C") { b = 12; }
        else if (c == "D") { b = 13; }
        else if (c == "E") { b = 14; }
        else if (c == "F") { b = 15; }
        else { die(1, "Invalid hexadecimal character: " c); }
        r *= 16;
        r += b;
    }
    return r;
 }
 # Program initialization
 ########################
 BEGIN {
    # The columns are separated by semicolons.
    FS = ";";
    # This will help us handle errors.
    exit_status = 0;
    # List of charsets.
    all_charsets_count = 0;
    all_charsets[all_charsets_count++] = "lower_case";
    all_charsets[all_charsets_count++] = "upper_case";
    all_charsets[all_charsets_count++] = "title_case";
    all_charsets[all_charsets_count++] = "letter";
    all_charsets[all_charsets_count++] = "digit";
    all_charsets[all_charsets_count++] = "hex_digit";
    all_charsets[all_charsets_count++] = "letter_plus_digit";
    all_charsets[all_charsets_count++] = "graphic";
    all_charsets[all_charsets_count++] = "whitespace";
    all_charsets[all_charsets_count++] = "printing";
    all_charsets[all_charsets_count++] = "iso_control";
    all_charsets[all_charsets_count++] = "punctuation";
    all_charsets[all_charsets_count++] = "symbol";
    all_charsets[all_charsets_count++] = "blank";
    all_charsets[all_charsets_count++] = "ascii";
    all_charsets[all_charsets_count++] = "empty";
    all_charsets[all_charsets_count++] = "designated";
    # Initialize charset state table.
    for (i in all_charsets) {
        cs = all_charsets[i];
        state[cs, "start"] = -1;
        state[cs, "end"] = -1;
        state[cs, "count"] = 0;
    }
 }
 # Record initialization
 #######################
 # In this block we give names to each field, and do some basic
 # initialization.
 {
    codepoint = hex($1);
    name = $2;
    category = $3;
    uppercase = $13;
    lowercase = $14;
    codepoint_end = codepoint;
    charset_count = 0;
 }
 # Some pairs of lines in UnicodeData.txt delimit ranges of
 # characters.
 name ~ /First>$/ {
    getline;
    last_name = name;
    sub(/First>$/, "Last>", last_name);
    if (last_name != $2) {
        die(1, "Invalid range in Unicode data.");
        exit_status = 1;
        exit 1;
    }
    codepoint_end = hex($1);
 }
 # Character set predicates
 ##########################
 ## The lower_case character set
 ###############################
 # For Unicode, we follow Java's specification: a character is
 # lowercase if
 #    * it is not in the range [U+2000,U+2FFF] ([8192,12287]), and
 #    * the Unicode attribute table does not give a lowercase mapping
 #      for it, and
 #    * at least one of the following is true:
 #          o the Unicode attribute table gives a mapping to uppercase
 #            for the character, or
 #          o the name for the character in the Unicode attribute table
 #            contains the words "SMALL LETTER" or "SMALL LIGATURE".
 (codepoint < 8192 || codepoint > 12287) &&
 lowercase == "" &&
 (uppercase != "" || name ~ /(SMALL LETTER|SMALL LIGATURE)/) {
    charsets[charset_count++] = "lower_case";
 }
 ## The upper_case character set
 ###############################
 # For Unicode, we follow Java's specification: a character is
 # uppercase if
 #    * it is not in the range [U+2000,U+2FFF] ([8192,12287]), and
 #    * the Unicode attribute table does not give an uppercase mapping
 #      for it (this excludes titlecase characters), and
 #    * at least one of the following is true:
 #          o the Unicode attribute table gives a mapping to lowercase
 #            for the character, or
 #          o the name for the character in the Unicode attribute table
 #            contains the words "CAPITAL LETTER" or "CAPITAL LIGATURE".
 (codepoint < 8192 || codepoint > 12287) &&
 uppercase == "" &&
 (lowercase != "" || name ~ /(CAPITAL LETTER|CAPITAL LIGATURE)/) {
    charsets[charset_count++] = "upper_case";
 }
 ## The title_case character set
 ###############################
 # A character is titlecase if it has the category Lt in the character
 # attribute database.
 category == "Lt" {
    charsets[charset_count++] = "title_case";
 }
 ## The letter character set
 ###########################
 # A letter is any character with one of the letter categories (Lu, Ll,
 # Lt, Lm, Lo) in the Unicode character database.
 category == "Lu" ||
 category == "Ll" ||
 category == "Lt" ||
 category == "Lm" ||
 category == "Lo" {
    charsets[charset_count++] = "letter";
    charsets[charset_count++] = "letter_plus_digit";
 }
 ## The digit character set
 ##########################
 # A character is a digit if it has the category Nd in the character
 # attribute database. In Latin-1 and ASCII, the only such characters
 # are 0123456789. In Unicode, there are other digit characters in
 # other code blocks, such as Gujarati digits and Tibetan digits.
 category == "Nd" {
    charsets[charset_count++] = "digit";
    charsets[charset_count++] = "letter_plus_digit";
 }
 ## The hex_digit character set
 ##############################
 # The only hex digits are 0123456789abcdefABCDEF.
 (codepoint >= 48 && codepoint <= 57) ||
 (codepoint >= 65 && codepoint <= 70) ||
 (codepoint >= 97 && codepoint <= 102) {
    charsets[charset_count++] = "hex_digit";
 }
 ## The graphic character set
 ############################
 # Characters that would 'use ink' when printed
 category ~ /L|M|N|P|S/ {
    charsets[charset_count++] = "graphic";
    charsets[charset_count++] = "printing";
 }
 ## The whitespace character set
 ###############################
 # A whitespace character is either
 #    * a character with one of the space, line, or paragraph separator
 #      categories (Zs, Zl or Zp) of the Unicode character database.
 #    * U+0009 (09) Horizontal tabulation (\t control-I)
 #    * U+000A (10) Line feed (\n control-J)
 #    * U+000B (11) Vertical tabulation (\v control-K)
 #    * U+000C (12) Form feed (\f control-L)
 #    * U+000D (13) Carriage return (\r control-M)
 category ~ /Zs|Zl|Zp/ ||
 (codepoint >= 9 && codepoint <= 13) {
    charsets[charset_count++] = "whitespace";
    charsets[charset_count++] = "printing";
 }
 ## The iso_control character set
 ################################
 # The ISO control characters are the Unicode/Latin-1 characters in the
 # ranges [U+0000,U+001F] ([0,31]) and [U+007F,U+009F] ([127,159]).
 (codepoint >= 0 && codepoint <= 31) ||
 (codepoint >= 127 && codepoint <= 159) {
    charsets[charset_count++] = "iso_control";
 }
 ## The punctuation character set
 ################################
 # A punctuation character is any character that has one of the
 # punctuation categories in the Unicode character database (Pc, Pd,
 # Ps, Pe, Pi, Pf, or Po.)
 # Note that srfi-14 gives conflicting requirements!!  It claims that
 # only the Unicode punctuation is necessary, but, explicitly calls out
 # the soft hyphen character (U+00AD) as punctution.  Current versions
 # of Unicode consider U+00AD to be a formatting character, not
 # punctuation.
 category ~ /P/ {
    charsets[charset_count++] = "punctuation";
 }
 ## The symbol character set
 ###########################
 # A symbol is any character that has one of the symbol categories in
 # the Unicode character database (Sm, Sc, Sk, or So).
 category ~ /S/ {
    charsets[charset_count++] = "symbol";
 }
 ## The blank character set
 ##########################
 # Blank chars are horizontal whitespace.  A blank character is either
 #    * a character with the space separator category (Zs) in the
 #      Unicode character database.
 #    * U+0009 (9) Horizontal tabulation (\t control-I)
 category ~ /Zs/ || codepoint == 9 {
    charsets[charset_count++] = "blank";
 }
 ## The ascii character set
 ##########################
 codepoint <= 127 {
    charsets[charset_count++] = "ascii";
 }
 ## The designated character set
 ###############################
 # Designated -- All characters except for the surrogates
 category !~ /Cs/ {
    charsets[charset_count++] = "designated";
 }
 ## Other character sets
 #######################
 # Note that the "letter_plus_digit" and "printing" character sets, which
 # are unions of other character sets, are included in the patterns
 # matching their constituent parts (i.e., the "letter_plus_digit"
 # character set is included as part of the "letter" and "digit"
 # patterns).
 #
 # Also, the "empty" character is computed by doing precisely nothing!
 # Keeping track of state
 ########################
 # Update the state for each charset.
 {
    for (i = 0; i < charset_count; i++) {
        cs = charsets[i];
        if (state[cs, "start"] == -1) {
            state[cs, "start"] = codepoint;
            state[cs, "end"] = codepoint_end;
        } else if (state[cs, "end"] + 1 == codepoint) {
            state[cs, "end"] = codepoint_end;
        } else {
            count = state[cs, "count"];
            state[cs, "count"]++;
            state[cs, "ranges", count, 0] = state[cs, "start"];
            state[cs, "ranges", count, 1] = state[cs, "end"];
            state[cs, "start"] = codepoint;
            state[cs, "end"] = codepoint_end;
        }
    }
 }
 # Printing and error handling
 #############################
 END {
    # Normally, an exit statement runs all the 'END' blocks before
    # actually exiting.  We use the 'exit_status' variable to short
    # circuit the rest of the 'END' block by reissuing the exit
    # statement.
    if (exit_status != 0) {
        exit exit_status;
    }
    # Write a bit of a header.
    print("/* srfi-14.i.c -- standard SRFI-14 character set data */");
    print("");
    print("/* This file is #include'd by srfi-14.c.  */");
    print("");
    print("/* This file was generated from");
    print("   https://unicode.org/Public/UNIDATA/UnicodeData.txt");
    print("   with the unidata_to_charset.awk script.  */");
    print("");
    for (i = 0; i < all_charsets_count; i++) {
        cs = all_charsets[i];
        # Extra logic to ensure that the last range is included.
        if (state[cs, "start"] != -1) {
            count = state[cs, "count"];
            state[cs, "count"]++;
            state[cs, "ranges", count, 0] = state[cs, "start"];
            state[cs, "ranges", count, 1] = state[cs, "end"];
        }
        count = state[cs, "count"];
        print("static const scm_t_char_range cs_" cs "_ranges[] = {");
        for (j = 0; j < count; j++) {
            rstart = state[cs, "ranges", j, 0];
            rend = state[cs, "ranges", j, 1];
            if (j + 1 < count) {
                printf("  {0x%04x, 0x%04x},\n", rstart, rend);
            } else {
                printf("  {0x%04x, 0x%04x}\n", rstart, rend);
            }
        }
        print("};");
        print("");
        count = state[cs, "count"];
        printf("static const size_t cs_%s_len = %d;\n", cs, count);
        if (i + 1 < all_charsets_count) {
            print("");
        }
    }
 }
 # And we're done.
--- a/libguile/unidata_to_charset.pl
+++ b/libguile/unidata_to_charset.pl
@ -1,401 +0,0 @@
 #!/usr/bin/perl
 # unidata_to_charset.pl --- Compute SRFI-14 charsets from UnicodeData.txt
 #
 # Copyright (C) 2009, 2010, 2022 Free Software Foundation, Inc.
 #
 # This library is free software; you can redistribute it and/or
 # modify it under the terms of the GNU Lesser General Public
 # License as published by the Free Software Foundation; either
 # version 3 of the License, or (at your option) any later version.
 #
 # This library is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # Lesser General Public License for more details.
 #
 # You should have received a copy of the GNU Lesser General Public
 # License along with this library; if not, write to the Free Software
 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 open(my $in,  "<",  "UnicodeData.txt")  or die "Can't open UnicodeData.txt: $!";           
 open(my $out, ">",  "srfi-14.i.c") or die "Can't open srfi-14.i.c: $!";
 # For Unicode, we follow Java's specification: a character is
 # lowercase if
 #    * it is not in the range [U+2000,U+2FFF], and
 #    * the Unicode attribute table does not give a lowercase mapping
 #      for it, and
 #    * at least one of the following is true:
 #          o the Unicode attribute table gives a mapping to uppercase
 #            for the character, or
 #          o the name for the character in the Unicode attribute table
 #            contains the words "SMALL LETTER" or "SMALL LIGATURE".
 sub lower_case {
    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
    if (($codepoint < 0x2000 || $codepoint > 0x2FFF)
        && (!defined($lowercase) || $lowercase eq "")
        && ((defined($uppercase) && $uppercase ne "")
            || ($name =~ /(SMALL LETTER|SMALL LIGATURE)/))) {
        return 1;
    } else {
        return 0;
    }
 }
 # For Unicode, we follow Java's specification: a character is
 # uppercase if
 #    * it is not in the range [U+2000,U+2FFF], and
 #    * the Unicode attribute table does not give an uppercase mapping
 #      for it (this excludes titlecase characters), and
 #    * at least one of the following is true:
 #          o the Unicode attribute table gives a mapping to lowercase
 #            for the character, or
 #          o the name for the character in the Unicode attribute table
 #            contains the words "CAPITAL LETTER" or "CAPITAL LIGATURE".
 sub upper_case {
    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
    if (($codepoint < 0x2000 || $codepoint > 0x2FFF)
        && (!defined($uppercase) || $uppercase eq "")
        && ((defined($lowercase) && $lowercase ne "")
            || ($name =~ /(CAPITAL LETTER|CAPITAL LIGATURE)/))) {
        return 1;
    } else {
        return 0;
    }
 }
 # A character is titlecase if it has the category Lt in the character
 # attribute database.
 sub title_case {
    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
    if (defined($category) && $category eq "Lt") {
        return 1;
    } else {
        return 0;
    }
 }
 # A letter is any character with one of the letter categories (Lu, Ll,
 # Lt, Lm, Lo) in the Unicode character database.
 sub letter {
    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
    if (defined($category) && ($category eq "Lu"
                               || $category eq "Ll"
                               || $category eq "Lt"
                               || $category eq "Lm"
                               || $category eq "Lo")) {
        return 1;
    } else {
        return 0;
    }
 }
 # A character is a digit if it has the category Nd in the character
 # attribute database. In Latin-1 and ASCII, the only such characters
 # are 0123456789. In Unicode, there are other digit characters in
 # other code blocks, such as Gujarati digits and Tibetan digits.
 sub digit {
    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
    if (defined($category) && $category eq "Nd") {
        return 1;
    } else {
        return 0;
    }
 }
 # The only hex digits are 0123456789abcdefABCDEF. 
 sub hex_digit {
    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
    if (($codepoint >= 0x30 && $codepoint <= 0x39)
        || ($codepoint >= 0x41 && $codepoint <= 0x46)
        || ($codepoint >= 0x61 && $codepoint <= 0x66)) {
        return 1;
    } else {
        return 0;
    }
 }
 # The union of char-set:letter and char-set:digit.
 sub letter_plus_digit {
    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
    if (letter($codepoint, $name, $category, $uppercase, $lowercase)
        || digit($codepoint, $name, $category, $uppercase, $lowercase)) {
        return 1;
    } else {
        return 0;
    }
 }
 # Characters that would 'use ink' when printed
 sub graphic {
    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
    if ($category =~ (/L|M|N|P|S/)) {
        return 1;
    } else {
        return 0;
    }
 }
 # A whitespace character is either
 #    * a character with one of the space, line, or paragraph separator
 #      categories (Zs, Zl or Zp) of the Unicode character database.
 #    * U+0009 Horizontal tabulation (\t control-I)
 #    * U+000A Line feed (\n control-J)
 #    * U+000B Vertical tabulation (\v control-K)
 #    * U+000C Form feed (\f control-L)
 #    * U+000D Carriage return (\r control-M)
 sub whitespace {
    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
    if ($category =~ (/Zs|Zl|Zp/)
        || $codepoint == 0x9
        || $codepoint == 0xA 
        || $codepoint == 0xB 
        || $codepoint == 0xC 
        || $codepoint == 0xD) { 
        return 1;
    } else {
        return 0;
    }
 }
 # A printing character is one that would occupy space when printed,
 # i.e., a graphic character or a space character. char-set:printing is
 # the union of char-set:whitespace and char-set:graphic.
 sub printing {
    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
    if (whitespace($codepoint, $name, $category, $uppercase, $lowercase)
        || graphic($codepoint, $name, $category, $uppercase, $lowercase)) {
        return 1;
    } else {
        return 0;
    }
 }
 # The ISO control characters are the Unicode/Latin-1 characters in the
 # ranges [U+0000,U+001F] and [U+007F,U+009F].
 sub iso_control {
    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
    if (($codepoint >= 0x00 && $codepoint <= 0x1F)
        || ($codepoint >= 0x7F && $codepoint <= 0x9F)) {
        return 1;
    } else {
        return 0;
    }
 }
 # A punctuation character is any character that has one of the
 # punctuation categories in the Unicode character database (Pc, Pd,
 # Ps, Pe, Pi, Pf, or Po.)
 # Note that srfi-14 gives conflicting requirements!!  It claims that
 # only the Unicode punctuation is necessary, but, explicitly calls out
 # the soft hyphen character (U+00AD) as punctution.  Current versions
 # of Unicode consider U+00AD to be a formatting character, not
 # punctuation.
 sub punctuation {
    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
    if ($category =~ (/P/)) {
        return 1;
    } else {
        return 0;
    }
 }
 # A symbol is any character that has one of the symbol categories in
 # the Unicode character database (Sm, Sc, Sk, or So).
 sub symbol {
    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
    if ($category =~ (/S/)) {
        return 1;
    } else {
        return 0;
    }
 }
 # Blank chars are horizontal whitespace.  A blank character is either
 #    * a character with the space separator category (Zs) in the
 #      Unicode character database.
 #    * U+0009 Horizontal tabulation (\t control-I) 
 sub blank {
    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
    if ($category =~ (/Zs/)
        || $codepoint == 0x9) { 
        return 1;
    } else {
        return 0;
    }
 }
 # ASCII
 sub ascii {
    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
    if ($codepoint <= 0x7F) {
        return 1;
    } else {
        return 0;
    }
 }
 # Empty
 sub empty {
    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
    return 0;
 }
 # Designated -- All characters except for the surrogates
 sub designated {
    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
    if ($category =~ (/Cs/)) {
        return 0;
    } else {
        return 1;
    }
 }
 # The procedure generates the two C structures necessary to describe a
 # given category.
 sub compute {
    my($f) = @_;
    my $start = -1;
    my $end = -1;
    my $len = 0;
    my @rstart = (-1);
    my @rend = (-1);
    seek($in, 0, 0) or die "Can't seek to beginning of file: $!";
    print "$f\n";
    while (<$in>) {
        # Parse the 14 column, semicolon-delimited UnicodeData.txt
        # file
        chomp;
        my(@fields) = split(/;/);
        # The codepoint: an integer
        my $codepoint = hex($fields[0]); 
        # If this is a character range, the last character in this
        # range
        my $codepoint_end = $codepoint;  
        # The name of the character
        my $name = $fields[1];    
        # A two-character category code, such as Ll (lower-case
        # letter)
        my $category = $fields[2];       
        # The codepoint of the uppercase version of this char
        my $uppercase = $fields[12];   
        # The codepoint of the lowercase version of this char
        my $lowercase = $fields[13];    
        my $pass = &$f($codepoint,$name,$category,$uppercase,$lowercase);
        if ($pass == 1) {
            # Some pairs of lines in UnicodeData.txt delimit ranges of
            # characters.
            if ($name =~ /First/) {
                $line = <$in>;
                die $! if $!;
                $codepoint_end = hex( (split(/;/, $line))[0] );
            }                 
            # Compute ranges of characters [start:end] that meet the
            # criteria.  Store the ranges.
            if ($start == -1) {
                $start = $codepoint;
                $end = $codepoint_end;
            } elsif ($end + 1 == $codepoint) {
                $end = $codepoint_end;
            } else {
                $rstart[$len] = $start;
                $rend[$len] = $end;
                $len++;
                $start = $codepoint;
                $end = $codepoint_end;
            }
        }
    }
    # Extra logic to ensure that the last range is included
    if ($start != -1) {
        if ($len > 0 && $rstart[@rstart-1] != $start) {
            $rstart[$len] = $start;
            $rend[$len] = $end;
            $len++;
        } elsif ($len == 0) {
 	    $rstart[0] = $start;
 	    $rend[0] = $end;
 	    $len++;
        }
    }
    # Print the C struct that contains the range list.
    print $out "static const scm_t_char_range cs_" . $f . "_ranges[] = {\n";
    if ($rstart[0] != -1) {
        for (my $i=0; $i<@rstart-1; $i++) {
            printf $out "  {0x%04x, 0x%04x},\n", $rstart[$i], $rend[$i];
        }
        printf $out "  {0x%04x, 0x%04x}\n", $rstart[@rstart-1], $rend[@rstart-1];
    }
    print $out "};\n\n";
    # Print the C struct that contains the range list length and
    # pointer to the range list.
    print $out "static const size_t cs_${f}_len = $len;\n\n";
 }
 # Write a bit of a header
 print $out "/* srfi-14.i.c -- standard SRFI-14 character set data */\n\n";
 print $out "/* This file is #include'd by srfi-14.c.  */\n\n";
 print $out "/* This file was generated from\n";
 print $out "   http://unicode.org/Public/UNIDATA/UnicodeData.txt\n";
 print $out "   with the unidata_to_charset.pl script.  */\n\n";
 # Write the C structs for each SRFI-14 charset
 compute "lower_case";
 compute "upper_case";
 compute "title_case";
 compute "letter";
 compute "digit";
 compute "hex_digit";
 compute "letter_plus_digit";
 compute "graphic";
 compute "whitespace";
 compute "printing";
 compute "iso_control";
 compute "punctuation";
 compute "symbol";
 compute "blank";
 compute "ascii";
 compute "empty";
 compute "designated";
 close $in;
 close $out;
 exec ('indent srfi-14.i.c') or print STDERR "call to 'indent' failed: $!";
 # And we're done.