In string-split, add support for character sets and predicates.

* libguile/srfi-13.c (string-split): Add support for splitting on character sets and predicates, like string-index and others. * test-suite/tests/strings.test (string-split): Add tests covering the new argument types. * doc/ref/api-data.texi (string-split): Update. Signed-off-by: Mark H Weaver <mhw@netris.org>
2025-06-18 09:40:25 +02:00 · 2012-10-08 18:35:00 +08:00 · 2012-10-08 18:35:00 +08:00 · 5f085775ab
commit 5f085775ab
parent 8b22ced1c9
4 changed files with 146 additions and 37 deletions
--- a/doc/ref/api-data.texi
+++ b/doc/ref/api-data.texi
@ -3152,12 +3152,24 @@ These procedures are useful for similar tasks.
 Convert the string @var{str} into a list of characters.
@end deffn
-@deffn {Scheme Procedure} string-split str chr
+@deffn {Scheme Procedure} string-split str char_pred
-@deffnx {C Function} scm_string_split (str, chr)
+@deffnx {C Function} scm_string_split (str, char_pred)
 Split the string @var{str} into a list of substrings delimited
-by appearances of the character @var{chr}.  Note that an empty substring
+by appearances of characters that
-between separator characters will result in an empty string in the
+
-result list.
+@itemize @bullet
@item
 equal @var{char_pred}, if it is a character,
@item
 satisfy the predicate @var{char_pred}, if it is a procedure,
@item
 are in the set @var{char_pred}, if it is a character set.
@end itemize
 Note that an empty substring between separator characters will result in
 an empty string in the result list.
@lisp
 (string-split "root:x:0:0:root:/root:/bin/bash" #\:)
--- a/libguile/srfi-13.c
+++ b/libguile/srfi-13.c
@ -2993,11 +2993,22 @@ SCM_DEFINE (scm_string_tokenize, "string-tokenize", 1, 3, 0,
 #undef FUNC_NAME
 SCM_DEFINE (scm_string_split, "string-split", 2, 0, 0,
-	    (SCM str, SCM chr),
+	    (SCM str, SCM char_pred),
 	    "Split the string @var{str} into a list of the substrings delimited\n"
-	    "by appearances of the character @var{chr}.  Note that an empty substring\n"
+            "by appearances of characters that\n"
-	    "between separator characters will result in an empty string in the\n"
+            "\n"
-	    "result list.\n"
+            "@itemize @bullet\n"
            "@item\n"
            "equal @var{char_pred}, if it is a character,\n"
            "\n"
            "@item\n"
            "satisfy the predicate @var{char_pred}, if it is a procedure,\n"
            "\n"
            "@item\n"
            "are in the set @var{char_pred}, if it is a character set.\n"
            "@end itemize\n\n"
            "Note that an empty substring between separator characters\n"
            "will result in an empty string in the result list.\n"
 	    "\n"
 	    "@lisp\n"
 	    "(string-split \"root:x:0:0:root:/root:/bin/bash\" #\\:)\n"
@ -3014,12 +3025,14 @@ SCM_DEFINE (scm_string_split, "string-split", 2, 0, 0,
 	    "@end lisp")
 #define FUNC_NAME s_scm_string_split
 {
  long idx, last_idx;
  int narrow;
  SCM res = SCM_EOL;
  SCM_VALIDATE_STRING (1, str);
-  SCM_VALIDATE_CHAR (2, chr);
+  
  if (SCM_CHARP (char_pred))
    {
      long idx, last_idx;
      int narrow;
      /* This is explicit wide/narrow logic (instead of using
         scm_i_string_ref) is a speed optimization.  */
@ -3031,7 +3044,7 @@ SCM_DEFINE (scm_string_split, "string-split", 2, 0, 0,
          while (idx >= 0)
            {
              last_idx = idx;
-          while (idx > 0 && buf[idx-1] != (char) SCM_CHAR(chr))
+              while (idx > 0 && buf[idx-1] != (char) SCM_CHAR(char_pred))
                idx--;
              if (idx >= 0)
                {
@ -3046,7 +3059,7 @@ SCM_DEFINE (scm_string_split, "string-split", 2, 0, 0,
          while (idx >= 0)
            {
              last_idx = idx;
-          while (idx > 0 && buf[idx-1] != SCM_CHAR(chr))
+              while (idx > 0 && buf[idx-1] != SCM_CHAR(char_pred))
                idx--;
              if (idx >= 0)
                {
@ -3055,6 +3068,30 @@ SCM_DEFINE (scm_string_split, "string-split", 2, 0, 0,
                }
            }
        }
    }
  else
    {
      SCM sidx, slast_idx;
      if (!SCM_CHARSETP (char_pred))
        SCM_ASSERT (scm_is_true (scm_procedure_p (char_pred)),
                    char_pred, SCM_ARG2, FUNC_NAME);
      /* Supporting predicates and character sets involves handling SCM
         values so there is less chance to optimize. */
      slast_idx = scm_string_length (str);
      for (;;)
        {
          sidx = scm_string_index_right (str, char_pred, SCM_INUM0, slast_idx);
          if (scm_is_false (sidx))
            break;
          res = scm_cons (scm_substring (str, scm_oneplus (sidx), slast_idx), res);
          slast_idx = sidx;
        }
      res = scm_cons (scm_substring (str, SCM_INUM0, slast_idx), res);
    }
  scm_remember_upto_here_1 (str);
  return res;
 }
--- a/libguile/srfi-13.h
+++ b/libguile/srfi-13.h
@ -110,7 +110,7 @@ SCM_API SCM scm_xsubstring (SCM s, SCM from, SCM to, SCM start, SCM end);
 SCM_API SCM scm_string_xcopy_x (SCM target, SCM tstart, SCM s, SCM sfrom, SCM sto, SCM start, SCM end);
 SCM_API SCM scm_string_replace (SCM s1, SCM s2, SCM start1, SCM end1, SCM start2, SCM end2);
 SCM_API SCM scm_string_tokenize (SCM s, SCM token_char, SCM start, SCM end);
-SCM_API SCM scm_string_split (SCM s, SCM chr);
+SCM_API SCM scm_string_split (SCM s, SCM char_pred);
 SCM_API SCM scm_string_filter (SCM char_pred, SCM s, SCM start, SCM end);
 SCM_API SCM scm_string_delete (SCM char_pred, SCM s, SCM start, SCM end);
--- a/test-suite/tests/strings.test
+++ b/test-suite/tests/strings.test
@ -557,7 +557,67 @@
  (pass-if "char 255"
    (equal? '("a" "b")
 	    (string-split (string #\a (integer->char 255) #\b)
-			  (integer->char 255)))))
+			  (integer->char 255))))
  (pass-if "empty string - char"
    (equal? '("")
            (string-split "" #\:)))
  (pass-if "non-empty - char - no delimiters"
    (equal? '("foobarfrob")
            (string-split "foobarfrob" #\:)))
  (pass-if "non-empty - char - delimiters"
    (equal? '("foo" "bar" "frob")
            (string-split "foo:bar:frob" #\:)))
  (pass-if "non-empty - char - leading delimiters"
    (equal? '("" "" "foo" "bar" "frob")
            (string-split "::foo:bar:frob" #\:)))
  (pass-if "non-empty - char - trailing delimiters"
    (equal? '("foo" "bar" "frob" "" "")
            (string-split "foo:bar:frob::" #\:)))
  (pass-if "empty string - charset"
    (equal? '("")
            (string-split "" (char-set #\:))))
  (pass-if "non-empty - charset - no delimiters"
    (equal? '("foobarfrob")
            (string-split "foobarfrob" (char-set #\:))))
  (pass-if "non-empty - charset - delimiters"
    (equal? '("foo" "bar" "frob")
            (string-split "foo:bar:frob" (char-set #\:))))
  (pass-if "non-empty - charset - leading delimiters"
    (equal? '("" "" "foo" "bar" "frob")
            (string-split "::foo:bar:frob" (char-set #\:))))
  (pass-if "non-empty - charset - trailing delimiters"
    (equal? '("foo" "bar" "frob" "" "")
            (string-split "foo:bar:frob::" (char-set #\:))))
  (pass-if "empty string - pred"
    (equal? '("")
            (string-split "" (negate char-alphabetic?))))
  (pass-if "non-empty - pred - no delimiters"
    (equal? '("foobarfrob")
            (string-split "foobarfrob" (negate char-alphabetic?))))
  (pass-if "non-empty - pred - delimiters"
    (equal? '("foo" "bar" "frob")
            (string-split "foo:bar:frob" (negate char-alphabetic?))))
  (pass-if "non-empty - pred - leading delimiters"
    (equal? '("" "" "foo" "bar" "frob")
            (string-split "::foo:bar:frob" (negate char-alphabetic?))))
  (pass-if "non-empty - pred - trailing delimiters"
    (equal? '("foo" "bar" "frob" "" "")
            (string-split "foo:bar:frob::" (negate char-alphabetic?)))))
 (with-test-prefix "substring-move!"