From 77b33170f4113c1d37f62c66a4807996187d2e24 Mon Sep 17 00:00:00 2001 From: Michael Gran Date: Tue, 17 Apr 2018 08:22:18 -0700 Subject: [PATCH] Let read-line handle alternate line endings Adds CRLF, NEL, PS and LS as line endings. %read-line will return these. In the case of CRLF, %read-line will return a string "\r\n" as the line ending. * libguile/rdelim.c (scm_read_line): handle more line delimiters * test-suite/tests/rdelim.test ("two lines, split, CRLF"): new test ("two long lines, split, CRLF", "two lines, split, NEL"): new tests ("two lines, split, LS", "two lines, split, PS"): new tests --- libguile/rdelim.c | 41 ++++++++++++++++++++++++------ test-suite/tests/rdelim.test | 49 ++++++++++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+), 7 deletions(-) diff --git a/libguile/rdelim.c b/libguile/rdelim.c index 80962bc5e..62795b9ba 100644 --- a/libguile/rdelim.c +++ b/libguile/rdelim.c @@ -126,6 +126,7 @@ SCM_DEFINE (scm_read_line, "%read-line", 0, 1, 0, SCM line, strings, result; scm_t_wchar buf[LINE_BUFFER_SIZE], delim; size_t index; + int cr = 0; if (SCM_UNBNDP (port)) port = scm_current_input_port (); @@ -151,12 +152,25 @@ SCM_DEFINE (scm_read_line, "%read-line", 0, 1, 0, buf[index] = scm_getc (port); switch (buf[index]) { - case EOF: case '\n': delim = buf[index]; - break; + break; + + case EOF: + case 0x85: + case 0x2028: + case 0x2029: + cr = 0; + delim = buf[index]; + break; + + case '\r': + cr = 1; + index ++; + break; default: + cr = 0; index++; } } @@ -164,20 +178,33 @@ SCM_DEFINE (scm_read_line, "%read-line", 0, 1, 0, while (delim == 0); if (SCM_LIKELY (scm_is_false (strings))) - /* The fast path. */ - line = scm_from_utf32_stringn (buf, index); + { + /* The fast path. */ + if (cr) + line = scm_from_utf32_stringn (buf, index - 1); + else + line = scm_from_utf32_stringn (buf, index); + } else { /* Aggregate the intermediary results. */ - strings = scm_cons (scm_from_utf32_stringn (buf, index), strings); + if (cr) + strings = scm_cons (scm_from_utf32_stringn (buf, index - 1), strings); + else + strings = scm_cons (scm_from_utf32_stringn (buf, index), strings); line = scm_string_concatenate (scm_reverse (strings)); } if (delim == EOF && scm_i_string_length (line) == 0) result = scm_cons (SCM_EOF_VAL, SCM_EOF_VAL); else - result = scm_cons (line, - delim == EOF ? SCM_EOF_VAL : SCM_MAKE_CHAR (delim)); + { + if (cr) + result = scm_cons (line, scm_from_latin1_string("\r\n")); + else + result = scm_cons (line, + delim == EOF ? SCM_EOF_VAL : SCM_MAKE_CHAR (delim)); + } return result; #undef LINE_BUFFER_SIZE diff --git a/test-suite/tests/rdelim.test b/test-suite/tests/rdelim.test index 3aaa0b253..7f9117b1e 100644 --- a/test-suite/tests/rdelim.test +++ b/test-suite/tests/rdelim.test @@ -62,6 +62,55 @@ (read-line p 'split))) (eof-object? (read-line p))))) + (pass-if "two lines, split, CRLF" + (let* ((s "foo\r\nbar\r\n") + (p (open-input-string s))) + (and (equal? '(("foo" . "\r\n") + ("bar" . "\r\n")) + (list (read-line p 'split) + (read-line p 'split))) + (eof-object? (read-line p))))) + + (pass-if "two long lines, split, CRLF" + ;; Must be longer than 256 codepoints + (let* ((text0 "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ") + (text1 (string-append text0 text0 text0 text0 text0)) + (text2 (string-append text1 "\r\n" text1 "\r\n"))) + (let* ((s text2) + (p (open-input-string s))) + (and (equal? `((,text1 . "\r\n") + (,text1 . "\r\n")) + (list (read-line p 'split) + (read-line p 'split))) + (eof-object? (read-line p)))))) + + (pass-if "two lines, split, NEL" + (let* ((s "foo\x85bar\x85") + (p (open-input-string s))) + (and (equal? '(("foo" . #\x85) + ("bar" . #\x85)) + (list (read-line p 'split) + (read-line p 'split))) + (eof-object? (read-line p))))) + + (pass-if "two lines, split, LS" + (let* ((s "foo\u2028bar\u2028") + (p (open-input-string s))) + (and (equal? '(("foo" . #\x2028) + ("bar" . #\x2028)) + (list (read-line p 'split) + (read-line p 'split))) + (eof-object? (read-line p))))) + + (pass-if "two lines, split, PS" + (let* ((s "foo\u2029bar\u2029") + (p (open-input-string s))) + (and (equal? '(("foo" . #\x2029) + ("bar" . #\x2029)) + (list (read-line p 'split) + (read-line p 'split))) + (eof-object? (read-line p))))) + (pass-if "two Greek lines, trim" (let* ((s "λαμβδα\nμυ\n") (p (open-input-string s)))