From 9bb8793a6cea35969fefab8fd535fcab16219e5e Mon Sep 17 00:00:00 2001 From: Michael Gran Date: Sun, 25 Jun 2023 08:16:34 -0700 Subject: [PATCH] Allow read-line to handle "\r\n" as a line terminator Adds CRLF as a line ending. %read-line will return these. In the case of CRLF, %read-line will return a string "\r\n" as the line ending. * libguile/rdelim.c (scm_read_line): handle CRLF line delimiter * module/ice-9/suspendable-ports.scm (%read-line): modify to handle CRLF line delimiter (read-line): use modified %read-line * test-suite/tests/rdelim.test ("two lines, split, CRLF"): new test ("two long lines, split, CRLF"): new test * doc/ref/api-io.texi: update read-line documentation --- NEWS | 11 +++++++ doc/ref/api-io.texi | 6 ++-- libguile/rdelim.c | 38 +++++++++++++++++++----- module/ice-9/suspendable-ports.scm | 46 ++++++++++++++++++++++++++---- test-suite/tests/rdelim.test | 22 ++++++++++++++ 5 files changed, 109 insertions(+), 14 deletions(-) diff --git a/NEWS b/NEWS index dfcd4fe0d..3d13bda74 100644 --- a/NEWS +++ b/NEWS @@ -109,6 +109,17 @@ eval-in-sandbox is modified so that #:time-limit accepts #f to disable the time limit. Systems without SIGALRM can use eval-in-sandbox if the time limit is disabled. +** 'read-line' detects '\r\n' as a line delimiter + +read-line is updated to detect a carriage return / line feed pair as a +line delimiter. When CRLF is detected at the end of a line, the +returned line delimiter is "\r\n". Previously, when a line terminated +with '\r\n' was read, the return character was appended to the string, +and the returned delimiter was '\n'. + +Carriage return / line feed is a common line deliminator for Windows +text files. + * Bug fixes ** `basename` now checks the suffix against the base name, not the full path diff --git a/doc/ref/api-io.texi b/doc/ref/api-io.texi index 1bc5c0836..70e30dc48 100644 --- a/doc/ref/api-io.texi +++ b/doc/ref/api-io.texi @@ -894,8 +894,10 @@ a specified set of characters. @deffn {Scheme Procedure} read-line [port] [handle-delim] Return a line of text from @var{port} if specified, otherwise from the -value returned by @code{(current-input-port)}. Under Unix, a line of text -is terminated by the first end-of-line character or by end-of-file. +value returned by @code{(current-input-port)}. A line of +text is terminated by a single linefeed character, a return followed +by a linefeed, or by end-of-file. + If @var{handle-delim} is specified, it should be one of the following symbols: diff --git a/libguile/rdelim.c b/libguile/rdelim.c index 9d41712dd..c0ae0f377 100644 --- a/libguile/rdelim.c +++ b/libguile/rdelim.c @@ -127,6 +127,7 @@ SCM_DEFINE (scm_read_line, "%read-line", 0, 1, 0, SCM line, strings, result; scm_t_wchar buf[LINE_BUFFER_SIZE], delim; size_t index; + int cr = 0; if (SCM_UNBNDP (port)) port = scm_current_input_port (); @@ -152,12 +153,22 @@ SCM_DEFINE (scm_read_line, "%read-line", 0, 1, 0, buf[index] = scm_getc (port); switch (buf[index]) { - case EOF: case '\n': delim = buf[index]; - break; + break; + + case EOF: + cr = 0; + delim = buf[index]; + break; + + case '\r': + cr = 1; + index ++; + break; default: + cr = 0; index++; } } @@ -165,20 +176,33 @@ SCM_DEFINE (scm_read_line, "%read-line", 0, 1, 0, while (delim == 0); if (SCM_LIKELY (scm_is_false (strings))) - /* The fast path. */ - line = scm_from_utf32_stringn (buf, index); + { + /* The fast path. */ + if (cr) + line = scm_from_utf32_stringn (buf, index - 1); + else + line = scm_from_utf32_stringn (buf, index); + } else { /* Aggregate the intermediary results. */ - strings = scm_cons (scm_from_utf32_stringn (buf, index), strings); + if (cr) + strings = scm_cons (scm_from_utf32_stringn (buf, index - 1), strings); + else + strings = scm_cons (scm_from_utf32_stringn (buf, index), strings); line = scm_string_concatenate (scm_reverse (strings)); } if (delim == EOF && scm_i_string_length (line) == 0) result = scm_cons (SCM_EOF_VAL, SCM_EOF_VAL); else - result = scm_cons (line, - delim == EOF ? SCM_EOF_VAL : SCM_MAKE_CHAR (delim)); + { + if (cr) + result = scm_cons (line, scm_from_latin1_string("\r\n")); + else + result = scm_cons (line, + delim == EOF ? SCM_EOF_VAL : SCM_MAKE_CHAR (delim)); + } return result; #undef LINE_BUFFER_SIZE diff --git a/module/ice-9/suspendable-ports.scm b/module/ice-9/suspendable-ports.scm index 720f10c0f..6645bd874 100644 --- a/module/ice-9/suspendable-ports.scm +++ b/module/ice-9/suspendable-ports.scm @@ -1,5 +1,5 @@ ;;; Ports, implemented in Scheme -;;; Copyright (C) 2016, 2019 Free Software Foundation, Inc. +;;; Copyright (C) 2016, 2018, 2019 Free Software Foundation, Inc. ;;; ;;; This library is free software: you can redistribute it and/or modify ;;; it under the terms of the GNU Lesser General Public License as @@ -691,12 +691,48 @@ (or (eqv? char (string-ref delims i)) (lp (1+ i))))))))))) +(define* (%read-line port) + (let* ((return-flag-delim-and-chars + (let loop ((cr #f) + (chars '()) + (c (read-char port))) + (cond + ((eof-object? c) + (list #f c chars)) + ((char=? c #\newline) + (list cr c chars)) + ((char=? c #\return) + (loop #t (cons c chars) (read-char port))) + (else + (loop #f (cons c chars) (read-char port)))))) + (return-flag (car return-flag-delim-and-chars)) + (delim (cadr return-flag-delim-and-chars)) + (chars (caddr return-flag-delim-and-chars))) + + (if (and (eof-object? delim) + (null? chars)) + (cons the-eof-object the-eof-object) + ;; Else + (if return-flag + (cons (list->string (reverse (cdr chars))) "\r\n") + (cons (list->string (reverse chars)) delim))))) + (define* (read-line #:optional (port (current-input-port)) (handle-delim 'trim)) - (read-delimited "\n" port handle-delim)) - -(define* (%read-line port) - (read-line port 'split)) + (let* ((line/delim (%read-line port)) + (line (car line/delim)) + (delim (cdr line/delim))) + (case handle-delim + ((trim) line) + ((split) line/delim) + ((concat) (if (and (string? line) (char? delim)) + (string-append line (string delim)) + line)) + ((peek) (if (char? delim) + (unread-char delim port)) + line) + (else + (error "unexpected handle-delim value: " handle-delim))))) (define* (put-string port str #:optional (start 0) (count (- (string-length str) start))) diff --git a/test-suite/tests/rdelim.test b/test-suite/tests/rdelim.test index ad44278d2..80ea86e76 100644 --- a/test-suite/tests/rdelim.test +++ b/test-suite/tests/rdelim.test @@ -63,6 +63,28 @@ (read-line p 'split))) (eof-object? (read-line p))))) + (pass-if "two lines, split, CRLF" + (let* ((s "foo\r\nbar\r\n") + (p (open-input-string s))) + (and (equal? '(("foo" . "\r\n") + ("bar" . "\r\n")) + (list (read-line p 'split) + (read-line p 'split))) + (eof-object? (read-line p))))) + + (pass-if "two long lines, split, CRLF" + ;; Must be longer than 256 codepoints + (let* ((text0 "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ") + (text1 (string-append text0 text0 text0 text0 text0)) + (text2 (string-append text1 "\r\n" text1 "\r\n"))) + (let* ((s text2) + (p (open-input-string s))) + (and (equal? `((,text1 . "\r\n") + (,text1 . "\r\n")) + (list (read-line p 'split) + (read-line p 'split))) + (eof-object? (read-line p)))))) + (pass-if "two Greek lines, trim" (let* ((s "λαμβδα\nμυ\n") (p (open-input-string s)))