1
Fork 0
mirror of https://git.savannah.gnu.org/git/guile.git synced 2025-04-30 03:40:34 +02:00

Revert "Handle CRLF and Unicode line endings in read-line"

This reverts commit 0f983e3db0.

After discussing with Mike we are going to punt the read-line changes
for now.  Open the port in O_TEXT mode if you want to chomp the CR in
CFLF sequences.
This commit is contained in:
Andy Wingo 2021-03-12 22:06:56 +01:00
parent 1c472fef54
commit e30ee90478
6 changed files with 25 additions and 171 deletions

6
NEWS
View file

@ -166,12 +166,6 @@ See the newly reorganized "Foreign Function Interface", for details.
These new interfaces replace `dynamic-link', `dynamic-pointer' and These new interfaces replace `dynamic-link', `dynamic-pointer' and
similar, which will eventually be deprecated. similar, which will eventually be deprecated.
** `read-line'
This now accepts return + newline and the Unicode line separator and
paragraph separator as line separators, as well as the newline and <eof>
line separators it handled before.
** `read-syntax' ** `read-syntax'
See "Annotated Scheme Read" in the manual. See "Annotated Scheme Read" in the manual.

View file

@ -755,10 +755,8 @@ a specified set of characters.
@deffn {Scheme Procedure} read-line [port] [handle-delim] @deffn {Scheme Procedure} read-line [port] [handle-delim]
Return a line of text from @var{port} if specified, otherwise from the Return a line of text from @var{port} if specified, otherwise from the
value returned by @code{(current-input-port)}. Under Unix, a line of value returned by @code{(current-input-port)}. Under Unix, a line of text
text is terminated by the first end-of-line character or by end-of-file. is terminated by the first end-of-line character or by end-of-file.
The end-of-line characters handled are newline, carriage return plus
newline, or the Unicode line or paragraph separators.
If @var{handle-delim} is specified, it should be one of the following If @var{handle-delim} is specified, it should be one of the following
symbols: symbols:
@ -773,9 +771,7 @@ Append the terminating delimiter (if any) to the returned string.
Push the terminating delimiter (if any) back on to the port. Push the terminating delimiter (if any) back on to the port.
@item split @item split
Return a pair containing the string read from the port and the Return a pair containing the string read from the port and the
terminating delimiter or end-of-file object. The delimiter will either terminating delimiter or end-of-file object.
be a single character for newline or the Unicode line or paragraph
separators, or it will be the string @code{"\r\n"}.
@end table @end table
@end deffn @end deffn

View file

@ -112,11 +112,10 @@ SCM_DEFINE (scm_read_delimited_x, "%read-delimited!", 3, 3, 0,
SCM_DEFINE (scm_read_line, "%read-line", 0, 1, 0, SCM_DEFINE (scm_read_line, "%read-line", 0, 1, 0,
(SCM port), (SCM port),
"Read a line from @var{port}, allocating storage as necessary.\n" "Read a newline-terminated line from @var{port}, allocating storage as\n"
"The terminator (if any) is removed from the string,\n" "necessary. The newline terminator (if any) is removed from the string,\n"
"and a pair consisting of the line and its delimiter is returned. The\n" "and a pair consisting of the line and its delimiter is returned. The\n"
"delimiter may be either a newline, return + newline, the Unicode\n" "delimiter may be either a newline or the @var{eof-object}; if\n"
"line or paragraph separators, or the @var{eof-object}; if\n"
"@code{%read-line} is called at the end of file, it returns the pair\n" "@code{%read-line} is called at the end of file, it returns the pair\n"
"@code{(#<eof> . #<eof>)}.") "@code{(#<eof> . #<eof>)}.")
#define FUNC_NAME s_scm_read_line #define FUNC_NAME s_scm_read_line
@ -128,7 +127,6 @@ SCM_DEFINE (scm_read_line, "%read-line", 0, 1, 0,
SCM line, strings, result; SCM line, strings, result;
scm_t_wchar buf[LINE_BUFFER_SIZE], delim; scm_t_wchar buf[LINE_BUFFER_SIZE], delim;
size_t index; size_t index;
int cr = 0;
if (SCM_UNBNDP (port)) if (SCM_UNBNDP (port))
port = scm_current_input_port (); port = scm_current_input_port ();
@ -154,24 +152,12 @@ SCM_DEFINE (scm_read_line, "%read-line", 0, 1, 0,
buf[index] = scm_getc (port); buf[index] = scm_getc (port);
switch (buf[index]) switch (buf[index])
{ {
case EOF:
case '\n': case '\n':
delim = buf[index]; delim = buf[index];
break; break;
case EOF:
case 0x2028: /* U+2028 LINE SEPARATOR */
case 0x2029: /* U+2029 PARAGRAPH SEPARATOR */
cr = 0;
delim = buf[index];
break;
case '\r':
cr = 1;
index ++;
break;
default: default:
cr = 0;
index++; index++;
} }
} }
@ -179,33 +165,20 @@ SCM_DEFINE (scm_read_line, "%read-line", 0, 1, 0,
while (delim == 0); while (delim == 0);
if (SCM_LIKELY (scm_is_false (strings))) if (SCM_LIKELY (scm_is_false (strings)))
{
/* The fast path. */ /* The fast path. */
if (cr)
line = scm_from_utf32_stringn (buf, index - 1);
else
line = scm_from_utf32_stringn (buf, index); line = scm_from_utf32_stringn (buf, index);
}
else else
{ {
/* Aggregate the intermediary results. */ /* Aggregate the intermediary results. */
if (cr)
strings = scm_cons (scm_from_utf32_stringn (buf, index - 1), strings);
else
strings = scm_cons (scm_from_utf32_stringn (buf, index), strings); strings = scm_cons (scm_from_utf32_stringn (buf, index), strings);
line = scm_string_concatenate (scm_reverse (strings)); line = scm_string_concatenate (scm_reverse (strings));
} }
if (delim == EOF && scm_i_string_length (line) == 0) if (delim == EOF && scm_i_string_length (line) == 0)
result = scm_cons (SCM_EOF_VAL, SCM_EOF_VAL); result = scm_cons (SCM_EOF_VAL, SCM_EOF_VAL);
else
{
if (cr)
result = scm_cons (line, scm_from_latin1_string("\r\n"));
else else
result = scm_cons (line, result = scm_cons (line,
delim == EOF ? SCM_EOF_VAL : SCM_MAKE_CHAR (delim)); delim == EOF ? SCM_EOF_VAL : SCM_MAKE_CHAR (delim));
}
return result; return result;
#undef LINE_BUFFER_SIZE #undef LINE_BUFFER_SIZE

View file

@ -1,5 +1,5 @@
;;; Ports, implemented in Scheme ;;; Ports, implemented in Scheme
;;; Copyright (C) 2016, 2018, 2021 Free Software Foundation, Inc. ;;; Copyright (C) 2016, 2019 Free Software Foundation, Inc.
;;; ;;;
;;; This library is free software: you can redistribute it and/or modify ;;; This library is free software: you can redistribute it and/or modify
;;; it under the terms of the GNU Lesser General Public License as ;;; it under the terms of the GNU Lesser General Public License as
@ -689,81 +689,10 @@
(define* (read-line #:optional (port (current-input-port)) (define* (read-line #:optional (port (current-input-port))
(handle-delim 'trim)) (handle-delim 'trim))
(let* ((line/delim (%read-line port)) (read-delimited "\n" port handle-delim))
(line (car line/delim))
(delim (cdr line/delim)))
(case handle-delim
((trim) line)
((split) line/delim)
((concat) (if (and (string? line) (char? delim))
(string-append line (string delim))
line))
((peek) (if (char? delim)
(unread-char delim port))
line)
(else
(error "unexpected handle-delim value: " handle-delim)))))
(define* (%read-line port) (define* (%read-line port)
(let ((LINE_BUFFER_SIZE 256)) (read-line port 'split))
(let ((strings #f)
(result #f)
(buf (make-string LINE_BUFFER_SIZE #\nul))
(delim #f)
(index 0)
(cr #f)
(go #t))
(cond
((not (input-port? port))
(error "Not an input port." port))
(else
(while go
(cond
((>= index LINE_BUFFER_SIZE)
(set! strings (cons (substring buf 0 index)
(or strings '())))
(set! index 0))
(else
(let ((c (read-char port)))
(cond
((or (eof-object? c)
(char=? c #\x2028) ; U+2028 LINE SEPARATOR
(char=? c #\x2029)) ; U+2029 PARAGRAPH SEPARATOR
(set! cr #f)
(set! delim c))
((char=? c #\newline)
(set! delim c))
((char=? c #\return)
(set! cr #t)
(string-set! buf index c)
(set! index (1+ index)))
(else
(set! cr #f)
(string-set! buf index c)
(set! index (1+ index)))))))
(if (or (eof-object? delim)
(char? delim))
(set! go #f)))
(let ((line (if (not strings)
;; A short string.
(if cr
(substring buf 0 (1- index))
(substring buf 0 index))
;; Else, aggregate the intermediary results.
(begin
(if cr
(set! strings (cons (substring buf 0 (1- index)) strings))
(set! strings (cons (substring buf 0 index) strings)))
(apply string-append (reverse strings))))))
(if (and (eof-object? delim)
(zero? (string-length line)))
(cons the-eof-object the-eof-object)
;; Else
(if cr
(cons line "\r\n")
(cons line delim)))))))))
(define* (put-string port str #:optional (start 0) (define* (put-string port str #:optional (start 0)
(count (- (string-length str) start))) (count (- (string-length str) start)))

View file

@ -157,12 +157,13 @@ The default writer will call put-string."
Raise a 'bad-header' exception if the line does not end in CRLF or LF, Raise a 'bad-header' exception if the line does not end in CRLF or LF,
or if EOF is reached." or if EOF is reached."
(match (%read-line port) (match (%read-line port)
(((? string? line) . "\r\n")
line)
(((? string? line) . #\newline) (((? string? line) . #\newline)
;; We are more tolerant than the RFC in that we tolerate LF-only ;; '%read-line' does not consider #\return a delimiter; so if it's
;; endings. ;; there, remove it. We are more tolerant than the RFC in that we
line) ;; tolerate LF-only endings.
(if (string-suffix? "\r" line)
(string-drop-right line 1)
line))
((line . _) ;EOF or missing delimiter ((line . _) ;EOF or missing delimiter
(bad-header 'read-header-line line)))) (bad-header 'read-header-line line))))
@ -183,7 +184,8 @@ was known but the value was invalid.
Returns the end-of-file object for both values if the end of the message Returns the end-of-file object for both values if the end of the message
body was reached (i.e., a blank line)." body was reached (i.e., a blank line)."
(let ((line (read-header-line port))) (let ((line (read-header-line port)))
(if (string-null? line) (if (or (string-null? line)
(string=? line "\r"))
(values *eof* *eof*) (values *eof* *eof*)
(let* ((delim (or (string-index line #\:) (let* ((delim (or (string-index line #\:)
(bad-header '%read line))) (bad-header '%read line)))

View file

@ -1,7 +1,7 @@
;;;; rdelim.test --- Delimited I/O. -*- mode: scheme; coding: utf-8; -*- ;;;; rdelim.test --- Delimited I/O. -*- mode: scheme; coding: utf-8; -*-
;;;; Ludovic Courtès <ludo@gnu.org> ;;;; Ludovic Courtès <ludo@gnu.org>
;;;; ;;;;
;;;; Copyright (C) 2011, 2013, 2014, 2021 Free Software Foundation, Inc. ;;;; Copyright (C) 2011, 2013, 2014 Free Software Foundation, Inc.
;;;; ;;;;
;;;; This library is free software; you can redistribute it and/or ;;;; This library is free software; you can redistribute it and/or
;;;; modify it under the terms of the GNU Lesser General Public ;;;; modify it under the terms of the GNU Lesser General Public
@ -62,46 +62,6 @@
(read-line p 'split))) (read-line p 'split)))
(eof-object? (read-line p))))) (eof-object? (read-line p)))))
(pass-if "two lines, split, CRLF"
(let* ((s "foo\r\nbar\r\n")
(p (open-input-string s)))
(and (equal? '(("foo" . "\r\n")
("bar" . "\r\n"))
(list (read-line p 'split)
(read-line p 'split)))
(eof-object? (read-line p)))))
(pass-if "two long lines, split, CRLF"
;; Must be longer than 256 codepoints
(let* ((text0 "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
(text1 (string-append text0 text0 text0 text0 text0))
(text2 (string-append text1 "\r\n" text1 "\r\n")))
(let* ((s text2)
(p (open-input-string s)))
(and (equal? `((,text1 . "\r\n")
(,text1 . "\r\n"))
(list (read-line p 'split)
(read-line p 'split)))
(eof-object? (read-line p))))))
(pass-if "two lines, split, LS"
(let* ((s "foo\u2028bar\u2028")
(p (open-input-string s)))
(and (equal? '(("foo" . #\x2028)
("bar" . #\x2028))
(list (read-line p 'split)
(read-line p 'split)))
(eof-object? (read-line p)))))
(pass-if "two lines, split, PS"
(let* ((s "foo\u2029bar\u2029")
(p (open-input-string s)))
(and (equal? '(("foo" . #\x2029)
("bar" . #\x2029))
(list (read-line p 'split)
(read-line p 'split)))
(eof-object? (read-line p)))))
(pass-if "two Greek lines, trim" (pass-if "two Greek lines, trim"
(let* ((s "λαμβδα\nμυ\n") (let* ((s "λαμβδα\nμυ\n")
(p (open-input-string s))) (p (open-input-string s)))