1
Fork 0
mirror of https://git.savannah.gnu.org/git/guile.git synced 2025-04-29 19:30:36 +02:00

Handle CRLF and Unicode line endings in read-line

* libguile/rdelim.c (scm_read_line): handle CRLF, LS and PS
* module/ice-9/suspendable-ports.scm (read-line): handle CRLF, LS, and PS
* module/web/http.scm (read-header-line): take advantage of CRLF in read-line
   (read-header): don't need to test for \return
* test-suite/tests/rdelim.test: new tests for read-line CRLF, LS and PS
* doc/ref/api-io.texi: update doc for read-line
This commit is contained in:
Mike Gran 2021-03-11 19:42:33 -08:00
parent a744f98dcc
commit 0f983e3db0
5 changed files with 165 additions and 25 deletions

View file

@ -755,8 +755,10 @@ a specified set of characters.
@deffn {Scheme Procedure} read-line [port] [handle-delim]
Return a line of text from @var{port} if specified, otherwise from the
value returned by @code{(current-input-port)}. Under Unix, a line of text
is terminated by the first end-of-line character or by end-of-file.
value returned by @code{(current-input-port)}. Under Unix, a line of
text is terminated by the first end-of-line character or by end-of-file.
The end-of-line characters handled are newline, carriage return plus
newline, or the Unicode line or paragraph separators.
If @var{handle-delim} is specified, it should be one of the following
symbols:
@ -771,7 +773,9 @@ Append the terminating delimiter (if any) to the returned string.
Push the terminating delimiter (if any) back on to the port.
@item split
Return a pair containing the string read from the port and the
terminating delimiter or end-of-file object.
terminating delimiter or end-of-file object. The delimiter will either
be a single character for newline or the Unicode line or paragraph
separators, or it will be the string @code{"\r\n"}.
@end table
@end deffn

View file

@ -112,10 +112,11 @@ SCM_DEFINE (scm_read_delimited_x, "%read-delimited!", 3, 3, 0,
SCM_DEFINE (scm_read_line, "%read-line", 0, 1, 0,
(SCM port),
"Read a newline-terminated line from @var{port}, allocating storage as\n"
"necessary. The newline terminator (if any) is removed from the string,\n"
"Read a line from @var{port}, allocating storage as necessary.\n"
"The terminator (if any) is removed from the string,\n"
"and a pair consisting of the line and its delimiter is returned. The\n"
"delimiter may be either a newline or the @var{eof-object}; if\n"
"delimiter may be either a newline, return + newline, the Unicode\n"
"line or paragraph separators, or the @var{eof-object}; if\n"
"@code{%read-line} is called at the end of file, it returns the pair\n"
"@code{(#<eof> . #<eof>)}.")
#define FUNC_NAME s_scm_read_line
@ -127,6 +128,7 @@ SCM_DEFINE (scm_read_line, "%read-line", 0, 1, 0,
SCM line, strings, result;
scm_t_wchar buf[LINE_BUFFER_SIZE], delim;
size_t index;
int cr = 0;
if (SCM_UNBNDP (port))
port = scm_current_input_port ();
@ -152,12 +154,24 @@ SCM_DEFINE (scm_read_line, "%read-line", 0, 1, 0,
buf[index] = scm_getc (port);
switch (buf[index])
{
case EOF:
case '\n':
delim = buf[index];
break;
break;
case EOF:
case 0x2028: /* U+2028 LINE SEPARATOR */
case 0x2029: /* U+2029 PARAGRAPH SEPARATOR */
cr = 0;
delim = buf[index];
break;
case '\r':
cr = 1;
index ++;
break;
default:
cr = 0;
index++;
}
}
@ -165,20 +179,33 @@ SCM_DEFINE (scm_read_line, "%read-line", 0, 1, 0,
while (delim == 0);
if (SCM_LIKELY (scm_is_false (strings)))
/* The fast path. */
line = scm_from_utf32_stringn (buf, index);
{
/* The fast path. */
if (cr)
line = scm_from_utf32_stringn (buf, index - 1);
else
line = scm_from_utf32_stringn (buf, index);
}
else
{
/* Aggregate the intermediary results. */
strings = scm_cons (scm_from_utf32_stringn (buf, index), strings);
if (cr)
strings = scm_cons (scm_from_utf32_stringn (buf, index - 1), strings);
else
strings = scm_cons (scm_from_utf32_stringn (buf, index), strings);
line = scm_string_concatenate (scm_reverse (strings));
}
if (delim == EOF && scm_i_string_length (line) == 0)
result = scm_cons (SCM_EOF_VAL, SCM_EOF_VAL);
else
result = scm_cons (line,
delim == EOF ? SCM_EOF_VAL : SCM_MAKE_CHAR (delim));
{
if (cr)
result = scm_cons (line, scm_from_latin1_string("\r\n"));
else
result = scm_cons (line,
delim == EOF ? SCM_EOF_VAL : SCM_MAKE_CHAR (delim));
}
return result;
#undef LINE_BUFFER_SIZE

View file

@ -1,5 +1,5 @@
;;; Ports, implemented in Scheme
;;; Copyright (C) 2016, 2019 Free Software Foundation, Inc.
;;; Copyright (C) 2016, 2018, 2021 Free Software Foundation, Inc.
;;;
;;; This library is free software: you can redistribute it and/or modify
;;; it under the terms of the GNU Lesser General Public License as
@ -689,10 +689,81 @@
(define* (read-line #:optional (port (current-input-port))
(handle-delim 'trim))
(read-delimited "\n" port handle-delim))
(let* ((line/delim (%read-line port))
(line (car line/delim))
(delim (cdr line/delim)))
(case handle-delim
((trim) line)
((split) line/delim)
((concat) (if (and (string? line) (char? delim))
(string-append line (string delim))
line))
((peek) (if (char? delim)
(unread-char delim port))
line)
(else
(error "unexpected handle-delim value: " handle-delim)))))
(define* (%read-line port)
(read-line port 'split))
(let ((LINE_BUFFER_SIZE 256))
(let ((strings #f)
(result #f)
(buf (make-string LINE_BUFFER_SIZE #\nul))
(delim #f)
(index 0)
(cr #f)
(go #t))
(cond
((not (input-port? port))
(error "Not an input port." port))
(else
(while go
(cond
((>= index LINE_BUFFER_SIZE)
(set! strings (cons (substring buf 0 index)
(or strings '())))
(set! index 0))
(else
(let ((c (read-char port)))
(cond
((or (eof-object? c)
(char=? c #\x2028) ; U+2028 LINE SEPARATOR
(char=? c #\x2029)) ; U+2029 PARAGRAPH SEPARATOR
(set! cr #f)
(set! delim c))
((char=? c #\newline)
(set! delim c))
((char=? c #\return)
(set! cr #t)
(string-set! buf index c)
(set! index (1+ index)))
(else
(set! cr #f)
(string-set! buf index c)
(set! index (1+ index)))))))
(if (or (eof-object? delim)
(char? delim))
(set! go #f)))
(let ((line (if (not strings)
;; A short string.
(if cr
(substring buf 0 (1- index))
(substring buf 0 index))
;; Else, aggregate the intermediary results.
(begin
(if cr
(set! strings (cons (substring buf 0 (1- index)) strings))
(set! strings (cons (substring buf 0 index) strings)))
(apply string-append (reverse strings))))))
(if (and (eof-object? delim)
(zero? (string-length line)))
(cons the-eof-object the-eof-object)
;; Else
(if cr
(cons line "\r\n")
(cons line delim)))))))))
(define* (put-string port str #:optional (start 0)
(count (- (string-length str) start)))

View file

@ -157,13 +157,12 @@ The default writer will call put-string."
Raise a 'bad-header' exception if the line does not end in CRLF or LF,
or if EOF is reached."
(match (%read-line port)
(((? string? line) . "\r\n")
line)
(((? string? line) . #\newline)
;; '%read-line' does not consider #\return a delimiter; so if it's
;; there, remove it. We are more tolerant than the RFC in that we
;; tolerate LF-only endings.
(if (string-suffix? "\r" line)
(string-drop-right line 1)
line))
;; We are more tolerant than the RFC in that we tolerate LF-only
;; endings.
line)
((line . _) ;EOF or missing delimiter
(bad-header 'read-header-line line))))
@ -184,8 +183,7 @@ was known but the value was invalid.
Returns the end-of-file object for both values if the end of the message
body was reached (i.e., a blank line)."
(let ((line (read-header-line port)))
(if (or (string-null? line)
(string=? line "\r"))
(if (string-null? line)
(values *eof* *eof*)
(let* ((delim (or (string-index line #\:)
(bad-header '%read line)))

View file

@ -1,7 +1,7 @@
;;;; rdelim.test --- Delimited I/O. -*- mode: scheme; coding: utf-8; -*-
;;;; Ludovic Courtès <ludo@gnu.org>
;;;;
;;;; Copyright (C) 2011, 2013, 2014 Free Software Foundation, Inc.
;;;; Copyright (C) 2011, 2013, 2014, 2021 Free Software Foundation, Inc.
;;;;
;;;; This library is free software; you can redistribute it and/or
;;;; modify it under the terms of the GNU Lesser General Public
@ -62,6 +62,46 @@
(read-line p 'split)))
(eof-object? (read-line p)))))
(pass-if "two lines, split, CRLF"
(let* ((s "foo\r\nbar\r\n")
(p (open-input-string s)))
(and (equal? '(("foo" . "\r\n")
("bar" . "\r\n"))
(list (read-line p 'split)
(read-line p 'split)))
(eof-object? (read-line p)))))
(pass-if "two long lines, split, CRLF"
;; Must be longer than 256 codepoints
(let* ((text0 "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
(text1 (string-append text0 text0 text0 text0 text0))
(text2 (string-append text1 "\r\n" text1 "\r\n")))
(let* ((s text2)
(p (open-input-string s)))
(and (equal? `((,text1 . "\r\n")
(,text1 . "\r\n"))
(list (read-line p 'split)
(read-line p 'split)))
(eof-object? (read-line p))))))
(pass-if "two lines, split, LS"
(let* ((s "foo\u2028bar\u2028")
(p (open-input-string s)))
(and (equal? '(("foo" . #\x2028)
("bar" . #\x2028))
(list (read-line p 'split)
(read-line p 'split)))
(eof-object? (read-line p)))))
(pass-if "two lines, split, PS"
(let* ((s "foo\u2029bar\u2029")
(p (open-input-string s)))
(and (equal? '(("foo" . #\x2029)
("bar" . #\x2029))
(list (read-line p 'split)
(read-line p 'split)))
(eof-object? (read-line p)))))
(pass-if "two Greek lines, trim"
(let* ((s "λαμβδα\nμυ\n")
(p (open-input-string s)))