1
Fork 0
mirror of https://git.savannah.gnu.org/git/guile.git synced 2025-04-30 03:40:34 +02:00

Handle CRLF and Unicode line endings in read-line

* libguile/rdelim.c (scm_read_line): handle CRLF, LS and PS
* module/ice-9/suspendable-ports.scm (read-line): handle CRLF, LS, and PS
* module/web/http.scm (read-header-line): take advantage of CRLF in read-line
   (read-header): don't need to test for \return
* test-suite/tests/rdelim.test: new tests for read-line CRLF, LS and PS
* doc/ref/api-io.texi: update doc for read-line
This commit is contained in:
Mike Gran 2021-03-11 19:42:33 -08:00
parent a744f98dcc
commit 0f983e3db0
5 changed files with 165 additions and 25 deletions

View file

@ -755,8 +755,10 @@ a specified set of characters.
@deffn {Scheme Procedure} read-line [port] [handle-delim] @deffn {Scheme Procedure} read-line [port] [handle-delim]
Return a line of text from @var{port} if specified, otherwise from the Return a line of text from @var{port} if specified, otherwise from the
value returned by @code{(current-input-port)}. Under Unix, a line of text value returned by @code{(current-input-port)}. Under Unix, a line of
is terminated by the first end-of-line character or by end-of-file. text is terminated by the first end-of-line character or by end-of-file.
The end-of-line characters handled are newline, carriage return plus
newline, or the Unicode line or paragraph separators.
If @var{handle-delim} is specified, it should be one of the following If @var{handle-delim} is specified, it should be one of the following
symbols: symbols:
@ -771,7 +773,9 @@ Append the terminating delimiter (if any) to the returned string.
Push the terminating delimiter (if any) back on to the port. Push the terminating delimiter (if any) back on to the port.
@item split @item split
Return a pair containing the string read from the port and the Return a pair containing the string read from the port and the
terminating delimiter or end-of-file object. terminating delimiter or end-of-file object. The delimiter will either
be a single character for newline or the Unicode line or paragraph
separators, or it will be the string @code{"\r\n"}.
@end table @end table
@end deffn @end deffn

View file

@ -112,10 +112,11 @@ SCM_DEFINE (scm_read_delimited_x, "%read-delimited!", 3, 3, 0,
SCM_DEFINE (scm_read_line, "%read-line", 0, 1, 0, SCM_DEFINE (scm_read_line, "%read-line", 0, 1, 0,
(SCM port), (SCM port),
"Read a newline-terminated line from @var{port}, allocating storage as\n" "Read a line from @var{port}, allocating storage as necessary.\n"
"necessary. The newline terminator (if any) is removed from the string,\n" "The terminator (if any) is removed from the string,\n"
"and a pair consisting of the line and its delimiter is returned. The\n" "and a pair consisting of the line and its delimiter is returned. The\n"
"delimiter may be either a newline or the @var{eof-object}; if\n" "delimiter may be either a newline, return + newline, the Unicode\n"
"line or paragraph separators, or the @var{eof-object}; if\n"
"@code{%read-line} is called at the end of file, it returns the pair\n" "@code{%read-line} is called at the end of file, it returns the pair\n"
"@code{(#<eof> . #<eof>)}.") "@code{(#<eof> . #<eof>)}.")
#define FUNC_NAME s_scm_read_line #define FUNC_NAME s_scm_read_line
@ -127,6 +128,7 @@ SCM_DEFINE (scm_read_line, "%read-line", 0, 1, 0,
SCM line, strings, result; SCM line, strings, result;
scm_t_wchar buf[LINE_BUFFER_SIZE], delim; scm_t_wchar buf[LINE_BUFFER_SIZE], delim;
size_t index; size_t index;
int cr = 0;
if (SCM_UNBNDP (port)) if (SCM_UNBNDP (port))
port = scm_current_input_port (); port = scm_current_input_port ();
@ -152,12 +154,24 @@ SCM_DEFINE (scm_read_line, "%read-line", 0, 1, 0,
buf[index] = scm_getc (port); buf[index] = scm_getc (port);
switch (buf[index]) switch (buf[index])
{ {
case EOF:
case '\n': case '\n':
delim = buf[index]; delim = buf[index];
break; break;
case EOF:
case 0x2028: /* U+2028 LINE SEPARATOR */
case 0x2029: /* U+2029 PARAGRAPH SEPARATOR */
cr = 0;
delim = buf[index];
break;
case '\r':
cr = 1;
index ++;
break;
default: default:
cr = 0;
index++; index++;
} }
} }
@ -165,20 +179,33 @@ SCM_DEFINE (scm_read_line, "%read-line", 0, 1, 0,
while (delim == 0); while (delim == 0);
if (SCM_LIKELY (scm_is_false (strings))) if (SCM_LIKELY (scm_is_false (strings)))
{
/* The fast path. */ /* The fast path. */
if (cr)
line = scm_from_utf32_stringn (buf, index - 1);
else
line = scm_from_utf32_stringn (buf, index); line = scm_from_utf32_stringn (buf, index);
}
else else
{ {
/* Aggregate the intermediary results. */ /* Aggregate the intermediary results. */
if (cr)
strings = scm_cons (scm_from_utf32_stringn (buf, index - 1), strings);
else
strings = scm_cons (scm_from_utf32_stringn (buf, index), strings); strings = scm_cons (scm_from_utf32_stringn (buf, index), strings);
line = scm_string_concatenate (scm_reverse (strings)); line = scm_string_concatenate (scm_reverse (strings));
} }
if (delim == EOF && scm_i_string_length (line) == 0) if (delim == EOF && scm_i_string_length (line) == 0)
result = scm_cons (SCM_EOF_VAL, SCM_EOF_VAL); result = scm_cons (SCM_EOF_VAL, SCM_EOF_VAL);
else
{
if (cr)
result = scm_cons (line, scm_from_latin1_string("\r\n"));
else else
result = scm_cons (line, result = scm_cons (line,
delim == EOF ? SCM_EOF_VAL : SCM_MAKE_CHAR (delim)); delim == EOF ? SCM_EOF_VAL : SCM_MAKE_CHAR (delim));
}
return result; return result;
#undef LINE_BUFFER_SIZE #undef LINE_BUFFER_SIZE

View file

@ -1,5 +1,5 @@
;;; Ports, implemented in Scheme ;;; Ports, implemented in Scheme
;;; Copyright (C) 2016, 2019 Free Software Foundation, Inc. ;;; Copyright (C) 2016, 2018, 2021 Free Software Foundation, Inc.
;;; ;;;
;;; This library is free software: you can redistribute it and/or modify ;;; This library is free software: you can redistribute it and/or modify
;;; it under the terms of the GNU Lesser General Public License as ;;; it under the terms of the GNU Lesser General Public License as
@ -689,10 +689,81 @@
(define* (read-line #:optional (port (current-input-port)) (define* (read-line #:optional (port (current-input-port))
(handle-delim 'trim)) (handle-delim 'trim))
(read-delimited "\n" port handle-delim)) (let* ((line/delim (%read-line port))
(line (car line/delim))
(delim (cdr line/delim)))
(case handle-delim
((trim) line)
((split) line/delim)
((concat) (if (and (string? line) (char? delim))
(string-append line (string delim))
line))
((peek) (if (char? delim)
(unread-char delim port))
line)
(else
(error "unexpected handle-delim value: " handle-delim)))))
(define* (%read-line port) (define* (%read-line port)
(read-line port 'split)) (let ((LINE_BUFFER_SIZE 256))
(let ((strings #f)
(result #f)
(buf (make-string LINE_BUFFER_SIZE #\nul))
(delim #f)
(index 0)
(cr #f)
(go #t))
(cond
((not (input-port? port))
(error "Not an input port." port))
(else
(while go
(cond
((>= index LINE_BUFFER_SIZE)
(set! strings (cons (substring buf 0 index)
(or strings '())))
(set! index 0))
(else
(let ((c (read-char port)))
(cond
((or (eof-object? c)
(char=? c #\x2028) ; U+2028 LINE SEPARATOR
(char=? c #\x2029)) ; U+2029 PARAGRAPH SEPARATOR
(set! cr #f)
(set! delim c))
((char=? c #\newline)
(set! delim c))
((char=? c #\return)
(set! cr #t)
(string-set! buf index c)
(set! index (1+ index)))
(else
(set! cr #f)
(string-set! buf index c)
(set! index (1+ index)))))))
(if (or (eof-object? delim)
(char? delim))
(set! go #f)))
(let ((line (if (not strings)
;; A short string.
(if cr
(substring buf 0 (1- index))
(substring buf 0 index))
;; Else, aggregate the intermediary results.
(begin
(if cr
(set! strings (cons (substring buf 0 (1- index)) strings))
(set! strings (cons (substring buf 0 index) strings)))
(apply string-append (reverse strings))))))
(if (and (eof-object? delim)
(zero? (string-length line)))
(cons the-eof-object the-eof-object)
;; Else
(if cr
(cons line "\r\n")
(cons line delim)))))))))
(define* (put-string port str #:optional (start 0) (define* (put-string port str #:optional (start 0)
(count (- (string-length str) start))) (count (- (string-length str) start)))

View file

@ -157,13 +157,12 @@ The default writer will call put-string."
Raise a 'bad-header' exception if the line does not end in CRLF or LF, Raise a 'bad-header' exception if the line does not end in CRLF or LF,
or if EOF is reached." or if EOF is reached."
(match (%read-line port) (match (%read-line port)
(((? string? line) . "\r\n")
line)
(((? string? line) . #\newline) (((? string? line) . #\newline)
;; '%read-line' does not consider #\return a delimiter; so if it's ;; We are more tolerant than the RFC in that we tolerate LF-only
;; there, remove it. We are more tolerant than the RFC in that we ;; endings.
;; tolerate LF-only endings. line)
(if (string-suffix? "\r" line)
(string-drop-right line 1)
line))
((line . _) ;EOF or missing delimiter ((line . _) ;EOF or missing delimiter
(bad-header 'read-header-line line)))) (bad-header 'read-header-line line))))
@ -184,8 +183,7 @@ was known but the value was invalid.
Returns the end-of-file object for both values if the end of the message Returns the end-of-file object for both values if the end of the message
body was reached (i.e., a blank line)." body was reached (i.e., a blank line)."
(let ((line (read-header-line port))) (let ((line (read-header-line port)))
(if (or (string-null? line) (if (string-null? line)
(string=? line "\r"))
(values *eof* *eof*) (values *eof* *eof*)
(let* ((delim (or (string-index line #\:) (let* ((delim (or (string-index line #\:)
(bad-header '%read line))) (bad-header '%read line)))

View file

@ -1,7 +1,7 @@
;;;; rdelim.test --- Delimited I/O. -*- mode: scheme; coding: utf-8; -*- ;;;; rdelim.test --- Delimited I/O. -*- mode: scheme; coding: utf-8; -*-
;;;; Ludovic Courtès <ludo@gnu.org> ;;;; Ludovic Courtès <ludo@gnu.org>
;;;; ;;;;
;;;; Copyright (C) 2011, 2013, 2014 Free Software Foundation, Inc. ;;;; Copyright (C) 2011, 2013, 2014, 2021 Free Software Foundation, Inc.
;;;; ;;;;
;;;; This library is free software; you can redistribute it and/or ;;;; This library is free software; you can redistribute it and/or
;;;; modify it under the terms of the GNU Lesser General Public ;;;; modify it under the terms of the GNU Lesser General Public
@ -62,6 +62,46 @@
(read-line p 'split))) (read-line p 'split)))
(eof-object? (read-line p))))) (eof-object? (read-line p)))))
(pass-if "two lines, split, CRLF"
(let* ((s "foo\r\nbar\r\n")
(p (open-input-string s)))
(and (equal? '(("foo" . "\r\n")
("bar" . "\r\n"))
(list (read-line p 'split)
(read-line p 'split)))
(eof-object? (read-line p)))))
(pass-if "two long lines, split, CRLF"
;; Must be longer than 256 codepoints
(let* ((text0 "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
(text1 (string-append text0 text0 text0 text0 text0))
(text2 (string-append text1 "\r\n" text1 "\r\n")))
(let* ((s text2)
(p (open-input-string s)))
(and (equal? `((,text1 . "\r\n")
(,text1 . "\r\n"))
(list (read-line p 'split)
(read-line p 'split)))
(eof-object? (read-line p))))))
(pass-if "two lines, split, LS"
(let* ((s "foo\u2028bar\u2028")
(p (open-input-string s)))
(and (equal? '(("foo" . #\x2028)
("bar" . #\x2028))
(list (read-line p 'split)
(read-line p 'split)))
(eof-object? (read-line p)))))
(pass-if "two lines, split, PS"
(let* ((s "foo\u2029bar\u2029")
(p (open-input-string s)))
(and (equal? '(("foo" . #\x2029)
("bar" . #\x2029))
(list (read-line p 'split)
(read-line p 'split)))
(eof-object? (read-line p)))))
(pass-if "two Greek lines, trim" (pass-if "two Greek lines, trim"
(let* ((s "λαμβδα\nμυ\n") (let* ((s "λαμβδα\nμυ\n")
(p (open-input-string s))) (p (open-input-string s)))