From 0f983e3db0c43ad7c89f57ea84f792ede373ba0c Mon Sep 17 00:00:00 2001 From: Mike Gran Date: Thu, 11 Mar 2021 19:42:33 -0800 Subject: [PATCH] Handle CRLF and Unicode line endings in read-line * libguile/rdelim.c (scm_read_line): handle CRLF, LS and PS * module/ice-9/suspendable-ports.scm (read-line): handle CRLF, LS, and PS * module/web/http.scm (read-header-line): take advantage of CRLF in read-line (read-header): don't need to test for \return * test-suite/tests/rdelim.test: new tests for read-line CRLF, LS and PS * doc/ref/api-io.texi: update doc for read-line --- doc/ref/api-io.texi | 10 ++-- libguile/rdelim.c | 47 ++++++++++++++---- module/ice-9/suspendable-ports.scm | 77 ++++++++++++++++++++++++++++-- module/web/http.scm | 14 +++--- test-suite/tests/rdelim.test | 42 +++++++++++++++- 5 files changed, 165 insertions(+), 25 deletions(-) diff --git a/doc/ref/api-io.texi b/doc/ref/api-io.texi index 777f282e9..2345f043c 100644 --- a/doc/ref/api-io.texi +++ b/doc/ref/api-io.texi @@ -755,8 +755,10 @@ a specified set of characters. @deffn {Scheme Procedure} read-line [port] [handle-delim] Return a line of text from @var{port} if specified, otherwise from the -value returned by @code{(current-input-port)}. Under Unix, a line of text -is terminated by the first end-of-line character or by end-of-file. +value returned by @code{(current-input-port)}. Under Unix, a line of +text is terminated by the first end-of-line character or by end-of-file. +The end-of-line characters handled are newline, carriage return plus +newline, or the Unicode line or paragraph separators. If @var{handle-delim} is specified, it should be one of the following symbols: @@ -771,7 +773,9 @@ Append the terminating delimiter (if any) to the returned string. Push the terminating delimiter (if any) back on to the port. @item split Return a pair containing the string read from the port and the -terminating delimiter or end-of-file object. +terminating delimiter or end-of-file object. The delimiter will either +be a single character for newline or the Unicode line or paragraph +separators, or it will be the string @code{"\r\n"}. @end table @end deffn diff --git a/libguile/rdelim.c b/libguile/rdelim.c index 4a0b20954..c1b92023a 100644 --- a/libguile/rdelim.c +++ b/libguile/rdelim.c @@ -112,10 +112,11 @@ SCM_DEFINE (scm_read_delimited_x, "%read-delimited!", 3, 3, 0, SCM_DEFINE (scm_read_line, "%read-line", 0, 1, 0, (SCM port), - "Read a newline-terminated line from @var{port}, allocating storage as\n" - "necessary. The newline terminator (if any) is removed from the string,\n" + "Read a line from @var{port}, allocating storage as necessary.\n" + "The terminator (if any) is removed from the string,\n" "and a pair consisting of the line and its delimiter is returned. The\n" - "delimiter may be either a newline or the @var{eof-object}; if\n" + "delimiter may be either a newline, return + newline, the Unicode\n" + "line or paragraph separators, or the @var{eof-object}; if\n" "@code{%read-line} is called at the end of file, it returns the pair\n" "@code{(# . #)}.") #define FUNC_NAME s_scm_read_line @@ -127,6 +128,7 @@ SCM_DEFINE (scm_read_line, "%read-line", 0, 1, 0, SCM line, strings, result; scm_t_wchar buf[LINE_BUFFER_SIZE], delim; size_t index; + int cr = 0; if (SCM_UNBNDP (port)) port = scm_current_input_port (); @@ -152,12 +154,24 @@ SCM_DEFINE (scm_read_line, "%read-line", 0, 1, 0, buf[index] = scm_getc (port); switch (buf[index]) { - case EOF: case '\n': delim = buf[index]; - break; + break; + + case EOF: + case 0x2028: /* U+2028 LINE SEPARATOR */ + case 0x2029: /* U+2029 PARAGRAPH SEPARATOR */ + cr = 0; + delim = buf[index]; + break; + + case '\r': + cr = 1; + index ++; + break; default: + cr = 0; index++; } } @@ -165,20 +179,33 @@ SCM_DEFINE (scm_read_line, "%read-line", 0, 1, 0, while (delim == 0); if (SCM_LIKELY (scm_is_false (strings))) - /* The fast path. */ - line = scm_from_utf32_stringn (buf, index); + { + /* The fast path. */ + if (cr) + line = scm_from_utf32_stringn (buf, index - 1); + else + line = scm_from_utf32_stringn (buf, index); + } else { /* Aggregate the intermediary results. */ - strings = scm_cons (scm_from_utf32_stringn (buf, index), strings); + if (cr) + strings = scm_cons (scm_from_utf32_stringn (buf, index - 1), strings); + else + strings = scm_cons (scm_from_utf32_stringn (buf, index), strings); line = scm_string_concatenate (scm_reverse (strings)); } if (delim == EOF && scm_i_string_length (line) == 0) result = scm_cons (SCM_EOF_VAL, SCM_EOF_VAL); else - result = scm_cons (line, - delim == EOF ? SCM_EOF_VAL : SCM_MAKE_CHAR (delim)); + { + if (cr) + result = scm_cons (line, scm_from_latin1_string("\r\n")); + else + result = scm_cons (line, + delim == EOF ? SCM_EOF_VAL : SCM_MAKE_CHAR (delim)); + } return result; #undef LINE_BUFFER_SIZE diff --git a/module/ice-9/suspendable-ports.scm b/module/ice-9/suspendable-ports.scm index f5f005cca..ba8d225f5 100644 --- a/module/ice-9/suspendable-ports.scm +++ b/module/ice-9/suspendable-ports.scm @@ -1,5 +1,5 @@ ;;; Ports, implemented in Scheme -;;; Copyright (C) 2016, 2019 Free Software Foundation, Inc. +;;; Copyright (C) 2016, 2018, 2021 Free Software Foundation, Inc. ;;; ;;; This library is free software: you can redistribute it and/or modify ;;; it under the terms of the GNU Lesser General Public License as @@ -689,10 +689,81 @@ (define* (read-line #:optional (port (current-input-port)) (handle-delim 'trim)) - (read-delimited "\n" port handle-delim)) + (let* ((line/delim (%read-line port)) + (line (car line/delim)) + (delim (cdr line/delim))) + (case handle-delim + ((trim) line) + ((split) line/delim) + ((concat) (if (and (string? line) (char? delim)) + (string-append line (string delim)) + line)) + ((peek) (if (char? delim) + (unread-char delim port)) + line) + (else + (error "unexpected handle-delim value: " handle-delim))))) (define* (%read-line port) - (read-line port 'split)) + (let ((LINE_BUFFER_SIZE 256)) + (let ((strings #f) + (result #f) + (buf (make-string LINE_BUFFER_SIZE #\nul)) + (delim #f) + (index 0) + (cr #f) + (go #t)) + (cond + ((not (input-port? port)) + (error "Not an input port." port)) + (else + (while go + (cond + ((>= index LINE_BUFFER_SIZE) + (set! strings (cons (substring buf 0 index) + (or strings '()))) + (set! index 0)) + (else + (let ((c (read-char port))) + (cond + ((or (eof-object? c) + (char=? c #\x2028) ; U+2028 LINE SEPARATOR + (char=? c #\x2029)) ; U+2029 PARAGRAPH SEPARATOR + (set! cr #f) + (set! delim c)) + ((char=? c #\newline) + (set! delim c)) + ((char=? c #\return) + (set! cr #t) + (string-set! buf index c) + (set! index (1+ index))) + (else + (set! cr #f) + (string-set! buf index c) + (set! index (1+ index))))))) + + (if (or (eof-object? delim) + (char? delim)) + (set! go #f))) + (let ((line (if (not strings) + ;; A short string. + (if cr + (substring buf 0 (1- index)) + (substring buf 0 index)) + ;; Else, aggregate the intermediary results. + (begin + (if cr + (set! strings (cons (substring buf 0 (1- index)) strings)) + (set! strings (cons (substring buf 0 index) strings))) + (apply string-append (reverse strings)))))) + + (if (and (eof-object? delim) + (zero? (string-length line))) + (cons the-eof-object the-eof-object) + ;; Else + (if cr + (cons line "\r\n") + (cons line delim))))))))) (define* (put-string port str #:optional (start 0) (count (- (string-length str) start))) diff --git a/module/web/http.scm b/module/web/http.scm index 4276e1744..32a3093f1 100644 --- a/module/web/http.scm +++ b/module/web/http.scm @@ -157,13 +157,12 @@ The default writer will call ‘put-string’." Raise a 'bad-header' exception if the line does not end in CRLF or LF, or if EOF is reached." (match (%read-line port) + (((? string? line) . "\r\n") + line) (((? string? line) . #\newline) - ;; '%read-line' does not consider #\return a delimiter; so if it's - ;; there, remove it. We are more tolerant than the RFC in that we - ;; tolerate LF-only endings. - (if (string-suffix? "\r" line) - (string-drop-right line 1) - line)) + ;; We are more tolerant than the RFC in that we tolerate LF-only + ;; endings. + line) ((line . _) ;EOF or missing delimiter (bad-header 'read-header-line line)))) @@ -184,8 +183,7 @@ was known but the value was invalid. Returns the end-of-file object for both values if the end of the message body was reached (i.e., a blank line)." (let ((line (read-header-line port))) - (if (or (string-null? line) - (string=? line "\r")) + (if (string-null? line) (values *eof* *eof*) (let* ((delim (or (string-index line #\:) (bad-header '%read line))) diff --git a/test-suite/tests/rdelim.test b/test-suite/tests/rdelim.test index 3aaa0b253..1060d5c17 100644 --- a/test-suite/tests/rdelim.test +++ b/test-suite/tests/rdelim.test @@ -1,7 +1,7 @@ ;;;; rdelim.test --- Delimited I/O. -*- mode: scheme; coding: utf-8; -*- ;;;; Ludovic Courtès ;;;; -;;;; Copyright (C) 2011, 2013, 2014 Free Software Foundation, Inc. +;;;; Copyright (C) 2011, 2013, 2014, 2021 Free Software Foundation, Inc. ;;;; ;;;; This library is free software; you can redistribute it and/or ;;;; modify it under the terms of the GNU Lesser General Public @@ -62,6 +62,46 @@ (read-line p 'split))) (eof-object? (read-line p))))) + (pass-if "two lines, split, CRLF" + (let* ((s "foo\r\nbar\r\n") + (p (open-input-string s))) + (and (equal? '(("foo" . "\r\n") + ("bar" . "\r\n")) + (list (read-line p 'split) + (read-line p 'split))) + (eof-object? (read-line p))))) + + (pass-if "two long lines, split, CRLF" + ;; Must be longer than 256 codepoints + (let* ((text0 "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ") + (text1 (string-append text0 text0 text0 text0 text0)) + (text2 (string-append text1 "\r\n" text1 "\r\n"))) + (let* ((s text2) + (p (open-input-string s))) + (and (equal? `((,text1 . "\r\n") + (,text1 . "\r\n")) + (list (read-line p 'split) + (read-line p 'split))) + (eof-object? (read-line p)))))) + + (pass-if "two lines, split, LS" + (let* ((s "foo\u2028bar\u2028") + (p (open-input-string s))) + (and (equal? '(("foo" . #\x2028) + ("bar" . #\x2028)) + (list (read-line p 'split) + (read-line p 'split))) + (eof-object? (read-line p))))) + + (pass-if "two lines, split, PS" + (let* ((s "foo\u2029bar\u2029") + (p (open-input-string s))) + (and (equal? '(("foo" . #\x2029) + ("bar" . #\x2029)) + (list (read-line p 'split) + (read-line p 'split))) + (eof-object? (read-line p))))) + (pass-if "two Greek lines, trim" (let* ((s "λαμβδα\nμυ\n") (p (open-input-string s)))