1
Fork 0
mirror of https://git.savannah.gnu.org/git/guile.git synced 2025-04-30 03:40:34 +02:00

Optimize `peek-char'.

This makes `peek-char' 40x faster on a port whose encoding is
faster on a UTF-8 port containing multi-byte codepoints.

The `xml->sxml' procedure is 4x faster on a 2.7 MiB XML file.

* libguile/ports.c (get_codepoint): New procedure, moved here from
  `scm_getc', with the additional BUF and LEN parameters.
  (scm_getc): Use it.
  (scm_peek_char): Use it instead of the `scm_getc'/`scm_ungetc'
  sequence.

* test-suite/tests/ports.test ("string ports")["peek-char [latin-1]",
  "peek-char [utf-8]"]: New tests.

* benchmark-suite/Makefile.am (SCM_BENCHMARKS): Add
  `benchmarks/ports.bm'.

* benchmark-suite/benchmarks/ports.bm: New file.
This commit is contained in:
Ludovic Courtès 2010-09-15 18:38:57 +02:00
parent e9c3018cec
commit fd5eec2b6e
4 changed files with 132 additions and 14 deletions

View file

@ -4,6 +4,7 @@ SCM_BENCHMARKS = benchmarks/0-reference.bm \
benchmarks/continuations.bm \ benchmarks/continuations.bm \
benchmarks/if.bm \ benchmarks/if.bm \
benchmarks/logand.bm \ benchmarks/logand.bm \
benchmarks/ports.bm \
benchmarks/read.bm \ benchmarks/read.bm \
benchmarks/srfi-1.bm \ benchmarks/srfi-1.bm \
benchmarks/srfi-13.bm \ benchmarks/srfi-13.bm \

View file

@ -0,0 +1,67 @@
;;; ports.bm --- Port I/O. -*- mode: scheme; coding: utf-8; -*-
;;;
;;; Copyright (C) 2010 Free Software Foundation, Inc.
;;;
;;; This program is free software; you can redistribute it and/or
;;; modify it under the terms of the GNU Lesser General Public License
;;; as published by the Free Software Foundation; either version 3, or
;;; (at your option) any later version.
;;;
;;; This program is distributed in the hope that it will be useful,
;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;;; GNU Lesser General Public License for more details.
;;;
;;; You should have received a copy of the GNU Lesser General Public
;;; License along with this software; see the file COPYING.LESSER. If
;;; not, write to the Free Software Foundation, Inc., 51 Franklin
;;; Street, Fifth Floor, Boston, MA 02110-1301 USA
(define-module (benchmarks ports)
#:use-module (benchmark-suite lib))
(define %latin1-port
(with-fluids ((%default-port-encoding #f))
(open-input-string "hello, world")))
(define %utf8/ascii-port
(with-fluids ((%default-port-encoding "UTF-8"))
(open-input-string "hello, world")))
(define %utf8/wide-port
(with-fluids ((%default-port-encoding "UTF-8"))
(open-input-string "안녕하세요")))
(with-benchmark-prefix "peek-char"
(benchmark "latin-1 port" 700000
(peek-char %latin1-port))
(benchmark "utf-8 port, ascii character" 700000
(peek-char %utf8/ascii-port))
(benchmark "utf-8 port, Korean character" 700000
(peek-char %utf8/wide-port)))
(with-benchmark-prefix "read-char"
(benchmark "latin-1 port" 10000000
(read-char %latin1-port))
(benchmark "utf-8 port, ascii character" 10000000
(read-char %utf8/ascii-port))
(benchmark "utf-8 port, Korean character" 10000000
(read-char %utf8/wide-port)))
(with-benchmark-prefix "char-ready?"
(benchmark "latin-1 port" 10000000
(char-ready? %latin1-port))
(benchmark "utf-8 port, ascii character" 10000000
(char-ready? %utf8/ascii-port))
(benchmark "utf-8 port, Korean character" 10000000
(char-ready? %utf8/wide-port)))

View file

@ -1023,13 +1023,15 @@ SCM_DEFINE (scm_read_char, "read-char", 0, 1, 0,
#define SCM_MBCHAR_BUF_SIZE (4) #define SCM_MBCHAR_BUF_SIZE (4)
/* Get one codepoint from a file, using the port's encoding. */ /* Read a codepoint from PORT and return it. Fill BUF with the byte
scm_t_wchar representation of the codepoint in PORT's encoding, and set *LEN to
scm_getc (SCM port) the length in bytes of that representation. Raise an error on
failure. */
static scm_t_wchar
get_codepoint (SCM port, char buf[SCM_MBCHAR_BUF_SIZE], size_t *len)
{ {
int c; int c;
unsigned int bufcount = 0; size_t bufcount = 0;
char buf[SCM_MBCHAR_BUF_SIZE];
scm_t_uint32 result_buf; scm_t_uint32 result_buf;
scm_t_wchar codepoint = 0; scm_t_wchar codepoint = 0;
scm_t_uint32 *u32; scm_t_uint32 *u32;
@ -1133,6 +1135,8 @@ scm_getc (SCM port)
break; break;
} }
*len = bufcount;
return codepoint; return codepoint;
failure: failure:
@ -1155,6 +1159,15 @@ scm_getc (SCM port)
return 0; return 0;
} }
/* Read a codepoint from PORT and return it. */
scm_t_wchar
scm_getc (SCM port)
{
size_t len;
char buf[SCM_MBCHAR_BUF_SIZE];
return get_codepoint (port, buf, &len);
}
/* this should only be called when the read buffer is empty. it /* this should only be called when the read buffer is empty. it
tries to refill the read buffer. it returns the first char from tries to refill the read buffer. it returns the first char from
@ -1635,18 +1648,37 @@ SCM_DEFINE (scm_peek_char, "peek-char", 0, 1, 0,
"to @code{read-char} would have hung.") "to @code{read-char} would have hung.")
#define FUNC_NAME s_scm_peek_char #define FUNC_NAME s_scm_peek_char
{ {
scm_t_wchar c, column; SCM result;
scm_t_wchar c;
char bytes[SCM_MBCHAR_BUF_SIZE];
long column, line;
size_t len;
if (SCM_UNBNDP (port)) if (SCM_UNBNDP (port))
port = scm_current_input_port (); port = scm_current_input_port ();
else else
SCM_VALIDATE_OPINPORT (1, port); SCM_VALIDATE_OPINPORT (1, port);
column = SCM_COL(port);
c = scm_getc (port); column = SCM_COL (port);
if (EOF == c) line = SCM_LINUM (port);
return SCM_EOF_VAL;
scm_ungetc (c, port); c = get_codepoint (port, bytes, &len);
SCM_COL(port) = column; if (c == EOF)
return SCM_MAKE_CHAR (c); result = SCM_EOF_VAL;
else
{
long i;
result = SCM_MAKE_CHAR (c);
for (i = len - 1; i >= 0; i--)
scm_unget_byte (bytes[i], port);
SCM_COL (port) = column;
SCM_LINUM (port) = line;
}
return result;
} }
#undef FUNC_NAME #undef FUNC_NAME

View file

@ -422,7 +422,25 @@
(and (eq? faulty-str str) (and (eq? faulty-str str)
(string=? from "UTF-32") (string=? from "UTF-32")
(string=? to "ISO-8859-1") (string=? to "ISO-8859-1")
(string? (strerror errno)))))))) (string? (strerror errno)))))))
(pass-if "peek-char [latin-1]"
(let ((p (with-fluids ((%default-port-encoding #f))
(open-input-string "hello, world"))))
(and (char=? (peek-char p) #\h)
(char=? (peek-char p) #\h)
(char=? (peek-char p) #\h)
(= (port-line p) 0)
(= (port-column p) 0))))
(pass-if "peek-char [utf-8]"
(let ((p (with-fluids ((%default-port-encoding "UTF-8"))
(open-input-string "안녕하세요"))))
(and (char=? (peek-char p) #\안)
(char=? (peek-char p) #\안)
(char=? (peek-char p) #\안)
(= (port-line p) 0)
(= (port-column p) 0)))))
(with-test-prefix "call-with-output-string" (with-test-prefix "call-with-output-string"