1
Fork 0
mirror of https://git.savannah.gnu.org/git/guile.git synced 2025-04-30 03:40:34 +02:00

Optimize `peek-char'.

This makes `peek-char' 40x faster on a port whose encoding is
faster on a UTF-8 port containing multi-byte codepoints.

The `xml->sxml' procedure is 4x faster on a 2.7 MiB XML file.

* libguile/ports.c (get_codepoint): New procedure, moved here from
  `scm_getc', with the additional BUF and LEN parameters.
  (scm_getc): Use it.
  (scm_peek_char): Use it instead of the `scm_getc'/`scm_ungetc'
  sequence.

* test-suite/tests/ports.test ("string ports")["peek-char [latin-1]",
  "peek-char [utf-8]"]: New tests.

* benchmark-suite/Makefile.am (SCM_BENCHMARKS): Add
  `benchmarks/ports.bm'.

* benchmark-suite/benchmarks/ports.bm: New file.
This commit is contained in:
Ludovic Courtès 2010-09-15 18:38:57 +02:00
parent e9c3018cec
commit fd5eec2b6e
4 changed files with 132 additions and 14 deletions

View file

@ -4,6 +4,7 @@ SCM_BENCHMARKS = benchmarks/0-reference.bm \
benchmarks/continuations.bm \
benchmarks/if.bm \
benchmarks/logand.bm \
benchmarks/ports.bm \
benchmarks/read.bm \
benchmarks/srfi-1.bm \
benchmarks/srfi-13.bm \

View file

@ -0,0 +1,67 @@
;;; ports.bm --- Port I/O. -*- mode: scheme; coding: utf-8; -*-
;;;
;;; Copyright (C) 2010 Free Software Foundation, Inc.
;;;
;;; This program is free software; you can redistribute it and/or
;;; modify it under the terms of the GNU Lesser General Public License
;;; as published by the Free Software Foundation; either version 3, or
;;; (at your option) any later version.
;;;
;;; This program is distributed in the hope that it will be useful,
;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;;; GNU Lesser General Public License for more details.
;;;
;;; You should have received a copy of the GNU Lesser General Public
;;; License along with this software; see the file COPYING.LESSER. If
;;; not, write to the Free Software Foundation, Inc., 51 Franklin
;;; Street, Fifth Floor, Boston, MA 02110-1301 USA
(define-module (benchmarks ports)
#:use-module (benchmark-suite lib))
(define %latin1-port
(with-fluids ((%default-port-encoding #f))
(open-input-string "hello, world")))
(define %utf8/ascii-port
(with-fluids ((%default-port-encoding "UTF-8"))
(open-input-string "hello, world")))
(define %utf8/wide-port
(with-fluids ((%default-port-encoding "UTF-8"))
(open-input-string "안녕하세요")))
(with-benchmark-prefix "peek-char"
(benchmark "latin-1 port" 700000
(peek-char %latin1-port))
(benchmark "utf-8 port, ascii character" 700000
(peek-char %utf8/ascii-port))
(benchmark "utf-8 port, Korean character" 700000
(peek-char %utf8/wide-port)))
(with-benchmark-prefix "read-char"
(benchmark "latin-1 port" 10000000
(read-char %latin1-port))
(benchmark "utf-8 port, ascii character" 10000000
(read-char %utf8/ascii-port))
(benchmark "utf-8 port, Korean character" 10000000
(read-char %utf8/wide-port)))
(with-benchmark-prefix "char-ready?"
(benchmark "latin-1 port" 10000000
(char-ready? %latin1-port))
(benchmark "utf-8 port, ascii character" 10000000
(char-ready? %utf8/ascii-port))
(benchmark "utf-8 port, Korean character" 10000000
(char-ready? %utf8/wide-port)))

View file

@ -1023,13 +1023,15 @@ SCM_DEFINE (scm_read_char, "read-char", 0, 1, 0,
#define SCM_MBCHAR_BUF_SIZE (4)
/* Get one codepoint from a file, using the port's encoding. */
scm_t_wchar
scm_getc (SCM port)
/* Read a codepoint from PORT and return it. Fill BUF with the byte
representation of the codepoint in PORT's encoding, and set *LEN to
the length in bytes of that representation. Raise an error on
failure. */
static scm_t_wchar
get_codepoint (SCM port, char buf[SCM_MBCHAR_BUF_SIZE], size_t *len)
{
int c;
unsigned int bufcount = 0;
char buf[SCM_MBCHAR_BUF_SIZE];
size_t bufcount = 0;
scm_t_uint32 result_buf;
scm_t_wchar codepoint = 0;
scm_t_uint32 *u32;
@ -1133,6 +1135,8 @@ scm_getc (SCM port)
break;
}
*len = bufcount;
return codepoint;
failure:
@ -1155,6 +1159,15 @@ scm_getc (SCM port)
return 0;
}
/* Read a codepoint from PORT and return it. */
scm_t_wchar
scm_getc (SCM port)
{
size_t len;
char buf[SCM_MBCHAR_BUF_SIZE];
return get_codepoint (port, buf, &len);
}
/* this should only be called when the read buffer is empty. it
tries to refill the read buffer. it returns the first char from
@ -1635,18 +1648,37 @@ SCM_DEFINE (scm_peek_char, "peek-char", 0, 1, 0,
"to @code{read-char} would have hung.")
#define FUNC_NAME s_scm_peek_char
{
scm_t_wchar c, column;
SCM result;
scm_t_wchar c;
char bytes[SCM_MBCHAR_BUF_SIZE];
long column, line;
size_t len;
if (SCM_UNBNDP (port))
port = scm_current_input_port ();
else
SCM_VALIDATE_OPINPORT (1, port);
column = SCM_COL (port);
c = scm_getc (port);
if (EOF == c)
return SCM_EOF_VAL;
scm_ungetc (c, port);
line = SCM_LINUM (port);
c = get_codepoint (port, bytes, &len);
if (c == EOF)
result = SCM_EOF_VAL;
else
{
long i;
result = SCM_MAKE_CHAR (c);
for (i = len - 1; i >= 0; i--)
scm_unget_byte (bytes[i], port);
SCM_COL (port) = column;
return SCM_MAKE_CHAR (c);
SCM_LINUM (port) = line;
}
return result;
}
#undef FUNC_NAME

View file

@ -422,7 +422,25 @@
(and (eq? faulty-str str)
(string=? from "UTF-32")
(string=? to "ISO-8859-1")
(string? (strerror errno))))))))
(string? (strerror errno)))))))
(pass-if "peek-char [latin-1]"
(let ((p (with-fluids ((%default-port-encoding #f))
(open-input-string "hello, world"))))
(and (char=? (peek-char p) #\h)
(char=? (peek-char p) #\h)
(char=? (peek-char p) #\h)
(= (port-line p) 0)
(= (port-column p) 0))))
(pass-if "peek-char [utf-8]"
(let ((p (with-fluids ((%default-port-encoding "UTF-8"))
(open-input-string "안녕하세요"))))
(and (char=? (peek-char p) #\안)
(char=? (peek-char p) #\안)
(char=? (peek-char p) #\안)
(= (port-line p) 0)
(= (port-column p) 0)))))
(with-test-prefix "call-with-output-string"