Merge remote-tracking branch 'origin/stable-2.0'

2025-06-22 03:30:22 +02:00 · 2011-05-09 00:13:04 +02:00 · 2011-05-09 00:13:04 +02:00 · e690a3cbf2
commit e690a3cbf2
parent 059a588fed 8f6a4b248b
20 changed files with 1726 additions and 359 deletions
--- a/libguile/foreign.c
+++ b/libguile/foreign.c
@ -965,9 +965,12 @@ unpack (const ffi_type *type, void *loc, SCM x)
 }
 #undef FUNC_NAME

-/* Return a Scheme representation of the foreign value at LOC of type TYPE.  */
+/* Return a Scheme representation of the foreign value at LOC of type
+   TYPE.  When RETURN_VALUE_P is true, LOC is assumed to point to a
+   return value buffer; otherwise LOC is assumed to point to an
+   argument buffer.  */
 static SCM
-pack (const ffi_type * type, const void *loc)
+pack (const ffi_type * type, const void *loc, int return_value_p)
 {
  switch (type->type)
    {
@ -977,22 +980,48 @@ pack (const ffi_type * type, const void *loc)
      return scm_from_double (*(float *) loc);
    case FFI_TYPE_DOUBLE:
      return scm_from_double (*(double *) loc);
+
+      /* For integer return values smaller than `int', libffi stores the
+	 result in an `ffi_arg'-long buffer, of which only the
+	 significant bits must be kept---hence the pair of casts below.
+	 See <http://thread.gmane.org/gmane.comp.lib.ffi.general/406>
+	 for details.  */
+
    case FFI_TYPE_UINT8:
-      return scm_from_uint8 (*(scm_t_uint8 *) loc);
+      if (return_value_p)
+	return scm_from_uint8 ((scm_t_uint8) *(ffi_arg *) loc);
+      else
+	return scm_from_uint8 (* (scm_t_uint8 *) loc);
    case FFI_TYPE_SINT8:
-      return scm_from_int8 (*(scm_t_int8 *) loc);
+      if (return_value_p)
+	return scm_from_int8 ((scm_t_int8) *(ffi_arg *) loc);
+      else
+	return scm_from_int8 (* (scm_t_int8 *) loc);
    case FFI_TYPE_UINT16:
-      return scm_from_uint16 (*(scm_t_uint16 *) loc);
+      if (return_value_p)
+	return scm_from_uint16 ((scm_t_uint16) *(ffi_arg *) loc);
+      else
+	return scm_from_uint16 (* (scm_t_uint16 *) loc);
    case FFI_TYPE_SINT16:
-      return scm_from_int16 (*(scm_t_int16 *) loc);
+      if (return_value_p)
+	return scm_from_int16 ((scm_t_int16) *(ffi_arg *) loc);
+      else
+	return scm_from_int16 (* (scm_t_int16 *) loc);
    case FFI_TYPE_UINT32:
-      return scm_from_uint32 (*(scm_t_uint32 *) loc);
+      if (return_value_p)
+	return scm_from_uint32 ((scm_t_uint32) *(ffi_arg *) loc);
+      else
+	return scm_from_uint32 (* (scm_t_uint32 *) loc);
    case FFI_TYPE_SINT32:
-      return scm_from_int32 (*(scm_t_int32 *) loc);
+      if (return_value_p)
+	return scm_from_int32 ((scm_t_int32) *(ffi_arg *) loc);
+      else
+	return scm_from_int32 (* (scm_t_int32 *) loc);
    case FFI_TYPE_UINT64:
      return scm_from_uint64 (*(scm_t_uint64 *) loc);
    case FFI_TYPE_SINT64:
      return scm_from_int64 (*(scm_t_int64 *) loc);
+
    case FFI_TYPE_STRUCT:
      {
 	void *mem = scm_gc_malloc_pointerless (type->size, "foreign");
@ -1060,7 +1089,7 @@ scm_i_foreign_call (SCM foreign, const SCM *argv)
  /* off we go! */
  ffi_call (cif, func, rvalue, args);

-  return pack (cif->rtype, rvalue);
+  return pack (cif->rtype, rvalue, 1);
 }


@ -1082,7 +1111,7 @@ invoke_closure (ffi_cif *cif, void *ret, void **args, void *data)

  /* Pack ARGS to SCM values, setting ARGV pointers.  */
  for (i = 0; i < cif->nargs; i++)
-    argv[i] = pack (cif->arg_types[i], args[i]);
+    argv[i] = pack (cif->arg_types[i], args[i], 0);

  result = scm_call_n (proc, argv, cif->nargs);

--- a/libguile/hash.c
+++ b/libguile/hash.c
@ -26,6 +26,7 @@
 #include <wchar.h>
 #endif

+#include <math.h>
 #include <unistr.h>

 #include "libguile/_scm.h"
@ -192,7 +193,7 @@ scm_hasher(SCM obj, unsigned long n, size_t d)
      case scm_tc16_real:
 	{
 	  double r = SCM_REAL_VALUE (obj);
-	  if (floor (r) == r) 
+	  if (floor (r) == r && !isinf (r) && !isnan (r))
 	    {
 	      obj = scm_inexact_to_exact (obj);
 	      return scm_to_ulong (scm_modulo (obj, scm_from_ulong (n)));
--- a/libguile/inline.h
+++ b/libguile/inline.h
@ -3,7 +3,8 @@
 #ifndef SCM_INLINE_H
 #define SCM_INLINE_H

-/* Copyright (C) 2001, 2002, 2003, 2004, 2006, 2008, 2009, 2010 Free Software Foundation, Inc.
+/* Copyright (C) 2001, 2002, 2003, 2004, 2006, 2008, 2009, 2010,
+ *   2011 Free Software Foundation, Inc.
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public License
@ -98,6 +99,7 @@ SCM_API int scm_is_pair (SCM x);
 SCM_API int scm_is_string (SCM x);

 SCM_API int scm_get_byte_or_eof (SCM port);
+SCM_API int scm_peek_byte_or_eof (SCM port);
 SCM_API void scm_putc (char c, SCM port);
 SCM_API void scm_puts (const char *str_data, SCM port);

@ -362,7 +364,7 @@ scm_get_byte_or_eof (SCM port)

  if (pt->read_pos >= pt->read_end)
    {
-      if (scm_fill_input (port) == EOF)
+      if (SCM_UNLIKELY (scm_fill_input (port) == EOF))
 	return EOF;
    }

@ -371,6 +373,34 @@ scm_get_byte_or_eof (SCM port)
  return c;
 }

+/* Like `scm_get_byte_or_eof' but does not change PORT's `read_pos'.  */
+#ifndef SCM_INLINE_C_INCLUDING_INLINE_H
+SCM_C_EXTERN_INLINE
+#endif
+int
+scm_peek_byte_or_eof (SCM port)
+{
+  int c;
+  scm_t_port *pt = SCM_PTAB_ENTRY (port);
+
+  if (pt->rw_active == SCM_PORT_WRITE)
+    /* may be marginally faster than calling scm_flush.  */
+    scm_ptobs[SCM_PTOBNUM (port)].flush (port);
+
+  if (pt->rw_random)
+    pt->rw_active = SCM_PORT_READ;
+
+  if (pt->read_pos >= pt->read_end)
+    {
+      if (SCM_UNLIKELY (scm_fill_input (port) == EOF))
+	return EOF;
+    }
+
+  c = *pt->read_pos;
+
+  return c;
+}
+
 #ifndef SCM_INLINE_C_INCLUDING_INLINE_H
 SCM_C_EXTERN_INLINE
 #endif
--- a/libguile/ports.c
+++ b/libguile/ports.c
@ -1057,6 +1057,7 @@ update_port_lf (scm_t_wchar c, SCM port)
  switch (c)
    {
    case '\a':
+    case EOF:
      break;
    case '\b':
      SCM_DECCOL (port);
@ -1115,23 +1116,154 @@ utf8_to_codepoint (const scm_t_uint8 *utf8_buf, size_t size)
  return codepoint;
 }

-/* Read a codepoint from PORT and return it in *CODEPOINT.  Fill BUF
-   with the byte representation of the codepoint in PORT's encoding, and
-   set *LEN to the length in bytes of that representation.  Return 0 on
-   success and an errno value on error.  */
+/* Read a UTF-8 sequence from PORT.  On success, return 0 and set
+   *CODEPOINT to the codepoint that was read, fill BUF with its UTF-8
+   representation, and set *LEN to the length in bytes.  Return
+   `EILSEQ' on error.  */
 static int
-get_codepoint (SCM port, scm_t_wchar *codepoint,
-	       char buf[SCM_MBCHAR_BUF_SIZE], size_t *len)
+get_utf8_codepoint (SCM port, scm_t_wchar *codepoint,
+		    scm_t_uint8 buf[SCM_MBCHAR_BUF_SIZE], size_t *len)
 {
+#define ASSERT_NOT_EOF(b)			\
+  if (SCM_UNLIKELY ((b) == EOF))		\
+    goto invalid_seq
+#define CONSUME_PEEKED_BYTE()				\
+  pt->read_pos++
+
+  int byte;
+  scm_t_port *pt;
+
+  *len = 0;
+  pt = SCM_PTAB_ENTRY (port);
+
+  byte = scm_get_byte_or_eof (port);
+  if (byte == EOF)
+    {
+      *codepoint = EOF;
+      return 0;
+    }
+
+  buf[0] = (scm_t_uint8) byte;
+  *len = 1;
+
+  if (buf[0] <= 0x7f)
+    /* 1-byte form.  */
+    *codepoint = buf[0];
+  else if (buf[0] >= 0xc2 && buf[0] <= 0xdf)
+    {
+      /* 2-byte form.  */
+      byte = scm_peek_byte_or_eof (port);
+      ASSERT_NOT_EOF (byte);
+
+      if (SCM_UNLIKELY ((byte & 0xc0) != 0x80))
+	goto invalid_seq;
+
+      CONSUME_PEEKED_BYTE ();
+      buf[1] = (scm_t_uint8) byte;
+      *len = 2;
+
+      *codepoint = ((scm_t_wchar) buf[0] & 0x1f) << 6UL
+	| (buf[1] & 0x3f);
+    }
+  else if ((buf[0] & 0xf0) == 0xe0)
+    {
+      /* 3-byte form.  */
+      byte = scm_peek_byte_or_eof (port);
+      ASSERT_NOT_EOF (byte);
+
+      if (SCM_UNLIKELY ((byte & 0xc0) != 0x80
+			|| (buf[0] == 0xe0 && byte < 0xa0)
+			|| (buf[0] == 0xed && byte > 0x9f)))
+	goto invalid_seq;
+
+      CONSUME_PEEKED_BYTE ();
+      buf[1] = (scm_t_uint8) byte;
+      *len = 2;
+
+      byte = scm_peek_byte_or_eof (port);
+      ASSERT_NOT_EOF (byte);
+
+      if (SCM_UNLIKELY ((byte & 0xc0) != 0x80))
+	goto invalid_seq;
+
+      CONSUME_PEEKED_BYTE ();
+      buf[2] = (scm_t_uint8) byte;
+      *len = 3;
+
+      *codepoint = ((scm_t_wchar) buf[0] & 0x0f) << 12UL
+	| ((scm_t_wchar) buf[1] & 0x3f) << 6UL
+	| (buf[2] & 0x3f);
+    }
+  else if (buf[0] >= 0xf0 && buf[0] <= 0xf4)
+    {
+      /* 4-byte form.  */
+      byte = scm_peek_byte_or_eof (port);
+      ASSERT_NOT_EOF (byte);
+
+      if (SCM_UNLIKELY (((byte & 0xc0) != 0x80)
+			|| (buf[0] == 0xf0 && byte < 0x90)
+			|| (buf[0] == 0xf4 && byte > 0x8f)))
+	goto invalid_seq;
+
+      CONSUME_PEEKED_BYTE ();
+      buf[1] = (scm_t_uint8) byte;
+      *len = 2;
+
+      byte = scm_peek_byte_or_eof (port);
+      ASSERT_NOT_EOF (byte);
+
+      if (SCM_UNLIKELY ((byte & 0xc0) != 0x80))
+	goto invalid_seq;
+
+      CONSUME_PEEKED_BYTE ();
+      buf[2] = (scm_t_uint8) byte;
+      *len = 3;
+
+      byte = scm_peek_byte_or_eof (port);
+      ASSERT_NOT_EOF (byte);
+
+      if (SCM_UNLIKELY ((byte & 0xc0) != 0x80))
+	goto invalid_seq;
+
+      CONSUME_PEEKED_BYTE ();
+      buf[3] = (scm_t_uint8) byte;
+      *len = 4;
+
+      *codepoint = ((scm_t_wchar) buf[0] & 0x07) << 18UL
+	| ((scm_t_wchar) buf[1] & 0x3f) << 12UL
+	| ((scm_t_wchar) buf[2] & 0x3f) << 6UL
+	| (buf[3] & 0x3f);
+    }
+  else
+    goto invalid_seq;
+
+  return 0;
+
+ invalid_seq:
+  /* Here we could choose the consume the faulty byte when it's not a
+     valid starting byte, but it's not a requirement.  What Section 3.9
+     of Unicode 6.0.0 mandates, though, is to not consume a byte that
+     would otherwise be a valid starting byte.  */
+
+  return EILSEQ;
+
+#undef CONSUME_PEEKED_BYTE
+#undef ASSERT_NOT_EOF
+}
+
+/* Likewise, read a byte sequence from PORT, passing it through its
+   input conversion descriptor.  */
+static int
+get_iconv_codepoint (SCM port, scm_t_wchar *codepoint,
+		     char buf[SCM_MBCHAR_BUF_SIZE], size_t *len)
+{
+  scm_t_port *pt;
  int err, byte_read;
  size_t bytes_consumed, output_size;
  char *output;
  scm_t_uint8 utf8_buf[SCM_MBCHAR_BUF_SIZE];
-  scm_t_port *pt = SCM_PTAB_ENTRY (port);

-  if (SCM_UNLIKELY (pt->input_cd == (iconv_t) -1))
-    /* Initialize the conversion descriptors.  */
-    scm_i_set_port_encoding_x (port, pt->encoding);
+  pt = SCM_PTAB_ENTRY (port);

  for (output_size = 0, output = (char *) utf8_buf,
 	 bytes_consumed = 0, err = 0;
@ -1177,30 +1309,45 @@ get_codepoint (SCM port, scm_t_wchar *codepoint,
  if (SCM_UNLIKELY (output_size == 0))
    /* An unterminated sequence.  */
    err = EILSEQ;
-
-  if (SCM_UNLIKELY (err != 0))
-    {
-      /* Reset the `iconv' state.  */
-      iconv (pt->input_cd, NULL, NULL, NULL, NULL);
-
-      if (pt->ilseq_handler == SCM_ICONVEH_QUESTION_MARK)
-	{
-	  *codepoint = '?';
-	  err = 0;
-	}
-
-      /* Fail when the strategy is SCM_ICONVEH_ERROR or
-	 SCM_ICONVEH_ESCAPE_SEQUENCE (the latter doesn't make sense for
-	 input encoding errors.)  */
-    }
-  else
+  else if (SCM_LIKELY (err == 0))
    {
      /* Convert the UTF8_BUF sequence to a Unicode code point.  */
      *codepoint = utf8_to_codepoint (utf8_buf, output_size);
-      update_port_lf (*codepoint, port);
+      *len = bytes_consumed;
    }

-  *len = bytes_consumed;
+  return err;
+}
+
+/* Read a codepoint from PORT and return it in *CODEPOINT.  Fill BUF
+   with the byte representation of the codepoint in PORT's encoding, and
+   set *LEN to the length in bytes of that representation.  Return 0 on
+   success and an errno value on error.  */
+static int
+get_codepoint (SCM port, scm_t_wchar *codepoint,
+	       char buf[SCM_MBCHAR_BUF_SIZE], size_t *len)
+{
+  int err;
+  scm_t_port *pt = SCM_PTAB_ENTRY (port);
+
+  if (pt->input_cd == (iconv_t) -1)
+    /* Initialize the conversion descriptors, if needed.  */
+    scm_i_set_port_encoding_x (port, pt->encoding);
+
+  /* FIXME: In 2.1, add a flag to determine whether a port is UTF-8.  */
+  if (pt->input_cd == (iconv_t) -1)
+    err = get_utf8_codepoint (port, codepoint, (scm_t_uint8 *) buf, len);
+  else
+    err = get_iconv_codepoint (port, codepoint, buf, len);
+
+  if (SCM_LIKELY (err == 0))
+    update_port_lf (*codepoint, port);
+  else if (pt->ilseq_handler == SCM_ICONVEH_QUESTION_MARK)
+    {
+      *codepoint = '?';
+      err = 0;
+      update_port_lf (*codepoint, port);
+    }

  return err;
 }
@ -2031,28 +2178,35 @@ scm_i_set_port_encoding_x (SCM port, const char *encoding)
  if (encoding == NULL)
    encoding = "ISO-8859-1";

-  pt->encoding = scm_gc_strdup (encoding, "port");
+  if (pt->encoding != encoding)
+    pt->encoding = scm_gc_strdup (encoding, "port");

-  if (SCM_CELL_WORD_0 (port) & SCM_RDNG)
+  /* If ENCODING is UTF-8, then no conversion descriptor is opened
+     because we do I/O ourselves.  This saves 100+ KiB for each
+     descriptor.  */
+  if (strcmp (encoding, "UTF-8"))
    {
-      /* Open an input iconv conversion descriptor, from ENCODING
-	 to UTF-8.  We choose UTF-8, not UTF-32, because iconv
-	 implementations can typically convert from anything to
-	 UTF-8, but not to UTF-32 (see
-	 <http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00007.html>).  */
-      new_input_cd = iconv_open ("UTF-8", encoding);
-      if (new_input_cd == (iconv_t) -1)
-	goto invalid_encoding;
-    }
-
-  if (SCM_CELL_WORD_0 (port) & SCM_WRTNG)
-    {
-      new_output_cd = iconv_open (encoding, "UTF-8");
-      if (new_output_cd == (iconv_t) -1)
+      if (SCM_CELL_WORD_0 (port) & SCM_RDNG)
 	{
-	  if (new_input_cd != (iconv_t) -1)
-	    iconv_close (new_input_cd);
-	  goto invalid_encoding;
+	  /* Open an input iconv conversion descriptor, from ENCODING
+	     to UTF-8.  We choose UTF-8, not UTF-32, because iconv
+	     implementations can typically convert from anything to
+	     UTF-8, but not to UTF-32 (see
+	     <http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00007.html>).  */
+	  new_input_cd = iconv_open ("UTF-8", encoding);
+	  if (new_input_cd == (iconv_t) -1)
+	    goto invalid_encoding;
+	}
+
+      if (SCM_CELL_WORD_0 (port) & SCM_WRTNG)
+	{
+	  new_output_cd = iconv_open (encoding, "UTF-8");
+	  if (new_output_cd == (iconv_t) -1)
+	    {
+	      if (new_input_cd != (iconv_t) -1)
+		iconv_close (new_input_cd);
+	      goto invalid_encoding;
+	    }
 	}
    }

--- a/libguile/print.c
+++ b/libguile/print.c
@ -821,31 +821,57 @@ codepoint_to_utf8 (scm_t_wchar ch, scm_t_uint8 utf8[4])
  return len;
 }

-/* Display the LEN codepoints in STR to PORT according to STRATEGY;
-   return the number of codepoints successfully displayed.  If NARROW_P,
-   then STR is interpreted as a sequence of `char', denoting a Latin-1
-   string; otherwise it's interpreted as a sequence of
-   `scm_t_wchar'.  */
-static size_t
-display_string (const void *str, int narrow_p,
-		size_t len, SCM port,
-		scm_t_string_failed_conversion_handler strategy)
-
-{
 #define STR_REF(s, x)				\
  (narrow_p					\
   ? (scm_t_wchar) ((unsigned char *) (s))[x]	\
   : ((scm_t_wchar *) (s))[x])

+/* Write STR to PORT as UTF-8.  STR is a LEN-codepoint string; it is
+   narrow if NARROW_P is true, wide otherwise.  Return LEN.  */
+static size_t
+display_string_as_utf8 (const void *str, int narrow_p, size_t len,
+			SCM port)
+{
+  size_t printed = 0;
+
+  while (len > printed)
+    {
+      size_t utf8_len, i;
+      char *input, utf8_buf[256];
+
+      /* Convert STR to UTF-8.  */
+      for (i = printed, utf8_len = 0, input = utf8_buf;
+	   i < len && utf8_len + 4 < sizeof (utf8_buf);
+	   i++)
+	{
+	  utf8_len += codepoint_to_utf8 (STR_REF (str, i),
+					 (scm_t_uint8 *) input);
+	  input = utf8_buf + utf8_len;
+	}
+
+      /* INPUT was successfully converted, entirely; print the
+	 result.  */
+      scm_lfwrite (utf8_buf, utf8_len, port);
+      printed += i - printed;
+    }
+
+  assert (printed == len);
+
+  return len;
+}
+
+/* Convert STR through PORT's output conversion descriptor and write the
+   output to PORT.  Return the number of codepoints written.  */
+static size_t
+display_string_using_iconv (const void *str, int narrow_p, size_t len,
+			    SCM port,
+			    scm_t_string_failed_conversion_handler strategy)
+{
  size_t printed;
  scm_t_port *pt;

  pt = SCM_PTAB_ENTRY (port);

-  if (SCM_UNLIKELY (pt->output_cd == (iconv_t) -1))
-    /* Initialize the conversion descriptors.  */
-    scm_i_set_port_encoding_x (port, pt->encoding);
-
  printed = 0;

  while (len > printed)
@ -928,7 +954,35 @@ display_string (const void *str, int narrow_p,
    }

  return printed;
+}
+
 #undef STR_REF
+
+/* Display the LEN codepoints in STR to PORT according to STRATEGY;
+   return the number of codepoints successfully displayed.  If NARROW_P,
+   then STR is interpreted as a sequence of `char', denoting a Latin-1
+   string; otherwise it's interpreted as a sequence of
+   `scm_t_wchar'.  */
+static size_t
+display_string (const void *str, int narrow_p,
+		size_t len, SCM port,
+		scm_t_string_failed_conversion_handler strategy)
+
+{
+  scm_t_port *pt;
+
+  pt = SCM_PTAB_ENTRY (port);
+
+  if (pt->output_cd == (iconv_t) -1)
+    /* Initialize the conversion descriptors, if needed.  */
+    scm_i_set_port_encoding_x (port, pt->encoding);
+
+  /* FIXME: In 2.1, add a flag to determine whether a port is UTF-8.  */
+  if (pt->output_cd == (iconv_t) -1)
+    return display_string_as_utf8 (str, narrow_p, len, port);
+  else
+    return display_string_using_iconv (str, narrow_p, len,
+				       port, strategy);
 }

 /* Attempt to display CH to PORT according to STRATEGY.  Return non-zero
--- a/libguile/r6rs-ports.c
+++ b/libguile/r6rs-ports.c
@ -460,14 +460,11 @@ SCM_DEFINE (scm_lookahead_u8, "lookahead-u8", 1, 0, 0,

  SCM_VALIDATE_BINARY_INPUT_PORT (1, port);

-  u8 = scm_get_byte_or_eof (port);
+  u8 = scm_peek_byte_or_eof (port);
  if (u8 == EOF)
    result = SCM_EOF_VAL;
  else
-    {
-      scm_unget_byte (u8, port);
-      result = SCM_I_MAKINUM ((scm_t_uint8) u8);
-    }
+    result = SCM_I_MAKINUM ((scm_t_uint8) u8);

  return result;
 }
--- a/libguile/read.c
+++ b/libguile/read.c
@ -1135,7 +1135,7 @@ scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
  return SCM_UNSPECIFIED;
 }

-static inline SCM
+static SCM
 scm_read_shebang (scm_t_wchar chr, SCM port)
 {
  int c = 0;