mirror of
https://git.savannah.gnu.org/git/guile.git
synced 2025-04-29 19:30:36 +02:00
* .x-sc_prohibit_S_IS_definition: New file. * m4/gnulib-cache.m4: Add `sys_stat'. * libguile/filesys.c: Remove `S_IS*' macro definitions for Ultrix and MinGW.
1187 lines
38 KiB
C
1187 lines
38 KiB
C
/* Character set conversion with error handling.
|
|
Copyright (C) 2001-2009 Free Software Foundation, Inc.
|
|
Written by Bruno Haible and Simon Josefsson.
|
|
|
|
This program is free software: you can redistribute it and/or modify
|
|
it under the terms of the GNU Lesser General Public License as published by
|
|
the Free Software Foundation; either version 3 of the License, or
|
|
(at your option) any later version.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU Lesser General Public License for more details.
|
|
|
|
You should have received a copy of the GNU Lesser General Public License
|
|
along with this program. If not, see <http://www.gnu.org/licenses/>. */
|
|
|
|
#include <config.h>
|
|
|
|
/* Specification. */
|
|
#include "striconveh.h"
|
|
|
|
#include <errno.h>
|
|
#include <stdbool.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
|
|
#if HAVE_ICONV
|
|
# include <iconv.h>
|
|
# include "unistr.h"
|
|
#endif
|
|
|
|
#include "c-strcase.h"
|
|
#include "c-strcaseeq.h"
|
|
|
|
#ifndef SIZE_MAX
|
|
# define SIZE_MAX ((size_t) -1)
|
|
#endif
|
|
|
|
|
|
#if HAVE_ICONV
|
|
|
|
/* The caller must provide an iconveh_t, not just an iconv_t, because when a
|
|
conversion error occurs, we may have to determine the Unicode representation
|
|
of the inconvertible character. */
|
|
|
|
int
|
|
iconveh_open (const char *to_codeset, const char *from_codeset, iconveh_t *cdp)
|
|
{
|
|
iconv_t cd;
|
|
iconv_t cd1;
|
|
iconv_t cd2;
|
|
|
|
/* Avoid glibc-2.1 bug with EUC-KR. */
|
|
# if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
|
|
if (c_strcasecmp (from_codeset, "EUC-KR") == 0
|
|
|| c_strcasecmp (to_codeset, "EUC-KR") == 0)
|
|
{
|
|
errno = EINVAL;
|
|
return -1;
|
|
}
|
|
# endif
|
|
|
|
cd = iconv_open (to_codeset, from_codeset);
|
|
|
|
if (STRCASEEQ (from_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0))
|
|
cd1 = (iconv_t)(-1);
|
|
else
|
|
{
|
|
cd1 = iconv_open ("UTF-8", from_codeset);
|
|
if (cd1 == (iconv_t)(-1))
|
|
{
|
|
int saved_errno = errno;
|
|
if (cd != (iconv_t)(-1))
|
|
iconv_close (cdp->cd);
|
|
errno = saved_errno;
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
if (STRCASEEQ (to_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0)
|
|
# if (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2 || _LIBICONV_VERSION >= 0x0105
|
|
|| c_strcasecmp (to_codeset, "UTF-8//TRANSLIT") == 0
|
|
# endif
|
|
)
|
|
cd2 = (iconv_t)(-1);
|
|
else
|
|
{
|
|
cd2 = iconv_open (to_codeset, "UTF-8");
|
|
if (cd2 == (iconv_t)(-1))
|
|
{
|
|
int saved_errno = errno;
|
|
if (cd1 != (iconv_t)(-1))
|
|
iconv_close (cd1);
|
|
if (cd != (iconv_t)(-1))
|
|
iconv_close (cd);
|
|
errno = saved_errno;
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
cdp->cd = cd;
|
|
cdp->cd1 = cd1;
|
|
cdp->cd2 = cd2;
|
|
return 0;
|
|
}
|
|
|
|
int
|
|
iconveh_close (const iconveh_t *cd)
|
|
{
|
|
if (cd->cd2 != (iconv_t)(-1) && iconv_close (cd->cd2) < 0)
|
|
{
|
|
/* Return -1, but preserve the errno from iconv_close. */
|
|
int saved_errno = errno;
|
|
if (cd->cd1 != (iconv_t)(-1))
|
|
iconv_close (cd->cd1);
|
|
if (cd->cd != (iconv_t)(-1))
|
|
iconv_close (cd->cd);
|
|
errno = saved_errno;
|
|
return -1;
|
|
}
|
|
if (cd->cd1 != (iconv_t)(-1) && iconv_close (cd->cd1) < 0)
|
|
{
|
|
/* Return -1, but preserve the errno from iconv_close. */
|
|
int saved_errno = errno;
|
|
if (cd->cd != (iconv_t)(-1))
|
|
iconv_close (cd->cd);
|
|
errno = saved_errno;
|
|
return -1;
|
|
}
|
|
if (cd->cd != (iconv_t)(-1) && iconv_close (cd->cd) < 0)
|
|
return -1;
|
|
return 0;
|
|
}
|
|
|
|
/* iconv_carefully is like iconv, except that it stops as soon as it encounters
|
|
a conversion error, and it returns in *INCREMENTED a boolean telling whether
|
|
it has incremented the input pointers past the error location. */
|
|
# if !defined _LIBICONV_VERSION && !defined __GLIBC__
|
|
/* Irix iconv() inserts a NUL byte if it cannot convert.
|
|
NetBSD iconv() inserts a question mark if it cannot convert.
|
|
Only GNU libiconv and GNU libc are known to prefer to fail rather
|
|
than doing a lossy conversion. */
|
|
static size_t
|
|
iconv_carefully (iconv_t cd,
|
|
const char **inbuf, size_t *inbytesleft,
|
|
char **outbuf, size_t *outbytesleft,
|
|
bool *incremented)
|
|
{
|
|
const char *inptr = *inbuf;
|
|
const char *inptr_end = inptr + *inbytesleft;
|
|
char *outptr = *outbuf;
|
|
size_t outsize = *outbytesleft;
|
|
const char *inptr_before;
|
|
size_t res;
|
|
|
|
do
|
|
{
|
|
size_t insize;
|
|
|
|
inptr_before = inptr;
|
|
res = (size_t)(-1);
|
|
|
|
for (insize = 1; inptr + insize <= inptr_end; insize++)
|
|
{
|
|
res = iconv (cd,
|
|
(ICONV_CONST char **) &inptr, &insize,
|
|
&outptr, &outsize);
|
|
if (!(res == (size_t)(-1) && errno == EINVAL))
|
|
break;
|
|
/* iconv can eat up a shift sequence but give EINVAL while attempting
|
|
to convert the first character. E.g. libiconv does this. */
|
|
if (inptr > inptr_before)
|
|
{
|
|
res = 0;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (res == 0)
|
|
{
|
|
*outbuf = outptr;
|
|
*outbytesleft = outsize;
|
|
}
|
|
}
|
|
while (res == 0 && inptr < inptr_end);
|
|
|
|
*inbuf = inptr;
|
|
*inbytesleft = inptr_end - inptr;
|
|
if (res != (size_t)(-1) && res > 0)
|
|
{
|
|
/* iconv() has already incremented INPTR. We cannot go back to a
|
|
previous INPTR, otherwise the state inside CD would become invalid,
|
|
if FROM_CODESET is a stateful encoding. So, tell the caller that
|
|
*INBUF has already been incremented. */
|
|
*incremented = (inptr > inptr_before);
|
|
errno = EILSEQ;
|
|
return (size_t)(-1);
|
|
}
|
|
else
|
|
{
|
|
*incremented = false;
|
|
return res;
|
|
}
|
|
}
|
|
# else
|
|
# define iconv_carefully(cd, inbuf, inbytesleft, outbuf, outbytesleft, incremented) \
|
|
(*(incremented) = false, \
|
|
iconv (cd, (ICONV_CONST char **) (inbuf), inbytesleft, outbuf, outbytesleft))
|
|
# endif
|
|
|
|
/* iconv_carefully_1 is like iconv_carefully, except that it stops after
|
|
converting one character or one shift sequence. */
|
|
static size_t
|
|
iconv_carefully_1 (iconv_t cd,
|
|
const char **inbuf, size_t *inbytesleft,
|
|
char **outbuf, size_t *outbytesleft,
|
|
bool *incremented)
|
|
{
|
|
const char *inptr_before = *inbuf;
|
|
const char *inptr = inptr_before;
|
|
const char *inptr_end = inptr_before + *inbytesleft;
|
|
char *outptr = *outbuf;
|
|
size_t outsize = *outbytesleft;
|
|
size_t res = (size_t)(-1);
|
|
size_t insize;
|
|
|
|
for (insize = 1; inptr_before + insize <= inptr_end; insize++)
|
|
{
|
|
inptr = inptr_before;
|
|
res = iconv (cd,
|
|
(ICONV_CONST char **) &inptr, &insize,
|
|
&outptr, &outsize);
|
|
if (!(res == (size_t)(-1) && errno == EINVAL))
|
|
break;
|
|
/* iconv can eat up a shift sequence but give EINVAL while attempting
|
|
to convert the first character. E.g. libiconv does this. */
|
|
if (inptr > inptr_before)
|
|
{
|
|
res = 0;
|
|
break;
|
|
}
|
|
}
|
|
|
|
*inbuf = inptr;
|
|
*inbytesleft = inptr_end - inptr;
|
|
# if !defined _LIBICONV_VERSION && !defined __GLIBC__
|
|
/* Irix iconv() inserts a NUL byte if it cannot convert.
|
|
NetBSD iconv() inserts a question mark if it cannot convert.
|
|
Only GNU libiconv and GNU libc are known to prefer to fail rather
|
|
than doing a lossy conversion. */
|
|
if (res != (size_t)(-1) && res > 0)
|
|
{
|
|
/* iconv() has already incremented INPTR. We cannot go back to a
|
|
previous INPTR, otherwise the state inside CD would become invalid,
|
|
if FROM_CODESET is a stateful encoding. So, tell the caller that
|
|
*INBUF has already been incremented. */
|
|
*incremented = (inptr > inptr_before);
|
|
errno = EILSEQ;
|
|
return (size_t)(-1);
|
|
}
|
|
# endif
|
|
|
|
if (res != (size_t)(-1))
|
|
{
|
|
*outbuf = outptr;
|
|
*outbytesleft = outsize;
|
|
}
|
|
*incremented = false;
|
|
return res;
|
|
}
|
|
|
|
/* utf8conv_carefully is like iconv, except that
|
|
- it converts from UTF-8 to UTF-8,
|
|
- it stops as soon as it encounters a conversion error, and it returns
|
|
in *INCREMENTED a boolean telling whether it has incremented the input
|
|
pointers past the error location,
|
|
- if one_character_only is true, it stops after converting one
|
|
character. */
|
|
static size_t
|
|
utf8conv_carefully (bool one_character_only,
|
|
const char **inbuf, size_t *inbytesleft,
|
|
char **outbuf, size_t *outbytesleft,
|
|
bool *incremented)
|
|
{
|
|
const char *inptr = *inbuf;
|
|
size_t insize = *inbytesleft;
|
|
char *outptr = *outbuf;
|
|
size_t outsize = *outbytesleft;
|
|
size_t res;
|
|
|
|
res = 0;
|
|
do
|
|
{
|
|
ucs4_t uc;
|
|
int n;
|
|
int m;
|
|
|
|
n = u8_mbtoucr (&uc, (const uint8_t *) inptr, insize);
|
|
if (n < 0)
|
|
{
|
|
errno = (n == -2 ? EINVAL : EILSEQ);
|
|
n = u8_mbtouc (&uc, (const uint8_t *) inptr, insize);
|
|
inptr += n;
|
|
insize -= n;
|
|
res = (size_t)(-1);
|
|
*incremented = true;
|
|
break;
|
|
}
|
|
if (outsize == 0)
|
|
{
|
|
errno = E2BIG;
|
|
res = (size_t)(-1);
|
|
*incremented = false;
|
|
break;
|
|
}
|
|
m = u8_uctomb ((uint8_t *) outptr, uc, outsize);
|
|
if (m == -2)
|
|
{
|
|
errno = E2BIG;
|
|
res = (size_t)(-1);
|
|
*incremented = false;
|
|
break;
|
|
}
|
|
inptr += n;
|
|
insize -= n;
|
|
if (m == -1)
|
|
{
|
|
errno = EILSEQ;
|
|
res = (size_t)(-1);
|
|
*incremented = true;
|
|
break;
|
|
}
|
|
outptr += m;
|
|
outsize -= m;
|
|
}
|
|
while (!one_character_only && insize > 0);
|
|
|
|
*inbuf = inptr;
|
|
*inbytesleft = insize;
|
|
*outbuf = outptr;
|
|
*outbytesleft = outsize;
|
|
return res;
|
|
}
|
|
|
|
static int
|
|
mem_cd_iconveh_internal (const char *src, size_t srclen,
|
|
iconv_t cd, iconv_t cd1, iconv_t cd2,
|
|
enum iconv_ilseq_handler handler,
|
|
size_t extra_alloc,
|
|
size_t *offsets,
|
|
char **resultp, size_t *lengthp)
|
|
{
|
|
/* When a conversion error occurs, we cannot start using CD1 and CD2 at
|
|
this point: FROM_CODESET may be a stateful encoding like ISO-2022-KR.
|
|
Instead, we have to start afresh from the beginning of SRC. */
|
|
/* Use a temporary buffer, so that for small strings, a single malloc()
|
|
call will be sufficient. */
|
|
# define tmpbufsize 4096
|
|
/* The alignment is needed when converting e.g. to glibc's WCHAR_T or
|
|
libiconv's UCS-4-INTERNAL encoding. */
|
|
union { unsigned int align; char buf[tmpbufsize]; } tmp;
|
|
# define tmpbuf tmp.buf
|
|
|
|
char *initial_result;
|
|
char *result;
|
|
size_t allocated;
|
|
size_t length;
|
|
size_t last_length = (size_t)(-1); /* only needed if offsets != NULL */
|
|
|
|
if (*resultp != NULL && *lengthp >= sizeof (tmpbuf))
|
|
{
|
|
initial_result = *resultp;
|
|
allocated = *lengthp;
|
|
}
|
|
else
|
|
{
|
|
initial_result = tmpbuf;
|
|
allocated = sizeof (tmpbuf);
|
|
}
|
|
result = initial_result;
|
|
|
|
/* Test whether a direct conversion is possible at all. */
|
|
if (cd == (iconv_t)(-1))
|
|
goto indirectly;
|
|
|
|
if (offsets != NULL)
|
|
{
|
|
size_t i;
|
|
|
|
for (i = 0; i < srclen; i++)
|
|
offsets[i] = (size_t)(-1);
|
|
|
|
last_length = (size_t)(-1);
|
|
}
|
|
length = 0;
|
|
|
|
/* First, try a direct conversion, and see whether a conversion error
|
|
occurs at all. */
|
|
{
|
|
const char *inptr = src;
|
|
size_t insize = srclen;
|
|
|
|
/* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug. */
|
|
# if defined _LIBICONV_VERSION \
|
|
|| !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
|
|
/* Set to the initial state. */
|
|
iconv (cd, NULL, NULL, NULL, NULL);
|
|
# endif
|
|
|
|
while (insize > 0)
|
|
{
|
|
char *outptr = result + length;
|
|
size_t outsize = allocated - extra_alloc - length;
|
|
bool incremented;
|
|
size_t res;
|
|
bool grow;
|
|
|
|
if (offsets != NULL)
|
|
{
|
|
if (length != last_length) /* ensure that offset[] be increasing */
|
|
{
|
|
offsets[inptr - src] = length;
|
|
last_length = length;
|
|
}
|
|
res = iconv_carefully_1 (cd,
|
|
&inptr, &insize,
|
|
&outptr, &outsize,
|
|
&incremented);
|
|
}
|
|
else
|
|
/* Use iconv_carefully instead of iconv here, because:
|
|
- If TO_CODESET is UTF-8, we can do the error handling in this
|
|
loop, no need for a second loop,
|
|
- With iconv() implementations other than GNU libiconv and GNU
|
|
libc, if we use iconv() in a big swoop, checking for an E2BIG
|
|
return, we lose the number of irreversible conversions. */
|
|
res = iconv_carefully (cd,
|
|
&inptr, &insize,
|
|
&outptr, &outsize,
|
|
&incremented);
|
|
|
|
length = outptr - result;
|
|
grow = (length + extra_alloc > allocated / 2);
|
|
if (res == (size_t)(-1))
|
|
{
|
|
if (errno == E2BIG)
|
|
grow = true;
|
|
else if (errno == EINVAL)
|
|
break;
|
|
else if (errno == EILSEQ && handler != iconveh_error)
|
|
{
|
|
if (cd2 == (iconv_t)(-1))
|
|
{
|
|
/* TO_CODESET is UTF-8. */
|
|
/* Error handling can produce up to 1 byte of output. */
|
|
if (length + 1 + extra_alloc > allocated)
|
|
{
|
|
char *memory;
|
|
|
|
allocated = 2 * allocated;
|
|
if (length + 1 + extra_alloc > allocated)
|
|
abort ();
|
|
if (result == initial_result)
|
|
memory = (char *) malloc (allocated);
|
|
else
|
|
memory = (char *) realloc (result, allocated);
|
|
if (memory == NULL)
|
|
{
|
|
if (result != initial_result)
|
|
free (result);
|
|
errno = ENOMEM;
|
|
return -1;
|
|
}
|
|
if (result == initial_result)
|
|
memcpy (memory, initial_result, length);
|
|
result = memory;
|
|
grow = false;
|
|
}
|
|
/* The input is invalid in FROM_CODESET. Eat up one byte
|
|
and emit a question mark. */
|
|
if (!incremented)
|
|
{
|
|
if (insize == 0)
|
|
abort ();
|
|
inptr++;
|
|
insize--;
|
|
}
|
|
result[length] = '?';
|
|
length++;
|
|
}
|
|
else
|
|
goto indirectly;
|
|
}
|
|
else
|
|
{
|
|
if (result != initial_result)
|
|
{
|
|
int saved_errno = errno;
|
|
free (result);
|
|
errno = saved_errno;
|
|
}
|
|
return -1;
|
|
}
|
|
}
|
|
if (insize == 0)
|
|
break;
|
|
if (grow)
|
|
{
|
|
char *memory;
|
|
|
|
allocated = 2 * allocated;
|
|
if (result == initial_result)
|
|
memory = (char *) malloc (allocated);
|
|
else
|
|
memory = (char *) realloc (result, allocated);
|
|
if (memory == NULL)
|
|
{
|
|
if (result != initial_result)
|
|
free (result);
|
|
errno = ENOMEM;
|
|
return -1;
|
|
}
|
|
if (result == initial_result)
|
|
memcpy (memory, initial_result, length);
|
|
result = memory;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Now get the conversion state back to the initial state.
|
|
But avoid glibc-2.1 bug and Solaris 2.7 bug. */
|
|
#if defined _LIBICONV_VERSION \
|
|
|| !((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) || defined __sun)
|
|
for (;;)
|
|
{
|
|
char *outptr = result + length;
|
|
size_t outsize = allocated - extra_alloc - length;
|
|
size_t res;
|
|
|
|
res = iconv (cd, NULL, NULL, &outptr, &outsize);
|
|
length = outptr - result;
|
|
if (res == (size_t)(-1))
|
|
{
|
|
if (errno == E2BIG)
|
|
{
|
|
char *memory;
|
|
|
|
allocated = 2 * allocated;
|
|
if (result == initial_result)
|
|
memory = (char *) malloc (allocated);
|
|
else
|
|
memory = (char *) realloc (result, allocated);
|
|
if (memory == NULL)
|
|
{
|
|
if (result != initial_result)
|
|
free (result);
|
|
errno = ENOMEM;
|
|
return -1;
|
|
}
|
|
if (result == initial_result)
|
|
memcpy (memory, initial_result, length);
|
|
result = memory;
|
|
}
|
|
else
|
|
{
|
|
if (result != initial_result)
|
|
{
|
|
int saved_errno = errno;
|
|
free (result);
|
|
errno = saved_errno;
|
|
}
|
|
return -1;
|
|
}
|
|
}
|
|
else
|
|
break;
|
|
}
|
|
#endif
|
|
|
|
/* The direct conversion succeeded. */
|
|
goto done;
|
|
|
|
indirectly:
|
|
/* The direct conversion failed.
|
|
Use a conversion through UTF-8. */
|
|
if (offsets != NULL)
|
|
{
|
|
size_t i;
|
|
|
|
for (i = 0; i < srclen; i++)
|
|
offsets[i] = (size_t)(-1);
|
|
|
|
last_length = (size_t)(-1);
|
|
}
|
|
length = 0;
|
|
{
|
|
const bool slowly = (offsets != NULL || handler == iconveh_error);
|
|
# define utf8bufsize 4096 /* may also be smaller or larger than tmpbufsize */
|
|
char utf8buf[utf8bufsize + 1];
|
|
size_t utf8len = 0;
|
|
const char *in1ptr = src;
|
|
size_t in1size = srclen;
|
|
bool do_final_flush1 = true;
|
|
bool do_final_flush2 = true;
|
|
|
|
/* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug. */
|
|
# if defined _LIBICONV_VERSION \
|
|
|| !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
|
|
/* Set to the initial state. */
|
|
if (cd1 != (iconv_t)(-1))
|
|
iconv (cd1, NULL, NULL, NULL, NULL);
|
|
if (cd2 != (iconv_t)(-1))
|
|
iconv (cd2, NULL, NULL, NULL, NULL);
|
|
# endif
|
|
|
|
while (in1size > 0 || do_final_flush1 || utf8len > 0 || do_final_flush2)
|
|
{
|
|
char *out1ptr = utf8buf + utf8len;
|
|
size_t out1size = utf8bufsize - utf8len;
|
|
bool incremented1;
|
|
size_t res1;
|
|
int errno1;
|
|
|
|
/* Conversion step 1: from FROM_CODESET to UTF-8. */
|
|
if (in1size > 0)
|
|
{
|
|
if (offsets != NULL
|
|
&& length != last_length) /* ensure that offset[] be increasing */
|
|
{
|
|
offsets[in1ptr - src] = length;
|
|
last_length = length;
|
|
}
|
|
if (cd1 != (iconv_t)(-1))
|
|
{
|
|
if (slowly)
|
|
res1 = iconv_carefully_1 (cd1,
|
|
&in1ptr, &in1size,
|
|
&out1ptr, &out1size,
|
|
&incremented1);
|
|
else
|
|
res1 = iconv_carefully (cd1,
|
|
&in1ptr, &in1size,
|
|
&out1ptr, &out1size,
|
|
&incremented1);
|
|
}
|
|
else
|
|
{
|
|
/* FROM_CODESET is UTF-8. */
|
|
res1 = utf8conv_carefully (slowly,
|
|
&in1ptr, &in1size,
|
|
&out1ptr, &out1size,
|
|
&incremented1);
|
|
}
|
|
}
|
|
else if (do_final_flush1)
|
|
{
|
|
/* Now get the conversion state of CD1 back to the initial state.
|
|
But avoid glibc-2.1 bug and Solaris 2.7 bug. */
|
|
# if defined _LIBICONV_VERSION \
|
|
|| !((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) || defined __sun)
|
|
if (cd1 != (iconv_t)(-1))
|
|
res1 = iconv (cd1, NULL, NULL, &out1ptr, &out1size);
|
|
else
|
|
# endif
|
|
res1 = 0;
|
|
do_final_flush1 = false;
|
|
incremented1 = true;
|
|
}
|
|
else
|
|
{
|
|
res1 = 0;
|
|
incremented1 = true;
|
|
}
|
|
if (res1 == (size_t)(-1)
|
|
&& !(errno == E2BIG || errno == EINVAL || errno == EILSEQ))
|
|
{
|
|
if (result != initial_result)
|
|
{
|
|
int saved_errno = errno;
|
|
free (result);
|
|
errno = saved_errno;
|
|
}
|
|
return -1;
|
|
}
|
|
if (res1 == (size_t)(-1)
|
|
&& errno == EILSEQ && handler != iconveh_error)
|
|
{
|
|
/* The input is invalid in FROM_CODESET. Eat up one byte and
|
|
emit a question mark. Room for the question mark was allocated
|
|
at the end of utf8buf. */
|
|
if (!incremented1)
|
|
{
|
|
if (in1size == 0)
|
|
abort ();
|
|
in1ptr++;
|
|
in1size--;
|
|
}
|
|
*out1ptr++ = '?';
|
|
res1 = 0;
|
|
}
|
|
errno1 = errno;
|
|
utf8len = out1ptr - utf8buf;
|
|
|
|
if (offsets != NULL
|
|
|| in1size == 0
|
|
|| utf8len > utf8bufsize / 2
|
|
|| (res1 == (size_t)(-1) && errno1 == E2BIG))
|
|
{
|
|
/* Conversion step 2: from UTF-8 to TO_CODESET. */
|
|
const char *in2ptr = utf8buf;
|
|
size_t in2size = utf8len;
|
|
|
|
while (in2size > 0
|
|
|| (in1size == 0 && !do_final_flush1 && do_final_flush2))
|
|
{
|
|
char *out2ptr = result + length;
|
|
size_t out2size = allocated - extra_alloc - length;
|
|
bool incremented2;
|
|
size_t res2;
|
|
bool grow;
|
|
|
|
if (in2size > 0)
|
|
{
|
|
if (cd2 != (iconv_t)(-1))
|
|
res2 = iconv_carefully (cd2,
|
|
&in2ptr, &in2size,
|
|
&out2ptr, &out2size,
|
|
&incremented2);
|
|
else
|
|
/* TO_CODESET is UTF-8. */
|
|
res2 = utf8conv_carefully (false,
|
|
&in2ptr, &in2size,
|
|
&out2ptr, &out2size,
|
|
&incremented2);
|
|
}
|
|
else /* in1size == 0 && !do_final_flush1
|
|
&& in2size == 0 && do_final_flush2 */
|
|
{
|
|
/* Now get the conversion state of CD1 back to the initial
|
|
state. But avoid glibc-2.1 bug and Solaris 2.7 bug. */
|
|
# if defined _LIBICONV_VERSION \
|
|
|| !((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) || defined __sun)
|
|
if (cd2 != (iconv_t)(-1))
|
|
res2 = iconv (cd2, NULL, NULL, &out2ptr, &out2size);
|
|
else
|
|
# endif
|
|
res2 = 0;
|
|
do_final_flush2 = false;
|
|
incremented2 = true;
|
|
}
|
|
|
|
length = out2ptr - result;
|
|
grow = (length + extra_alloc > allocated / 2);
|
|
if (res2 == (size_t)(-1))
|
|
{
|
|
if (errno == E2BIG)
|
|
grow = true;
|
|
else if (errno == EINVAL)
|
|
break;
|
|
else if (errno == EILSEQ && handler != iconveh_error)
|
|
{
|
|
/* Error handling can produce up to 10 bytes of ASCII
|
|
output. But TO_CODESET may be UCS-2, UTF-16 or
|
|
UCS-4, so use CD2 here as well. */
|
|
char scratchbuf[10];
|
|
size_t scratchlen;
|
|
ucs4_t uc;
|
|
const char *inptr;
|
|
size_t insize;
|
|
size_t res;
|
|
|
|
if (incremented2)
|
|
{
|
|
if (u8_prev (&uc, (const uint8_t *) in2ptr,
|
|
(const uint8_t *) utf8buf)
|
|
== NULL)
|
|
abort ();
|
|
}
|
|
else
|
|
{
|
|
int n;
|
|
if (in2size == 0)
|
|
abort ();
|
|
n = u8_mbtouc_unsafe (&uc, (const uint8_t *) in2ptr,
|
|
in2size);
|
|
in2ptr += n;
|
|
in2size -= n;
|
|
}
|
|
|
|
if (handler == iconveh_escape_sequence)
|
|
{
|
|
static char hex[16] = "0123456789ABCDEF";
|
|
scratchlen = 0;
|
|
scratchbuf[scratchlen++] = '\\';
|
|
if (uc < 0x10000)
|
|
scratchbuf[scratchlen++] = 'u';
|
|
else
|
|
{
|
|
scratchbuf[scratchlen++] = 'U';
|
|
scratchbuf[scratchlen++] = hex[(uc>>28) & 15];
|
|
scratchbuf[scratchlen++] = hex[(uc>>24) & 15];
|
|
scratchbuf[scratchlen++] = hex[(uc>>20) & 15];
|
|
scratchbuf[scratchlen++] = hex[(uc>>16) & 15];
|
|
}
|
|
scratchbuf[scratchlen++] = hex[(uc>>12) & 15];
|
|
scratchbuf[scratchlen++] = hex[(uc>>8) & 15];
|
|
scratchbuf[scratchlen++] = hex[(uc>>4) & 15];
|
|
scratchbuf[scratchlen++] = hex[uc & 15];
|
|
}
|
|
else
|
|
{
|
|
scratchbuf[0] = '?';
|
|
scratchlen = 1;
|
|
}
|
|
|
|
inptr = scratchbuf;
|
|
insize = scratchlen;
|
|
if (cd2 != (iconv_t)(-1))
|
|
res = iconv (cd2,
|
|
(ICONV_CONST char **) &inptr, &insize,
|
|
&out2ptr, &out2size);
|
|
else
|
|
{
|
|
/* TO_CODESET is UTF-8. */
|
|
if (out2size >= insize)
|
|
{
|
|
memcpy (out2ptr, inptr, insize);
|
|
out2ptr += insize;
|
|
out2size -= insize;
|
|
inptr += insize;
|
|
insize = 0;
|
|
res = 0;
|
|
}
|
|
else
|
|
{
|
|
errno = E2BIG;
|
|
res = (size_t)(-1);
|
|
}
|
|
}
|
|
length = out2ptr - result;
|
|
if (res == (size_t)(-1) && errno == E2BIG)
|
|
{
|
|
char *memory;
|
|
|
|
allocated = 2 * allocated;
|
|
if (length + 1 + extra_alloc > allocated)
|
|
abort ();
|
|
if (result == initial_result)
|
|
memory = (char *) malloc (allocated);
|
|
else
|
|
memory = (char *) realloc (result, allocated);
|
|
if (memory == NULL)
|
|
{
|
|
if (result != initial_result)
|
|
free (result);
|
|
errno = ENOMEM;
|
|
return -1;
|
|
}
|
|
if (result == initial_result)
|
|
memcpy (memory, initial_result, length);
|
|
result = memory;
|
|
grow = false;
|
|
|
|
out2ptr = result + length;
|
|
out2size = allocated - extra_alloc - length;
|
|
if (cd2 != (iconv_t)(-1))
|
|
res = iconv (cd2,
|
|
(ICONV_CONST char **) &inptr,
|
|
&insize,
|
|
&out2ptr, &out2size);
|
|
else
|
|
{
|
|
/* TO_CODESET is UTF-8. */
|
|
if (!(out2size >= insize))
|
|
abort ();
|
|
memcpy (out2ptr, inptr, insize);
|
|
out2ptr += insize;
|
|
out2size -= insize;
|
|
inptr += insize;
|
|
insize = 0;
|
|
res = 0;
|
|
}
|
|
length = out2ptr - result;
|
|
}
|
|
# if !defined _LIBICONV_VERSION && !defined __GLIBC__
|
|
/* Irix iconv() inserts a NUL byte if it cannot convert.
|
|
NetBSD iconv() inserts a question mark if it cannot
|
|
convert.
|
|
Only GNU libiconv and GNU libc are known to prefer
|
|
to fail rather than doing a lossy conversion. */
|
|
if (res != (size_t)(-1) && res > 0)
|
|
{
|
|
errno = EILSEQ;
|
|
res = (size_t)(-1);
|
|
}
|
|
# endif
|
|
if (res == (size_t)(-1))
|
|
{
|
|
/* Failure converting the ASCII replacement. */
|
|
if (result != initial_result)
|
|
{
|
|
int saved_errno = errno;
|
|
free (result);
|
|
errno = saved_errno;
|
|
}
|
|
return -1;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if (result != initial_result)
|
|
{
|
|
int saved_errno = errno;
|
|
free (result);
|
|
errno = saved_errno;
|
|
}
|
|
return -1;
|
|
}
|
|
}
|
|
if (!(in2size > 0
|
|
|| (in1size == 0 && !do_final_flush1 && do_final_flush2)))
|
|
break;
|
|
if (grow)
|
|
{
|
|
char *memory;
|
|
|
|
allocated = 2 * allocated;
|
|
if (result == initial_result)
|
|
memory = (char *) malloc (allocated);
|
|
else
|
|
memory = (char *) realloc (result, allocated);
|
|
if (memory == NULL)
|
|
{
|
|
if (result != initial_result)
|
|
free (result);
|
|
errno = ENOMEM;
|
|
return -1;
|
|
}
|
|
if (result == initial_result)
|
|
memcpy (memory, initial_result, length);
|
|
result = memory;
|
|
}
|
|
}
|
|
|
|
/* Move the remaining bytes to the beginning of utf8buf. */
|
|
if (in2size > 0)
|
|
memmove (utf8buf, in2ptr, in2size);
|
|
utf8len = in2size;
|
|
}
|
|
|
|
if (res1 == (size_t)(-1))
|
|
{
|
|
if (errno1 == EINVAL)
|
|
in1size = 0;
|
|
else if (errno1 == EILSEQ)
|
|
{
|
|
if (result != initial_result)
|
|
free (result);
|
|
errno = errno1;
|
|
return -1;
|
|
}
|
|
}
|
|
}
|
|
# undef utf8bufsize
|
|
}
|
|
|
|
done:
|
|
/* Now the final memory allocation. */
|
|
if (result == tmpbuf)
|
|
{
|
|
size_t memsize = length + extra_alloc;
|
|
char *memory;
|
|
|
|
memory = (char *) malloc (memsize > 0 ? memsize : 1);
|
|
if (memory != NULL)
|
|
{
|
|
memcpy (memory, tmpbuf, length);
|
|
result = memory;
|
|
}
|
|
else
|
|
{
|
|
errno = ENOMEM;
|
|
return -1;
|
|
}
|
|
}
|
|
else if (result != *resultp && length + extra_alloc < allocated)
|
|
{
|
|
/* Shrink the allocated memory if possible. */
|
|
size_t memsize = length + extra_alloc;
|
|
char *memory;
|
|
|
|
memory = (char *) realloc (result, memsize > 0 ? memsize : 1);
|
|
if (memory != NULL)
|
|
result = memory;
|
|
}
|
|
*resultp = result;
|
|
*lengthp = length;
|
|
return 0;
|
|
# undef tmpbuf
|
|
# undef tmpbufsize
|
|
}
|
|
|
|
int
|
|
mem_cd_iconveh (const char *src, size_t srclen,
|
|
const iconveh_t *cd,
|
|
enum iconv_ilseq_handler handler,
|
|
size_t *offsets,
|
|
char **resultp, size_t *lengthp)
|
|
{
|
|
return mem_cd_iconveh_internal (src, srclen, cd->cd, cd->cd1, cd->cd2,
|
|
handler, 0, offsets, resultp, lengthp);
|
|
}
|
|
|
|
char *
|
|
str_cd_iconveh (const char *src,
|
|
const iconveh_t *cd,
|
|
enum iconv_ilseq_handler handler)
|
|
{
|
|
/* For most encodings, a trailing NUL byte in the input will be converted
|
|
to a trailing NUL byte in the output. But not for UTF-7. So that this
|
|
function is usable for UTF-7, we have to exclude the NUL byte from the
|
|
conversion and add it by hand afterwards. */
|
|
char *result = NULL;
|
|
size_t length = 0;
|
|
int retval = mem_cd_iconveh_internal (src, strlen (src),
|
|
cd->cd, cd->cd1, cd->cd2, handler, 1,
|
|
NULL, &result, &length);
|
|
|
|
if (retval < 0)
|
|
{
|
|
if (result != NULL)
|
|
{
|
|
int saved_errno = errno;
|
|
free (result);
|
|
errno = saved_errno;
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
/* Add the terminating NUL byte. */
|
|
result[length] = '\0';
|
|
|
|
return result;
|
|
}
|
|
|
|
#endif
|
|
|
|
int
|
|
mem_iconveh (const char *src, size_t srclen,
|
|
const char *from_codeset, const char *to_codeset,
|
|
enum iconv_ilseq_handler handler,
|
|
size_t *offsets,
|
|
char **resultp, size_t *lengthp)
|
|
{
|
|
if (srclen == 0)
|
|
{
|
|
/* Nothing to convert. */
|
|
*lengthp = 0;
|
|
return 0;
|
|
}
|
|
else if (offsets == NULL && c_strcasecmp (from_codeset, to_codeset) == 0)
|
|
{
|
|
char *result;
|
|
|
|
if (*resultp != NULL && *lengthp >= srclen)
|
|
result = *resultp;
|
|
else
|
|
{
|
|
result = (char *) malloc (srclen);
|
|
if (result == NULL)
|
|
{
|
|
errno = ENOMEM;
|
|
return -1;
|
|
}
|
|
}
|
|
memcpy (result, src, srclen);
|
|
*resultp = result;
|
|
*lengthp = srclen;
|
|
return 0;
|
|
}
|
|
else
|
|
{
|
|
#if HAVE_ICONV
|
|
iconveh_t cd;
|
|
char *result;
|
|
size_t length;
|
|
int retval;
|
|
|
|
if (iconveh_open (to_codeset, from_codeset, &cd) < 0)
|
|
return -1;
|
|
|
|
result = *resultp;
|
|
length = *lengthp;
|
|
retval = mem_cd_iconveh (src, srclen, &cd, handler, offsets,
|
|
&result, &length);
|
|
|
|
if (retval < 0)
|
|
{
|
|
/* Close cd, but preserve the errno from str_cd_iconv. */
|
|
int saved_errno = errno;
|
|
iconveh_close (&cd);
|
|
errno = saved_errno;
|
|
}
|
|
else
|
|
{
|
|
if (iconveh_close (&cd) < 0)
|
|
{
|
|
/* Return -1, but free the allocated memory, and while doing
|
|
that, preserve the errno from iconveh_close. */
|
|
int saved_errno = errno;
|
|
if (result != *resultp && result != NULL)
|
|
free (result);
|
|
errno = saved_errno;
|
|
return -1;
|
|
}
|
|
*resultp = result;
|
|
*lengthp = length;
|
|
}
|
|
return retval;
|
|
#else
|
|
/* This is a different error code than if iconv_open existed but didn't
|
|
support from_codeset and to_codeset, so that the caller can emit
|
|
an error message such as
|
|
"iconv() is not supported. Installing GNU libiconv and
|
|
then reinstalling this package would fix this." */
|
|
errno = ENOSYS;
|
|
return -1;
|
|
#endif
|
|
}
|
|
}
|
|
|
|
char *
|
|
str_iconveh (const char *src,
|
|
const char *from_codeset, const char *to_codeset,
|
|
enum iconv_ilseq_handler handler)
|
|
{
|
|
if (*src == '\0' || c_strcasecmp (from_codeset, to_codeset) == 0)
|
|
{
|
|
char *result = strdup (src);
|
|
|
|
if (result == NULL)
|
|
errno = ENOMEM;
|
|
return result;
|
|
}
|
|
else
|
|
{
|
|
#if HAVE_ICONV
|
|
iconveh_t cd;
|
|
char *result;
|
|
|
|
if (iconveh_open (to_codeset, from_codeset, &cd) < 0)
|
|
return NULL;
|
|
|
|
result = str_cd_iconveh (src, &cd, handler);
|
|
|
|
if (result == NULL)
|
|
{
|
|
/* Close cd, but preserve the errno from str_cd_iconv. */
|
|
int saved_errno = errno;
|
|
iconveh_close (&cd);
|
|
errno = saved_errno;
|
|
}
|
|
else
|
|
{
|
|
if (iconveh_close (&cd) < 0)
|
|
{
|
|
/* Return NULL, but free the allocated memory, and while doing
|
|
that, preserve the errno from iconveh_close. */
|
|
int saved_errno = errno;
|
|
free (result);
|
|
errno = saved_errno;
|
|
return NULL;
|
|
}
|
|
}
|
|
return result;
|
|
#else
|
|
/* This is a different error code than if iconv_open existed but didn't
|
|
support from_codeset and to_codeset, so that the caller can emit
|
|
an error message such as
|
|
"iconv() is not supported. Installing GNU libiconv and
|
|
then reinstalling this package would fix this." */
|
|
errno = ENOSYS;
|
|
return NULL;
|
|
#endif
|
|
}
|
|
}
|