371 lines
14 KiB
C
371 lines
14 KiB
C
/* Copyright (C) 2001 by Eric Kidd. All rights reserved.
|
|
**
|
|
** Redistribution and use in source and binary forms, with or without
|
|
** modification, are permitted provided that the following conditions
|
|
** are met:
|
|
** 1. Redistributions of source code must retain the above copyright
|
|
** notice, this list of conditions and the following disclaimer.
|
|
** 2. Redistributions in binary form must reproduce the above copyright
|
|
** notice, this list of conditions and the following disclaimer in the
|
|
** documentation and/or other materials provided with the distribution.
|
|
** 3. The name of the author may not be used to endorse or promote products
|
|
** derived from this software without specific prior written permission.
|
|
**
|
|
** THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
|
** ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
** IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
** ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
|
** FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
** DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
** OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
** HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
** LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
** OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
** SUCH DAMAGE. */
|
|
|
|
|
|
/*=========================================================================
|
|
** XML-RPC UTF-8 Utilities
|
|
**=========================================================================
|
|
** Routines for validating, encoding and decoding UTF-8 data. We try to
|
|
** be very, very strict about invalid UTF-8 data.
|
|
**
|
|
** All of the code in this file assumes that your machine represents
|
|
** wchar_t as a 16-bit (or wider) character containing UCS-2 data. If this
|
|
** assumption is incorrect, you may need to replace this file.
|
|
**
|
|
** For lots of information on Unicode and UTF-8 decoding, see:
|
|
** http://www.cl.cam.ac.uk/~mgk25/unicode.html
|
|
*/
|
|
|
|
#include "xmlrpc_config.h"
|
|
|
|
#include "xmlrpc.h"
|
|
|
|
#ifdef HAVE_UNICODE_WCHAR
|
|
|
|
/*=========================================================================
|
|
** Tables and Constants
|
|
**=========================================================================
|
|
** We use a variety of tables and constants to help decode and validate
|
|
** UTF-8 data.
|
|
*/
|
|
|
|
/* The number of bytes in a UTF-8 sequence starting with the character used
|
|
** as the array index. A zero entry indicates an illegal initial byte.
|
|
** This table was generated using a Perl script and information from the
|
|
** UTF-8 standard.
|
|
**
|
|
** Fredrik Lundh's UTF-8 decoder Python 2.0 uses a similar table. But
|
|
** since Python 2.0 has the icky CNRI license, I regenerated this
|
|
** table from scratch and wrote my own decoder. */
|
|
static unsigned char utf8_seq_length[256] = {
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
|
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
|
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
|
4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
|
|
};
|
|
|
|
/* The minimum legal character value for a UTF-8 sequence of the given
|
|
** length. We have to check this to avoid accepting "overlong" UTF-8
|
|
** sequences, which use more bytes than necessary to encode a given
|
|
** character. Such sequences are commonly used by evil people to bypass
|
|
** filters and security checks. This table is based on the UTF-8-test.txt
|
|
** file by Markus Kuhn <mkuhn@acm.org>. */
|
|
static wchar_t utf8_min_char_for_length[4] = {
|
|
0, /* Length 0: Not used (meaningless) */
|
|
0x0000, /* Length 1: Not used (special-cased) */
|
|
0x0080, /* Length 2 */
|
|
0x0800 /* Length 3 */
|
|
|
|
#if 0
|
|
/* These are only useful on systems where wchar_t is 32-bits wide
|
|
** and supports full UCS-4. */
|
|
0x00010000, /* Length 4 */
|
|
0x00200000, /* Length 5 */
|
|
0x04000000 /* Length 6 */
|
|
#endif
|
|
};
|
|
|
|
/* This is the maximum legal 16-byte (UCS-2) character. Again, this
|
|
** information is based on UTF-8-test.txt. */
|
|
#define UCS2_MAX_LEGAL_CHARACTER (0xFFFD)
|
|
|
|
/* First and last UTF-16 surrogate characters. These are *not* legal UCS-2
|
|
** characters--they're used to code for UCS-4 characters when using
|
|
** UTF-16. They should never appear in decoded UTF-8 data! Again, these
|
|
** could hypothetically be used to bypass security measures on some machines.
|
|
** Based on UTF-8-test.txt. */
|
|
#define UTF16_FIRST_SURROGATE (0xD800)
|
|
#define UTF16_LAST_SURROGATE (0xDFFF)
|
|
|
|
/* Is the character 'c' a UTF-8 continuation character? */
|
|
#define IS_CONTINUATION(c) (((c) & 0xC0) == 0x80)
|
|
|
|
/* Maximum number of bytes needed to encode a supported character. */
|
|
#define MAX_ENCODED_BYTES (3)
|
|
|
|
|
|
/*=========================================================================
|
|
** decode_utf8
|
|
**=========================================================================
|
|
** Internal routine which decodes (or validates) a UTF-8 string.
|
|
** To validate, set io_buff and out_buff_len to NULL. To decode, allocate
|
|
** a sufficiently large buffer, pass it as io_buff, and pass a pointer as
|
|
** as out_buff_len. The data will be written to the buffer, and the
|
|
** length to out_buff_len.
|
|
**
|
|
** We assume that wchar_t holds a single UCS-2 character in native-endian
|
|
** byte ordering.
|
|
*/
|
|
|
|
static void
|
|
decode_utf8(xmlrpc_env * const env,
|
|
const char * const utf8_data,
|
|
size_t const utf8_len,
|
|
wchar_t * const io_buff,
|
|
size_t * const out_buff_len) {
|
|
|
|
size_t i, length, out_pos;
|
|
char init, con1, con2;
|
|
wchar_t wc;
|
|
|
|
XMLRPC_ASSERT_ENV_OK(env);
|
|
XMLRPC_ASSERT_PTR_OK(utf8_data);
|
|
XMLRPC_ASSERT((!io_buff && !out_buff_len) ||
|
|
(io_buff && out_buff_len));
|
|
|
|
/* Suppress GCC warning about possibly undefined variable. */
|
|
wc = 0;
|
|
|
|
i = 0;
|
|
out_pos = 0;
|
|
while (i < utf8_len) {
|
|
init = utf8_data[i];
|
|
if ((init & 0x80) == 0x00) {
|
|
/* Convert ASCII character to wide character. */
|
|
wc = init;
|
|
i++;
|
|
} else {
|
|
/* Look up the length of this UTF-8 sequence. */
|
|
length = utf8_seq_length[(unsigned char) init];
|
|
|
|
/* Check to make sure we have enough bytes to convert. */
|
|
if (i + length > utf8_len)
|
|
XMLRPC_FAIL(env, XMLRPC_INVALID_UTF8_ERROR,
|
|
"Truncated UTF-8 sequence");
|
|
|
|
/* Decode a multibyte UTF-8 sequence. */
|
|
switch (length) {
|
|
case 0:
|
|
XMLRPC_FAIL(env, XMLRPC_INVALID_UTF8_ERROR,
|
|
"Invalid UTF-8 initial byte");
|
|
|
|
case 2:
|
|
/* 110xxxxx 10xxxxxx */
|
|
con1 = utf8_data[i+1];
|
|
if (!IS_CONTINUATION(con1))
|
|
XMLRPC_FAIL(env, XMLRPC_INVALID_UTF8_ERROR,
|
|
"UTF-8 sequence too short");
|
|
wc = ((((wchar_t) (init & 0x1F)) << 6) |
|
|
(((wchar_t) (con1 & 0x3F))));
|
|
break;
|
|
|
|
case 3:
|
|
/* 1110xxxx 10xxxxxx 10xxxxxx */
|
|
con1 = utf8_data[i+1];
|
|
con2 = utf8_data[i+2];
|
|
if (!IS_CONTINUATION(con1) || !IS_CONTINUATION(con2))
|
|
XMLRPC_FAIL(env, XMLRPC_INVALID_UTF8_ERROR,
|
|
"UTF-8 sequence too short");
|
|
wc = ((((wchar_t) (init & 0x0F)) << 12) |
|
|
(((wchar_t) (con1 & 0x3F)) << 6) |
|
|
(((wchar_t) (con2 & 0x3F))));
|
|
break;
|
|
|
|
case 4:
|
|
/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
|
|
case 5:
|
|
/* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
|
|
case 6:
|
|
/* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
|
|
XMLRPC_FAIL(env, XMLRPC_INVALID_UTF8_ERROR,
|
|
"UCS-4 characters not supported");
|
|
|
|
default:
|
|
XMLRPC_ASSERT("Error in UTF-8 decoder tables");
|
|
}
|
|
|
|
/* Advance to the end of the sequence. */
|
|
i += length;
|
|
|
|
/* Check for illegal UCS-2 characters. */
|
|
if (wc > UCS2_MAX_LEGAL_CHARACTER)
|
|
XMLRPC_FAIL(env, XMLRPC_INVALID_UTF8_ERROR,
|
|
"UCS-2 characters > U+FFFD are illegal");
|
|
|
|
/* Check for UTF-16 surrogates. */
|
|
if (UTF16_FIRST_SURROGATE <= wc && wc <= UTF16_LAST_SURROGATE)
|
|
XMLRPC_FAIL(env, XMLRPC_INVALID_UTF8_ERROR,
|
|
"UTF-16 surrogates may not appear in UTF-8 data");
|
|
|
|
/* Check for overlong sequences. */
|
|
if (wc < utf8_min_char_for_length[length])
|
|
XMLRPC_FAIL(env, XMLRPC_INVALID_UTF8_ERROR,
|
|
"Overlong UTF-8 sequence not allowed");
|
|
}
|
|
|
|
/* If we have a buffer, write our character to it. */
|
|
if (io_buff) {
|
|
io_buff[out_pos++] = wc;
|
|
}
|
|
}
|
|
|
|
/* Record the number of characters we found. */
|
|
if (out_buff_len)
|
|
*out_buff_len = out_pos;
|
|
|
|
cleanup:
|
|
if (env->fault_occurred) {
|
|
if (out_buff_len)
|
|
*out_buff_len = 0;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
/*=========================================================================
|
|
** xmlrpc_validate_utf8
|
|
**=========================================================================
|
|
** Make sure that a UTF-8 string is valid.
|
|
*/
|
|
|
|
void
|
|
xmlrpc_validate_utf8 (xmlrpc_env * const env,
|
|
const char * const utf8_data,
|
|
size_t const utf8_len) {
|
|
|
|
decode_utf8(env, utf8_data, utf8_len, NULL, NULL);
|
|
}
|
|
|
|
|
|
/*=========================================================================
|
|
** xmlrpc_utf8_to_wcs
|
|
**=========================================================================
|
|
** Decode UTF-8 string to a "wide character string". This function
|
|
** returns an xmlrpc_mem_block with an element type of wchar_t. Don't
|
|
** try to intepret the block in a bytewise fashion--it won't work in
|
|
** any useful or portable fashion.
|
|
*/
|
|
|
|
xmlrpc_mem_block *xmlrpc_utf8_to_wcs (xmlrpc_env *env,
|
|
char *utf8_data,
|
|
size_t utf8_len)
|
|
{
|
|
xmlrpc_mem_block *output;
|
|
size_t wcs_length;
|
|
|
|
/* Allocate a memory block large enough to hold any possible output.
|
|
** We assume that each byte of the input may decode to a whcar_t. */
|
|
output = XMLRPC_TYPED_MEM_BLOCK_NEW(wchar_t, env, utf8_len);
|
|
XMLRPC_FAIL_IF_FAULT(env);
|
|
|
|
/* Decode the UTF-8 data. */
|
|
decode_utf8(env, utf8_data, utf8_len,
|
|
XMLRPC_TYPED_MEM_BLOCK_CONTENTS(wchar_t, output),
|
|
&wcs_length);
|
|
XMLRPC_FAIL_IF_FAULT(env);
|
|
|
|
/* Make sure we didn't overrun our buffer. */
|
|
XMLRPC_ASSERT(wcs_length <= utf8_len);
|
|
|
|
/* Correct the length of the memory block. */
|
|
XMLRPC_TYPED_MEM_BLOCK_RESIZE(wchar_t, env, output, wcs_length);
|
|
XMLRPC_FAIL_IF_FAULT(env);
|
|
|
|
cleanup:
|
|
if (env->fault_occurred) {
|
|
if (output)
|
|
xmlrpc_mem_block_free(output);
|
|
return NULL;
|
|
}
|
|
return output;
|
|
}
|
|
|
|
|
|
/*=========================================================================
|
|
** xmlrpc_utf8_to_wcs
|
|
**=========================================================================
|
|
** Encode a "wide character string" as UTF-8.
|
|
*/
|
|
|
|
xmlrpc_mem_block *xmlrpc_wcs_to_utf8 (xmlrpc_env *env,
|
|
wchar_t *wcs_data,
|
|
size_t wcs_len)
|
|
{
|
|
size_t estimate, bytes_used, i;
|
|
xmlrpc_mem_block *output;
|
|
unsigned char *buffer;
|
|
wchar_t wc;
|
|
|
|
XMLRPC_ASSERT_ENV_OK(env);
|
|
XMLRPC_ASSERT_PTR_OK(wcs_data);
|
|
|
|
/* Allocate a memory block large enough to hold any possible output.
|
|
** We assume that every wchar might encode to the maximum length. */
|
|
estimate = wcs_len * MAX_ENCODED_BYTES;
|
|
output = XMLRPC_TYPED_MEM_BLOCK_NEW(char, env, estimate);
|
|
XMLRPC_FAIL_IF_FAULT(env);
|
|
|
|
/* Output our characters. */
|
|
buffer = (unsigned char*) XMLRPC_TYPED_MEM_BLOCK_CONTENTS(char, output);
|
|
bytes_used = 0;
|
|
for (i = 0; i < wcs_len; i++) {
|
|
wc = wcs_data[i];
|
|
if (wc <= 0x007F) {
|
|
buffer[bytes_used++] = wc & 0x7F;
|
|
} else if (wc <= 0x07FF) {
|
|
/* 110xxxxx 10xxxxxx */
|
|
buffer[bytes_used++] = 0xC0 | (wc >> 6);
|
|
buffer[bytes_used++] = 0x80 | (wc & 0x3F);
|
|
} else if (wc <= 0xFFFF) {
|
|
/* 1110xxxx 10xxxxxx 10xxxxxx */
|
|
buffer[bytes_used++] = 0xE0 | (wc >> 12);
|
|
buffer[bytes_used++] = 0x80 | ((wc >> 6) & 0x3F);
|
|
buffer[bytes_used++] = 0x80 | (wc & 0x3F);
|
|
} else {
|
|
XMLRPC_FAIL(env, XMLRPC_INTERNAL_ERROR,
|
|
"Don't know how to encode UCS-4 characters yet");
|
|
}
|
|
}
|
|
|
|
/* Make sure we didn't overrun our buffer. */
|
|
XMLRPC_ASSERT(bytes_used <= estimate);
|
|
|
|
/* Correct the length of the memory block. */
|
|
XMLRPC_TYPED_MEM_BLOCK_RESIZE(char, env, output, bytes_used);
|
|
XMLRPC_FAIL_IF_FAULT(env);
|
|
|
|
cleanup:
|
|
if (env->fault_occurred) {
|
|
if (output)
|
|
xmlrpc_mem_block_free(output);
|
|
return NULL;
|
|
}
|
|
return output;
|
|
}
|
|
|
|
#endif /* HAVE_UNICODE_WCHAR */
|