glibc/libidn/idna.c
2004-03-14 09:17:48 +00:00

798 lines
21 KiB
C

/* idna.c Convert to or from IDN strings.
* Copyright (C) 2002, 2003, 2004 Simon Josefsson
*
* This file is part of GNU Libidn.
*
* GNU Libidn is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* GNU Libidn is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with GNU Libidn; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
*/
#if HAVE_CONFIG_H
# include "config.h"
#endif
#include <stdlib.h>
#include <string.h>
#include <stringprep.h>
#include <punycode.h>
#include "idna.h"
#define DOTP(c) ((c) == 0x002E || (c) == 0x3002 || \
(c) == 0xFF0E || (c) == 0xFF61)
/* Core functions */
/**
* idna_to_ascii_4i
* @in: input array with unicode code points.
* @inlen: length of input array with unicode code points.
* @out: output zero terminated string that must have room for at
* least 63 characters plus the terminating zero.
* @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES.
*
* The ToASCII operation takes a sequence of Unicode code points that make
* up one label and transforms it into a sequence of code points in the
* ASCII range (0..7F). If ToASCII succeeds, the original sequence and the
* resulting sequence are equivalent labels.
*
* It is important to note that the ToASCII operation can fail. ToASCII
* fails if any step of it fails. If any step of the ToASCII operation
* fails on any label in a domain name, that domain name MUST NOT be used
* as an internationalized domain name. The method for deadling with this
* failure is application-specific.
*
* The inputs to ToASCII are a sequence of code points, the AllowUnassigned
* flag, and the UseSTD3ASCIIRules flag. The output of ToASCII is either a
* sequence of ASCII code points or a failure condition.
*
* ToASCII never alters a sequence of code points that are all in the ASCII
* range to begin with (although it could fail). Applying the ToASCII
* operation multiple times has exactly the same effect as applying it just
* once.
*
* Return value: Returns 0 on success, or an error code.
*/
int
idna_to_ascii_4i (const uint32_t * in, size_t inlen, char *out, int flags)
{
size_t len, outlen;
uint32_t *src; /* XXX don't need to copy data? */
int rc;
/*
* ToASCII consists of the following steps:
*
* 1. If all code points in the sequence are in the ASCII range (0..7F)
* then skip to step 3.
*/
{
size_t i;
int inasciirange;
inasciirange = 1;
for (i = 0; i < inlen; i++)
if (in[i] > 0x7F)
inasciirange = 0;
if (inasciirange)
{
src = malloc (sizeof (in[0]) * (inlen + 1));
if (src == NULL)
return IDNA_MALLOC_ERROR;
memcpy (src, in, sizeof (in[0]) * inlen);
src[inlen] = 0;
goto step3;
}
}
/*
* 2. Perform the steps specified in [NAMEPREP] and fail if there is
* an error. The AllowUnassigned flag is used in [NAMEPREP].
*/
{
char *p;
p = stringprep_ucs4_to_utf8 (in, inlen, NULL, NULL);
if (p == NULL)
return IDNA_MALLOC_ERROR;
len = strlen (p);
do
{
len = 2 * len + 10; /* XXX better guess? */
p = realloc (p, len);
if (p == NULL)
return IDNA_MALLOC_ERROR;
if (flags & IDNA_ALLOW_UNASSIGNED)
rc = stringprep_nameprep (p, len);
else
rc = stringprep_nameprep_no_unassigned (p, len);
}
while (rc == STRINGPREP_TOO_SMALL_BUFFER);
if (rc != STRINGPREP_OK)
{
free (p);
return IDNA_STRINGPREP_ERROR;
}
src = stringprep_utf8_to_ucs4 (p, -1, NULL);
free (p);
}
step3:
/*
* 3. If the UseSTD3ASCIIRules flag is set, then perform these checks:
*
* (a) Verify the absence of non-LDH ASCII code points; that is,
* the absence of 0..2C, 2E..2F, 3A..40, 5B..60, and 7B..7F.
*
* (b) Verify the absence of leading and trailing hyphen-minus;
* that is, the absence of U+002D at the beginning and end of
* the sequence.
*/
if (flags & IDNA_USE_STD3_ASCII_RULES)
{
size_t i;
for (i = 0; src[i]; i++)
if (src[i] <= 0x2C || src[i] == 0x2E || src[i] == 0x2F ||
(src[i] >= 0x3A && src[i] <= 0x40) ||
(src[i] >= 0x5B && src[i] <= 0x60) ||
(src[i] >= 0x7B && src[i] <= 0x7F))
{
free (src);
return IDNA_CONTAINS_NON_LDH;
}
if (src[0] == 0x002D || (i > 0 && src[i - 1] == 0x002D))
{
free (src);
return IDNA_CONTAINS_MINUS;
}
}
/*
* 4. If all code points in the sequence are in the ASCII range
* (0..7F), then skip to step 8.
*/
{
size_t i;
int inasciirange;
inasciirange = 1;
for (i = 0; src[i]; i++)
{
if (src[i] > 0x7F)
inasciirange = 0;
/* copy string to output buffer if we are about to skip to step8 */
if (i < 64)
out[i] = src[i];
}
if (i < 64)
out[i] = '\0';
if (inasciirange)
goto step8;
}
/*
* 5. Verify that the sequence does NOT begin with the ACE prefix.
*
*/
{
size_t i;
int match;
match = 1;
for (i = 0; match && i < strlen (IDNA_ACE_PREFIX); i++)
if (((uint32_t) IDNA_ACE_PREFIX[i] & 0xFF) != src[i])
match = 0;
if (match)
{
free (src);
return IDNA_CONTAINS_ACE_PREFIX;
}
}
/*
* 6. Encode the sequence using the encoding algorithm in [PUNYCODE]
* and fail if there is an error.
*/
for (len = 0; src[len]; len++)
;
src[len] = '\0';
outlen = 63 - strlen (IDNA_ACE_PREFIX);
rc = punycode_encode (len, src, NULL,
&outlen, &out[strlen (IDNA_ACE_PREFIX)]);
if (rc != PUNYCODE_SUCCESS)
{
free (src);
return IDNA_PUNYCODE_ERROR;
}
out[strlen (IDNA_ACE_PREFIX) + outlen] = '\0';
/*
* 7. Prepend the ACE prefix.
*/
memcpy (out, IDNA_ACE_PREFIX, strlen (IDNA_ACE_PREFIX));
/*
* 8. Verify that the number of code points is in the range 1 to 63
* inclusive (0 is excluded).
*/
step8:
free (src);
if (strlen (out) < 1 || strlen (out) > 63)
return IDNA_INVALID_LENGTH;
return IDNA_SUCCESS;
}
/* ToUnicode(). May realloc() utf8in. */
static int
idna_to_unicode_internal (char *utf8in,
uint32_t * out, size_t * outlen, int flags)
{
int rc;
char tmpout[64];
size_t utf8len = strlen (utf8in) + 1;
size_t addlen = 0;
/*
* ToUnicode consists of the following steps:
*
* 1. If the sequence contains any code points outside the ASCII range
* (0..7F) then proceed to step 2, otherwise skip to step 3.
*/
{
size_t i;
int inasciirange;
inasciirange = 1;
for (i = 0; utf8in[i]; i++)
if (utf8in[i] & ~0x7F)
inasciirange = 0;
if (inasciirange)
goto step3;
}
/*
* 2. Perform the steps specified in [NAMEPREP] and fail if there is an
* error. (If step 3 of ToASCII is also performed here, it will not
* affect the overall behavior of ToUnicode, but it is not
* necessary.) The AllowUnassigned flag is used in [NAMEPREP].
*/
do
{
utf8in = realloc (utf8in, utf8len + addlen);
if (!utf8in)
return IDNA_MALLOC_ERROR;
if (flags & IDNA_ALLOW_UNASSIGNED)
rc = stringprep_nameprep (utf8in, utf8len + addlen);
else
rc = stringprep_nameprep_no_unassigned (utf8in, utf8len + addlen);
addlen += 1;
}
while (rc == STRINGPREP_TOO_SMALL_BUFFER);
if (rc != STRINGPREP_OK)
return IDNA_STRINGPREP_ERROR;
/* 3. Verify that the sequence begins with the ACE prefix, and save a
* copy of the sequence.
*/
step3:
if (memcmp (IDNA_ACE_PREFIX, utf8in, strlen (IDNA_ACE_PREFIX)) != 0)
return IDNA_NO_ACE_PREFIX;
/* 4. Remove the ACE prefix.
*/
memmove (utf8in, &utf8in[strlen (IDNA_ACE_PREFIX)],
strlen (utf8in) - strlen (IDNA_ACE_PREFIX) + 1);
/* 5. Decode the sequence using the decoding algorithm in [PUNYCODE]
* and fail if there is an error. Save a copy of the result of
* this step.
*/
(*outlen)--; /* reserve one for the zero */
rc = punycode_decode (strlen (utf8in), utf8in, outlen, out, NULL);
if (rc != PUNYCODE_SUCCESS)
return IDNA_PUNYCODE_ERROR;
out[*outlen] = 0; /* add zero */
/* 6. Apply ToASCII.
*/
rc = idna_to_ascii_4i (out, *outlen, tmpout, flags);
if (rc != IDNA_SUCCESS)
return rc;
/* 7. Verify that the result of step 6 matches the saved copy from
* step 3, using a case-insensitive ASCII comparison.
*/
if (strcasecmp (utf8in, tmpout + strlen (IDNA_ACE_PREFIX)) != 0)
return IDNA_ROUNDTRIP_VERIFY_ERROR;
/* 8. Return the saved copy from step 5.
*/
return IDNA_SUCCESS;
}
/**
* idna_to_unicode_44i
* @in: input array with unicode code points.
* @inlen: length of input array with unicode code points.
* @out: output array with unicode code points.
* @outlen: on input, maximum size of output array with unicode code points,
* on exit, actual size of output array with unicode code points.
* @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES.
*
* The ToUnicode operation takes a sequence of Unicode code points
* that make up one label and returns a sequence of Unicode code
* points. If the input sequence is a label in ACE form, then the
* result is an equivalent internationalized label that is not in ACE
* form, otherwise the original sequence is returned unaltered.
*
* ToUnicode never fails. If any step fails, then the original input
* sequence is returned immediately in that step.
*
* The Punycode decoder can never output more code points than it
* inputs, but Nameprep can, and therefore ToUnicode can. Note that
* the number of octets needed to represent a sequence of code points
* depends on the particular character encoding used.
*
* The inputs to ToUnicode are a sequence of code points, the
* AllowUnassigned flag, and the UseSTD3ASCIIRules flag. The output of
* ToUnicode is always a sequence of Unicode code points.
*
* Return value: Returns error condition, but it must only be used for
* debugging purposes. The output buffer is always
* guaranteed to contain the correct data according to
* the specification (sans malloc induced errors). NB!
* This means that you normally ignore the return code
* from this function, as checking it means breaking the
* standard.
*/
int
idna_to_unicode_44i (const uint32_t * in, size_t inlen,
uint32_t * out, size_t * outlen, int flags)
{
int rc;
size_t outlensave = *outlen;
char *p;
p = stringprep_ucs4_to_utf8 (in, inlen, NULL, NULL);
if (p == NULL)
return IDNA_MALLOC_ERROR;
rc = idna_to_unicode_internal (p, out, outlen, flags);
if (rc != IDNA_SUCCESS)
{
memcpy (out, in, sizeof (in[0]) * (inlen < outlensave ?
inlen : outlensave));
*outlen = inlen;
}
free (p);
return rc;
}
/* Wrappers that handle several labels */
/**
* idna_to_ascii_4z:
* @input: zero terminated input Unicode string.
* @output: pointer to newly allocated output string.
* @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES.
*
* Convert UCS-4 domain name to ASCII string. The domain name may
* contain several labels, separated by dots. The output buffer must
* be deallocated by the caller.
*
* Return value: Returns IDNA_SUCCESS on success, or error code.
**/
int
idna_to_ascii_4z (const uint32_t * input, char **output, int flags)
{
const uint32_t *start = input;
const uint32_t *end = input;
char buf[64];
char *out = NULL;
int rc;
/* 1) Whenever dots are used as label separators, the following
characters MUST be recognized as dots: U+002E (full stop),
U+3002 (ideographic full stop), U+FF0E (fullwidth full stop),
U+FF61 (halfwidth ideographic full stop). */
if (input[0] == 0)
{
/* Handle implicit zero-length root label. */
*output = malloc (1);
if (!*output)
return IDNA_MALLOC_ERROR;
strcpy (*output, "");
return IDNA_SUCCESS;
}
if (DOTP (input[0]) && input[1] == 0)
{
/* Handle explicit zero-length root label. */
*output = malloc (2);
if (!*output)
return IDNA_MALLOC_ERROR;
strcpy (*output, ".");
return IDNA_SUCCESS;
}
*output = NULL;
do
{
end = start;
for (; *end && !DOTP (*end); end++)
;
if (*end == '\0' && start == end)
{
/* Handle explicit zero-length root label. */
buf[0] = '\0';
}
else
{
rc = idna_to_ascii_4i (start, end - start, buf, flags);
if (rc != IDNA_SUCCESS)
return rc;
}
if (out)
{
out = realloc (out, strlen (out) + 1 + strlen (buf) + 1);
if (!out)
return IDNA_MALLOC_ERROR;
strcat (out, ".");
strcat (out, buf);
}
else
{
out = (char *) malloc (strlen (buf) + 1);
if (!out)
return IDNA_MALLOC_ERROR;
strcpy (out, buf);
}
start = end + 1;
}
while (*end);
*output = out;
return IDNA_SUCCESS;
}
/**
* idna_to_ascii_8z:
* @input: zero terminated input UTF-8 string.
* @output: pointer to newly allocated output string.
* @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES.
*
* Convert UTF-8 domain name to ASCII string. The domain name may
* contain several labels, separated by dots. The output buffer must
* be deallocated by the caller.
*
* Return value: Returns IDNA_SUCCESS on success, or error code.
**/
int
idna_to_ascii_8z (const char *input, char **output, int flags)
{
uint32_t *ucs4;
size_t ucs4len;
int rc;
ucs4 = stringprep_utf8_to_ucs4 (input, -1, &ucs4len);
if (!ucs4)
return IDNA_ICONV_ERROR;
rc = idna_to_ascii_4z (ucs4, output, flags);
free (ucs4);
return rc;
}
/**
* idna_to_ascii_lz:
* @input: zero terminated input UTF-8 string.
* @output: pointer to newly allocated output string.
* @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES.
*
* Convert domain name in the locale's encoding to ASCII string. The
* domain name may contain several labels, separated by dots. The
* output buffer must be deallocated by the caller.
*
* Return value: Returns IDNA_SUCCESS on success, or error code.
**/
int
idna_to_ascii_lz (const char *input, char **output, int flags)
{
char *utf8;
int rc;
utf8 = stringprep_locale_to_utf8 (input);
if (!utf8)
return IDNA_ICONV_ERROR;
rc = idna_to_ascii_8z (utf8, output, flags);
free (utf8);
return rc;
}
/**
* idna_to_unicode_4z4z:
* @input: zero-terminated Unicode string.
* @output: pointer to newly allocated output Unicode string.
* @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES.
*
* Convert possibly ACE encoded domain name in UCS-4 format into a
* UCS-4 string. The domain name may contain several labels,
* separated by dots. The output buffer must be deallocated by the
* caller.
*
* Return value: Returns IDNA_SUCCESS on success, or error code.
**/
int
idna_to_unicode_4z4z (const uint32_t * input, uint32_t ** output, int flags)
{
const uint32_t *start = input;
const uint32_t *end = input;
uint32_t *buf;
size_t buflen;
uint32_t *out = NULL;
size_t outlen = 0;
int rc;
*output = NULL;
do
{
end = start;
for (; *end && !DOTP (*end); end++)
;
buflen = end - start;
buf = malloc (sizeof (buf[0]) * (buflen + 1));
if (!buf)
return IDNA_MALLOC_ERROR;
rc = idna_to_unicode_44i (start, end - start, buf, &buflen, flags);
/* don't check rc as per specification! */
if (out)
{
out = realloc (out, sizeof (out[0]) * (outlen + 1 + buflen + 1));
if (!out)
return IDNA_MALLOC_ERROR;
out[outlen++] = 0x002E; /* '.' (full stop) */
memcpy (out + outlen, buf, sizeof (buf[0]) * buflen);
outlen += buflen;
out[outlen] = 0x0;
free (buf);
}
else
{
out = buf;
outlen = buflen;
out[outlen] = 0x0;
}
start = end + 1;
}
while (*end);
*output = out;
return IDNA_SUCCESS;
}
/**
* idna_to_unicode_8z4z:
* @input: zero-terminated UTF-8 string.
* @output: pointer to newly allocated output Unicode string.
* @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES.
*
* Convert possibly ACE encoded domain name in UTF-8 format into a
* UCS-4 string. The domain name may contain several labels,
* separated by dots. The output buffer must be deallocated by the
* caller.
*
* Return value: Returns IDNA_SUCCESS on success, or error code.
**/
int
idna_to_unicode_8z4z (const char *input, uint32_t ** output, int flags)
{
uint32_t *ucs4;
size_t ucs4len;
int rc;
ucs4 = stringprep_utf8_to_ucs4 (input, -1, &ucs4len);
if (!ucs4)
return IDNA_ICONV_ERROR;
rc = idna_to_unicode_4z4z (ucs4, output, flags);
free (ucs4);
return rc;
}
/**
* idna_to_unicode_8z8z:
* @input: zero-terminated UTF-8 string.
* @output: pointer to newly allocated output UTF-8 string.
* @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES.
*
* Convert possibly ACE encoded domain name in UTF-8 format into a
* UTF-8 string. The domain name may contain several labels,
* separated by dots. The output buffer must be deallocated by the
* caller.
*
* Return value: Returns IDNA_SUCCESS on success, or error code.
**/
int
idna_to_unicode_8z8z (const char *input, char **output, int flags)
{
uint32_t *ucs4;
int rc;
rc = idna_to_unicode_8z4z (input, &ucs4, flags);
*output = stringprep_ucs4_to_utf8 (ucs4, -1, NULL, NULL);
free (ucs4);
if (!*output)
return IDNA_ICONV_ERROR;
return rc;
}
/**
* idna_to_unicode_8zlz:
* @input: zero-terminated UTF-8 string.
* @output: pointer to newly allocated output string encoded in the
* current locale's character set.
* @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES.
*
* Convert possibly ACE encoded domain name in UTF-8 format into a
* string encoded in the current locale's character set. The domain
* name may contain several labels, separated by dots. The output
* buffer must be deallocated by the caller.
*
* Return value: Returns IDNA_SUCCESS on success, or error code.
**/
int
idna_to_unicode_8zlz (const char *input, char **output, int flags)
{
char *utf8;
int rc;
rc = idna_to_unicode_8z8z (input, &utf8, flags);
*output = stringprep_utf8_to_locale (utf8);
free (utf8);
if (!*output)
return IDNA_ICONV_ERROR;
return rc;
}
/**
* idna_to_unicode_lzlz:
* @input: zero-terminated string encoded in the current locale's
* character set.
* @output: pointer to newly allocated output string encoded in the
* current locale's character set.
* @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES.
*
* Convert possibly ACE encoded domain name in the locale's character
* set into a string encoded in the current locale's character set.
* The domain name may contain several labels, separated by dots. The
* output buffer must be deallocated by the caller.
*
* Return value: Returns IDNA_SUCCESS on success, or error code.
**/
int
idna_to_unicode_lzlz (const char *input, char **output, int flags)
{
char *utf8;
int rc;
utf8 = stringprep_locale_to_utf8 (input);
if (!utf8)
return IDNA_ICONV_ERROR;
rc = idna_to_unicode_8zlz (utf8, output, flags);
free (utf8);
return rc;
}
/**
* IDNA_ACE_PREFIX
*
* The IANA allocated prefix to use for IDNA. "xn--"
*/
/**
* Idna_rc:
* @IDNA_SUCCESS: Successful operation. This value is guaranteed to
* always be zero, the remaining ones are only guaranteed to hold
* non-zero values, for logical comparison purposes.
* @IDNA_STRINGPREP_ERROR: Error during string preparation.
* @IDNA_PUNYCODE_ERROR: Error during punycode operation.
* @IDNA_CONTAINS_NON_LDH: For IDNA_USE_STD3_ASCII_RULES, indicate that
* the string contains non-LDH ASCII characters.
* @IDNA_CONTAINS_MINUS: For IDNA_USE_STD3_ASCII_RULES, indicate that
* the string contains a leading or trailing hyphen-minus (U+002D).
* @IDNA_INVALID_LENGTH: The final output string is not within the
* (inclusive) range 1 to 63 characters.
* @IDNA_NO_ACE_PREFIX: The string does not contain the ACE prefix
* (for ToUnicode).
* @IDNA_ROUNDTRIP_VERIFY_ERROR: The ToASCII operation on output
* string does not equal the input.
* @IDNA_CONTAINS_ACE_PREFIX: The input contains the ACE prefix (for
* ToASCII).
* @IDNA_ICONV_ERROR: Could not convert string in locale encoding.
* @IDNA_MALLOC_ERROR: Could not allocate buffer (this is typically a
* fatal error).
*
* Enumerated return codes of idna_to_ascii_4i(),
* idna_to_unicode_44i() functions (and functions derived from those
* functions). The value 0 is guaranteed to always correspond to
* success.
*/
/**
* Idna_flags:
* @IDNA_ALLOW_UNASSIGNED: Don't reject strings containing unassigned
* Unicode code points.
* @IDNA_USE_STD3_ASCII_RULES: Validate strings according to STD3
* rules (i.e., normal host name rules).
*
* Flags to pass to idna_to_ascii_4i(), idna_to_unicode_44i() etc.
*/