diff --git a/gcc/ChangeLog b/gcc/ChangeLog index d7afc1937c2..8a3d1fdfaca 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,19 @@ +2004-01-16 Eric Christopher + Chandrakala Chavva + + * cppcharset.c (one_iso88591_to_utf8): New function. + (convert_iso88591_utf8): Ditto. Use. + (conversion_tab): Use. + (_cpp_input_to_utf8): New function. + (_cpp_init_iconv_buffer): Ditto. + (_cpp_close_iconv_buffer): Ditto. + * cpphash.h: Prototype new functions. + (cpp_buffer): Add input_cset_desc. + * cppinit.c: Add input_charset default. + * cpplib.c (cpp_push_buffer): Support init and + close of iconv. + * cpplib.h (cpp_options): Add input_charset. + 2004-01-16 Kazu Hirata * system.h (ASM_OUTPUT_SECTION_NAME): Poison. @@ -14,23 +30,23 @@ * fixinc/tests/base/sys/stat.h: Adapt for new hackname. * fixinc/inclhack.def (alpha___extern_prefix, - alpha___extern_prefix_standards): New hacks to obey + alpha___extern_prefix_standards): New hacks to obey __PRAGMA_EXTERN_PREFIX. * fixinc/tests/base/testing.h [ALPHA___EXTERN_PREFIX_CHECK]: New test. * fixinc/tests/base/standards.h: Likewise. - + * fixincl/inclhack.def (alpha_pthread): Tweak to match more variations. New testcase. * fixinc/tests/base/pthread.h: Handle it. - + * fixincl/inclhack.def (bad_lval): Sort file list. Add many missing files up to Tru64 UNIX V5.1B. * gcc/fixinc/tests/base/libgen.h: Renamed to ... * gcc/fixinc/tests/base/dirent.h: ... this to match new file list order. - + * fixinc/fixincl.x: Regenerate. 2004-01-16 Mark Mitchell diff --git a/gcc/cppcharset.c b/gcc/cppcharset.c index 1b2d0b2a091..5070366e3a8 100644 --- a/gcc/cppcharset.c +++ b/gcc/cppcharset.c @@ -170,7 +170,7 @@ one_utf8_to_cppchar (const uchar **inbufp, size_t *inbytesleftp, { static const uchar masks[6] = { 0x7F, 0x1F, 0x0F, 0x07, 0x02, 0x01 }; static const uchar patns[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; - + cppchar_t c; const uchar *inbuf = *inbufp; size_t nbytes, i; @@ -274,7 +274,7 @@ one_cppchar_to_utf8 (cppchar_t c, uchar **outbufp, size_t *outbytesleftp) The return value is either 0 for success, or an errno value for failure, which may be E2BIG (need more space), EILSEQ (ill-formed input sequence), ir EINVAL (incomplete input sequence). */ - + static inline int one_utf8_to_utf32 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp, uchar **outbufp, size_t *outbytesleftp) @@ -446,6 +446,31 @@ one_utf16_to_utf8 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp, return 0; } +/* The first 256 code points of ISO 8859.1 have the same numeric + values as the first 256 code points of Unicode, therefore the + incoming ISO 8859.1 character can be passed directly to + one_cppchar_to_utf8 (which expects a Unicode value). */ + +static int +one_iso88591_to_utf8 (iconv_t bigend ATTRIBUTE_UNUSED, const uchar **inbufp, + size_t *inbytesleftp, uchar **outbufp, size_t *outbytesleftp) +{ + const uchar *inbuf = *inbufp; + int rval; + + if (*inbytesleftp > 1) + return EINVAL; + + rval = one_cppchar_to_utf8 ((cppchar_t)*inbuf, outbufp, outbytesleftp); + if (rval) + return rval; + + *inbufp += 1; + *inbytesleftp -= 1; + + return 0; +} + /* Helper routine for the next few functions. The 'const' on one_conversion means that we promise not to modify what function is pointed to, which lets the inliner see through it. */ @@ -489,7 +514,7 @@ conversion_loop (int (*const one_conversion)(iconv_t, const uchar **, size_t *, outbuf = to->text + to->asize - outbytesleft; } } - + /* These functions convert entire strings between character sets. They all have the signature @@ -529,6 +554,14 @@ convert_utf32_utf8 (iconv_t cd, const uchar *from, size_t flen, return conversion_loop (one_utf32_to_utf8, cd, from, flen, to); } +static bool +convert_iso88591_utf8 (iconv_t cd, const uchar *from, size_t flen, + struct _cpp_strbuf *to) +{ + return conversion_loop (one_iso88591_to_utf8, cd, from, flen, to); +} + + /* Identity conversion, used when we have no alternative. */ static bool convert_no_conversion (iconv_t cd ATTRIBUTE_UNUSED, @@ -606,6 +639,7 @@ static const struct conversion conversion_tab[] = { { "UTF-32BE/UTF-8", convert_utf32_utf8, (iconv_t)1 }, { "UTF-16LE/UTF-8", convert_utf16_utf8, (iconv_t)0 }, { "UTF-16BE/UTF-8", convert_utf16_utf8, (iconv_t)1 }, + { "ISO-8859-1/UTF-8", convert_iso88591_utf8, (iconv_t)0 }, }; /* Subroutine of cpp_init_iconv: initialize and return a @@ -619,7 +653,7 @@ init_iconv_desc (cpp_reader *pfile, const char *to, const char *from) struct cset_converter ret; char *pair; size_t i; - + if (!strcasecmp (to, from)) { ret.func = convert_no_conversion; @@ -649,7 +683,7 @@ init_iconv_desc (cpp_reader *pfile, const char *to, const char *from) if (ret.cd == (iconv_t) -1) { if (errno == EINVAL) - cpp_error (pfile, CPP_DL_ERROR, /* XXX should be DL_SORRY */ + cpp_error (pfile, CPP_DL_ERROR, /* FIXME should be DL_SORRY */ "conversion from %s to %s not supported by iconv", from, to); else @@ -660,7 +694,7 @@ init_iconv_desc (cpp_reader *pfile, const char *to, const char *from) } else { - cpp_error (pfile, CPP_DL_ERROR, /* XXX should be DL_SORRY */ + cpp_error (pfile, CPP_DL_ERROR, /* FIXME: should be DL_SORRY */ "no iconv implementation, cannot convert from %s to %s", from, to); ret.func = convert_no_conversion; @@ -1270,7 +1304,7 @@ narrow_str_to_charconst (cpp_reader *pfile, cpp_string str, *unsignedp = unsigned_p; return result; } - + /* Subroutine of cpp_interpret_charconst which performs the conversion to a number, for wide strings. STR is the string structure returned by cpp_interpret_string. PCHARS_SEEN and UNSIGNEDP are as for @@ -1352,3 +1386,46 @@ cpp_interpret_charconst (cpp_reader *pfile, const cpp_token *token, return result; } + +uchar * +_cpp_input_to_utf8 (cpp_reader *pfile, const uchar *input, cppchar_t length) +{ + struct _cpp_strbuf tbuf; + struct cset_converter cvt = pfile->buffer->input_cset_desc; + + tbuf.asize = MAX (OUTBUF_BLOCK_SIZE, length); + tbuf.text = xmalloc (tbuf.asize); + tbuf.len = 0; + + if (!APPLY_CONVERSION (cvt, input, length, &tbuf)) + { + cpp_error (pfile, CPP_DL_ERROR, "converting input to source character set."); + return NULL; + } + + if (length) + tbuf.text[tbuf.len] = '\n'; + else + tbuf.text[0] = '\n'; + + return tbuf.text; +} + + /* Check the input file format. At present assuming the input file + is in iso-8859-1 format. Convert this input character set to + source character set format (UTF-8). */ + +void +_cpp_init_iconv_buffer (cpp_reader *pfile, const char *from) +{ + pfile->buffer->input_cset_desc = init_iconv_desc (pfile, SOURCE_CHARSET, + from); +} + +void +_cpp_close_iconv_buffer (cpp_reader *pfile) +{ + if (HAVE_ICONV + && pfile->buffer->input_cset_desc.func == convert_using_iconv) + iconv_close (pfile->buffer->input_cset_desc.cd); +} diff --git a/gcc/cpphash.h b/gcc/cpphash.h index 80cb04c5f52..6c13ea1c0b1 100644 --- a/gcc/cpphash.h +++ b/gcc/cpphash.h @@ -270,7 +270,7 @@ struct cpp_buffer const uchar *cur; /* Current location. */ const uchar *line_base; /* Start of current physical line. */ const uchar *next_line; /* Start of to-be-cleaned logical line. */ - + const uchar *buf; /* Entire character buffer. */ const uchar *rlimit; /* Writable byte at end of file. */ @@ -313,6 +313,10 @@ struct cpp_buffer /* Used for buffer overlays by cpptrad.c. */ const uchar *saved_cur, *saved_rlimit; + + /* Descriptor for converting from the input character set to the + source character set. */ + struct cset_converter input_cset_desc; }; /* A cpp_reader encapsulates the "state" of a pre-processor run. @@ -557,6 +561,9 @@ extern void _cpp_init_internal_pragmas (cpp_reader *); extern void _cpp_do_file_change (cpp_reader *, enum lc_reason, const char *, unsigned int, unsigned int); extern void _cpp_pop_buffer (cpp_reader *); +extern uchar *_cpp_input_to_utf8 (cpp_reader *, const unsigned char *, cppchar_t); +extern void _cpp_init_iconv_buffer (cpp_reader *, const char *); +extern void _cpp_close_iconv_buffer (cpp_reader *); /* In cpptrad.c. */ extern bool _cpp_scan_out_logical_line (cpp_reader *, cpp_macro *); diff --git a/gcc/cppinit.c b/gcc/cppinit.c index 13326886778..629da2734a7 100644 --- a/gcc/cppinit.c +++ b/gcc/cppinit.c @@ -161,6 +161,9 @@ cpp_create_reader (enum c_lang lang, hash_table *table) CPP_OPTION (pfile, narrow_charset) = 0; CPP_OPTION (pfile, wide_charset) = 0; + /* Default the input character set to iso-8859-1 for now. */ + CPP_OPTION (pfile, input_charset) = "ISO-8859-1"; + /* A fake empty "directory" used as the starting point for files looked up without a search path. Name cannot be '/' because we don't want to prepend anything at all to filenames using it. All diff --git a/gcc/cpplib.c b/gcc/cpplib.c index 2b213cb461a..feb8717745b 100644 --- a/gcc/cpplib.c +++ b/gcc/cpplib.c @@ -549,14 +549,14 @@ do_undef (cpp_reader *pfile) /* Undefine a single macro/assertion/whatever. */ static int -undefine_macros (cpp_reader *pfile, cpp_hashnode *h, +undefine_macros (cpp_reader *pfile, cpp_hashnode *h, void *data_p ATTRIBUTE_UNUSED) { switch (h->type) { case NT_VOID: break; - + case NT_MACRO: if (pfile->cb.undef) (*pfile->cb.undef) (pfile, pfile->directive_line, h); @@ -855,7 +855,7 @@ do_linemarker (cpp_reader *pfile) cpp_string s = { 0, 0 }; if (_cpp_interpret_string_notranslate (pfile, &token->val.str, &s)) new_file = (const char *)s.text; - + new_sysp = 0; flag = read_flag (pfile, 0); if (flag == 1) @@ -1159,7 +1159,7 @@ do_pragma (cpp_reader *pfile) (*p->u.handler) (pfile); if (pfile->cb.line_change) (*pfile->cb.line_change) (pfile, pfile->cur_token, false); - + } else if (pfile->cb.def_pragma) { @@ -1925,6 +1925,7 @@ cpp_push_buffer (cpp_reader *pfile, const uchar *buffer, size_t len, int from_stage3) { cpp_buffer *new = xobnew (&pfile->buffer_ob, cpp_buffer); + const char *input = CPP_OPTION (pfile, input_charset); /* Clears, amongst other things, if_stack and mi_cmacro. */ memset (new, 0, sizeof (cpp_buffer)); @@ -1936,6 +1937,8 @@ cpp_push_buffer (cpp_reader *pfile, const uchar *buffer, size_t len, new->need_line = true; pfile->buffer = new; + _cpp_init_iconv_buffer (pfile, input); + return new; } @@ -1957,6 +1960,8 @@ _cpp_pop_buffer (cpp_reader *pfile) /* In case of a missing #endif. */ pfile->state.skipping = 0; + _cpp_close_iconv_buffer (pfile); + /* _cpp_do_file_change expects pfile->buffer to be the new one. */ pfile->buffer = buffer->prev; diff --git a/gcc/cpplib.h b/gcc/cpplib.h index 5f189245eb5..f7e12d200b7 100644 --- a/gcc/cpplib.h +++ b/gcc/cpplib.h @@ -332,6 +332,9 @@ struct cpp_options /* Holds the name of the target wide character set. */ const char *wide_charset; + /* Holds the name of the input character set. */ + const char *input_charset; + /* True to warn about precompiled header files we couldn't use. */ bool warn_invalid_pch; @@ -417,7 +420,7 @@ struct cpp_dir /* Mapping of file names for this directory for MS-DOS and related platforms. A NULL-terminated array of (from, to) pairs. */ const char **name_map; - + /* The C front end uses these to recognize duplicated directories in the search path. */ ino_t ino; @@ -481,7 +484,7 @@ struct cpp_hashnode GTY(()) { struct ht_identifier ident; unsigned int is_directive : 1; - unsigned int directive_index : 7; /* If is_directive, + unsigned int directive_index : 7; /* If is_directive, then index into directive table. Otherwise, a NODE_OPERATOR. */ unsigned char rid_code; /* Rid code - for front ends. */