re PR preprocessor/33415 (Can't compile .cpp file with UTF-8 BOM.)

libcpp PR libcpp/33415: * charset.c (_cpp_convert_input): Add buffer_start argument. Ignore UTF-8 BOM if seen. * internal.h (_cpp_convert_input): Add argument. * files.c (struct _cpp_file) <buffer_start>: New field. (destroy_cpp_file): Free buffer_start, not buffer. (_cpp_pop_file_buffer): Likewise. (read_file_guts): Update. gcc/testsuite PR libcpp/33415: * gcc.dg/cpp/pr33415.c: New file. From-SVN: r134507
author: Tom Tromey <tromey@redhat.com> 2008-04-21 14:02:00 +0000
committer: Tom Tromey <tromey@gcc.gnu.org> 2008-04-21 14:02:00 +0000
commit: 688e7a53446776e4d7b49472b06fec29ea69ff17 (patch)
tree: 95c70d6534e91125519d9a8f2e7c2cf84de0c82b /libcpp/charset.c
parent: 009890be6c6e3f34630be0f086303d42c6aa867b (diff)
1 files changed, 31 insertions, 9 deletions
diff --git a/libcpp/charset.c b/libcpp/charset.c
index 225cdb4915e..d70d05cc020 100644
--- a/libcpp/charset.c
+++ b/libcpp/charset.c
@@ -1,5 +1,5 @@
 /* CPP Library - charsets
-   Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2006
+   Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2006, 2008
    Free Software Foundation, Inc.
 
    Broken out of c-lex.c Apr 2003, adding valid C99 UCN ranges.
@@ -1637,18 +1637,24 @@ _cpp_interpret_identifier (cpp_reader *pfile, const uchar *id, size_t len)
    source file) from INPUT_CHARSET to the source character set.  INPUT
    points to the input buffer, SIZE is its allocated size, and LEN is
    the length of the meaningful data within the buffer.  The
-   translated buffer is returned, and *ST_SIZE is set to the length of
-   the meaningful data within the translated buffer.
-
-   INPUT is expected to have been allocated with xmalloc.  This function
-   will either return INPUT, or free it and return a pointer to another
-   xmalloc-allocated block of memory.  */
+   translated buffer is returned, *ST_SIZE is set to the length of
+   the meaningful data within the translated buffer, and *BUFFER_START
+   is set to the start of the returned buffer.  *BUFFER_START may
+   differ from the return value in the case of a BOM or other ignored
+   marker information.
+
+   INPUT is expected to have been allocated with xmalloc.  This
+   function will either set *BUFFER_START to INPUT, or free it and set
+   *BUFFER_START to a pointer to another xmalloc-allocated block of
+   memory.  */
 uchar * 
 _cpp_convert_input (cpp_reader *pfile, const char *input_charset,
-		    uchar *input, size_t size, size_t len, off_t *st_size)
+		    uchar *input, size_t size, size_t len,
+		    const unsigned char **buffer_start, off_t *st_size)
 {
   struct cset_converter input_cset;
   struct _cpp_strbuf to;
+  unsigned char *buffer;
 
   input_cset = init_iconv_desc (pfile, SOURCE_CHARSET, input_charset);
   if (input_cset.func == convert_no_conversion)
@@ -1689,8 +1695,24 @@ _cpp_convert_input (cpp_reader *pfile, const char *input_charset,
   else
     to.text[to.len] = '\n';
 
+  buffer = to.text;
   *st_size = to.len;
-  return to.text;
+#if HOST_CHARSET == HOST_CHARSET_ASCII
+  /* The HOST_CHARSET test just above ensures that the source charset
+     is UTF-8.  So, ignore a UTF-8 BOM if we see one.  Note that
+     glib'c UTF-8 iconv() provider (as of glibc 2.7) does not ignore a
+     BOM -- however, even if it did, we would still need this code due
+     to the 'convert_no_conversion' case.  */
+  if (to.len >= 3 && to.text[0] == 0xef && to.text[1] == 0xbb
+      && to.text[2] == 0xbf)
+    {
+      *st_size -= 3;
+      buffer += 3;
+    }
+#endif
+
+  *buffer_start = to.text;
+  return buffer;
 }
 
 /* Decide on the default encoding to assume for input files.  */
author	Tom Tromey <tromey@redhat.com>	2008-04-21 14:02:00 +0000
committer	Tom Tromey <tromey@gcc.gnu.org>	2008-04-21 14:02:00 +0000
commit	688e7a53446776e4d7b49472b06fec29ea69ff17 (patch)
tree	95c70d6534e91125519d9a8f2e7c2cf84de0c82b /libcpp/charset.c
parent	009890be6c6e3f34630be0f086303d42c6aa867b (diff)