summaryrefslogtreecommitdiff
path: root/libcpp
diff options
context:
space:
mode:
authorLewis Hyatt <lhyatt@gmail.com>2019-09-19 19:56:11 +0000
committerJoseph Myers <jsm28@gcc.gnu.org>2019-09-19 20:56:11 +0100
commit7d112d6670a0e0e662f8a7e64c33686e475832c8 (patch)
tree983eb23217b2572ff4fe5a7f7fe0e5c0c0b9a48d /libcpp
parente0710fcf7dc70054a9a20ab1b8d77f4fef26ef2c (diff)
Support extended characters in C/C++ identifiers (PR c/67224)
libcpp/ChangeLog 2019-09-19 Lewis Hyatt <lhyatt@gmail.com> PR c/67224 * charset.c (_cpp_valid_utf8): New function to help lex UTF-8 tokens. * internal.h (_cpp_valid_utf8): Declare. * lex.c (forms_identifier_p): Use it to recognize UTF-8 identifiers. (_cpp_lex_direct): Handle UTF-8 in identifiers and CPP_OTHER tokens. Do all work in "default" case to avoid slowing down typical code paths. Also handle $ and UCN in the default case for consistency. gcc/Changelog 2019-09-19 Lewis Hyatt <lhyatt@gmail.com> PR c/67224 * doc/cpp.texi: Document support for extended characters in identifiers. * doc/cppopts.texi: Likewise. gcc/testsuite/ChangeLog 2019-09-19 Lewis Hyatt <lhyatt@gmail.com> PR c/67224 * c-c++-common/cpp/ucnid-2011-1-utf8.c: New test. * g++.dg/cpp/ucnid-1-utf8.C: New test. * g++.dg/cpp/ucnid-2-utf8.C: New test. * g++.dg/cpp/ucnid-3-utf8.C: New test. * g++.dg/cpp/ucnid-4-utf8.C: New test. * g++.dg/other/ucnid-1-utf8.C: New test. * gcc.dg/cpp/ucnid-1-utf8.c: New test. * gcc.dg/cpp/ucnid-10-utf8.c: New test. * gcc.dg/cpp/ucnid-11-utf8.c: New test. * gcc.dg/cpp/ucnid-12-utf8.c: New test. * gcc.dg/cpp/ucnid-13-utf8.c: New test. * gcc.dg/cpp/ucnid-14-utf8.c: New test. * gcc.dg/cpp/ucnid-15-utf8.c: New test. * gcc.dg/cpp/ucnid-2-utf8.c: New test. * gcc.dg/cpp/ucnid-3-utf8.c: New test. * gcc.dg/cpp/ucnid-4-utf8.c: New test. * gcc.dg/cpp/ucnid-6-utf8.c: New test. * gcc.dg/cpp/ucnid-7-utf8.c: New test. * gcc.dg/cpp/ucnid-9-utf8.c: New test. * gcc.dg/ucnid-1-utf8.c: New test. * gcc.dg/ucnid-10-utf8.c: New test. * gcc.dg/ucnid-11-utf8.c: New test. * gcc.dg/ucnid-12-utf8.c: New test. * gcc.dg/ucnid-13-utf8.c: New test. * gcc.dg/ucnid-14-utf8.c: New test. * gcc.dg/ucnid-15-utf8.c: New test. * gcc.dg/ucnid-16-utf8.c: New test. * gcc.dg/ucnid-2-utf8.c: New test. * gcc.dg/ucnid-3-utf8.c: New test. * gcc.dg/ucnid-4-utf8.c: New test. * gcc.dg/ucnid-5-utf8.c: New test. * gcc.dg/ucnid-6-utf8.c: New test. * gcc.dg/ucnid-7-utf8.c: New test. * gcc.dg/ucnid-8-utf8.c: New test. * gcc.dg/ucnid-9-utf8.c: New test. From-SVN: r275979
Diffstat (limited to 'libcpp')
-rw-r--r--libcpp/ChangeLog10
-rw-r--r--libcpp/charset.c83
-rw-r--r--libcpp/internal.h8
-rw-r--r--libcpp/lex.c55
4 files changed, 136 insertions, 20 deletions
diff --git a/libcpp/ChangeLog b/libcpp/ChangeLog
index 1ec8541a54c..0c851952b55 100644
--- a/libcpp/ChangeLog
+++ b/libcpp/ChangeLog
@@ -1,3 +1,13 @@
+2019-09-19 Lewis Hyatt <lhyatt@gmail.com>
+
+ PR c/67224
+ * charset.c (_cpp_valid_utf8): New function to help lex UTF-8 tokens.
+ * internal.h (_cpp_valid_utf8): Declare.
+ * lex.c (forms_identifier_p): Use it to recognize UTF-8 identifiers.
+ (_cpp_lex_direct): Handle UTF-8 in identifiers and CPP_OTHER tokens.
+ Do all work in "default" case to avoid slowing down typical code paths.
+ Also handle $ and UCN in the default case for consistency.
+
2019-08-30 Nathan Sidwell <nathan@acm.org>
New # semantics for popping to "" name.
diff --git a/libcpp/charset.c b/libcpp/charset.c
index 8a0e5cbb29b..10286219bd6 100644
--- a/libcpp/charset.c
+++ b/libcpp/charset.c
@@ -1198,6 +1198,84 @@ convert_ucn (cpp_reader *pfile, const uchar *from, const uchar *limit,
return from;
}
+/* Performs a similar task as _cpp_valid_ucn, but parses UTF-8-encoded
+ extended characters rather than UCNs. If the return value is TRUE, then a
+ character was successfully decoded and stored in *CP; *PSTR has been
+ updated to point one past the valid UTF-8 sequence. Diagnostics may have
+ been emitted if the character parsed is not allowed in the current context.
+ If the return value is FALSE, then *PSTR has not been modified and *CP may
+ equal 0, to indicate that *PSTR does not form a valid UTF-8 sequence, or it
+ may, when processing an identifier in C mode, equal a codepoint that was
+ validly encoded but is not allowed to appear in an identifier. In either
+ case, no diagnostic is emitted, and the return value of FALSE should cause
+ a new token to be formed.
+
+ Unlike _cpp_valid_ucn, this will never be called when lexing a string; only
+ a potential identifier, or a CPP_OTHER token. NST is unused in the latter
+ case.
+
+ As in _cpp_valid_ucn, IDENTIFIER_POS is 0 when not in an identifier, 1 for
+ the start of an identifier, or 2 otherwise. */
+
+extern bool
+_cpp_valid_utf8 (cpp_reader *pfile,
+ const uchar **pstr,
+ const uchar *limit,
+ int identifier_pos,
+ struct normalize_state *nst,
+ cppchar_t *cp)
+{
+ const uchar *base = *pstr;
+ size_t inbytesleft = limit - base;
+ if (one_utf8_to_cppchar (pstr, &inbytesleft, cp))
+ {
+ /* No diagnostic here as this byte will rather become a
+ new token. */
+ *cp = 0;
+ return false;
+ }
+
+ if (identifier_pos)
+ {
+ switch (ucn_valid_in_identifier (pfile, *cp, nst))
+ {
+
+ case 0:
+ /* In C++, this is an error for invalid character in an identifier
+ because logically, the UTF-8 was converted to a UCN during
+ translation phase 1 (even though we don't physically do it that
+ way). In C, this byte rather becomes grammatically a separate
+ token. */
+
+ if (CPP_OPTION (pfile, cplusplus))
+ cpp_error (pfile, CPP_DL_ERROR,
+ "extended character %.*s is not valid in an identifier",
+ (int) (*pstr - base), base);
+ else
+ {
+ *pstr = base;
+ return false;
+ }
+
+ break;
+
+ case 2:
+ if (identifier_pos == 1)
+ {
+ /* This is treated the same way in C++ or C99 -- lexed as an
+ identifier which is then invalid because an identifier is
+ not allowed to start with this character. */
+ cpp_error (pfile, CPP_DL_ERROR,
+ "extended character %.*s is not valid at the start of an identifier",
+ (int) (*pstr - base), base);
+ }
+ break;
+ }
+ }
+
+ return true;
+}
+
/* Subroutine of convert_hex and convert_oct. N is the representation
in the execution character set of a numeric escape; write it into the
string buffer TBUF and update the end-of-string pointer therein. WIDE
@@ -1956,8 +2034,9 @@ cpp_interpret_charconst (cpp_reader *pfile, const cpp_token *token,
}
/* Convert an identifier denoted by ID and LEN, which might contain
- UCN escapes, to the source character set, either UTF-8 or
- UTF-EBCDIC. Assumes that the identifier is actually a valid identifier. */
+ UCN escapes or UTF-8 multibyte chars, to the source character set,
+ either UTF-8 or UTF-EBCDIC. Assumes that the identifier is actually
+ a valid identifier. */
cpp_hashnode *
_cpp_interpret_identifier (cpp_reader *pfile, const uchar *id, size_t len)
{
diff --git a/libcpp/internal.h b/libcpp/internal.h
index f9bcd37c571..90263bde47d 100644
--- a/libcpp/internal.h
+++ b/libcpp/internal.h
@@ -791,6 +791,14 @@ extern bool _cpp_valid_ucn (cpp_reader *, const unsigned char **,
cppchar_t *,
source_range *char_range,
cpp_string_location_reader *loc_reader);
+
+extern bool _cpp_valid_utf8 (cpp_reader *pfile,
+ const uchar **pstr,
+ const uchar *limit,
+ int identifier_pos,
+ struct normalize_state *nst,
+ cppchar_t *cp);
+
extern void _cpp_destroy_iconv (cpp_reader *);
extern unsigned char *_cpp_convert_input (cpp_reader *, const char *,
unsigned char *, size_t, size_t,
diff --git a/libcpp/lex.c b/libcpp/lex.c
index 52e5bceb3ff..0e8de3807b3 100644
--- a/libcpp/lex.c
+++ b/libcpp/lex.c
@@ -1313,7 +1313,9 @@ warn_about_normalization (cpp_reader *pfile,
}
}
-/* Returns TRUE if the sequence starting at buffer->cur is invalid in
+static const cppchar_t utf8_signifier = 0xC0;
+
+/* Returns TRUE if the sequence starting at buffer->cur is valid in
an identifier. FIRST is TRUE if this starts an identifier. */
static bool
forms_identifier_p (cpp_reader *pfile, int first,
@@ -1336,17 +1338,25 @@ forms_identifier_p (cpp_reader *pfile, int first,
return true;
}
- /* Is this a syntactically valid UCN? */
- if (CPP_OPTION (pfile, extended_identifiers)
- && *buffer->cur == '\\'
- && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
+ /* Is this a syntactically valid UCN or a valid UTF-8 char? */
+ if (CPP_OPTION (pfile, extended_identifiers))
{
cppchar_t s;
- buffer->cur += 2;
- if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
- state, &s, NULL, NULL))
- return true;
- buffer->cur -= 2;
+ if (*buffer->cur >= utf8_signifier)
+ {
+ if (_cpp_valid_utf8 (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
+ state, &s))
+ return true;
+ }
+ else if (*buffer->cur == '\\'
+ && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
+ {
+ buffer->cur += 2;
+ if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
+ state, &s, NULL, NULL))
+ return true;
+ buffer->cur -= 2;
+ }
}
return false;
@@ -1464,7 +1474,8 @@ lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
pfile->buffer->cur = cur;
if (starts_ucn || forms_identifier_p (pfile, false, nst))
{
- /* Slower version for identifiers containing UCNs (or $). */
+ /* Slower version for identifiers containing UCNs
+ or extended chars (including $). */
do {
while (ISIDNUM (*pfile->buffer->cur))
{
@@ -3123,12 +3134,12 @@ _cpp_lex_direct (cpp_reader *pfile)
/* @ is a punctuator in Objective-C. */
case '@': result->type = CPP_ATSIGN; break;
- case '$':
- case '\\':
+ default:
{
const uchar *base = --buffer->cur;
- struct normalize_state nst = INITIAL_NORMALIZE_STATE;
+ /* Check for an extended identifier ($ or UCN or UTF-8). */
+ struct normalize_state nst = INITIAL_NORMALIZE_STATE;
if (forms_identifier_p (pfile, true, &nst))
{
result->type = CPP_NAME;
@@ -3137,13 +3148,21 @@ _cpp_lex_direct (cpp_reader *pfile)
warn_about_normalization (pfile, result, &nst);
break;
}
+
+ /* Otherwise this will form a CPP_OTHER token. Parse valid UTF-8 as a
+ single token. */
buffer->cur++;
+ if (c >= utf8_signifier)
+ {
+ const uchar *pstr = base;
+ cppchar_t s;
+ if (_cpp_valid_utf8 (pfile, &pstr, buffer->rlimit, 0, NULL, &s))
+ buffer->cur = pstr;
+ }
+ create_literal (pfile, result, base, buffer->cur - base, CPP_OTHER);
+ break;
}
- /* FALLTHRU */
- default:
- create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER);
- break;
}
/* Potentially convert the location of the token to a range. */