summaryrefslogtreecommitdiff
path: root/contrib
diff options
context:
space:
mode:
authorRoland Illig <roland.illig@gmx.de>2019-04-30 16:14:40 +0000
committerJeff Law <law@gcc.gnu.org>2019-04-30 10:14:40 -0600
commitaaae53ce0201febb4f9545c7d0e8068aa9fe6090 (patch)
tree4f83aba1b018dd5036e5ff36c0680affc0f90889 /contrib
parent7df942516727f9043403090ad0e3319afbc11adc (diff)
* check-internal-format-escaping.py: New version using polib.
From-SVN: r270704
Diffstat (limited to 'contrib')
-rw-r--r--contrib/ChangeLog4
-rwxr-xr-xcontrib/check-internal-format-escaping.py292
2 files changed, 249 insertions, 47 deletions
diff --git a/contrib/ChangeLog b/contrib/ChangeLog
index db3eb2caa82..835c5c4c618 100644
--- a/contrib/ChangeLog
+++ b/contrib/ChangeLog
@@ -1,3 +1,7 @@
+2019-04-30 Roland Illig <roland.illig@gmx.de>
+
+ * check-internal-format-escaping.py: New version using polib.
+
2019-04-19 Christophe Lyon <christophe.lyon@linaro.org>
PR translation/90118
diff --git a/contrib/check-internal-format-escaping.py b/contrib/check-internal-format-escaping.py
index 9c625868012..e06752666b8 100755
--- a/contrib/check-internal-format-escaping.py
+++ b/contrib/check-internal-format-escaping.py
@@ -1,7 +1,8 @@
#!/usr/bin/env python3
#
-# Check gcc.pot file for gcc-internal-format and print all strings
-# that contain an option that is not wrapped by %<-option_name%>.
+# Check gcc.pot file for stylistic issues as described in
+# https://gcc.gnu.org/onlinedocs/gccint/Guidelines-for-Diagnostics.html,
+# especially in gcc-internal-format messages.
#
# This file is part of GCC.
#
@@ -17,52 +18,249 @@
#
# You should have received a copy of the GNU General Public License
# along with GCC; see the file COPYING3. If not see
-# <http://www.gnu.org/licenses/>. */
-#
-#
-#
+# <http://www.gnu.org/licenses/>.
import argparse
import re
+from collections import Counter
+from typing import Dict, Match
+
+import polib
+
+seen_warnings = Counter()
+
+
+def location(msg: polib.POEntry):
+ if msg.occurrences:
+ occ = msg.occurrences[0]
+ return f'{occ[0]}:{occ[1]}'
+ return '<unknown location>'
+
+
+def warn(msg: polib.POEntry,
+ diagnostic_id: str, diagnostic: str, include_msgid=True):
+ """
+ To suppress a warning for a particular message,
+ add a line "#, gcclint:ignore:{diagnostic_id}" to the message.
+ """
+
+ if f'gcclint:ignore:{diagnostic_id}' in msg.flags:
+ return
+
+ seen_warnings[diagnostic] += 1
+
+ if include_msgid:
+ print(f'{location(msg)}: {diagnostic} in {repr(msg.msgid)}')
+ else:
+ print(f'{location(msg)}: {diagnostic}')
+
+
+def lint_gcc_internal_format(msg: polib.POEntry):
+ """
+ Checks a single message that has the gcc-internal-format. These
+ messages use a variety of placeholders like %qs, %<quotes%> and
+ %q#E.
+ """
+
+ msgid: str = msg.msgid
+
+ def outside_quotes(m: Match[str]):
+ before = msgid[:m.start(0)]
+ return before.count("%<") == before.count("%>")
+
+ def lint_matching_placeholders():
+ """
+ Warns when literal values in placeholders are not exactly equal
+ in the translation. This can happen when doing copy-and-paste
+ translations of similar messages.
+
+ To avoid these mismatches in the first place,
+ structurally equal messages are found by
+ lint_diagnostics_differing_only_in_placeholders.
+
+ This check only applies when checking a finished translation
+ such as de.po, not gcc.pot.
+ """
+
+ if not msg.translated():
+ return
+
+ in_msgid = re.findall('%<[^%]+%>', msgid)
+ in_msgstr = re.findall('%<[^%]+%>', msg.msgstr)
+
+ if set(in_msgid) != set(in_msgstr):
+ warn(msg,
+ 'placeholder-mismatch',
+ f'placeholder mismatch: msgid has {in_msgid}, '
+ f'msgstr has {in_msgstr}',
+ include_msgid=False)
+
+ def lint_option_outside_quotes():
+ for match in re.finditer(r'\S+', msgid):
+ part = match.group()
+ if not outside_quotes(match):
+ continue
+
+ if part.startswith('-'):
+ if len(part) >= 2 and part[1].isalpha():
+ if part == '-INF':
+ continue
+
+ warn(msg,
+ 'option-outside-quotes',
+ 'command line option outside %<quotes%>')
+
+ if part.startswith('__builtin_'):
+ warn(msg,
+ 'builtin-outside-quotes',
+ 'builtin function outside %<quotes%>')
+
+ def lint_plain_apostrophe():
+ for match in re.finditer("[^%]'", msgid):
+ if outside_quotes(match):
+ warn(msg, 'apostrophe', 'apostrophe without leading %')
+
+ def lint_space_before_quote():
+ """
+ A space before %< is often the result of string literals that
+ are joined by the C compiler and neither literal has a space
+ to separate the words.
+ """
+
+ for match in re.finditer("(.?[a-zA-Z0-9])%<", msgid):
+ if match.group(1) != '%s':
+ warn(msg,
+ 'no-space-before-quote',
+ '%< directly following a letter or digit')
+
+ def lint_underscore_outside_quotes():
+ """
+ An underscore outside of quotes is used in several contexts,
+ and many of them violate the GCC Guidelines for Diagnostics:
+
+ * names of GCC-internal compiler functions
+ * names of GCC-internal data structures
+ * static_cast and the like (which are legitimate)
+ """
+
+ for match in re.finditer("_", msgid):
+ if outside_quotes(match):
+ warn(msg,
+ 'underscore-outside-quotes',
+ 'underscore outside of %<quotes%>')
+ return
+
+ def lint_may_not():
+ """
+ The term "may not" may either mean "it could be the case"
+ or "should not". These two different meanings are sometimes
+ hard to tell apart.
+ """
+
+ if re.search(r'\bmay not\b', msgid):
+ warn(msg,
+ 'ambiguous-may-not',
+ 'the term "may not" is ambiguous')
+
+ def lint_unbalanced_quotes():
+ if msgid.count("%<") != msgid.count("%>"):
+ warn(msg,
+ 'unbalanced-quotes',
+ 'unbalanced %< and %> quotes')
+
+ if msg.translated():
+ if msg.msgstr.count("%<") != msg.msgstr.count("%>"):
+ warn(msg,
+ 'unbalanced-quotes',
+ 'unbalanced %< and %> quotes')
+
+ def lint_single_space_after_sentence():
+ """
+ After a sentence there should be two spaces.
+ """
+
+ if re.search(r'[.] [A-Z]', msgid):
+ warn(msg,
+ 'single-space-after-sentence',
+ 'single space after sentence')
+
+ def lint_non_canonical_quotes():
+ """
+ Catches %<%s%>, which can be written in the shorter form %qs.
+ """
+ match = re.search("%<%s%>|'%s'|\"%s\"|`%s'", msgid)
+ if match:
+ warn(msg,
+ 'non-canonical-quotes',
+ f'placeholder {match.group()} should be written as %qs')
+
+ lint_option_outside_quotes()
+ lint_plain_apostrophe()
+ lint_space_before_quote()
+ lint_underscore_outside_quotes()
+ lint_may_not()
+ lint_unbalanced_quotes()
+ lint_matching_placeholders()
+ lint_single_space_after_sentence()
+ lint_non_canonical_quotes()
+
+
+def lint_diagnostics_differing_only_in_placeholders(po: polib.POFile):
+ """
+ Detects messages that are structurally the same, except that they
+ use different plain strings inside %<quotes%>. These messages can
+ be merged in order to prevent copy-and-paste mistakes by the
+ translators.
+
+ See bug 90119.
+ """
+
+ seen: Dict[str, polib.POEntry] = {}
+
+ for msg in po:
+ msg: polib.POEntry
+ msgid = msg.msgid
+
+ normalized = re.sub('%<[^%]+%>', '%qs', msgid)
+ if normalized not in seen:
+ seen[normalized] = msg
+ seen[msgid] = msg
+ continue
+
+ prev = seen[normalized]
+ warn(msg,
+ 'same-pattern',
+ f'same pattern for {repr(msgid)} and '
+ f'{repr(prev.msgid)} in {location(prev)}',
+ include_msgid=False)
+
+
+def lint_file(po: polib.POFile):
+ for msg in po:
+ msg: polib.POEntry
+
+ if not msg.obsolete and not msg.fuzzy:
+ if 'gcc-internal-format' in msg.flags:
+ lint_gcc_internal_format(msg)
+
+ lint_diagnostics_differing_only_in_placeholders(po)
+
+
+def main():
+ parser = argparse.ArgumentParser(description='')
+ parser.add_argument('file', help='pot file')
+
+ args = parser.parse_args()
+
+ po = polib.pofile(args.file)
+ lint_file(po)
+
+ print()
+ print('summary:')
+ for entry in seen_warnings.most_common():
+ if entry[1] > 1:
+ print(f'{entry[1]}\t{entry[0]}')
+
-parser = argparse.ArgumentParser(description='')
-parser.add_argument('file', help = 'pot file')
-
-args = parser.parse_args()
-
-origin = None
-internal = False
-
-lines = open(args.file).readlines()
-for i, l in enumerate(lines):
- l = l.strip()
- s = 'msgid '
- if l.startswith('#: '):
- origin = l
- elif '#, gcc-internal-format' in l:
- internal = True
- if l.startswith(s) and origin and internal:
- j = 0
- while not lines[i + j].startswith('msgstr'):
- l = lines[i + j]
- if l.startswith(s):
- l = l[len(s):]
- text = l.strip('"').strip()
- if text:
- parts = text.split(' ')
- for p in parts:
- if p.startswith('-'):
- if len(p) >= 2 and (p[1].isalpha() and p != '-INF'):
- print('%s: %s' % (origin, text))
- elif p.startswith('__builtin_'):
- print('%s: %s' % (origin, text))
- if re.search("[^%]'", p):
- print('%s: %s' % (origin, text))
- # %< should not be preceded by a non-punctuation
- # %character.
- if re.search("[a-zA-Z0-9]%<", p):
- print('%s: %s' % (origin, text))
- j += 1
-
- origin = None
- internal = False
+if __name__ == '__main__':
+ main()