* check-internal-format-escaping.py: New version using polib.

From-SVN: r270704
author: Roland Illig <roland.illig@gmx.de> 2019-04-30 16:14:40 +0000
committer: Jeff Law <law@gcc.gnu.org> 2019-04-30 10:14:40 -0600
commit: aaae53ce0201febb4f9545c7d0e8068aa9fe6090 (patch)
tree: 4f83aba1b018dd5036e5ff36c0680affc0f90889 /contrib
parent: 7df942516727f9043403090ad0e3319afbc11adc (diff)
2 files changed, 249 insertions, 47 deletions
diff --git a/contrib/ChangeLog b/contrib/ChangeLog
index db3eb2caa82..835c5c4c618 100644
--- a/contrib/ChangeLog
+++ b/contrib/ChangeLog
@@ -1,3 +1,7 @@
+2019-04-30  Roland Illig  <roland.illig@gmx.de>
+
+	* check-internal-format-escaping.py: New version using polib.
+
 2019-04-19  Christophe Lyon  <christophe.lyon@linaro.org>
 
 	PR translation/90118
diff --git a/contrib/check-internal-format-escaping.py b/contrib/check-internal-format-escaping.py
index 9c625868012..e06752666b8 100755
--- a/contrib/check-internal-format-escaping.py
+++ b/contrib/check-internal-format-escaping.py
@@ -1,7 +1,8 @@
 #!/usr/bin/env python3
 #
-# Check gcc.pot file for gcc-internal-format and print all strings
-# that contain an option that is not wrapped by %<-option_name%>.
+# Check gcc.pot file for stylistic issues as described in
+# https://gcc.gnu.org/onlinedocs/gccint/Guidelines-for-Diagnostics.html,
+# especially in gcc-internal-format messages.
 #
 # This file is part of GCC.
 #
@@ -17,52 +18,249 @@
 #
 # You should have received a copy of the GNU General Public License
 # along with GCC; see the file COPYING3.  If not see
-# <http://www.gnu.org/licenses/>.  */
-#
-#
-#
+# <http://www.gnu.org/licenses/>.
 
 import argparse
 import re
+from collections import Counter
+from typing import Dict, Match
+
+import polib
+
+seen_warnings = Counter()
+
+
+def location(msg: polib.POEntry):
+    if msg.occurrences:
+        occ = msg.occurrences[0]
+        return f'{occ[0]}:{occ[1]}'
+    return '<unknown location>'
+
+
+def warn(msg: polib.POEntry,
+         diagnostic_id: str, diagnostic: str, include_msgid=True):
+    """
+    To suppress a warning for a particular message,
+    add a line "#, gcclint:ignore:{diagnostic_id}" to the message.
+    """
+
+    if f'gcclint:ignore:{diagnostic_id}' in msg.flags:
+        return
+
+    seen_warnings[diagnostic] += 1
+
+    if include_msgid:
+        print(f'{location(msg)}: {diagnostic} in {repr(msg.msgid)}')
+    else:
+        print(f'{location(msg)}: {diagnostic}')
+
+
+def lint_gcc_internal_format(msg: polib.POEntry):
+    """
+    Checks a single message that has the gcc-internal-format. These
+    messages use a variety of placeholders like %qs, %<quotes%> and
+    %q#E.
+    """
+
+    msgid: str = msg.msgid
+
+    def outside_quotes(m: Match[str]):
+        before = msgid[:m.start(0)]
+        return before.count("%<") == before.count("%>")
+
+    def lint_matching_placeholders():
+        """
+        Warns when literal values in placeholders are not exactly equal
+        in the translation. This can happen when doing copy-and-paste
+        translations of similar messages.
+
+        To avoid these mismatches in the first place,
+        structurally equal messages are found by
+        lint_diagnostics_differing_only_in_placeholders.
+
+        This check only applies when checking a finished translation
+        such as de.po, not gcc.pot.
+        """
+
+        if not msg.translated():
+            return
+
+        in_msgid = re.findall('%<[^%]+%>', msgid)
+        in_msgstr = re.findall('%<[^%]+%>', msg.msgstr)
+
+        if set(in_msgid) != set(in_msgstr):
+            warn(msg,
+                 'placeholder-mismatch',
+                 f'placeholder mismatch: msgid has {in_msgid}, '
+                 f'msgstr has {in_msgstr}',
+                 include_msgid=False)
+
+    def lint_option_outside_quotes():
+        for match in re.finditer(r'\S+', msgid):
+            part = match.group()
+            if not outside_quotes(match):
+                continue
+
+            if part.startswith('-'):
+                if len(part) >= 2 and part[1].isalpha():
+                    if part == '-INF':
+                        continue
+
+                    warn(msg,
+                         'option-outside-quotes',
+                         'command line option outside %<quotes%>')
+
+            if part.startswith('__builtin_'):
+                warn(msg,
+                     'builtin-outside-quotes',
+                     'builtin function outside %<quotes%>')
+
+    def lint_plain_apostrophe():
+        for match in re.finditer("[^%]'", msgid):
+            if outside_quotes(match):
+                warn(msg, 'apostrophe', 'apostrophe without leading %')
+
+    def lint_space_before_quote():
+        """
+        A space before %< is often the result of string literals that
+        are joined by the C compiler and neither literal has a space
+        to separate the words.
+        """
+
+        for match in re.finditer("(.?[a-zA-Z0-9])%<", msgid):
+            if match.group(1) != '%s':
+                warn(msg,
+                     'no-space-before-quote',
+                     '%< directly following a letter or digit')
+
+    def lint_underscore_outside_quotes():
+        """
+        An underscore outside of quotes is used in several contexts,
+        and many of them violate the GCC Guidelines for Diagnostics:
+
+        * names of GCC-internal compiler functions
+        * names of GCC-internal data structures
+        * static_cast and the like (which are legitimate)
+        """
+
+        for match in re.finditer("_", msgid):
+            if outside_quotes(match):
+                warn(msg,
+                     'underscore-outside-quotes',
+                     'underscore outside of %<quotes%>')
+                return
+
+    def lint_may_not():
+        """
+        The term "may not" may either mean "it could be the case"
+        or "should not". These two different meanings are sometimes
+        hard to tell apart.
+        """
+
+        if re.search(r'\bmay not\b', msgid):
+            warn(msg,
+                 'ambiguous-may-not',
+                 'the term "may not" is ambiguous')
+
+    def lint_unbalanced_quotes():
+        if msgid.count("%<") != msgid.count("%>"):
+            warn(msg,
+                 'unbalanced-quotes',
+                 'unbalanced %< and %> quotes')
+
+        if msg.translated():
+            if msg.msgstr.count("%<") != msg.msgstr.count("%>"):
+                warn(msg,
+                     'unbalanced-quotes',
+                     'unbalanced %< and %> quotes')
+
+    def lint_single_space_after_sentence():
+        """
+        After a sentence there should be two spaces.
+        """
+
+        if re.search(r'[.] [A-Z]', msgid):
+            warn(msg,
+                 'single-space-after-sentence',
+                 'single space after sentence')
+
+    def lint_non_canonical_quotes():
+        """
+        Catches %<%s%>, which can be written in the shorter form %qs.
+        """
+        match = re.search("%<%s%>|'%s'|\"%s\"|`%s'", msgid)
+        if match:
+            warn(msg,
+                 'non-canonical-quotes',
+                 f'placeholder {match.group()} should be written as %qs')
+
+    lint_option_outside_quotes()
+    lint_plain_apostrophe()
+    lint_space_before_quote()
+    lint_underscore_outside_quotes()
+    lint_may_not()
+    lint_unbalanced_quotes()
+    lint_matching_placeholders()
+    lint_single_space_after_sentence()
+    lint_non_canonical_quotes()
+
+
+def lint_diagnostics_differing_only_in_placeholders(po: polib.POFile):
+    """
+    Detects messages that are structurally the same, except that they
+    use different plain strings inside %<quotes%>. These messages can
+    be merged in order to prevent copy-and-paste mistakes by the
+    translators.
+
+    See bug 90119.
+    """
+
+    seen: Dict[str, polib.POEntry] = {}
+
+    for msg in po:
+        msg: polib.POEntry
+        msgid = msg.msgid
+
+        normalized = re.sub('%<[^%]+%>', '%qs', msgid)
+        if normalized not in seen:
+            seen[normalized] = msg
+            seen[msgid] = msg
+            continue
+
+        prev = seen[normalized]
+        warn(msg,
+             'same-pattern',
+             f'same pattern for {repr(msgid)} and '
+             f'{repr(prev.msgid)} in {location(prev)}',
+             include_msgid=False)
+
+
+def lint_file(po: polib.POFile):
+    for msg in po:
+        msg: polib.POEntry
+
+        if not msg.obsolete and not msg.fuzzy:
+            if 'gcc-internal-format' in msg.flags:
+                lint_gcc_internal_format(msg)
+
+    lint_diagnostics_differing_only_in_placeholders(po)
+
+
+def main():
+    parser = argparse.ArgumentParser(description='')
+    parser.add_argument('file', help='pot file')
+
+    args = parser.parse_args()
+
+    po = polib.pofile(args.file)
+    lint_file(po)
+
+    print()
+    print('summary:')
+    for entry in seen_warnings.most_common():
+        if entry[1] > 1:
+            print(f'{entry[1]}\t{entry[0]}')
+
 
-parser = argparse.ArgumentParser(description='')
-parser.add_argument('file', help = 'pot file')
-
-args = parser.parse_args()
-
-origin = None
-internal = False
-
-lines = open(args.file).readlines()
-for i, l in enumerate(lines):
-    l = l.strip()
-    s = 'msgid '
-    if l.startswith('#: '):
-        origin = l
-    elif '#, gcc-internal-format' in l:
-        internal = True
-    if l.startswith(s) and origin and internal:
-        j = 0
-        while not lines[i + j].startswith('msgstr'):
-            l = lines[i + j]
-            if l.startswith(s):
-                l = l[len(s):]
-            text = l.strip('"').strip()
-            if text:
-                parts = text.split(' ')
-                for p in parts:
-                    if p.startswith('-'):
-                        if len(p) >= 2 and (p[1].isalpha() and p != '-INF'):
-                            print('%s: %s' % (origin, text))
-                    elif p.startswith('__builtin_'):
-                        print('%s: %s' % (origin, text))
-                    if re.search("[^%]'", p):
-                        print('%s: %s' % (origin, text))
-                    # %< should not be preceded by a non-punctuation
-                    # %character.
-                    if re.search("[a-zA-Z0-9]%<", p):
-                        print('%s: %s' % (origin, text))
-            j += 1
-
-        origin = None
-        internal = False
+if __name__ == '__main__':
+    main()
author	Roland Illig <roland.illig@gmx.de>	2019-04-30 16:14:40 +0000
committer	Jeff Law <law@gcc.gnu.org>	2019-04-30 10:14:40 -0600
commit	aaae53ce0201febb4f9545c7d0e8068aa9fe6090 (patch)
tree	4f83aba1b018dd5036e5ff36c0680affc0f90889 /contrib
parent	7df942516727f9043403090ad0e3319afbc11adc (diff)