mirror of
git://gcc.gnu.org/git/gcc.git
synced 2025-03-18 20:41:17 +08:00
* check-internal-format-escaping.py: New version using polib.
From-SVN: r270704
This commit is contained in:
parent
7df9425167
commit
aaae53ce02
@ -1,3 +1,7 @@
|
||||
2019-04-30 Roland Illig <roland.illig@gmx.de>
|
||||
|
||||
* check-internal-format-escaping.py: New version using polib.
|
||||
|
||||
2019-04-19 Christophe Lyon <christophe.lyon@linaro.org>
|
||||
|
||||
PR translation/90118
|
||||
|
@ -1,7 +1,8 @@
|
||||
#!/usr/bin/env python3
|
||||
#
|
||||
# Check gcc.pot file for gcc-internal-format and print all strings
|
||||
# that contain an option that is not wrapped by %<-option_name%>.
|
||||
# Check gcc.pot file for stylistic issues as described in
|
||||
# https://gcc.gnu.org/onlinedocs/gccint/Guidelines-for-Diagnostics.html,
|
||||
# especially in gcc-internal-format messages.
|
||||
#
|
||||
# This file is part of GCC.
|
||||
#
|
||||
@ -17,52 +18,249 @@
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with GCC; see the file COPYING3. If not see
|
||||
# <http://www.gnu.org/licenses/>. */
|
||||
#
|
||||
#
|
||||
#
|
||||
# <http://www.gnu.org/licenses/>.
|
||||
|
||||
import argparse
|
||||
import re
|
||||
from collections import Counter
|
||||
from typing import Dict, Match
|
||||
|
||||
parser = argparse.ArgumentParser(description='')
|
||||
parser.add_argument('file', help = 'pot file')
|
||||
import polib
|
||||
|
||||
args = parser.parse_args()
|
||||
seen_warnings = Counter()
|
||||
|
||||
origin = None
|
||||
internal = False
|
||||
|
||||
lines = open(args.file).readlines()
|
||||
for i, l in enumerate(lines):
|
||||
l = l.strip()
|
||||
s = 'msgid '
|
||||
if l.startswith('#: '):
|
||||
origin = l
|
||||
elif '#, gcc-internal-format' in l:
|
||||
internal = True
|
||||
if l.startswith(s) and origin and internal:
|
||||
j = 0
|
||||
while not lines[i + j].startswith('msgstr'):
|
||||
l = lines[i + j]
|
||||
if l.startswith(s):
|
||||
l = l[len(s):]
|
||||
text = l.strip('"').strip()
|
||||
if text:
|
||||
parts = text.split(' ')
|
||||
for p in parts:
|
||||
if p.startswith('-'):
|
||||
if len(p) >= 2 and (p[1].isalpha() and p != '-INF'):
|
||||
print('%s: %s' % (origin, text))
|
||||
elif p.startswith('__builtin_'):
|
||||
print('%s: %s' % (origin, text))
|
||||
if re.search("[^%]'", p):
|
||||
print('%s: %s' % (origin, text))
|
||||
# %< should not be preceded by a non-punctuation
|
||||
# %character.
|
||||
if re.search("[a-zA-Z0-9]%<", p):
|
||||
print('%s: %s' % (origin, text))
|
||||
j += 1
|
||||
def location(msg: polib.POEntry):
|
||||
if msg.occurrences:
|
||||
occ = msg.occurrences[0]
|
||||
return f'{occ[0]}:{occ[1]}'
|
||||
return '<unknown location>'
|
||||
|
||||
origin = None
|
||||
internal = False
|
||||
|
||||
def warn(msg: polib.POEntry,
|
||||
diagnostic_id: str, diagnostic: str, include_msgid=True):
|
||||
"""
|
||||
To suppress a warning for a particular message,
|
||||
add a line "#, gcclint:ignore:{diagnostic_id}" to the message.
|
||||
"""
|
||||
|
||||
if f'gcclint:ignore:{diagnostic_id}' in msg.flags:
|
||||
return
|
||||
|
||||
seen_warnings[diagnostic] += 1
|
||||
|
||||
if include_msgid:
|
||||
print(f'{location(msg)}: {diagnostic} in {repr(msg.msgid)}')
|
||||
else:
|
||||
print(f'{location(msg)}: {diagnostic}')
|
||||
|
||||
|
||||
def lint_gcc_internal_format(msg: polib.POEntry):
|
||||
"""
|
||||
Checks a single message that has the gcc-internal-format. These
|
||||
messages use a variety of placeholders like %qs, %<quotes%> and
|
||||
%q#E.
|
||||
"""
|
||||
|
||||
msgid: str = msg.msgid
|
||||
|
||||
def outside_quotes(m: Match[str]):
|
||||
before = msgid[:m.start(0)]
|
||||
return before.count("%<") == before.count("%>")
|
||||
|
||||
def lint_matching_placeholders():
|
||||
"""
|
||||
Warns when literal values in placeholders are not exactly equal
|
||||
in the translation. This can happen when doing copy-and-paste
|
||||
translations of similar messages.
|
||||
|
||||
To avoid these mismatches in the first place,
|
||||
structurally equal messages are found by
|
||||
lint_diagnostics_differing_only_in_placeholders.
|
||||
|
||||
This check only applies when checking a finished translation
|
||||
such as de.po, not gcc.pot.
|
||||
"""
|
||||
|
||||
if not msg.translated():
|
||||
return
|
||||
|
||||
in_msgid = re.findall('%<[^%]+%>', msgid)
|
||||
in_msgstr = re.findall('%<[^%]+%>', msg.msgstr)
|
||||
|
||||
if set(in_msgid) != set(in_msgstr):
|
||||
warn(msg,
|
||||
'placeholder-mismatch',
|
||||
f'placeholder mismatch: msgid has {in_msgid}, '
|
||||
f'msgstr has {in_msgstr}',
|
||||
include_msgid=False)
|
||||
|
||||
def lint_option_outside_quotes():
|
||||
for match in re.finditer(r'\S+', msgid):
|
||||
part = match.group()
|
||||
if not outside_quotes(match):
|
||||
continue
|
||||
|
||||
if part.startswith('-'):
|
||||
if len(part) >= 2 and part[1].isalpha():
|
||||
if part == '-INF':
|
||||
continue
|
||||
|
||||
warn(msg,
|
||||
'option-outside-quotes',
|
||||
'command line option outside %<quotes%>')
|
||||
|
||||
if part.startswith('__builtin_'):
|
||||
warn(msg,
|
||||
'builtin-outside-quotes',
|
||||
'builtin function outside %<quotes%>')
|
||||
|
||||
def lint_plain_apostrophe():
|
||||
for match in re.finditer("[^%]'", msgid):
|
||||
if outside_quotes(match):
|
||||
warn(msg, 'apostrophe', 'apostrophe without leading %')
|
||||
|
||||
def lint_space_before_quote():
|
||||
"""
|
||||
A space before %< is often the result of string literals that
|
||||
are joined by the C compiler and neither literal has a space
|
||||
to separate the words.
|
||||
"""
|
||||
|
||||
for match in re.finditer("(.?[a-zA-Z0-9])%<", msgid):
|
||||
if match.group(1) != '%s':
|
||||
warn(msg,
|
||||
'no-space-before-quote',
|
||||
'%< directly following a letter or digit')
|
||||
|
||||
def lint_underscore_outside_quotes():
|
||||
"""
|
||||
An underscore outside of quotes is used in several contexts,
|
||||
and many of them violate the GCC Guidelines for Diagnostics:
|
||||
|
||||
* names of GCC-internal compiler functions
|
||||
* names of GCC-internal data structures
|
||||
* static_cast and the like (which are legitimate)
|
||||
"""
|
||||
|
||||
for match in re.finditer("_", msgid):
|
||||
if outside_quotes(match):
|
||||
warn(msg,
|
||||
'underscore-outside-quotes',
|
||||
'underscore outside of %<quotes%>')
|
||||
return
|
||||
|
||||
def lint_may_not():
|
||||
"""
|
||||
The term "may not" may either mean "it could be the case"
|
||||
or "should not". These two different meanings are sometimes
|
||||
hard to tell apart.
|
||||
"""
|
||||
|
||||
if re.search(r'\bmay not\b', msgid):
|
||||
warn(msg,
|
||||
'ambiguous-may-not',
|
||||
'the term "may not" is ambiguous')
|
||||
|
||||
def lint_unbalanced_quotes():
|
||||
if msgid.count("%<") != msgid.count("%>"):
|
||||
warn(msg,
|
||||
'unbalanced-quotes',
|
||||
'unbalanced %< and %> quotes')
|
||||
|
||||
if msg.translated():
|
||||
if msg.msgstr.count("%<") != msg.msgstr.count("%>"):
|
||||
warn(msg,
|
||||
'unbalanced-quotes',
|
||||
'unbalanced %< and %> quotes')
|
||||
|
||||
def lint_single_space_after_sentence():
|
||||
"""
|
||||
After a sentence there should be two spaces.
|
||||
"""
|
||||
|
||||
if re.search(r'[.] [A-Z]', msgid):
|
||||
warn(msg,
|
||||
'single-space-after-sentence',
|
||||
'single space after sentence')
|
||||
|
||||
def lint_non_canonical_quotes():
|
||||
"""
|
||||
Catches %<%s%>, which can be written in the shorter form %qs.
|
||||
"""
|
||||
match = re.search("%<%s%>|'%s'|\"%s\"|`%s'", msgid)
|
||||
if match:
|
||||
warn(msg,
|
||||
'non-canonical-quotes',
|
||||
f'placeholder {match.group()} should be written as %qs')
|
||||
|
||||
lint_option_outside_quotes()
|
||||
lint_plain_apostrophe()
|
||||
lint_space_before_quote()
|
||||
lint_underscore_outside_quotes()
|
||||
lint_may_not()
|
||||
lint_unbalanced_quotes()
|
||||
lint_matching_placeholders()
|
||||
lint_single_space_after_sentence()
|
||||
lint_non_canonical_quotes()
|
||||
|
||||
|
||||
def lint_diagnostics_differing_only_in_placeholders(po: polib.POFile):
|
||||
"""
|
||||
Detects messages that are structurally the same, except that they
|
||||
use different plain strings inside %<quotes%>. These messages can
|
||||
be merged in order to prevent copy-and-paste mistakes by the
|
||||
translators.
|
||||
|
||||
See bug 90119.
|
||||
"""
|
||||
|
||||
seen: Dict[str, polib.POEntry] = {}
|
||||
|
||||
for msg in po:
|
||||
msg: polib.POEntry
|
||||
msgid = msg.msgid
|
||||
|
||||
normalized = re.sub('%<[^%]+%>', '%qs', msgid)
|
||||
if normalized not in seen:
|
||||
seen[normalized] = msg
|
||||
seen[msgid] = msg
|
||||
continue
|
||||
|
||||
prev = seen[normalized]
|
||||
warn(msg,
|
||||
'same-pattern',
|
||||
f'same pattern for {repr(msgid)} and '
|
||||
f'{repr(prev.msgid)} in {location(prev)}',
|
||||
include_msgid=False)
|
||||
|
||||
|
||||
def lint_file(po: polib.POFile):
|
||||
for msg in po:
|
||||
msg: polib.POEntry
|
||||
|
||||
if not msg.obsolete and not msg.fuzzy:
|
||||
if 'gcc-internal-format' in msg.flags:
|
||||
lint_gcc_internal_format(msg)
|
||||
|
||||
lint_diagnostics_differing_only_in_placeholders(po)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='')
|
||||
parser.add_argument('file', help='pot file')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
po = polib.pofile(args.file)
|
||||
lint_file(po)
|
||||
|
||||
print()
|
||||
print('summary:')
|
||||
for entry in seen_warnings.most_common():
|
||||
if entry[1] > 1:
|
||||
print(f'{entry[1]}\t{entry[0]}')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
Loading…
x
Reference in New Issue
Block a user