Add deeper XML checking to update-translation script
- Catch problems such as mismatched formatting characters. Remove messages that can give problems at runtime. - Also remove unfinished/untranslated messages, they just take up space in the ts and waste parsing time. Fixes #4774.
This commit is contained in:
parent
93f97aab62
commit
da59f28335
1 changed files with 134 additions and 14 deletions
|
@ -14,13 +14,14 @@ It will do the following automatically:
|
||||||
|
|
||||||
TODO:
|
TODO:
|
||||||
- auto-add new translations to the build system according to the translation process
|
- auto-add new translations to the build system according to the translation process
|
||||||
- remove 'unfinished' translation items
|
|
||||||
'''
|
'''
|
||||||
from __future__ import division, print_function
|
from __future__ import division, print_function
|
||||||
import subprocess
|
import subprocess
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
|
import io
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
|
|
||||||
# Name of transifex tool
|
# Name of transifex tool
|
||||||
TX = 'tx'
|
TX = 'tx'
|
||||||
|
@ -40,24 +41,143 @@ def fetch_all_translations():
|
||||||
print('Error while fetching translations', file=sys.stderr)
|
print('Error while fetching translations', file=sys.stderr)
|
||||||
exit(1)
|
exit(1)
|
||||||
|
|
||||||
def postprocess_translations():
|
def find_format_specifiers(s):
|
||||||
print('Postprocessing...')
|
'''Find all format specifiers in a string.'''
|
||||||
|
pos = 0
|
||||||
|
specifiers = []
|
||||||
|
while True:
|
||||||
|
percent = s.find('%', pos)
|
||||||
|
if percent < 0:
|
||||||
|
break
|
||||||
|
specifiers.append(s[percent+1])
|
||||||
|
pos = percent+2
|
||||||
|
return specifiers
|
||||||
|
|
||||||
|
def split_format_specifiers(specifiers):
|
||||||
|
'''Split format specifiers between numeric (Qt) and others (strprintf)'''
|
||||||
|
numeric = []
|
||||||
|
other = []
|
||||||
|
for s in specifiers:
|
||||||
|
if s in {'1','2','3','4','5','6','7','8','9'}:
|
||||||
|
numeric.append(s)
|
||||||
|
else:
|
||||||
|
other.append(s)
|
||||||
|
|
||||||
|
# numeric (Qt) can be present in any order, others (strprintf) must be in specified order
|
||||||
|
return set(numeric),other
|
||||||
|
|
||||||
|
def sanitize_string(s):
|
||||||
|
'''Sanitize string for printing'''
|
||||||
|
return s.replace('\n',' ')
|
||||||
|
|
||||||
|
def check_format_specifiers(source, translation, errors):
|
||||||
|
source_f = split_format_specifiers(find_format_specifiers(source))
|
||||||
|
# assert that no source messages contain both Qt and strprintf format specifiers
|
||||||
|
# if this fails, go change the source as this is hacky and confusing!
|
||||||
|
assert(not(source_f[0] and source_f[1]))
|
||||||
|
try:
|
||||||
|
translation_f = split_format_specifiers(find_format_specifiers(translation))
|
||||||
|
except IndexError:
|
||||||
|
errors.append("Parse error in translation '%s'" % sanitize_string(translation))
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
if source_f != translation_f:
|
||||||
|
errors.append("Mismatch between '%s' and '%s'" % (sanitize_string(source), sanitize_string(translation)))
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
def all_ts_files(suffix=''):
|
||||||
for filename in os.listdir(LOCALE_DIR):
|
for filename in os.listdir(LOCALE_DIR):
|
||||||
# process only language files, and do not process source language
|
# process only language files, and do not process source language
|
||||||
if not filename.endswith('.ts') or filename == SOURCE_LANG:
|
if not filename.endswith('.ts'+suffix) or filename == SOURCE_LANG+suffix:
|
||||||
continue
|
continue
|
||||||
|
if suffix: # remove provided suffix
|
||||||
|
filename = filename[0:-len(suffix)]
|
||||||
filepath = os.path.join(LOCALE_DIR, filename)
|
filepath = os.path.join(LOCALE_DIR, filename)
|
||||||
with open(filepath, 'rb') as f:
|
yield(filename, filepath)
|
||||||
|
|
||||||
|
FIX_RE = re.compile(b'[\x00-\x09\x0b\x0c\x0e-\x1f]')
|
||||||
|
def remove_invalid_characters(s):
|
||||||
|
'''Remove invalid characters from translation string'''
|
||||||
|
return FIX_RE.sub(b'', s)
|
||||||
|
|
||||||
|
# Override cdata escape function to make our output match Qt's (optional, just for cleaner diffs for
|
||||||
|
# comparison, disable by default)
|
||||||
|
_orig_escape_cdata = None
|
||||||
|
def escape_cdata(text):
|
||||||
|
text = _orig_escape_cdata(text)
|
||||||
|
text = text.replace("'", ''')
|
||||||
|
text = text.replace('"', '"')
|
||||||
|
return text
|
||||||
|
|
||||||
|
def postprocess_translations(reduce_diff_hacks=False):
|
||||||
|
print('Checking and postprocessing...')
|
||||||
|
|
||||||
|
if reduce_diff_hacks:
|
||||||
|
global _orig_escape_cdata
|
||||||
|
_orig_escape_cdata = ET._escape_cdata
|
||||||
|
ET._escape_cdata = escape_cdata
|
||||||
|
|
||||||
|
for (filename,filepath) in all_ts_files():
|
||||||
|
os.rename(filepath, filepath+'.orig')
|
||||||
|
|
||||||
|
have_errors = False
|
||||||
|
for (filename,filepath) in all_ts_files('.orig'):
|
||||||
|
# pre-fixups to cope with transifex output
|
||||||
|
parser = ET.XMLParser(encoding='utf-8') # need to override encoding because 'utf8' is not understood only 'utf-8'
|
||||||
|
with open(filepath + '.orig', 'rb') as f:
|
||||||
data = f.read()
|
data = f.read()
|
||||||
# remove non-allowed control characters
|
# remove control characters; this must be done over the entire file otherwise the XML parser will fail
|
||||||
data = re.sub('[\x00-\x09\x0b\x0c\x0e-\x1f]', '', data)
|
data = remove_invalid_characters(data)
|
||||||
data = data.split('\n')
|
tree = ET.parse(io.BytesIO(data), parser=parser)
|
||||||
# strip locations from non-origin translation
|
|
||||||
# location tags are used to guide translators, they are not necessary for compilation
|
# iterate over all messages in file
|
||||||
# TODO: actually process XML instead of relying on Transifex's one-tag-per-line output format
|
root = tree.getroot()
|
||||||
data = [line for line in data if not '<location' in line]
|
for context in root.findall('context'):
|
||||||
with open(filepath, 'wb') as f:
|
for message in context.findall('message'):
|
||||||
f.write('\n'.join(data))
|
numerus = message.get('numerus') == 'yes'
|
||||||
|
source = message.find('source').text
|
||||||
|
translation_node = message.find('translation')
|
||||||
|
# pick all numerusforms
|
||||||
|
if numerus:
|
||||||
|
translations = [i.text for i in translation_node.findall('numerusform')]
|
||||||
|
else:
|
||||||
|
translations = [translation_node.text]
|
||||||
|
|
||||||
|
for translation in translations:
|
||||||
|
if translation is None:
|
||||||
|
continue
|
||||||
|
errors = []
|
||||||
|
valid = check_format_specifiers(source, translation, errors)
|
||||||
|
|
||||||
|
for error in errors:
|
||||||
|
print('%s: %s' % (filename, error))
|
||||||
|
|
||||||
|
if not valid: # set type to unfinished and clear string if invalid
|
||||||
|
translation_node.clear()
|
||||||
|
translation_node.set('type', 'unfinished')
|
||||||
|
have_errors = True
|
||||||
|
|
||||||
|
# Remove location tags
|
||||||
|
for location in message.findall('location'):
|
||||||
|
message.remove(location)
|
||||||
|
|
||||||
|
# Remove entire message if it is an unfinished translation
|
||||||
|
if translation_node.get('type') == 'unfinished':
|
||||||
|
context.remove(message)
|
||||||
|
|
||||||
|
# write fixed-up tree
|
||||||
|
# if diff reduction requested, replace some XML to 'sanitize' to qt formatting
|
||||||
|
if reduce_diff_hacks:
|
||||||
|
out = io.BytesIO()
|
||||||
|
tree.write(out, encoding='utf-8')
|
||||||
|
out = out.getvalue()
|
||||||
|
out = out.replace(b' />', b'/>')
|
||||||
|
with open(filepath, 'wb') as f:
|
||||||
|
f.write(out)
|
||||||
|
else:
|
||||||
|
tree.write(filepath, encoding='utf-8')
|
||||||
|
return have_errors
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
check_at_repository_root()
|
check_at_repository_root()
|
||||||
|
|
Loading…
Reference in a new issue