lbrycrd/contrib/devtools/update-translations.py

#!/usr/bin/env python3
# Copyright (c) 2014 Wladimir J. van der Laan
# Distributed under the MIT software license, see the accompanying
# file COPYING or http://www.opensource.org/licenses/mit-license.php.
'''
Run this script from the root of the repository to update all translations from
transifex.
It will do the following automatically:

- fetch all translations using the tx tool
- post-process them into valid and committable format
  - remove invalid control characters
  - remove location tags (makes diffs less noisy)

TODO:
- auto-add new translations to the build system according to the translation process
'''
import subprocess
import re
import sys
import os
import io
import xml.etree.ElementTree as ET

# Name of transifex tool
TX = 'tx'
# Name of source language file
SOURCE_LANG = 'bitcoin_en.ts'
# Directory with locale files
LOCALE_DIR = 'src/qt/locale'
# Minimum number of messages for translation to be considered at all
MIN_NUM_MESSAGES = 10
# Regexp to check for Bitcoin addresses
ADDRESS_REGEXP = re.compile('([13]|bc1)[a-zA-Z0-9]{30,}')

def check_at_repository_root():
    if not os.path.exists('.git'):
        print('No .git directory found')
        print('Execute this script at the root of the repository', file=sys.stderr)
        sys.exit(1)

def fetch_all_translations():
    if subprocess.call([TX, 'pull', '-f', '-a']):
        print('Error while fetching translations', file=sys.stderr)
        sys.exit(1)

def find_format_specifiers(s):
    '''Find all format specifiers in a string.'''
    pos = 0
    specifiers = []
    while True:
        percent = s.find('%', pos)
        if percent < 0:
            break
        specifiers.append(s[percent+1])
        pos = percent+2
    return specifiers

def split_format_specifiers(specifiers):
    '''Split format specifiers between numeric (Qt) and others (strprintf)'''
    numeric = []
    other = []
    for s in specifiers:
        if s in {'1','2','3','4','5','6','7','8','9'}:
            numeric.append(s)
        else:
            other.append(s)

    # If both numeric format specifiers and "others" are used, assume we're dealing
    # with a Qt-formatted message. In the case of Qt formatting (see https://doc.qt.io/qt-5/qstring.html#arg)
    # only numeric formats are replaced at all. This means "(percentage: %1%)" is valid, without needing
    # any kind of escaping that would be necessary for strprintf. Without this, this function
    # would wrongly detect '%)' as a printf format specifier.
    if numeric:
        other = []

    # numeric (Qt) can be present in any order, others (strprintf) must be in specified order
    return set(numeric),other

def sanitize_string(s):
    '''Sanitize string for printing'''
    return s.replace('\n',' ')

def check_format_specifiers(source, translation, errors, numerus):
    source_f = split_format_specifiers(find_format_specifiers(source))
    # assert that no source messages contain both Qt and strprintf format specifiers
    # if this fails, go change the source as this is hacky and confusing!
    assert(not(source_f[0] and source_f[1]))
    try:
        translation_f = split_format_specifiers(find_format_specifiers(translation))
    except IndexError:
        errors.append("Parse error in translation for '%s': '%s'" % (sanitize_string(source), sanitize_string(translation)))
        return False
    else:
        if source_f != translation_f:
            if numerus and source_f == (set(), ['n']) and translation_f == (set(), []) and translation.find('%') == -1:
                # Allow numerus translations to omit %n specifier (usually when it only has one possible value)
                return True
            errors.append("Mismatch between '%s' and '%s'" % (sanitize_string(source), sanitize_string(translation)))
            return False
    return True

def all_ts_files(suffix=''):
    for filename in os.listdir(LOCALE_DIR):
        # process only language files, and do not process source language
        if not filename.endswith('.ts'+suffix) or filename == SOURCE_LANG+suffix:
            continue
        if suffix: # remove provided suffix
            filename = filename[0:-len(suffix)]
        filepath = os.path.join(LOCALE_DIR, filename)
        yield(filename, filepath)

FIX_RE = re.compile(b'[\x00-\x09\x0b\x0c\x0e-\x1f]')
def remove_invalid_characters(s):
    '''Remove invalid characters from translation string'''
    return FIX_RE.sub(b'', s)

# Override cdata escape function to make our output match Qt's (optional, just for cleaner diffs for
# comparison, disable by default)
_orig_escape_cdata = None
def escape_cdata(text):
    text = _orig_escape_cdata(text)
    text = text.replace("'", '&apos;')
    text = text.replace('"', '&quot;')
    return text

def contains_bitcoin_addr(text, errors):
    if text != None and ADDRESS_REGEXP.search(text) != None:
        errors.append('Translation "%s" contains a bitcoin address. This will be removed.' % (text))
        return True
    return False

def postprocess_translations(reduce_diff_hacks=False):
    print('Checking and postprocessing...')

    if reduce_diff_hacks:
        global _orig_escape_cdata
        _orig_escape_cdata = ET._escape_cdata
        ET._escape_cdata = escape_cdata

    for (filename,filepath) in all_ts_files():
        os.rename(filepath, filepath+'.orig')

    have_errors = False
    for (filename,filepath) in all_ts_files('.orig'):
        # pre-fixups to cope with transifex output
        parser = ET.XMLParser(encoding='utf-8') # need to override encoding because 'utf8' is not understood only 'utf-8'
        with open(filepath + '.orig', 'rb') as f:
            data = f.read()
        # remove control characters; this must be done over the entire file otherwise the XML parser will fail
        data = remove_invalid_characters(data)
        tree = ET.parse(io.BytesIO(data), parser=parser)

        # iterate over all messages in file
        root = tree.getroot()
        for context in root.findall('context'):
            for message in context.findall('message'):
                numerus = message.get('numerus') == 'yes'
                source = message.find('source').text
                translation_node = message.find('translation')
                # pick all numerusforms
                if numerus:
                    translations = [i.text for i in translation_node.findall('numerusform')]
                else:
                    translations = [translation_node.text]

                for translation in translations:
                    if translation is None:
                        continue
                    errors = []
                    valid = check_format_specifiers(source, translation, errors, numerus) and not contains_bitcoin_addr(translation, errors)

                    for error in errors:
                        print('%s: %s' % (filename, error))

                    if not valid: # set type to unfinished and clear string if invalid
                        translation_node.clear()
                        translation_node.set('type', 'unfinished')
                        have_errors = True

                # Remove location tags
                for location in message.findall('location'):
                    message.remove(location)

                # Remove entire message if it is an unfinished translation
                if translation_node.get('type') == 'unfinished':
                    context.remove(message)

        # check if document is (virtually) empty, and remove it if so
        num_messages = 0
        for context in root.findall('context'):
            for message in context.findall('message'):
                num_messages += 1
        if num_messages < MIN_NUM_MESSAGES:
            print('Removing %s, as it contains only %i messages' % (filepath, num_messages))
            continue

        # write fixed-up tree
        # if diff reduction requested, replace some XML to 'sanitize' to qt formatting
        if reduce_diff_hacks:
            out = io.BytesIO()
            tree.write(out, encoding='utf-8')
            out = out.getvalue()
            out = out.replace(b' />', b'/>')
            with open(filepath, 'wb') as f:
                f.write(out)
        else:
            tree.write(filepath, encoding='utf-8')
    return have_errors

if __name__ == '__main__':
    check_at_repository_root()
    fetch_all_translations()
    postprocess_translations()
Change all python files to use Python3 2017-12-12 20:47:24 +01:00			`#!/usr/bin/env python3`
devtools: add a script to fetch and postprocess translations Run this script from the root of the repository to update all translations from transifex. It will do the following automatically: - create a transifex configuration file - fetch all translations - post-process them into valid and committable format 2014-04-30 09:46:04 +02:00			`# Copyright (c) 2014 Wladimir J. van der Laan`
Remove references to X11 licence 2014-12-13 05:09:33 +01:00			`# Distributed under the MIT software license, see the accompanying`
devtools: add a script to fetch and postprocess translations Run this script from the root of the repository to update all translations from transifex. It will do the following automatically: - create a transifex configuration file - fetch all translations - post-process them into valid and committable format 2014-04-30 09:46:04 +02:00			`# file COPYING or http://www.opensource.org/licenses/mit-license.php.`
			`'''`
			`Run this script from the root of the repository to update all translations from`
			`transifex.`
			`It will do the following automatically:`

			`- fetch all translations using the tx tool`
			`- post-process them into valid and committable format`
			`- remove invalid control characters`
			`- remove location tags (makes diffs less noisy)`

			`TODO:`
			`- auto-add new translations to the build system according to the translation process`
			`'''`
			`import subprocess`
			`import re`
			`import sys`
			`import os`
Add deeper XML checking to update-translation script - Catch problems such as mismatched formatting characters. Remove messages that can give problems at runtime. - Also remove unfinished/untranslated messages, they just take up space in the ts and waste parsing time. Fixes #4774. 2014-08-28 13:09:19 +02:00			`import io`
			`import xml.etree.ElementTree as ET`
devtools: add a script to fetch and postprocess translations Run this script from the root of the repository to update all translations from transifex. It will do the following automatically: - create a transifex configuration file - fetch all translations - post-process them into valid and committable format 2014-04-30 09:46:04 +02:00
			`# Name of transifex tool`
			`TX = 'tx'`
			`# Name of source language file`
			`SOURCE_LANG = 'bitcoin_en.ts'`
			`# Directory with locale files`
			`LOCALE_DIR = 'src/qt/locale'`
qt: Final translation update before 0.12 fork - Add new translations (finally, after a long time) - update-translation script was not considering new translations - oops - fixed this, also remove (nearly) empty translations - Update translation process, it was still describing the old repository structure 2015-12-02 14:28:35 +01:00			`# Minimum number of messages for translation to be considered at all`
			`MIN_NUM_MESSAGES = 10`
utils: checking for bitcoin addresses in translations Checking for and removing any bitcoin addresses in translations 2018-06-02 19:43:46 +02:00			`# Regexp to check for Bitcoin addresses`
			`ADDRESS_REGEXP = re.compile('([13]\|bc1)[a-zA-Z0-9]{30,}')`
devtools: add a script to fetch and postprocess translations Run this script from the root of the repository to update all translations from transifex. It will do the following automatically: - create a transifex configuration file - fetch all translations - post-process them into valid and committable format 2014-04-30 09:46:04 +02:00
			`def check_at_repository_root():`
			`if not os.path.exists('.git'):`
			`print('No .git directory found')`
			`print('Execute this script at the root of the repository', file=sys.stderr)`
Use sys.exit(...) instead of exit(...): exit(...) should not be used in programs 2017-07-09 22:57:02 +02:00			`sys.exit(1)`
devtools: add a script to fetch and postprocess translations Run this script from the root of the repository to update all translations from transifex. It will do the following automatically: - create a transifex configuration file - fetch all translations - post-process them into valid and committable format 2014-04-30 09:46:04 +02:00
			`def fetch_all_translations():`
qt: Final translation update before 0.12 fork - Add new translations (finally, after a long time) - update-translation script was not considering new translations - oops - fixed this, also remove (nearly) empty translations - Update translation process, it was still describing the old repository structure 2015-12-02 14:28:35 +01:00			`if subprocess.call([TX, 'pull', '-f', '-a']):`
devtools: add a script to fetch and postprocess translations Run this script from the root of the repository to update all translations from transifex. It will do the following automatically: - create a transifex configuration file - fetch all translations - post-process them into valid and committable format 2014-04-30 09:46:04 +02:00			`print('Error while fetching translations', file=sys.stderr)`
Use sys.exit(...) instead of exit(...): exit(...) should not be used in programs 2017-07-09 22:57:02 +02:00			`sys.exit(1)`
devtools: add a script to fetch and postprocess translations Run this script from the root of the repository to update all translations from transifex. It will do the following automatically: - create a transifex configuration file - fetch all translations - post-process them into valid and committable format 2014-04-30 09:46:04 +02:00
Add deeper XML checking to update-translation script - Catch problems such as mismatched formatting characters. Remove messages that can give problems at runtime. - Also remove unfinished/untranslated messages, they just take up space in the ts and waste parsing time. Fixes #4774. 2014-08-28 13:09:19 +02:00			`def find_format_specifiers(s):`
			`'''Find all format specifiers in a string.'''`
			`pos = 0`
			`specifiers = []`
			`while True:`
			`percent = s.find('%', pos)`
			`if percent < 0:`
			`break`
			`specifiers.append(s[percent+1])`
			`pos = percent+2`
			`return specifiers`

			`def split_format_specifiers(specifiers):`
			`'''Split format specifiers between numeric (Qt) and others (strprintf)'''`
			`numeric = []`
			`other = []`
			`for s in specifiers:`
			`if s in {'1','2','3','4','5','6','7','8','9'}:`
			`numeric.append(s)`
			`else:`
			`other.append(s)`

devtools: Handle Qt formatting characters edge-case in update-translations.py If both numeric format specifiers and "others" are used, assume we're dealing with a Qt-formatted message. In the case of Qt formatting (see https://doc.qt.io/qt-5/qstring.html#arg) only numeric formats are replaced at all. This means "(percentage: %1%)" is valid (which was introduced in #9461), without needing any kind of escaping that would be necessary for strprintf. Without this, this function would wrongly detect '%)' as a printf format specifier. 2017-02-10 09:49:04 +01:00			`# If both numeric format specifiers and "others" are used, assume we're dealing`
			`# with a Qt-formatted message. In the case of Qt formatting (see https://doc.qt.io/qt-5/qstring.html#arg)`
			`# only numeric formats are replaced at all. This means "(percentage: %1%)" is valid, without needing`
			`# any kind of escaping that would be necessary for strprintf. Without this, this function`
			`# would wrongly detect '%)' as a printf format specifier.`
			`if numeric:`
			`other = []`

Add deeper XML checking to update-translation script - Catch problems such as mismatched formatting characters. Remove messages that can give problems at runtime. - Also remove unfinished/untranslated messages, they just take up space in the ts and waste parsing time. Fixes #4774. 2014-08-28 13:09:19 +02:00			`# numeric (Qt) can be present in any order, others (strprintf) must be in specified order`
			`return set(numeric),other`

			`def sanitize_string(s):`
			`'''Sanitize string for printing'''`
			`return s.replace('\n',' ')`

Bugfix: update-translations: Allow numerus translations to omit %n specifier (usually when it only has one possible value) 2015-12-25 14:12:37 +01:00			`def check_format_specifiers(source, translation, errors, numerus):`
Add deeper XML checking to update-translation script - Catch problems such as mismatched formatting characters. Remove messages that can give problems at runtime. - Also remove unfinished/untranslated messages, they just take up space in the ts and waste parsing time. Fixes #4774. 2014-08-28 13:09:19 +02:00			`source_f = split_format_specifiers(find_format_specifiers(source))`
			`# assert that no source messages contain both Qt and strprintf format specifiers`
			`# if this fails, go change the source as this is hacky and confusing!`
			`assert(not(source_f[0] and source_f[1]))`
			`try:`
			`translation_f = split_format_specifiers(find_format_specifiers(translation))`
			`except IndexError:`
Bugfix: update-translations: Allow numerus translations to omit %n specifier (usually when it only has one possible value) 2015-12-25 14:12:37 +01:00			`errors.append("Parse error in translation for '%s': '%s'" % (sanitize_string(source), sanitize_string(translation)))`
Add deeper XML checking to update-translation script - Catch problems such as mismatched formatting characters. Remove messages that can give problems at runtime. - Also remove unfinished/untranslated messages, they just take up space in the ts and waste parsing time. Fixes #4774. 2014-08-28 13:09:19 +02:00			`return False`
			`else:`
			`if source_f != translation_f:`
Bugfix: update-translations: Allow numerus translations to omit %n specifier (usually when it only has one possible value) 2015-12-25 14:12:37 +01:00			`if numerus and source_f == (set(), ['n']) and translation_f == (set(), []) and translation.find('%') == -1:`
			`# Allow numerus translations to omit %n specifier (usually when it only has one possible value)`
			`return True`
Add deeper XML checking to update-translation script - Catch problems such as mismatched formatting characters. Remove messages that can give problems at runtime. - Also remove unfinished/untranslated messages, they just take up space in the ts and waste parsing time. Fixes #4774. 2014-08-28 13:09:19 +02:00			`errors.append("Mismatch between '%s' and '%s'" % (sanitize_string(source), sanitize_string(translation)))`
			`return False`
			`return True`

			`def all_ts_files(suffix=''):`
devtools: add a script to fetch and postprocess translations Run this script from the root of the repository to update all translations from transifex. It will do the following automatically: - create a transifex configuration file - fetch all translations - post-process them into valid and committable format 2014-04-30 09:46:04 +02:00			`for filename in os.listdir(LOCALE_DIR):`
			`# process only language files, and do not process source language`
Add deeper XML checking to update-translation script - Catch problems such as mismatched formatting characters. Remove messages that can give problems at runtime. - Also remove unfinished/untranslated messages, they just take up space in the ts and waste parsing time. Fixes #4774. 2014-08-28 13:09:19 +02:00			`if not filename.endswith('.ts'+suffix) or filename == SOURCE_LANG+suffix:`
devtools: add a script to fetch and postprocess translations Run this script from the root of the repository to update all translations from transifex. It will do the following automatically: - create a transifex configuration file - fetch all translations - post-process them into valid and committable format 2014-04-30 09:46:04 +02:00			`continue`
Add deeper XML checking to update-translation script - Catch problems such as mismatched formatting characters. Remove messages that can give problems at runtime. - Also remove unfinished/untranslated messages, they just take up space in the ts and waste parsing time. Fixes #4774. 2014-08-28 13:09:19 +02:00			`if suffix: # remove provided suffix`
			`filename = filename[0:-len(suffix)]`
devtools: add a script to fetch and postprocess translations Run this script from the root of the repository to update all translations from transifex. It will do the following automatically: - create a transifex configuration file - fetch all translations - post-process them into valid and committable format 2014-04-30 09:46:04 +02:00			`filepath = os.path.join(LOCALE_DIR, filename)`
Add deeper XML checking to update-translation script - Catch problems such as mismatched formatting characters. Remove messages that can give problems at runtime. - Also remove unfinished/untranslated messages, they just take up space in the ts and waste parsing time. Fixes #4774. 2014-08-28 13:09:19 +02:00			`yield(filename, filepath)`

			`FIX_RE = re.compile(b'[\x00-\x09\x0b\x0c\x0e-\x1f]')`
			`def remove_invalid_characters(s):`
			`'''Remove invalid characters from translation string'''`
			`return FIX_RE.sub(b'', s)`

			`# Override cdata escape function to make our output match Qt's (optional, just for cleaner diffs for`
			`# comparison, disable by default)`
			`_orig_escape_cdata = None`
			`def escape_cdata(text):`
			`text = _orig_escape_cdata(text)`
			`text = text.replace("'", ''')`
			`text = text.replace('"', '"')`
			`return text`

utils: checking for bitcoin addresses in translations Checking for and removing any bitcoin addresses in translations 2018-06-02 19:43:46 +02:00			`def contains_bitcoin_addr(text, errors):`
			`if text != None and ADDRESS_REGEXP.search(text) != None:`
			`errors.append('Translation "%s" contains a bitcoin address. This will be removed.' % (text))`
			`return True`
			`return False`

Add deeper XML checking to update-translation script - Catch problems such as mismatched formatting characters. Remove messages that can give problems at runtime. - Also remove unfinished/untranslated messages, they just take up space in the ts and waste parsing time. Fixes #4774. 2014-08-28 13:09:19 +02:00			`def postprocess_translations(reduce_diff_hacks=False):`
			`print('Checking and postprocessing...')`

			`if reduce_diff_hacks:`
			`global _orig_escape_cdata`
			`_orig_escape_cdata = ET._escape_cdata`
			`ET._escape_cdata = escape_cdata`

			`for (filename,filepath) in all_ts_files():`
			`os.rename(filepath, filepath+'.orig')`

			`have_errors = False`
			`for (filename,filepath) in all_ts_files('.orig'):`
			`# pre-fixups to cope with transifex output`
			`parser = ET.XMLParser(encoding='utf-8') # need to override encoding because 'utf8' is not understood only 'utf-8'`
			`with open(filepath + '.orig', 'rb') as f:`
devtools: add a script to fetch and postprocess translations Run this script from the root of the repository to update all translations from transifex. It will do the following automatically: - create a transifex configuration file - fetch all translations - post-process them into valid and committable format 2014-04-30 09:46:04 +02:00			`data = f.read()`
Add deeper XML checking to update-translation script - Catch problems such as mismatched formatting characters. Remove messages that can give problems at runtime. - Also remove unfinished/untranslated messages, they just take up space in the ts and waste parsing time. Fixes #4774. 2014-08-28 13:09:19 +02:00			`# remove control characters; this must be done over the entire file otherwise the XML parser will fail`
			`data = remove_invalid_characters(data)`
			`tree = ET.parse(io.BytesIO(data), parser=parser)`

			`# iterate over all messages in file`
			`root = tree.getroot()`
			`for context in root.findall('context'):`
			`for message in context.findall('message'):`
			`numerus = message.get('numerus') == 'yes'`
			`source = message.find('source').text`
			`translation_node = message.find('translation')`
			`# pick all numerusforms`
			`if numerus:`
			`translations = [i.text for i in translation_node.findall('numerusform')]`
			`else:`
			`translations = [translation_node.text]`

			`for translation in translations:`
			`if translation is None:`
			`continue`
			`errors = []`
utils: checking for bitcoin addresses in translations Checking for and removing any bitcoin addresses in translations 2018-06-02 19:43:46 +02:00			`valid = check_format_specifiers(source, translation, errors, numerus) and not contains_bitcoin_addr(translation, errors)`
Add deeper XML checking to update-translation script - Catch problems such as mismatched formatting characters. Remove messages that can give problems at runtime. - Also remove unfinished/untranslated messages, they just take up space in the ts and waste parsing time. Fixes #4774. 2014-08-28 13:09:19 +02:00
			`for error in errors:`
			`print('%s: %s' % (filename, error))`

			`if not valid: # set type to unfinished and clear string if invalid`
			`translation_node.clear()`
			`translation_node.set('type', 'unfinished')`
			`have_errors = True`

			`# Remove location tags`
			`for location in message.findall('location'):`
			`message.remove(location)`

			`# Remove entire message if it is an unfinished translation`
			`if translation_node.get('type') == 'unfinished':`
			`context.remove(message)`

qt: Final translation update before 0.12 fork - Add new translations (finally, after a long time) - update-translation script was not considering new translations - oops - fixed this, also remove (nearly) empty translations - Update translation process, it was still describing the old repository structure 2015-12-02 14:28:35 +01:00			`# check if document is (virtually) empty, and remove it if so`
			`num_messages = 0`
			`for context in root.findall('context'):`
			`for message in context.findall('message'):`
			`num_messages += 1`
			`if num_messages < MIN_NUM_MESSAGES:`
			`print('Removing %s, as it contains only %i messages' % (filepath, num_messages))`
			`continue`

Add deeper XML checking to update-translation script - Catch problems such as mismatched formatting characters. Remove messages that can give problems at runtime. - Also remove unfinished/untranslated messages, they just take up space in the ts and waste parsing time. Fixes #4774. 2014-08-28 13:09:19 +02:00			`# write fixed-up tree`
			`# if diff reduction requested, replace some XML to 'sanitize' to qt formatting`
			`if reduce_diff_hacks:`
			`out = io.BytesIO()`
			`tree.write(out, encoding='utf-8')`
			`out = out.getvalue()`
			`out = out.replace(b' />', b'/>')`
			`with open(filepath, 'wb') as f:`
			`f.write(out)`
			`else:`
			`tree.write(filepath, encoding='utf-8')`
			`return have_errors`
devtools: add a script to fetch and postprocess translations Run this script from the root of the repository to update all translations from transifex. It will do the following automatically: - create a transifex configuration file - fetch all translations - post-process them into valid and committable format 2014-04-30 09:46:04 +02:00
			`if __name__ == '__main__':`
			`check_at_repository_root()`
			`fetch_all_translations()`
			`postprocess_translations()`