Mercurial > kallithea

--- a/scripts/i18n	Sun Mar 29 21:24:14 2020 +0200
+++ b/scripts/i18n	Thu Dec 19 00:14:27 2019 +0100
@@ -21,6 +21,14 @@

 """
 Tool for maintenance of .po and .pot files
+
+Normally, the i18n-related files contain for each translatable string a
+reference to all the source code locations where this string is found. This
+meta data is useful for translators to assess how strings are used, but is not
+relevant for normal development nor for running Kallithea. Such meta data, or
+derived data like kallithea.pot, will inherently be outdated, and create
+unnecessary churn and repository growth, making it harder to spot actual and
+important changes.
 """

 @click.group()
@@ -30,5 +38,16 @@
         i18n_utils.do_debug = True
     pass

+@cli.command()
+@click.argument('po_files', nargs=-1)
+def normalize_po_files(po_files):
+    """Normalize the specified .po and .pot files.
+
+    Only actual translations and essential headers will be preserved.
+    """
+    for po_file in po_files:
+        i18n_utils._normalize_po_file(po_file, strip=True)
+
+
 if __name__ == '__main__':
     cli()
--- a/scripts/i18n_utils.py	Sun Mar 29 21:24:14 2020 +0200
+++ b/scripts/i18n_utils.py	Thu Dec 19 00:14:27 2019 +0100
@@ -13,6 +13,8 @@

 from __future__ import print_function

+import os
+import re
 import subprocess


@@ -25,3 +27,141 @@
 def runcmd(cmd, *args, **kwargs):
     debug('... Executing command: %s' % ' '.join(cmd))
     subprocess.check_call(cmd, *args, **kwargs)
+
+header_comment_strip_re = re.compile(r'''
+    ^
+    [#][ ]Translations[ ]template[ ]for[ ]Kallithea[.] \n
+    |
+    ^
+    [#][ ]FIRST[ ]AUTHOR[ ]<EMAIL@ADDRESS>,[ ]\d+[.] \n
+    [#] \n
+    [#],[ ]fuzzy \n
+    ''',
+    re.MULTILINE|re.VERBOSE)
+
+header_normalize_re = re.compile(r'''
+    ^ "
+    (POT-Creation-Date|PO-Revision-Date|Last-Translator|Language-Team|X-Generator|Generated-By|Project-Id-Version):
+    [ ][^\\]*\\n
+    " \n
+    ''',
+    re.MULTILINE|re.IGNORECASE|re.VERBOSE)
+
+def _normalize_po(raw_content):
+    r"""
+    >>> print(_normalize_po(r'''
+    ... # header comment
+    ...
+    ...
+    ... # comment before header
+    ... msgid ""
+    ... msgstr "yada"
+    ... "POT-Creation-Date: 2019-05-04 21:13+0200\n"
+    ... "MIME-Version: "
+    ... "1.0\n"
+    ... "Last-Translator: Jabba"
+    ... "the Hutt\n"
+    ... "X-Generator: Weblate 1.2.3\n"
+    ...
+    ... # comment, but not in header
+    ... msgid "None"
+    ... msgstr "Ingen"
+    ...
+    ...
+    ... line 2
+    ... # third comment
+    ...
+    ... msgid "Special"
+    ... msgstr ""
+    ...
+    ... msgid "Specialist"
+    ... # odd comment
+    ... msgstr ""
+    ... "Expert"
+    ...
+    ... # crazy fuzzy auto translation by msgmerge, using foo for bar
+    ... #, fuzzy
+    ... #| msgid "some foo string"
+    ... msgid "some bar string."
+    ... msgstr "translation of foo string"
+    ...
+    ... msgid "%d minute"
+    ... msgid_plural "%d minutes"
+    ... msgstr[0] "minut"
+    ... msgstr[1] "minutter"
+    ... msgstr[2] ""
+    ...
+    ... msgid "%d year"
+    ... msgid_plural "%d years"
+    ... msgstr[0] ""
+    ... msgstr[1] ""
+    ...
+    ... # last comment
+    ... ''') + '^^^')
+    # header comment
+    <BLANKLINE>
+    <BLANKLINE>
+    # comment before header
+    <BLANKLINE>
+    msgid ""
+    msgstr "yada"
+    "MIME-Version: "
+    "1.0\n"
+    <BLANKLINE>
+    msgid "None"
+    msgstr "Ingen"
+    <BLANKLINE>
+    line 2
+    <BLANKLINE>
+    msgid "Specialist"
+    msgstr ""
+    "Expert"
+    <BLANKLINE>
+    msgid "%d minute"
+    msgid_plural "%d minutes"
+    msgstr[0] "minut"
+    msgstr[1] "minutter"
+    msgstr[2] ""
+    ^^^
+    """
+    header_start = raw_content.find('\nmsgid ""\n') + 1
+    header_end = raw_content.find('\n\n', header_start) + 1 or len(raw_content)
+    chunks = [
+        header_comment_strip_re.sub('', raw_content[0:header_start])
+            .strip(),
+        '',
+        header_normalize_re.sub('', raw_content[header_start:header_end])
+            .strip(),
+        '']  # preserve normalized header
+    # all chunks are separated by empty line
+    for raw_chunk in raw_content[header_end:].split('\n\n'):
+        if '\n#, fuzzy' in raw_chunk:  # might be like "#, fuzzy, python-format"
+            continue  # drop crazy auto translation that is worse than useless
+        # strip all comment lines from chunk
+        chunk_lines = [
+            line
+            for line in raw_chunk.splitlines()
+            if line
+            and not line.startswith('#')
+        ]
+        if not chunk_lines:
+            continue
+        # check lines starting from first msgstr, skip chunk if no translation lines
+        msgstr_i = [i for i, line in enumerate(chunk_lines) if line.startswith('msgstr')]
+        if (
+            chunk_lines[0].startswith('msgid') and
+            msgstr_i and
+            all(line.endswith(' ""') for line in chunk_lines[msgstr_i[0]:])
+        ):  # skip translation chunks that doesn't have any actual translations
+            continue
+        chunks.append('\n'.join(chunk_lines) + '\n')
+    return '\n'.join(chunks)
+
+def _normalize_po_file(po_file, strip=False):
+    if strip:
+        po_tmp = po_file + '.tmp'
+        with open(po_file, 'r') as src, open(po_tmp, 'w') as dest:
+            raw_content = src.read()
+            normalized_content = _normalize_po(raw_content)
+            dest.write(normalized_content)
+        os.rename(po_tmp, po_file)