Merge pull request #2081 from edx/ned/i18n-xmodule

Improvements to the i18n workflow tools.

Merge pull request #2081 from edx/ned/i18n-xmodule
Improvements to the i18n workflow tools.
61afc819 · Ned Batchelder · 430cfd75 · af120fdf · 61afc819 · 61afc819
Commit 61afc819 authored Jan 07, 2014 by Ned Batchelder
11 changed files
--- a/common/lib/xmodule/xmodule/js/src/capa/display.coffee
+++ b/common/lib/xmodule/xmodule/js/src/capa/display.coffee
@@ -405,7 +405,7 @@ class @Problem
    formulaequationinput: (element) ->
      $(element).find('input').on 'input', ->
        $p = $(element).find('p.status')
-        `// Translators: the word Answer here is about answering a problem the student must solve.`
+        `// Translators: the word unanswered here is about answering a problem the student must solve.`
        $p.text gettext("unanswered")
        $p.parent().removeClass().addClass "unanswered"

@@ -434,7 +434,7 @@ class @Problem
    textline: (element) ->
      $(element).find('input').on 'input', ->
        $p = $(element).find('p.status')
-        `// Translators: the word Answer here is about answering a problem the student must solve.`
+        `// Translators: the word unanswered here is about answering a problem the student must solve.`
        $p.text gettext("unanswered")
        $p.parent().removeClass().addClass "unanswered"


--- a/i18n/converter.py
+++ b/i18n/converter.py
@@ -21,9 +21,9 @@ class Converter(object):
    #   HTML:   <B>, </B>, <BR/>, <textformat leading="10">
    #   Python: %(date)s, %(name)s
    tag_pattern = re.compile(r'''
-        (<[-\w" .:?=/]*>)   |       # <tag>
-        ({[^}]*})           |       # {tag}
-        (%\([^)]*\)\w)      |       # %(tag)s
+        (<[^>]+>)           |       # <tag>
+        ({[^}]+})           |       # {tag}
+        (%\([\w]+\)\w)      |       # %(tag)s
        (&\w+;)             |       # &entity;
        (&\#\d+;)           |       # &#1234;
        (&\#x[0-9a-f]+;)            # &#xABCD;

--- a/i18n/dummy.py
+++ b/i18n/dummy.py
-from converter import Converter
+# -*- coding: utf-8 -*-
+r"""
+Creates new localization properties files in a dummy language.
+
+Each property file is derived from the equivalent en_US file, with these
+transformations applied:
+
+1. Every vowel is replaced with an equivalent with extra accent marks.

-# Creates new localization properties files in a dummy language
-# Each property file is derived from the equivalent en_US file, except
-# 1. Every vowel is replaced with an equivalent with extra accent marks
-# 2. Every string is padded out to +30% length to simulate verbose languages (e.g. German)
-#    to see if layout and flows work properly
-# 3. Every string is terminated with a '#' character to make it easier to detect truncation
+2. Every string is padded out to +30% length to simulate verbose languages
+   (such as German) to see if layout and flows work properly.

+3. Every string is terminated with a '#' character to make it easier to detect
+   truncation.

-# --------------------------------
-# Example use:
-# >>> from dummy import Dummy
-# >>> c = Dummy()
-# >>> c.convert("hello my name is Bond, James Bond")
-# u'h\xe9ll\xf6 my n\xe4m\xe9 \xefs B\xf6nd, J\xe4m\xe9s B\xf6nd Lorem i#'
-#
-# >>> c.convert('don\'t convert <a href="href">tag ids</a>')
-# u'd\xf6n\'t \xe7\xf6nv\xe9rt <a href="href">t\xe4g \xefds</a> Lorem ipsu#'
-#
-# >>> c.convert('don\'t convert %(name)s tags on %(date)s')
-# u"d\xf6n't \xe7\xf6nv\xe9rt %(name)s t\xe4gs \xf6n %(date)s Lorem ips#"
+Example use::

+    >>> from dummy import Dummy
+    >>> c = Dummy()
+    >>> c.convert("My name is Bond, James Bond")
+    u'M\xfd n\xe4m\xe9 \xefs B\xf8nd, J\xe4m\xe9s B\xf8nd \u2360\u03c3\u044f\u0454\u043c \u03b9\u03c1#'
+    >>> print c.convert("My name is Bond, James Bond")
+    Mý nämé ïs Bønd, Jämés Bønd Ⱡσяєм ιρ#
+    >>> print c.convert("don't convert <a href='href'>tag ids</a>")
+    døn't çønvért <a href='href'>täg ïds</a> Ⱡσяєм ιρѕυ#
+    >>> print c.convert("don't convert %(name)s tags on %(date)s")
+    døn't çønvért %(name)s tägs øn %(date)s Ⱡσяєм ιρѕ#
+
+"""
+
+from converter import Converter

 # Substitute plain characters with accented lookalikes.
 # http://tlt.its.psu.edu/suggestions/international/web/codehtml.html#accent
-TABLE = {'A': u'\xC0',
-         'a': u'\xE4',
-         'b': u'\xDF',
-         'C': u'\xc7',
-         'c': u'\xE7',
-         'E': u'\xC9',
-         'e': u'\xE9',
-         'I': U'\xCC',
-         'i': u'\xEF',
-         'O': u'\xD8',
-         'o': u'\xF8',
-         'U': u'\xDB',
-         'u': u'\xFC',
-         'Y': u'\xDD',
-         'y': u'\xFD',
-         }
-
+TABLE = {
+    'A': u'À',
+    'a': u'ä',
+    'b': u'ß',
+    'C': u'Ç',
+    'c': u'ç',
+    'E': u'É',
+    'e': u'é',
+    'I': u'Ì',
+    'i': u'ï',
+    'O': u'Ø',
+    'o': u'ø',
+    'U': u'Û',
+    'u': u'ü',
+    'Y': u'Ý',
+    'y': u'ý',
+}


 # The print industry's standard dummy text, in use since the 1500s
-# see http://www.lipsum.com/
-LOREM = ' Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed ' \
-        'do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad ' \
-        'minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ' \
-        'ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate ' \
-        'velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat ' \
-        'cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. '
+# see http://www.lipsum.com/, then fed through a "fancy-text" converter.
+# The string should start with a space.
+LOREM = " " + " ".join(     # join and split just make the string easier here.
+    u"""
+    Ⱡσяєм ιρѕυм ∂σłσя ѕιт αмєт, ¢σηѕє¢тєтυя α∂ιριѕι¢ιηg єłιт, ѕє∂ ∂σ єιυѕмσ∂
+    тємρσя ιη¢ι∂ι∂υηт υт łαвσяє єт ∂σłσяє мαgηα αłιqυα. υт єηιм α∂ мιηιм
+    νєηιαм, qυιѕ ησѕтяυ∂ єχєя¢ιтαтιση υłłαм¢σ łαвσяιѕ ηιѕι υт αłιqυιρ єχ єα
+    ¢σммσ∂σ ¢σηѕєqυαт.  ∂υιѕ αυтє ιяυяє ∂σłσя ιη яєρяєнєη∂єяιт ιη νσłυρтαтє
+    νєłιт єѕѕє ¢ιłłυм ∂σłσяє єυ ƒυgιαт ηυłłα ραяιαтυя. єχ¢єρтєυя ѕιηт σ¢¢αє¢αт
+    ¢υρι∂αтαт ηση ρяσι∂єηт, ѕυηт ιη ¢υłρα qυι σƒƒι¢ια ∂єѕєяυηт мσłłιт αηιм ι∂
+    єѕт łαвσяυм.
+    """.split()
+)

 # To simulate more verbose languages (like German), pad the length of a string
 # by a multiple of PAD_FACTOR
@@ -85,20 +99,6 @@ class Dummy(Converter):
        """replaces the final char of string with #"""
        return string[:-1] + '#'

-    def init_msgs(self, msgs):
-        """
-        Make sure the first msg in msgs has a plural property.
-        msgs is list of instances of polib.POEntry
-        """
-        if not msgs:
-            return
-        headers = msgs[0].get_property('msgstr')
-        has_plural = any(header.startswith('Plural-Forms:') for header in headers)
-        if not has_plural:
-            # Apply declaration for English pluralization rules
-            plural = "Plural-Forms: nplurals=2; plural=(n != 1);\\n"
-            headers.append(plural)
-
    def convert_msg(self, msg):
        """
        Takes one POEntry object and converts it (adds a dummy translation to it)
@@ -114,8 +114,10 @@ class Dummy(Converter):
            # translate singular and plural
            foreign_single = self.convert(source)
            foreign_plural = self.convert(plural)
-            plural = {'0': self.final_newline(source, foreign_single),
-                      '1': self.final_newline(plural, foreign_plural)}
+            plural = {
+                '0': self.final_newline(source, foreign_single),
+                '1': self.final_newline(plural, foreign_plural),
+            }
            msg.msgstr_plural = plural
        else:
            foreign = self.convert(source)

--- a/i18n/extract.py
+++ b/i18n/extract.py
@@ -45,7 +45,7 @@ def main():
        remove_file(source_msgs_dir.joinpath(filename))

    # Extract strings from mako templates.
-    babel_mako_cmd = 'pybabel extract -F %s -c "TRANSLATORS:" . -o %s' % (BABEL_CONFIG, BABEL_OUT)
+    babel_mako_cmd = 'pybabel extract -F %s -c "Translators:" . -o %s' % (BABEL_CONFIG, BABEL_OUT)

    # Extract strings from django source files.
    make_django_cmd = (

--- a/i18n/generate.py
+++ b/i18n/generate.py
@@ -60,9 +60,12 @@ def merge(locale, target='django.po', fail_if_missing=True):
 def clean_metadata(file):
    """
    Clean up redundancies in the metadata caused by merging.
-    This reads in a PO file and simply saves it back out again.
    """
-    pofile(file).save()
+    # Reading in the .po file and saving it again fixes redundancies.
+    pomsgs = pofile(file)
+    # The msgcat tool marks the metadata as fuzzy, but it's ok as it is.
+    pomsgs.metadata_is_fuzzy = False
+    pomsgs.save()


 def validate_files(dir, files_to_merge):

--- a/i18n/make_dummy.py
+++ b/i18n/make_dummy.py
@@ -38,9 +38,15 @@ def main(file, locale):
        raise IOError('File does not exist: %s' % file)
    pofile = polib.pofile(file)
    converter = Dummy()
-    converter.init_msgs(pofile.translated_entries())
    for msg in pofile:
        converter.convert_msg(msg)
+
+    # If any message has a plural, then the file needs plural information.
+    # Apply declaration for English pluralization rules so that ngettext will
+    # do something reasonable.
+    if any(m.msgid_plural for m in pofile):
+        pofile.metadata['Plural-Forms'] = 'nplurals=2; plural=(n != 1);'
+
    new_file = new_filename(file, locale)
    create_dir_if_necessary(new_file)
    pofile.save(new_file)

--- a/i18n/tests/test_converter.py
+++ b/i18n/tests/test_converter.py
+"""Tests of i18n/converter.py"""
+
 import os
 from unittest import TestCase
+import ddt

 import converter

@@ -11,36 +14,48 @@ class UpcaseConverter(converter.Converter):
        return string.upper()


+@ddt.ddt
 class TestConverter(TestCase):
    """
    Tests functionality of i18n/converter.py
    """

-    def test_converter(self):
+    @ddt.data(
+        # no tags
+        ('big bad wolf',
+         'BIG BAD WOLF'),
+        # one html tag
+        ('big <strong>bad</strong> wolf',
+         'BIG <strong>BAD</strong> WOLF'),
+        # two html tags
+        ('big <b>bad</b> gray <i>wolf</i>',
+         'BIG <b>BAD</b> GRAY <i>WOLF</i>'),
+        # html tags with attributes
+        ('<a href="foo">bar</a> baz',
+         '<a href="foo">BAR</a> BAZ'),
+        ("<a href='foo'>bar</a> baz",
+         "<a href='foo'>BAR</a> BAZ"),
+        # one python tag
+        ('big %(adjective)s wolf',
+         'BIG %(adjective)s WOLF'),
+        # two python tags
+        ('big %(adjective)s gray %(noun)s',
+         'BIG %(adjective)s GRAY %(noun)s'),
+        # both kinds of tags
+        ('<strong>big</strong> %(adjective)s %(noun)s',
+         '<strong>BIG</strong> %(adjective)s %(noun)s'),
+        # .format-style tags
+        ('The {0} barn is {1!r}.',
+         'THE {0} BARN IS {1!r}.'),
+        # HTML entities
+        ('<b>&copy; 2013 edX, &#xa0;</b>',
+         '<b>&copy; 2013 EDX, &#xa0;</b>'),
+    )
+    def test_converter(self, data):
        """
        Tests with a simple converter (converts strings to uppercase).
        Assert that embedded HTML and python tags are not converted.
        """
-        c = UpcaseConverter()
-        test_cases = [
-            # no tags
-            ('big bad wolf', 'BIG BAD WOLF'),
-            # one html tag
-            ('big <strong>bad</strong> wolf', 'BIG <strong>BAD</strong> WOLF'),
-            # two html tags
-            ('big <b>bad</b> <i>wolf</i>', 'BIG <b>BAD</b> <i>WOLF</i>'),
-            # one python tag
-            ('big %(adjective)s wolf', 'BIG %(adjective)s WOLF'),
-            # two python tags
-            ('big %(adjective)s %(noun)s', 'BIG %(adjective)s %(noun)s'),
-            # both kinds of tags
-            ('<strong>big</strong> %(adjective)s %(noun)s',
-             '<strong>BIG</strong> %(adjective)s %(noun)s'),
-            # .format-style tags
-            ('The {0} barn is {1!r}.', 'THE {0} BARN IS {1!r}.'),
-            # HTML entities
-            ('<b>&copy; 2013 edX, &#xa0;</b>', '<b>&copy; 2013 EDX, &#xa0;</b>'),
-        ]
-        for source, expected in test_cases:
-            result = c.convert(source)
-            self.assertEquals(result, expected)
+        source, expected = data
+        result = UpcaseConverter().convert(source)
+        self.assertEquals(result, expected)
--- a/i18n/tests/test_dummy.py
+++ b/i18n/tests/test_dummy.py
+# -*- coding: utf-8 -*-
+"""Tests of i18n/dummy.py"""
+
 import os, string, random
 from unittest import TestCase
+
+import ddt
 from polib import POEntry

 import dummy


+@ddt.ddt
 class TestDummy(TestCase):
    """
    Tests functionality of i18n/dummy.py
@@ -13,39 +19,52 @@ class TestDummy(TestCase):
    def setUp(self):
        self.converter = dummy.Dummy()

-    def test_dummy(self):
+    def assertUnicodeEquals(self, str1, str2):
+        """Just like assertEquals, but doesn't put Unicode into the fail message.
+
+        Either nose, or rake, or something, deals very badly with unusual
+        Unicode characters in the assertions, so we use repr here to keep
+        things safe.
+
+        """
+        self.assertEquals(
+            str1, str2,
+            "Mismatch: %r != %r" % (str1, str2),
+        )
+
+    @ddt.data(
+        (u"hello my name is Bond, James Bond",
+         u"héllø mý nämé ïs Bønd, Jämés Bønd Ⱡσяєм ι#"),
+
+        (u"don't convert <a href='href'>tag ids</a>",
+         u"døn't çønvért <a href='href'>täg ïds</a> Ⱡσяєм ιρѕυ#"),
+
+        (u"don't convert %(name)s tags on %(date)s",
+         u"døn't çønvért %(name)s tägs øn %(date)s Ⱡσяєм ιρѕ#"),
+    )
+    def test_dummy(self, data):
        """
        Tests with a dummy converter (adds spurious accents to strings).
        Assert that embedded HTML and python tags are not converted.
        """
-        test_cases = [
-            ("hello my name is Bond, James Bond",
-             u'h\xe9ll\xf8 m\xfd n\xe4m\xe9 \xefs B\xf8nd, J\xe4m\xe9s B\xf8nd Lorem i#'),
-
-            ('don\'t convert <a href="href">tag ids</a>',
-             u'd\xf8n\'t \xe7\xf8nv\xe9rt <a href="href">t\xe4g \xefds</a> Lorem ipsu#'),
-
-            ('don\'t convert %(name)s tags on %(date)s',
-             u"d\xf8n't \xe7\xf8nv\xe9rt %(name)s t\xe4gs \xf8n %(date)s Lorem ips#")
-        ]
-        for source, expected in test_cases:
-            result = self.converter.convert(source)
-            self.assertEquals(result, expected)
+        source, expected = data
+        result = self.converter.convert(source)
+        self.assertUnicodeEquals(result, expected)

    def test_singular(self):
        entry = POEntry()
        entry.msgid = 'A lovely day for a cup of tea.'
-        expected = u'\xc0 l\xf8v\xe9l\xfd d\xe4\xfd f\xf8r \xe4 \xe7\xfcp \xf8f t\xe9\xe4. Lorem i#'
+        expected = u'À løvélý däý før ä çüp øf téä. Ⱡσяєм ι#'
        self.converter.convert_msg(entry)
-        self.assertEquals(entry.msgstr, expected)
+        self.assertUnicodeEquals(entry.msgstr, expected)

    def test_plural(self):
        entry = POEntry()
        entry.msgid = 'A lovely day for a cup of tea.'
        entry.msgid_plural = 'A lovely day for some cups of tea.'
-        expected_s = u'\xc0 l\xf8v\xe9l\xfd d\xe4\xfd f\xf8r \xe4 \xe7\xfcp \xf8f t\xe9\xe4. Lorem i#'
-        expected_p = u'\xc0 l\xf8v\xe9l\xfd d\xe4\xfd f\xf8r s\xf8m\xe9 \xe7\xfcps \xf8f t\xe9\xe4. Lorem ip#'
+        expected_s = u'À løvélý däý før ä çüp øf téä. Ⱡσяєм ι#'
+        expected_p = u'À løvélý däý før sømé çüps øf téä. Ⱡσяєм ιρ#'
        self.converter.convert_msg(entry)
        result = entry.msgstr_plural
-        self.assertEquals(result['0'], expected_s)
-        self.assertEquals(result['1'], expected_p)
+        self.assertUnicodeEquals(result['0'], expected_s)
+        self.assertUnicodeEquals(result['1'], expected_p)
--- a/i18n/tests/test_validate.py
+++ b/i18n/tests/test_validate.py
-import os, sys, logging
-from unittest import TestCase
-from nose.plugins.skip import SkipTest
+"""Tests that validate .po files."""
+
+import codecs
+import logging
+import os
+import sys
+import textwrap
+
+import polib

 from config import LOCALE_DIR
 from execute import call
+from converter import Converter
+

 def test_po_files(root=LOCALE_DIR):
    """
@@ -12,20 +20,120 @@ def test_po_files(root=LOCALE_DIR):
    log = logging.getLogger(__name__)
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)

-    for (dirpath, dirnames, filenames) in os.walk(root):
+    for dirpath, __, filenames in os.walk(root):
        for name in filenames:
-            (base, ext) = os.path.splitext(name)
+            __, ext = os.path.splitext(name)
            if ext.lower() == '.po':
-                yield validate_po_file, os.path.join(dirpath, name), log
+                filename = os.path.join(dirpath, name)
+                yield msgfmt_check_po_file, filename, log
+                yield check_messages, filename


-def validate_po_file(filename, log):
+def msgfmt_check_po_file(filename, log):
    """
    Call GNU msgfmt -c on each .po file to validate its format.
    Any errors caught by msgfmt are logged to log.
    """
    # Use relative paths to make output less noisy.
    rfile = os.path.relpath(filename, LOCALE_DIR)
-    (out, err) = call(['msgfmt','-c', rfile], working_directory=LOCALE_DIR)
+    out, err = call(['msgfmt', '-c', rfile], working_directory=LOCALE_DIR)
    if err != '':
-        log.warn('\n'+err)
+        log.info('\n' + out)
+        log.warn('\n' + err)
+    assert not err
+
+
+def tags_in_string(msg):
+    """
+    Return the set of tags in a message string.
+
+    Tags includes HTML tags, data placeholders, etc.
+
+    Skips tags that might change due to translations: HTML entities, <abbr>,
+    and so on.
+
+    """
+    def is_linguistic_tag(tag):
+        """Is this tag one that can change with the language?"""
+        if tag.startswith("&"):
+            return True
+        if any(x in tag for x in ["<abbr>", "<abbr ", "</abbr>"]):
+            return True
+        return False
+
+    __, tags = Converter().detag_string(msg)
+    return set(t for t in tags if not is_linguistic_tag(t))
+
+
+def astral(msg):
+    """Does `msg` have characters outside the Basic Multilingual Plane?"""
+    return any(ord(c) > 0xFFFF for c in msg)
+
+
+def check_messages(filename):
+    """
+    Checks messages in various ways:
+
+    Translations must have the same slots as the English.  The translation
+    must not be empty. Messages can't have astral characters in them.
+
+    """
+    # Don't check English files.
+    if "/locale/en/" in filename:
+        return
+
+    # problems will be a list of tuples.  Each is a description, and a msgid,
+    # and then zero or more translations.
+    problems = []
+    pomsgs = polib.pofile(filename)
+    for msg in pomsgs:
+        # Check for characters Javascript can't support.
+        # https://code.djangoproject.com/ticket/21725
+        if astral(msg.msgstr):
+            problems.append(("Non-BMP char", msg.msgid, msg.msgstr))
+
+        if msg.msgid_plural:
+            # Plurals: two strings in, N strings out.
+            source = msg.msgid + " | " + msg.msgid_plural
+            translation = " | ".join(v for k,v in sorted(msg.msgstr_plural.items()))
+            empty = any(not t.strip() for t in msg.msgstr_plural.values())
+        else:
+            # Singular: just one string in and one string out.
+            source = msg.msgid
+            translation = msg.msgstr
+            empty = not msg.msgstr.strip()
+
+        if empty:
+            problems.append(("Empty translation", source))
+        else:
+            id_tags = tags_in_string(source)
+            tx_tags = tags_in_string(translation)
+            if id_tags != tx_tags:
+                id_has = u", ".join(u'"{}"'.format(t) for t in id_tags - tx_tags)
+                tx_has = u", ".join(u'"{}"'.format(t) for t in tx_tags - id_tags)
+                if id_has and tx_has:
+                    diff = u"{} vs {}".format(id_has, tx_has)
+                elif id_has:
+                    diff = u"{} missing".format(id_has)
+                else:
+                    diff = u"{} added".format(tx_has)
+                problems.append((
+                    "Different tags in source and translation",
+                    source,
+                    translation,
+                    diff
+                ))
+
+    if problems:
+        problem_file = filename.replace(".po", ".prob")
+        id_filler = textwrap.TextWrapper(width=79, initial_indent="  msgid: ", subsequent_indent=" " * 9)
+        tx_filler = textwrap.TextWrapper(width=79, initial_indent="  -----> ", subsequent_indent=" " * 9)
+        with codecs.open(problem_file, "w", encoding="utf8") as prob_file:
+            for problem in problems:
+                desc, msgid = problem[:2]
+                prob_file.write(u"{}\n{}\n".format(desc, id_filler.fill(msgid)))
+                for translation in problem[2:]:
+                    prob_file.write(u"{}\n".format(tx_filler.fill(translation)))
+                prob_file.write(u"\n")
+
+    assert not problems, "Found %d problems in %s, details in .prob file" % (len(problems), filename)
--- a/i18n/transifex.py
+++ b/i18n/transifex.py
@@ -15,6 +15,7 @@ def push():
 def pull():
    for locale in CONFIGURATION.locales:
        if locale != CONFIGURATION.source_locale:
+            print "Pulling %s from transifex..." % locale
            execute('tx pull -l %s' % locale)
    clean_translated_locales()


--- a/lms/templates/login.html
+++ b/lms/templates/login.html
@@ -89,7 +89,7 @@
        $submitButton.
          addClass('is-disabled').
          prop('disabled', true).
-          html(gettext('Processing your account information &hellip;'));
+          html("${_(u'Processing your account information…')}");
      }
    }
  </script>