Commit 61afc819 by Ned Batchelder

Merge pull request #2081 from edx/ned/i18n-xmodule

Improvements to the i18n workflow tools.
parents 430cfd75 af120fdf
......@@ -405,7 +405,7 @@ class @Problem
formulaequationinput: (element) ->
$(element).find('input').on 'input', ->
$p = $(element).find('p.status')
`// Translators: the word Answer here is about answering a problem the student must solve.`
`// Translators: the word unanswered here is about answering a problem the student must solve.`
$p.text gettext("unanswered")
$p.parent().removeClass().addClass "unanswered"
......@@ -434,7 +434,7 @@ class @Problem
textline: (element) ->
$(element).find('input').on 'input', ->
$p = $(element).find('p.status')
`// Translators: the word Answer here is about answering a problem the student must solve.`
`// Translators: the word unanswered here is about answering a problem the student must solve.`
$p.text gettext("unanswered")
$p.parent().removeClass().addClass "unanswered"
......
......@@ -21,9 +21,9 @@ class Converter(object):
# HTML: <B>, </B>, <BR/>, <textformat leading="10">
# Python: %(date)s, %(name)s
tag_pattern = re.compile(r'''
(<[-\w" .:?=/]*>) | # <tag>
({[^}]*}) | # {tag}
(%\([^)]*\)\w) | # %(tag)s
(<[^>]+>) | # <tag>
({[^}]+}) | # {tag}
(%\([\w]+\)\w) | # %(tag)s
(&\w+;) | # &entity;
(&\#\d+;) | # &#1234;
(&\#x[0-9a-f]+;) # &#xABCD;
......
from converter import Converter
# -*- coding: utf-8 -*-
r"""
Creates new localization properties files in a dummy language.
Each property file is derived from the equivalent en_US file, with these
transformations applied:
1. Every vowel is replaced with an equivalent with extra accent marks.
# Creates new localization properties files in a dummy language
# Each property file is derived from the equivalent en_US file, except
# 1. Every vowel is replaced with an equivalent with extra accent marks
# 2. Every string is padded out to +30% length to simulate verbose languages (e.g. German)
# to see if layout and flows work properly
# 3. Every string is terminated with a '#' character to make it easier to detect truncation
2. Every string is padded out to +30% length to simulate verbose languages
(such as German) to see if layout and flows work properly.
3. Every string is terminated with a '#' character to make it easier to detect
truncation.
# --------------------------------
# Example use:
# >>> from dummy import Dummy
# >>> c = Dummy()
# >>> c.convert("hello my name is Bond, James Bond")
# u'h\xe9ll\xf6 my n\xe4m\xe9 \xefs B\xf6nd, J\xe4m\xe9s B\xf6nd Lorem i#'
#
# >>> c.convert('don\'t convert <a href="href">tag ids</a>')
# u'd\xf6n\'t \xe7\xf6nv\xe9rt <a href="href">t\xe4g \xefds</a> Lorem ipsu#'
#
# >>> c.convert('don\'t convert %(name)s tags on %(date)s')
# u"d\xf6n't \xe7\xf6nv\xe9rt %(name)s t\xe4gs \xf6n %(date)s Lorem ips#"
Example use::
>>> from dummy import Dummy
>>> c = Dummy()
>>> c.convert("My name is Bond, James Bond")
u'M\xfd n\xe4m\xe9 \xefs B\xf8nd, J\xe4m\xe9s B\xf8nd \u2360\u03c3\u044f\u0454\u043c \u03b9\u03c1#'
>>> print c.convert("My name is Bond, James Bond")
Mý nämé ïs Bønd, Jämés Bønd Ⱡσяєм ιρ#
>>> print c.convert("don't convert <a href='href'>tag ids</a>")
døn't çønvért <a href='href'>täg ïds</a> Ⱡσяєм ιρѕυ#
>>> print c.convert("don't convert %(name)s tags on %(date)s")
døn't çønvért %(name)s tägs øn %(date)s Ⱡσяєм ιρѕ#
"""
from converter import Converter
# Substitute plain characters with accented lookalikes.
# http://tlt.its.psu.edu/suggestions/international/web/codehtml.html#accent
TABLE = {'A': u'\xC0',
'a': u'\xE4',
'b': u'\xDF',
'C': u'\xc7',
'c': u'\xE7',
'E': u'\xC9',
'e': u'\xE9',
'I': U'\xCC',
'i': u'\xEF',
'O': u'\xD8',
'o': u'\xF8',
'U': u'\xDB',
'u': u'\xFC',
'Y': u'\xDD',
'y': u'\xFD',
}
TABLE = {
'A': u'À',
'a': u'ä',
'b': u'ß',
'C': u'Ç',
'c': u'ç',
'E': u'É',
'e': u'é',
'I': u'Ì',
'i': u'ï',
'O': u'Ø',
'o': u'ø',
'U': u'Û',
'u': u'ü',
'Y': u'Ý',
'y': u'ý',
}
# The print industry's standard dummy text, in use since the 1500s
# see http://www.lipsum.com/
LOREM = ' Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed ' \
'do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad ' \
'minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ' \
'ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate ' \
'velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat ' \
'cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. '
# see http://www.lipsum.com/, then fed through a "fancy-text" converter.
# The string should start with a space.
LOREM = " " + " ".join( # join and split just make the string easier here.
u"""
Ⱡσяєм ιρѕυм ∂σłσя ѕιт αмєт, ¢σηѕє¢тєтυя α∂ιριѕι¢ιηg єłιт, ѕє∂ ∂σ єιυѕмσ∂
тємρσя ιη¢ι∂ι∂υηт υт łαвσяє єт ∂σłσяє мαgηα αłιqυα. υт єηιм α∂ мιηιм
νєηιαм, qυιѕ ησѕтяυ∂ єχєя¢ιтαтιση υłłαм¢σ łαвσяιѕ ηιѕι υт αłιqυιρ єχ єα
¢σммσ∂σ ¢σηѕєqυαт. ∂υιѕ αυтє ιяυяє ∂σłσя ιη яєρяєнєη∂єяιт ιη νσłυρтαтє
νєłιт єѕѕє ¢ιłłυм ∂σłσяє єυ ƒυgιαт ηυłłα ραяιαтυя. єχ¢єρтєυя ѕιηт σ¢¢αє¢αт
¢υρι∂αтαт ηση ρяσι∂єηт, ѕυηт ιη ¢υłρα qυι σƒƒι¢ια ∂єѕєяυηт мσłłιт αηιм ι∂
єѕт łαвσяυм.
""".split()
)
# To simulate more verbose languages (like German), pad the length of a string
# by a multiple of PAD_FACTOR
......@@ -85,20 +99,6 @@ class Dummy(Converter):
"""replaces the final char of string with #"""
return string[:-1] + '#'
def init_msgs(self, msgs):
"""
Make sure the first msg in msgs has a plural property.
msgs is list of instances of polib.POEntry
"""
if not msgs:
return
headers = msgs[0].get_property('msgstr')
has_plural = any(header.startswith('Plural-Forms:') for header in headers)
if not has_plural:
# Apply declaration for English pluralization rules
plural = "Plural-Forms: nplurals=2; plural=(n != 1);\\n"
headers.append(plural)
def convert_msg(self, msg):
"""
Takes one POEntry object and converts it (adds a dummy translation to it)
......@@ -114,8 +114,10 @@ class Dummy(Converter):
# translate singular and plural
foreign_single = self.convert(source)
foreign_plural = self.convert(plural)
plural = {'0': self.final_newline(source, foreign_single),
'1': self.final_newline(plural, foreign_plural)}
plural = {
'0': self.final_newline(source, foreign_single),
'1': self.final_newline(plural, foreign_plural),
}
msg.msgstr_plural = plural
else:
foreign = self.convert(source)
......
......@@ -45,7 +45,7 @@ def main():
remove_file(source_msgs_dir.joinpath(filename))
# Extract strings from mako templates.
babel_mako_cmd = 'pybabel extract -F %s -c "TRANSLATORS:" . -o %s' % (BABEL_CONFIG, BABEL_OUT)
babel_mako_cmd = 'pybabel extract -F %s -c "Translators:" . -o %s' % (BABEL_CONFIG, BABEL_OUT)
# Extract strings from django source files.
make_django_cmd = (
......
......@@ -60,9 +60,12 @@ def merge(locale, target='django.po', fail_if_missing=True):
def clean_metadata(file):
"""
Clean up redundancies in the metadata caused by merging.
This reads in a PO file and simply saves it back out again.
"""
pofile(file).save()
# Reading in the .po file and saving it again fixes redundancies.
pomsgs = pofile(file)
# The msgcat tool marks the metadata as fuzzy, but it's ok as it is.
pomsgs.metadata_is_fuzzy = False
pomsgs.save()
def validate_files(dir, files_to_merge):
......
......@@ -38,9 +38,15 @@ def main(file, locale):
raise IOError('File does not exist: %s' % file)
pofile = polib.pofile(file)
converter = Dummy()
converter.init_msgs(pofile.translated_entries())
for msg in pofile:
converter.convert_msg(msg)
# If any message has a plural, then the file needs plural information.
# Apply declaration for English pluralization rules so that ngettext will
# do something reasonable.
if any(m.msgid_plural for m in pofile):
pofile.metadata['Plural-Forms'] = 'nplurals=2; plural=(n != 1);'
new_file = new_filename(file, locale)
create_dir_if_necessary(new_file)
pofile.save(new_file)
......
"""Tests of i18n/converter.py"""
import os
from unittest import TestCase
import ddt
import converter
......@@ -11,36 +14,48 @@ class UpcaseConverter(converter.Converter):
return string.upper()
@ddt.ddt
class TestConverter(TestCase):
"""
Tests functionality of i18n/converter.py
"""
def test_converter(self):
@ddt.data(
# no tags
('big bad wolf',
'BIG BAD WOLF'),
# one html tag
('big <strong>bad</strong> wolf',
'BIG <strong>BAD</strong> WOLF'),
# two html tags
('big <b>bad</b> gray <i>wolf</i>',
'BIG <b>BAD</b> GRAY <i>WOLF</i>'),
# html tags with attributes
('<a href="foo">bar</a> baz',
'<a href="foo">BAR</a> BAZ'),
("<a href='foo'>bar</a> baz",
"<a href='foo'>BAR</a> BAZ"),
# one python tag
('big %(adjective)s wolf',
'BIG %(adjective)s WOLF'),
# two python tags
('big %(adjective)s gray %(noun)s',
'BIG %(adjective)s GRAY %(noun)s'),
# both kinds of tags
('<strong>big</strong> %(adjective)s %(noun)s',
'<strong>BIG</strong> %(adjective)s %(noun)s'),
# .format-style tags
('The {0} barn is {1!r}.',
'THE {0} BARN IS {1!r}.'),
# HTML entities
('<b>&copy; 2013 edX, &#xa0;</b>',
'<b>&copy; 2013 EDX, &#xa0;</b>'),
)
def test_converter(self, data):
"""
Tests with a simple converter (converts strings to uppercase).
Assert that embedded HTML and python tags are not converted.
"""
c = UpcaseConverter()
test_cases = [
# no tags
('big bad wolf', 'BIG BAD WOLF'),
# one html tag
('big <strong>bad</strong> wolf', 'BIG <strong>BAD</strong> WOLF'),
# two html tags
('big <b>bad</b> <i>wolf</i>', 'BIG <b>BAD</b> <i>WOLF</i>'),
# one python tag
('big %(adjective)s wolf', 'BIG %(adjective)s WOLF'),
# two python tags
('big %(adjective)s %(noun)s', 'BIG %(adjective)s %(noun)s'),
# both kinds of tags
('<strong>big</strong> %(adjective)s %(noun)s',
'<strong>BIG</strong> %(adjective)s %(noun)s'),
# .format-style tags
('The {0} barn is {1!r}.', 'THE {0} BARN IS {1!r}.'),
# HTML entities
('<b>&copy; 2013 edX, &#xa0;</b>', '<b>&copy; 2013 EDX, &#xa0;</b>'),
]
for source, expected in test_cases:
result = c.convert(source)
self.assertEquals(result, expected)
source, expected = data
result = UpcaseConverter().convert(source)
self.assertEquals(result, expected)
# -*- coding: utf-8 -*-
"""Tests of i18n/dummy.py"""
import os, string, random
from unittest import TestCase
import ddt
from polib import POEntry
import dummy
@ddt.ddt
class TestDummy(TestCase):
"""
Tests functionality of i18n/dummy.py
......@@ -13,39 +19,52 @@ class TestDummy(TestCase):
def setUp(self):
self.converter = dummy.Dummy()
def test_dummy(self):
def assertUnicodeEquals(self, str1, str2):
"""Just like assertEquals, but doesn't put Unicode into the fail message.
Either nose, or rake, or something, deals very badly with unusual
Unicode characters in the assertions, so we use repr here to keep
things safe.
"""
self.assertEquals(
str1, str2,
"Mismatch: %r != %r" % (str1, str2),
)
@ddt.data(
(u"hello my name is Bond, James Bond",
u"héllø mý nämé ïs Bønd, Jämés Bønd Ⱡσяєм ι#"),
(u"don't convert <a href='href'>tag ids</a>",
u"døn't çønvért <a href='href'>täg ïds</a> Ⱡσяєм ιρѕυ#"),
(u"don't convert %(name)s tags on %(date)s",
u"døn't çønvért %(name)s tägs øn %(date)s Ⱡσяєм ιρѕ#"),
)
def test_dummy(self, data):
"""
Tests with a dummy converter (adds spurious accents to strings).
Assert that embedded HTML and python tags are not converted.
"""
test_cases = [
("hello my name is Bond, James Bond",
u'h\xe9ll\xf8 m\xfd n\xe4m\xe9 \xefs B\xf8nd, J\xe4m\xe9s B\xf8nd Lorem i#'),
('don\'t convert <a href="href">tag ids</a>',
u'd\xf8n\'t \xe7\xf8nv\xe9rt <a href="href">t\xe4g \xefds</a> Lorem ipsu#'),
('don\'t convert %(name)s tags on %(date)s',
u"d\xf8n't \xe7\xf8nv\xe9rt %(name)s t\xe4gs \xf8n %(date)s Lorem ips#")
]
for source, expected in test_cases:
result = self.converter.convert(source)
self.assertEquals(result, expected)
source, expected = data
result = self.converter.convert(source)
self.assertUnicodeEquals(result, expected)
def test_singular(self):
entry = POEntry()
entry.msgid = 'A lovely day for a cup of tea.'
expected = u'\xc0 l\xf8v\xe9l\xfd d\xe4\xfd f\xf8r \xe4 \xe7\xfcp \xf8f t\xe9\xe4. Lorem i#'
expected = u'À løvélý däý før ä çüp øf téä. Ⱡσяєм ι#'
self.converter.convert_msg(entry)
self.assertEquals(entry.msgstr, expected)
self.assertUnicodeEquals(entry.msgstr, expected)
def test_plural(self):
entry = POEntry()
entry.msgid = 'A lovely day for a cup of tea.'
entry.msgid_plural = 'A lovely day for some cups of tea.'
expected_s = u'\xc0 l\xf8v\xe9l\xfd d\xe4\xfd f\xf8r \xe4 \xe7\xfcp \xf8f t\xe9\xe4. Lorem i#'
expected_p = u'\xc0 l\xf8v\xe9l\xfd d\xe4\xfd f\xf8r s\xf8m\xe9 \xe7\xfcps \xf8f t\xe9\xe4. Lorem ip#'
expected_s = u'À løvélý däý før ä çüp øf téä. Ⱡσяєм ι#'
expected_p = u'À løvélý däý før sømé çüps øf téä. Ⱡσяєм ιρ#'
self.converter.convert_msg(entry)
result = entry.msgstr_plural
self.assertEquals(result['0'], expected_s)
self.assertEquals(result['1'], expected_p)
self.assertUnicodeEquals(result['0'], expected_s)
self.assertUnicodeEquals(result['1'], expected_p)
import os, sys, logging
from unittest import TestCase
from nose.plugins.skip import SkipTest
"""Tests that validate .po files."""
import codecs
import logging
import os
import sys
import textwrap
import polib
from config import LOCALE_DIR
from execute import call
from converter import Converter
def test_po_files(root=LOCALE_DIR):
"""
......@@ -12,20 +20,120 @@ def test_po_files(root=LOCALE_DIR):
log = logging.getLogger(__name__)
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
for (dirpath, dirnames, filenames) in os.walk(root):
for dirpath, __, filenames in os.walk(root):
for name in filenames:
(base, ext) = os.path.splitext(name)
__, ext = os.path.splitext(name)
if ext.lower() == '.po':
yield validate_po_file, os.path.join(dirpath, name), log
filename = os.path.join(dirpath, name)
yield msgfmt_check_po_file, filename, log
yield check_messages, filename
def validate_po_file(filename, log):
def msgfmt_check_po_file(filename, log):
"""
Call GNU msgfmt -c on each .po file to validate its format.
Any errors caught by msgfmt are logged to log.
"""
# Use relative paths to make output less noisy.
rfile = os.path.relpath(filename, LOCALE_DIR)
(out, err) = call(['msgfmt','-c', rfile], working_directory=LOCALE_DIR)
out, err = call(['msgfmt', '-c', rfile], working_directory=LOCALE_DIR)
if err != '':
log.warn('\n'+err)
log.info('\n' + out)
log.warn('\n' + err)
assert not err
def tags_in_string(msg):
"""
Return the set of tags in a message string.
Tags includes HTML tags, data placeholders, etc.
Skips tags that might change due to translations: HTML entities, <abbr>,
and so on.
"""
def is_linguistic_tag(tag):
"""Is this tag one that can change with the language?"""
if tag.startswith("&"):
return True
if any(x in tag for x in ["<abbr>", "<abbr ", "</abbr>"]):
return True
return False
__, tags = Converter().detag_string(msg)
return set(t for t in tags if not is_linguistic_tag(t))
def astral(msg):
"""Does `msg` have characters outside the Basic Multilingual Plane?"""
return any(ord(c) > 0xFFFF for c in msg)
def check_messages(filename):
"""
Checks messages in various ways:
Translations must have the same slots as the English. The translation
must not be empty. Messages can't have astral characters in them.
"""
# Don't check English files.
if "/locale/en/" in filename:
return
# problems will be a list of tuples. Each is a description, and a msgid,
# and then zero or more translations.
problems = []
pomsgs = polib.pofile(filename)
for msg in pomsgs:
# Check for characters Javascript can't support.
# https://code.djangoproject.com/ticket/21725
if astral(msg.msgstr):
problems.append(("Non-BMP char", msg.msgid, msg.msgstr))
if msg.msgid_plural:
# Plurals: two strings in, N strings out.
source = msg.msgid + " | " + msg.msgid_plural
translation = " | ".join(v for k,v in sorted(msg.msgstr_plural.items()))
empty = any(not t.strip() for t in msg.msgstr_plural.values())
else:
# Singular: just one string in and one string out.
source = msg.msgid
translation = msg.msgstr
empty = not msg.msgstr.strip()
if empty:
problems.append(("Empty translation", source))
else:
id_tags = tags_in_string(source)
tx_tags = tags_in_string(translation)
if id_tags != tx_tags:
id_has = u", ".join(u'"{}"'.format(t) for t in id_tags - tx_tags)
tx_has = u", ".join(u'"{}"'.format(t) for t in tx_tags - id_tags)
if id_has and tx_has:
diff = u"{} vs {}".format(id_has, tx_has)
elif id_has:
diff = u"{} missing".format(id_has)
else:
diff = u"{} added".format(tx_has)
problems.append((
"Different tags in source and translation",
source,
translation,
diff
))
if problems:
problem_file = filename.replace(".po", ".prob")
id_filler = textwrap.TextWrapper(width=79, initial_indent=" msgid: ", subsequent_indent=" " * 9)
tx_filler = textwrap.TextWrapper(width=79, initial_indent=" -----> ", subsequent_indent=" " * 9)
with codecs.open(problem_file, "w", encoding="utf8") as prob_file:
for problem in problems:
desc, msgid = problem[:2]
prob_file.write(u"{}\n{}\n".format(desc, id_filler.fill(msgid)))
for translation in problem[2:]:
prob_file.write(u"{}\n".format(tx_filler.fill(translation)))
prob_file.write(u"\n")
assert not problems, "Found %d problems in %s, details in .prob file" % (len(problems), filename)
......@@ -15,6 +15,7 @@ def push():
def pull():
for locale in CONFIGURATION.locales:
if locale != CONFIGURATION.source_locale:
print "Pulling %s from transifex..." % locale
execute('tx pull -l %s' % locale)
clean_translated_locales()
......
......@@ -89,7 +89,7 @@
$submitButton.
addClass('is-disabled').
prop('disabled', true).
html(gettext('Processing your account information &hellip;'));
html("${_(u'Processing your account information…')}");
}
}
</script>
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment