Commit 9491ca11 by Ned Batchelder

Merge pull request #2235 from edx/ned/i18n-tool-improvements

Added a .po segmenting tool
parents 8e707b28 9149d57b
......@@ -35,6 +35,9 @@ conf/locale/en/LC_MESSAGES/*.po
### Remove when we have real Esperanto translations. For now, ignore
### dummy Esperanto files.
conf/locale/eo/*
## Remove when we officially support these languages.
conf/locale/fr
conf/locale/ko_KR
### Testing artifacts
.testids/
......
......@@ -7,18 +7,36 @@ source_file = conf/locale/en/LC_MESSAGES/django-partial.po
source_lang = en
type = PO
[edx-platform.django-studio]
file_filter = conf/locale/<lang>/LC_MESSAGES/django-studio.po
source_file = conf/locale/en/LC_MESSAGES/django-studio.po
source_lang = en
type = PO
[edx-platform.djangojs]
file_filter = conf/locale/<lang>/LC_MESSAGES/djangojs.po
source_file = conf/locale/en/LC_MESSAGES/djangojs.po
source_lang = en
type = PO
[edx-platform.djangojs-studio]
file_filter = conf/locale/<lang>/LC_MESSAGES/djangojs-studio.po
source_file = conf/locale/en/LC_MESSAGES/djangojs-studio.po
source_lang = en
type = PO
[edx-platform.mako]
file_filter = conf/locale/<lang>/LC_MESSAGES/mako.po
source_file = conf/locale/en/LC_MESSAGES/mako.po
source_lang = en
type = PO
[edx-platform.mako-studio]
file_filter = conf/locale/<lang>/LC_MESSAGES/mako-studio.po
source_file = conf/locale/en/LC_MESSAGES/mako-studio.po
source_lang = en
type = PO
[edx-platform.messages]
file_filter = conf/locale/<lang>/LC_MESSAGES/messages.po
source_file = conf/locale/en/LC_MESSAGES/messages.po
......
{
"locales" : ["en"],
"dummy-locale" : "eo"
}
This file is now at config.yaml in the same directory.
# Configuration for i18n workflow.
locales:
- en
- fr
- ko_KR
# More languages we might want someday, these have started on Transifex.
# ru
# es_419
# ja_JP
# pt_BR
# zh_CN
# zh_TW
# ar
# es_ES
# fa_IR
# tr_TR
# de_DE
# id
# hi
# vi
# pt_PT
# lt_LT
# gl
# it_IT
# cs
# et_EE
# nb
# sk
# The locale used for fake-accented English, for testing.
dummy-locale: eo
# Directories we don't search for strings.
ignore_dirs:
- docs
- src
- i18n
- test_root
- common/static/xmodule/modules
- common/static/xmodule/descriptors
# How should .po files be segmented? See i18n/segment.py for details. Strings
# that are only found in a particular segment are segregated into that .po file
# so that translators can focus on separate parts of the product.
#
# We segregate Studio so we can provide new languages for LMS without having to
# also translate the Studio strings. LMS needs the strings from lms/* and
# common/*, so those will stay in the main .po file.
segment:
django-partial.po: # This .po file..
django-studio.po: # produces this .po file..
- cms/* # by segregating strings from these files.
# Anything that doesn't match a pattern stays in the original file.
djangojs.po:
djangojs-studio.po:
- cms/*
mako.po:
mako-studio.po:
- cms/*
# How should the generate step merge files?
generate_merge:
django.po:
- django-partial.po
- django-studio.po
- mako.po
- mako-studio.po
- messages.po
djangojs.po:
- djangojs.po
- djangojs-studio.po
import os
import json
import yaml
from path import path
# BASE_DIR is the working directory to execute django-admin commands from.
......@@ -13,10 +14,15 @@ LOCALE_DIR = BASE_DIR.joinpath('conf', 'locale')
class Configuration(object):
"""
# Reads localization configuration in json format
Reads localization configuration in json format.
"""
_source_locale = 'en'
DEFAULTS = {
'generate_merge': {},
'ignore_dirs': [],
'locales': ['en'],
'segment': {},
'source_locale': 'en',
}
def __init__(self, filename):
self._filename = filename
......@@ -29,24 +35,12 @@ class Configuration(object):
if not os.path.exists(filename):
raise Exception("Configuration file cannot be found: %s" % filename)
with open(filename) as stream:
return json.load(stream)
return yaml.safe_load(stream)
@property
def locales(self):
"""
Returns a list of locales declared in the configuration file,
e.g. ['en', 'fr', 'es']
Each locale is a string.
"""
return self._config['locales']
@property
def source_locale(self):
"""
Returns source language.
Source language is English.
"""
return self._source_locale
def __getattr__(self, name):
if name in self.DEFAULTS:
return self._config.get(name, self.DEFAULTS[name])
raise AttributeError("Configuration has no such setting: {!r}".format(name))
@property
def dummy_locale(self):
......@@ -76,4 +70,4 @@ class Configuration(object):
return self.get_messages_dir(self.source_locale)
CONFIGURATION = Configuration(LOCALE_DIR.joinpath('config').normpath())
CONFIGURATION = Configuration(LOCALE_DIR.joinpath('config.yaml').normpath())
......@@ -11,7 +11,7 @@ def execute(command, working_directory=BASE_DIR):
Output is ignored.
"""
LOG.info(command)
subprocess.check_call(command, cwd=working_directory, stderr=sys.STDOUT, shell=True)
subprocess.check_call(command, cwd=working_directory, stderr=subprocess.STDOUT, shell=True)
def call(command, working_directory=BASE_DIR):
......
......@@ -21,6 +21,7 @@ from polib import pofile
from i18n.config import BASE_DIR, LOCALE_DIR, CONFIGURATION
from i18n.execute import execute, create_dir_if_necessary, remove_file
from i18n.segment import segment_pofiles
# BABEL_CONFIG contains declarations for Babel to extract strings from mako template files
......@@ -31,7 +32,7 @@ BABEL_CONFIG = BASE_DIR.relpathto(LOCALE_DIR.joinpath('babel.cfg'))
# Use relpath to reduce noise in logs
BABEL_OUT = BASE_DIR.relpathto(CONFIGURATION.source_messages_dir.joinpath('mako.po'))
SOURCE_WARN = 'This English source file is machine-generated. Do not check it into github'
SOURCE_WARN = 'This English source file is machine-generated. Do not check it into git.'
LOG = logging.getLogger(__name__)
......@@ -40,15 +41,13 @@ def main():
create_dir_if_necessary(LOCALE_DIR)
source_msgs_dir = CONFIGURATION.source_messages_dir
remove_file(source_msgs_dir.joinpath('django.po'))
generated_files = ('django-partial.po', 'djangojs.po', 'mako.po')
for filename in generated_files:
remove_file(source_msgs_dir.joinpath(filename))
generated_files = ['django-partial.po', 'djangojs.po', 'mako.po']
# Prepare makemessages command.
ignore_dirs = ["docs", "src", "i18n", "test_root"]
ignores = " ".join("--ignore={}/*".format(d) for d in ignore_dirs)
makemessages = 'django-admin.py makemessages -l en ' + ignores
makemessages = "django-admin.py makemessages -l en"
ignores = " ".join('--ignore="{}/*"'.format(d) for d in CONFIGURATION.ignore_dirs)
if ignores:
makemessages += " " + ignores
# Extract strings from mako templates.
babel_mako_cmd = 'pybabel extract -F %s -c "Translators:" . -o %s' % (BABEL_CONFIG, BABEL_OUT)
......@@ -69,6 +68,11 @@ def main():
source_msgs_dir.joinpath('django-partial.po')
)
# Segment the generated files.
segmented_files = segment_pofiles("en")
generated_files.extend(segmented_files)
# Finish each file.
for filename in generated_files:
LOG.info('Cleaning %s' % filename)
po = pofile(source_msgs_dir.joinpath(filename))
......@@ -80,6 +84,7 @@ def main():
strip_key_strings(po)
po.save()
def fix_header(po):
"""
Replace default headers with edX headers
......
......@@ -22,10 +22,10 @@ from i18n.execute import execute
LOG = logging.getLogger(__name__)
def merge(locale, target='django.po', fail_if_missing=True):
def merge(locale, target='django.po', sources=('django-partial.po',), fail_if_missing=True):
"""
For the given locale, merge django-partial.po, messages.po, mako.po -> django.po
target is the resulting filename
For the given locale, merge the `sources` files to become the `target`
file. Note that the target file might also be one of the sources.
If fail_if_missing is true, and the files to be merged are missing,
throw an Exception, otherwise return silently.
......@@ -34,18 +34,17 @@ def merge(locale, target='django.po', fail_if_missing=True):
just return silently.
"""
LOG.info('Merging locale={0}'.format(locale))
LOG.info('Merging {target} for locale {locale}'.format(target=target, locale=locale))
locale_directory = CONFIGURATION.get_messages_dir(locale)
files_to_merge = ('django-partial.po', 'messages.po', 'mako.po')
try:
validate_files(locale_directory, files_to_merge)
validate_files(locale_directory, sources)
except Exception, e:
if not fail_if_missing:
return
raise e
# merged file is merged.po
merge_cmd = 'msgcat -o merged.po ' + ' '.join(files_to_merge)
merge_cmd = 'msgcat -o merged.po ' + ' '.join(sources)
execute(merge_cmd, working_directory=locale_directory)
# clean up redunancies in the metadata
......@@ -53,8 +52,16 @@ def merge(locale, target='django.po', fail_if_missing=True):
clean_metadata(merged_filename)
# rename merged.po -> django.po (default)
django_filename = locale_directory.joinpath(target)
os.rename(merged_filename, django_filename) # can't overwrite file on Windows
target_filename = locale_directory.joinpath(target)
os.rename(merged_filename, target_filename)
def merge_files(locale, fail_if_missing=True):
"""
Merge all the files in `locale`, as specified in config.yaml.
"""
for target, sources in CONFIGURATION.generate_merge.items():
merge(locale, target, sources, fail_if_missing)
def clean_metadata(file):
......@@ -85,9 +92,10 @@ def main():
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
for locale in CONFIGURATION.locales:
merge(locale)
merge_files(locale)
# Dummy text is not required. Don't raise exception if files are missing.
merge(CONFIGURATION.dummy_locale, fail_if_missing=False)
merge_files(CONFIGURATION.dummy_locale, fail_if_missing=False)
compile_cmd = 'django-admin.py compilemessages'
execute(compile_cmd, working_directory=BASE_DIR)
......
#!/usr/bin/env python
"""
Segment a .po file to produce smaller files based on the locations of the
messages.
"""
import copy
import fnmatch
import logging
import sys
import polib
from i18n.config import CONFIGURATION
LOG = logging.getLogger(__name__)
def segment_pofiles(locale):
"""Segment all the pofiles for `locale`.
Returns a set of filenames, all the segment files written.
"""
files_written = set()
for filename, segments in CONFIGURATION.segment.items():
filename = CONFIGURATION.get_messages_dir(locale) / filename
files_written.update(segment_pofile(filename, segments))
return files_written
def segment_pofile(filename, segments):
"""Segment a .po file using patterns in `segments`.
The .po file at `filename` is read, and the occurrence locations of its
messages are examined. `segments` is a dictionary: the keys are segment
.po filenames, the values are lists of patterns::
{
'django-studio.po': [
'cms/*',
'some-other-studio-place/*',
],
'django-weird.po': [
'*/weird_*.*',
],
}
If all a message's occurrences match the patterns for a segment, then that
message is written to the new segmented .po file.
Any message that matches no segments, or more than one, is written back to
the original file.
Arguments:
filename (path.path): a path object referring to the original .po file.
segments (dict): specification of the segments to create.
Returns:
a set of path objects, all the segment files written.
"""
reading_msg = "Reading {num} entries from {file}"
writing_msg = "Writing {num} entries to {file}"
source_po = polib.pofile(filename)
LOG.info(reading_msg.format(file=filename, num=len(source_po)))
# A new pofile just like the source, but with no messages. We'll put
# anything not segmented into this file.
remaining_po = copy.deepcopy(source_po)
remaining_po[:] = []
# Turn the segments dictionary into two structures: segment_patterns is a
# list of (pattern, segmentfile) pairs. segment_po_files is a dict mapping
# segment file names to pofile objects of their contents.
segment_po_files = {filename: remaining_po}
segment_patterns = []
for segmentfile, patterns in segments.items():
segment_po_files[segmentfile] = copy.deepcopy(remaining_po)
segment_patterns.extend((pat, segmentfile) for pat in patterns)
# Examine each message in the source file. If all of its occurrences match
# a pattern for the same segment, it goes in that segment. Otherwise, it
# goes in remaining.
for msg in source_po:
msg_segments = set()
for occ_file, _ in msg.occurrences:
for pat, segment_file in segment_patterns:
if fnmatch.fnmatch(occ_file, pat):
msg_segments.add(segment_file)
break
else:
msg_segments.add(filename)
if len(msg_segments) == 1:
# This message belongs in this segment.
segment_file = msg_segments.pop()
segment_po_files[segment_file].append(msg)
else:
# Either it's in more than one segment, or none, so put it back in
# the main file.
remaining_po.append(msg)
# Write out the results.
files_written = set()
for segment_file, pofile in segment_po_files.items():
out_file = filename.dirname() / segment_file
if len(pofile) == 0:
LOG.error("No messages to write to {file}, did you run segment twice?".format(file=out_file))
else:
LOG.info(writing_msg.format(file=out_file, num=len(pofile)))
pofile.save(out_file)
files_written.add(out_file)
LOG.info(writing_msg.format(file=filename, num=len(remaining_po)))
remaining_po.save(filename)
return files_written
def main(argv):
"""
$ segment.py LOCALE [...]
Segment the .po files in LOCALE(s) based on the segmenting rules in
config.yaml.
Note that segmenting is *not* idempotent: it modifies the input file, so
be careful that you don't run it twice on the same file.
"""
# This is used as a tool only to segment translation files when adding a
# new segment. In the regular workflow, the work is done by the extract
# phase calling the functions above.
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
if len(argv) < 2:
sys.exit("Need a locale to segment")
for locale in argv[1:]:
segment_pofiles(locale)
if __name__ == "__main__":
main(sys.argv)
# This is test data.
#
msgid ""
msgstr ""
"Project-Id-Version: 0.1a\n"
"Report-Msgid-Bugs-To: openedx-translation@googlegroups.com\n"
"POT-Creation-Date: 2014-01-22 15:35-0500\n"
"PO-Revision-Date: 2014-01-22 20:35:52.096456\n"
"Last-Translator: \n"
"Language-Team: openedx-translation <openedx-translation@googlegroups.com>\n"
"MIME-Version: 1.0\n"
"Content-Type: text/plain; charset=UTF-8\n"
"Content-Transfer-Encoding: 8bit\n"
"Language: en\n"
#: cms/djangoapps/contentstore/views/tabs.py:39
#: lms/djangoapps/instructor/views/instructor_dashboard.py:111
msgid "Course Info"
msgstr "stuff about the course"
#: common/djangoapps/course_modes/models.py:43
msgid "Honor Code Certificate"
msgstr "your paper"
#: common/djangoapps/course_modes/views.py:81
#: common/djangoapps/student/views.py:478
msgid "Enrollment is closed"
msgstr "no way, dude"
#: common/static/js/vendor/mathjax-MathJax-c9db6ac/docs/source/mjtheme/layout.html:129
#: lms/templates/wiki/plugins/attachments/index.html:40
msgid "Search"
msgstr "find it!"
#: lms/djangoapps/courseware/features/video.py:111
msgid "ERROR: No playable video sources found!"
msgstr "try youtube, dude!"
# This is test data.
#
msgid ""
msgstr ""
"Project-Id-Version: 0.1a\n"
"Report-Msgid-Bugs-To: openedx-translation@googlegroups.com\n"
"POT-Creation-Date: 2014-01-22 15:35-0500\n"
"PO-Revision-Date: 2014-01-22 20:35:52.096456\n"
"Last-Translator: \n"
"Language-Team: openedx-translation <openedx-translation@googlegroups.com>\n"
"MIME-Version: 1.0\n"
"Content-Type: text/plain; charset=UTF-8\n"
"Content-Transfer-Encoding: 8bit\n"
"Language: en\n"
#: cms/djangoapps/contentstore/views/tabs.py:39
#: lms/djangoapps/instructor/views/instructor_dashboard.py:111
msgid "Course Info"
msgstr "stuff about the course"
#: common/djangoapps/course_modes/models.py:43
msgid "Honor Code Certificate"
msgstr "your paper"
#: common/djangoapps/course_modes/views.py:81
#: common/djangoapps/student/views.py:478
msgid "Enrollment is closed"
msgstr "no way, dude"
#: cms/djangoapps/contentstore/views/course.py:237
msgid ""
"There is already a course defined with the same organization, course number,"
" and course run. Please change either organization or course number to be "
"unique."
msgstr "org/course/run, wtf??"
#: cms/djangoapps/contentstore/views/course.py:243
#: cms/djangoapps/contentstore/views/course.py:247
#: other_cms/djangoapps/contentstore/views/course.py:269
#: cms/djangoapps/contentstore/views/course.py:272
msgid ""
"Please change either the organization or course number so that it is unique."
msgstr "pick again!"
#: common/static/js/vendor/mathjax-MathJax-c9db6ac/docs/source/mjtheme/layout.html:129
#: lms/templates/wiki/plugins/attachments/index.html:40
msgid "Search"
msgstr "find it!"
#: lms/djangoapps/courseware/features/video.py:111
msgid "ERROR: No playable video sources found!"
msgstr "try youtube, dude!"
# This is test data.
#
msgid ""
msgstr ""
"Project-Id-Version: 0.1a\n"
"Report-Msgid-Bugs-To: openedx-translation@googlegroups.com\n"
"POT-Creation-Date: 2014-01-22 15:35-0500\n"
"PO-Revision-Date: 2014-01-22 20:35:52.096456\n"
"Last-Translator: \n"
"Language-Team: openedx-translation <openedx-translation@googlegroups.com>\n"
"MIME-Version: 1.0\n"
"Content-Type: text/plain; charset=UTF-8\n"
"Content-Transfer-Encoding: 8bit\n"
"Language: en\n"
#: cms/djangoapps/contentstore/views/course.py:237
msgid ""
"There is already a course defined with the same organization, course number,"
" and course run. Please change either organization or course number to be "
"unique."
msgstr "org/course/run, wtf??"
#: cms/djangoapps/contentstore/views/course.py:243
#: cms/djangoapps/contentstore/views/course.py:247
#: other_cms/djangoapps/contentstore/views/course.py:269
#: cms/djangoapps/contentstore/views/course.py:272
msgid ""
"Please change either the organization or course number so that it is unique."
msgstr "pick again!"
......@@ -9,7 +9,7 @@ class TestConfiguration(TestCase):
"""
def test_config(self):
config_filename = os.path.normpath(os.path.join(LOCALE_DIR, 'config'))
config_filename = os.path.normpath(os.path.join(LOCALE_DIR, 'config.yaml'))
config = Configuration(config_filename)
self.assertEqual(config.source_locale, 'en')
......
......@@ -49,7 +49,10 @@ class TestGenerate(TestCase):
self.assertTrue(exists, msg='Missing file in locale %s: %s' % (locale, mofile))
self.assertTrue(datetime.fromtimestamp(os.path.getmtime(path), UTC) >= self.start_time,
msg='File not recently modified: %s' % path)
self.assert_merge_headers(locale)
# Segmenting means that the merge headers don't work they way they
# used to, so don't make this check for now. I'm not sure if we'll
# get the merge header back eventually, or delete this code eventually.
# self.assert_merge_headers(locale)
def assert_merge_headers(self, locale):
"""
......
"""Test i18n/segment.py"""
import os.path
import shutil
import unittest
from path import path
import polib
from i18n.segment import segment_pofile
HERE = path(__file__).dirname()
TEST_DATA = HERE / "data"
WORK = HERE / "work"
class SegmentTest(unittest.TestCase):
"""Test segment_pofile."""
def setUp(self):
if not os.path.exists(WORK):
os.mkdir(WORK)
self.addCleanup(shutil.rmtree, WORK)
def assert_pofile_same(self, pofile1, pofile2):
"""The paths `p1` and `p2` should be identical pofiles."""
po1 = polib.pofile(pofile1)
po2 = polib.pofile(pofile2)
self.assertEqual(po1, po2)
def test_sample_data(self):
work_file = WORK / "django.po"
shutil.copyfile(TEST_DATA / "django_before.po", work_file)
original_pofile = polib.pofile(work_file)
written = segment_pofile(
work_file,
{
'studio.po': [
'cms/*',
'other_cms/*',
],
}
)
self.assertEqual(written, set([WORK / "django.po", WORK / "studio.po"]))
pofiles = [polib.pofile(f) for f in written]
after_entries = sum(len(pofile) for pofile in pofiles)
self.assertEqual(len(original_pofile), after_entries)
original_ids = set(m.msgid for m in original_pofile)
after_ids = set(m.msgid for pofile in pofiles for m in pofile)
self.assertEqual(original_ids, after_ids)
self.assert_pofile_same(WORK / "django.po", TEST_DATA / "django_after.po")
self.assert_pofile_same(WORK / "studio.po", TEST_DATA / "studio.po")
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment