Commit f76a5e17 by Ned Batchelder

I18n segmenting tool.

Messages can be segmented into different .po files based on the location
of their occurrences.  This is so Studio strings can go into different
.po files so we can direct Transifex workers to the LMS strings we
really need done.
parent aa1ab5c7
......@@ -7,18 +7,36 @@ source_file = conf/locale/en/LC_MESSAGES/django-partial.po
source_lang = en
type = PO
[edx-platform.django-studio]
file_filter = conf/locale/<lang>/LC_MESSAGES/django-studio.po
source_file = conf/locale/en/LC_MESSAGES/django-studio.po
source_lang = en
type = PO
[edx-platform.djangojs]
file_filter = conf/locale/<lang>/LC_MESSAGES/djangojs.po
source_file = conf/locale/en/LC_MESSAGES/djangojs.po
source_lang = en
type = PO
[edx-platform.djangojs-studio]
file_filter = conf/locale/<lang>/LC_MESSAGES/djangojs-studio.po
source_file = conf/locale/en/LC_MESSAGES/djangojs-studio.po
source_lang = en
type = PO
[edx-platform.mako]
file_filter = conf/locale/<lang>/LC_MESSAGES/mako.po
source_file = conf/locale/en/LC_MESSAGES/mako.po
source_lang = en
type = PO
[edx-platform.mako-studio]
file_filter = conf/locale/<lang>/LC_MESSAGES/mako-studio.po
source_file = conf/locale/en/LC_MESSAGES/mako-studio.po
source_lang = en
type = PO
[edx-platform.messages]
file_filter = conf/locale/<lang>/LC_MESSAGES/messages.po
source_file = conf/locale/en/LC_MESSAGES/messages.po
......
......@@ -31,3 +31,22 @@ locales:
# The locale used for fake-accented English, for testing.
dummy-locale: eo
# How should .po files be segmented? See i18n/segment.py for details. Strings
# that are only found in a particular segment are segregated into that .po file
# so that translators can focus on separate parts of the product.
#
# We segregate Studio so we can provide new languages for LMS without having to
# also translate the Studio strings. LMS needs the strings from lms/* and
# common/*, so those will stay in the main .po file.
segment:
django-partial.po: # This .po file..
django-studio.po: # produces this .po file..
- cms/* # by segregating strings from these files.
# Anything that doesn't match a pattern stays in the original file.
djangojs.po:
djangojs-studio.po:
- cms/*
mako.po:
mako-studio.po:
- cms/*
......@@ -31,7 +31,7 @@ BABEL_CONFIG = BASE_DIR.relpathto(LOCALE_DIR.joinpath('babel.cfg'))
# Use relpath to reduce noise in logs
BABEL_OUT = BASE_DIR.relpathto(CONFIGURATION.source_messages_dir.joinpath('mako.po'))
SOURCE_WARN = 'This English source file is machine-generated. Do not check it into github'
SOURCE_WARN = 'This English source file is machine-generated. Do not check it into git.'
LOG = logging.getLogger(__name__)
......
#!/usr/bin/env python
"""
Segment a .po file to produce smaller files based on the locations of the
messages.
"""
import copy
import fnmatch
import logging
import sys
import polib
from i18n.config import CONFIGURATION
LOG = logging.getLogger(__name__)
def segment_pofiles(locale):
"""Segment all the pofiles for `locale`.
Returns a set of filenames, all the segment files written.
"""
files_written = set()
for filename, segments in CONFIGURATION.segment.items():
filename = CONFIGURATION.get_messages_dir(locale) / filename
files_written.update(segment_pofile(filename, segments))
return files_written
def segment_pofile(filename, segments):
"""Segment a .po file using patterns in `segments`.
The .po file at `filename` is read, and the occurrence locations of its
messages are examined. `segments` is a dictionary: the keys are segment
.po filenames, the values are lists of patterns::
{
'django-studio.po': [
'cms/*',
'some-other-studio-place/*',
],
'django-weird.po': [
'*/weird_*.*',
],
}
If all a message's occurrences match the patterns for a segment, then that
message is written to the new segmented .po file.
Any message that matches no segments, or more than one, is written back to
the original file.
Arguments:
filename (path.path): a path object referring to the original .po file.
segments (dict): specification of the segments to create.
Returns:
a set of path objects, all the segment files written.
"""
reading_msg = "Reading {num} entries from {file}"
writing_msg = "Writing {num} entries to {file}"
source_po = polib.pofile(filename)
LOG.info(reading_msg.format(file=filename, num=len(source_po)))
# A new pofile just like the source, but with no messages. We'll put
# anything not segmented into this file.
remaining_po = copy.deepcopy(source_po)
remaining_po[:] = []
# Turn the segments dictionary into two structures: segment_patterns is a
# list of (pattern, segmentfile) pairs. segment_po_files is a dict mapping
# segment file names to pofile objects of their contents.
segment_po_files = {filename: remaining_po}
segment_patterns = []
for segmentfile, patterns in segments.items():
segment_po_files[segmentfile] = copy.deepcopy(remaining_po)
segment_patterns.extend((pat, segmentfile) for pat in patterns)
# Examine each message in the source file. If all of its occurrences match
# a pattern for the same segment, it goes in that segment. Otherwise, it
# goes in remaining.
for msg in source_po:
msg_segments = set()
for occ_file, _ in msg.occurrences:
for pat, segment_file in segment_patterns:
if fnmatch.fnmatch(occ_file, pat):
msg_segments.add(segment_file)
break
else:
msg_segments.add(filename)
if len(msg_segments) == 1:
# This message belongs in this segment.
segment_file = msg_segments.pop()
segment_po_files[segment_file].append(msg)
else:
# Either it's in more than one segment, or none, so put it back in
# the main file.
remaining_po.append(msg)
# Write out the results.
files_written = set()
for segment_file, pofile in segment_po_files.items():
out_file = filename.dirname() / segment_file
if len(pofile) == 0:
LOG.error("No messages to write to {file}, did you run segment twice?".format(file=out_file))
else:
LOG.info(writing_msg.format(file=out_file, num=len(pofile)))
pofile.save(out_file)
files_written.add(out_file)
LOG.info(writing_msg.format(file=filename, num=len(remaining_po)))
remaining_po.save(filename)
return files_written
def main(argv):
"""
$ segment.py LOCALE [...]
Segment the .po files in LOCALE(s) based on the segmenting rules in
config.yaml.
Note that segmenting is *not* idempotent: it modifies the input file, so
be careful that you don't run it twice on the same file.
"""
# This is used as a tool only to segment translation files when adding a
# new segment. In the regular workflow, the work is done by the extract
# phase calling the functions above.
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
if len(argv) < 2:
sys.exit("Need a locale to segment")
for locale in argv[1:]:
segment_pofiles(locale)
if __name__ == "__main__":
main(sys.argv)
# This is test data.
#
msgid ""
msgstr ""
"Project-Id-Version: 0.1a\n"
"Report-Msgid-Bugs-To: openedx-translation@googlegroups.com\n"
"POT-Creation-Date: 2014-01-22 15:35-0500\n"
"PO-Revision-Date: 2014-01-22 20:35:52.096456\n"
"Last-Translator: \n"
"Language-Team: openedx-translation <openedx-translation@googlegroups.com>\n"
"MIME-Version: 1.0\n"
"Content-Type: text/plain; charset=UTF-8\n"
"Content-Transfer-Encoding: 8bit\n"
"Language: en\n"
#: cms/djangoapps/contentstore/views/tabs.py:39
#: lms/djangoapps/instructor/views/instructor_dashboard.py:111
msgid "Course Info"
msgstr "stuff about the course"
#: common/djangoapps/course_modes/models.py:43
msgid "Honor Code Certificate"
msgstr "your paper"
#: common/djangoapps/course_modes/views.py:81
#: common/djangoapps/student/views.py:478
msgid "Enrollment is closed"
msgstr "no way, dude"
#: common/static/js/vendor/mathjax-MathJax-c9db6ac/docs/source/mjtheme/layout.html:129
#: lms/templates/wiki/plugins/attachments/index.html:40
msgid "Search"
msgstr "find it!"
#: lms/djangoapps/courseware/features/video.py:111
msgid "ERROR: No playable video sources found!"
msgstr "try youtube, dude!"
# This is test data.
#
msgid ""
msgstr ""
"Project-Id-Version: 0.1a\n"
"Report-Msgid-Bugs-To: openedx-translation@googlegroups.com\n"
"POT-Creation-Date: 2014-01-22 15:35-0500\n"
"PO-Revision-Date: 2014-01-22 20:35:52.096456\n"
"Last-Translator: \n"
"Language-Team: openedx-translation <openedx-translation@googlegroups.com>\n"
"MIME-Version: 1.0\n"
"Content-Type: text/plain; charset=UTF-8\n"
"Content-Transfer-Encoding: 8bit\n"
"Language: en\n"
#: cms/djangoapps/contentstore/views/tabs.py:39
#: lms/djangoapps/instructor/views/instructor_dashboard.py:111
msgid "Course Info"
msgstr "stuff about the course"
#: common/djangoapps/course_modes/models.py:43
msgid "Honor Code Certificate"
msgstr "your paper"
#: common/djangoapps/course_modes/views.py:81
#: common/djangoapps/student/views.py:478
msgid "Enrollment is closed"
msgstr "no way, dude"
#: cms/djangoapps/contentstore/views/course.py:237
msgid ""
"There is already a course defined with the same organization, course number,"
" and course run. Please change either organization or course number to be "
"unique."
msgstr "org/course/run, wtf??"
#: cms/djangoapps/contentstore/views/course.py:243
#: cms/djangoapps/contentstore/views/course.py:247
#: other_cms/djangoapps/contentstore/views/course.py:269
#: cms/djangoapps/contentstore/views/course.py:272
msgid ""
"Please change either the organization or course number so that it is unique."
msgstr "pick again!"
#: common/static/js/vendor/mathjax-MathJax-c9db6ac/docs/source/mjtheme/layout.html:129
#: lms/templates/wiki/plugins/attachments/index.html:40
msgid "Search"
msgstr "find it!"
#: lms/djangoapps/courseware/features/video.py:111
msgid "ERROR: No playable video sources found!"
msgstr "try youtube, dude!"
# This is test data.
#
msgid ""
msgstr ""
"Project-Id-Version: 0.1a\n"
"Report-Msgid-Bugs-To: openedx-translation@googlegroups.com\n"
"POT-Creation-Date: 2014-01-22 15:35-0500\n"
"PO-Revision-Date: 2014-01-22 20:35:52.096456\n"
"Last-Translator: \n"
"Language-Team: openedx-translation <openedx-translation@googlegroups.com>\n"
"MIME-Version: 1.0\n"
"Content-Type: text/plain; charset=UTF-8\n"
"Content-Transfer-Encoding: 8bit\n"
"Language: en\n"
#: cms/djangoapps/contentstore/views/course.py:237
msgid ""
"There is already a course defined with the same organization, course number,"
" and course run. Please change either organization or course number to be "
"unique."
msgstr "org/course/run, wtf??"
#: cms/djangoapps/contentstore/views/course.py:243
#: cms/djangoapps/contentstore/views/course.py:247
#: other_cms/djangoapps/contentstore/views/course.py:269
#: cms/djangoapps/contentstore/views/course.py:272
msgid ""
"Please change either the organization or course number so that it is unique."
msgstr "pick again!"
"""Test i18n/segment.py"""
import os.path
import shutil
import unittest
from path import path
import polib
from i18n.segment import segment_pofile
HERE = path(__file__).dirname()
TEST_DATA = HERE / "data"
WORK = HERE / "work"
class SegmentTest(unittest.TestCase):
"""Test segment_pofile."""
def setUp(self):
if not os.path.exists(WORK):
os.mkdir(WORK)
self.addCleanup(shutil.rmtree, WORK)
def assert_pofile_same(self, pofile1, pofile2):
"""The paths `p1` and `p2` should be identical pofiles."""
po1 = polib.pofile(pofile1)
po2 = polib.pofile(pofile2)
self.assertEqual(po1, po2)
def test_sample_data(self):
work_file = WORK / "django.po"
shutil.copyfile(TEST_DATA / "django_before.po", work_file)
original_pofile = polib.pofile(work_file)
written = segment_pofile(
work_file,
{
'studio.po': [
'cms/*',
'other_cms/*',
],
}
)
self.assertEqual(written, set([WORK / "django.po", WORK / "studio.po"]))
pofiles = [polib.pofile(f) for f in written]
after_entries = sum(len(pofile) for pofile in pofiles)
self.assertEqual(len(original_pofile), after_entries)
original_ids = set(m.msgid for m in original_pofile)
after_ids = set(m.msgid for pofile in pofiles for m in pofile)
self.assertEqual(original_ids, after_ids)
self.assert_pofile_same(WORK / "django.po", TEST_DATA / "django_after.po")
self.assert_pofile_same(WORK / "studio.po", TEST_DATA / "studio.po")
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment