I18n segmenting tool.

Messages can be segmented into different .po files based on the location of their occurrences. This is so Studio strings can go into different .po files so we can direct Transifex workers to the LMS strings we really need done.

I18n segmenting tool.
Messages can be segmented into different .po files based on the location of their occurrences. This is so Studio strings can go into different .po files so we can direct Transifex workers to the LMS strings we really need done.
f76a5e17 · Ned Batchelder · aa1ab5c7 · f76a5e17 · f76a5e17 · f76a5e17
Commit f76a5e17 authored Jan 20, 2014 by Ned Batchelder
8 changed files
--- a/.tx/config
+++ b/.tx/config
@@ -7,18 +7,36 @@ source_file = conf/locale/en/LC_MESSAGES/django-partial.po
 source_lang = en
 type = PO

+[edx-platform.django-studio]
+file_filter = conf/locale/<lang>/LC_MESSAGES/django-studio.po
+source_file = conf/locale/en/LC_MESSAGES/django-studio.po
+source_lang = en
+type = PO
+
 [edx-platform.djangojs]
 file_filter = conf/locale/<lang>/LC_MESSAGES/djangojs.po
 source_file = conf/locale/en/LC_MESSAGES/djangojs.po
 source_lang = en
 type = PO

+[edx-platform.djangojs-studio]
+file_filter = conf/locale/<lang>/LC_MESSAGES/djangojs-studio.po
+source_file = conf/locale/en/LC_MESSAGES/djangojs-studio.po
+source_lang = en
+type = PO
+
 [edx-platform.mako]
 file_filter = conf/locale/<lang>/LC_MESSAGES/mako.po
 source_file = conf/locale/en/LC_MESSAGES/mako.po
 source_lang = en
 type = PO

+[edx-platform.mako-studio]
+file_filter = conf/locale/<lang>/LC_MESSAGES/mako-studio.po
+source_file = conf/locale/en/LC_MESSAGES/mako-studio.po
+source_lang = en
+type = PO
+
 [edx-platform.messages]
 file_filter = conf/locale/<lang>/LC_MESSAGES/messages.po
 source_file = conf/locale/en/LC_MESSAGES/messages.po

--- a/conf/locale/config.yaml
+++ b/conf/locale/config.yaml
@@ -31,3 +31,22 @@ locales:

 # The locale used for fake-accented English, for testing.
 dummy-locale: eo
+
+# How should .po files be segmented?  See i18n/segment.py for details. Strings
+# that are only found in a particular segment are segregated into that .po file
+# so that translators can focus on separate parts of the product.
+#
+# We segregate Studio so we can provide new languages for LMS without having to
+# also translate the Studio strings. LMS needs the strings from lms/* and
+# common/*, so those will stay in the main .po file.
+segment:
+    django-partial.po:          # This .po file..
+        django-studio.po:       #  produces this .po file..
+            - cms/*             #   by segregating strings from these files.
+        # Anything that doesn't match a pattern stays in the original file.
+    djangojs.po:
+        djangojs-studio.po:
+            - cms/*
+    mako.po:
+        mako-studio.po:
+            - cms/*
--- a/i18n/extract.py
+++ b/i18n/extract.py
@@ -31,7 +31,7 @@ BABEL_CONFIG = BASE_DIR.relpathto(LOCALE_DIR.joinpath('babel.cfg'))
 # Use relpath to reduce noise in logs
 BABEL_OUT = BASE_DIR.relpathto(CONFIGURATION.source_messages_dir.joinpath('mako.po'))

-SOURCE_WARN = 'This English source file is machine-generated. Do not check it into github'
+SOURCE_WARN = 'This English source file is machine-generated. Do not check it into git.'

 LOG = logging.getLogger(__name__)


--- a/i18n/segment.py
+++ b/i18n/segment.py
+#!/usr/bin/env python
+"""
+Segment a .po file to produce smaller files based on the locations of the
+messages.
+"""
+
+import copy
+import fnmatch
+import logging
+import sys
+
+import polib
+
+from i18n.config import CONFIGURATION
+
+LOG = logging.getLogger(__name__)
+
+
+def segment_pofiles(locale):
+    """Segment all the pofiles for `locale`.
+
+    Returns a set of filenames, all the segment files written.
+
+    """
+    files_written = set()
+    for filename, segments in CONFIGURATION.segment.items():
+        filename = CONFIGURATION.get_messages_dir(locale) / filename
+        files_written.update(segment_pofile(filename, segments))
+    return files_written
+
+
+def segment_pofile(filename, segments):
+    """Segment a .po file using patterns in `segments`.
+
+    The .po file at `filename` is read, and the occurrence locations of its
+    messages are examined.  `segments` is a dictionary: the keys are segment
+    .po filenames, the values are lists of patterns::
+
+        {
+            'django-studio.po': [
+                'cms/*',
+                'some-other-studio-place/*',
+            ],
+            'django-weird.po': [
+                '*/weird_*.*',
+            ],
+        }
+
+    If all a message's occurrences match the patterns for a segment, then that
+    message is written to the new segmented .po file.
+
+    Any message that matches no segments, or more than one, is written back to
+    the original file.
+
+    Arguments:
+        filename (path.path): a path object referring to the original .po file.
+        segments (dict): specification of the segments to create.
+
+    Returns:
+        a set of path objects, all the segment files written.
+
+    """
+    reading_msg = "Reading {num} entries from {file}"
+    writing_msg = "Writing {num} entries to {file}"
+
+    source_po = polib.pofile(filename)
+    LOG.info(reading_msg.format(file=filename, num=len(source_po)))
+
+    # A new pofile just like the source, but with no messages. We'll put
+    # anything not segmented into this file.
+    remaining_po = copy.deepcopy(source_po)
+    remaining_po[:] = []
+
+    # Turn the segments dictionary into two structures: segment_patterns is a
+    # list of (pattern, segmentfile) pairs.  segment_po_files is a dict mapping
+    # segment file names to pofile objects of their contents.
+    segment_po_files = {filename: remaining_po}
+    segment_patterns = []
+    for segmentfile, patterns in segments.items():
+        segment_po_files[segmentfile] = copy.deepcopy(remaining_po)
+        segment_patterns.extend((pat, segmentfile) for pat in patterns)
+
+    # Examine each message in the source file. If all of its occurrences match
+    # a pattern for the same segment, it goes in that segment.  Otherwise, it
+    # goes in remaining.
+    for msg in source_po:
+        msg_segments = set()
+        for occ_file, _ in msg.occurrences:
+            for pat, segment_file in segment_patterns:
+                if fnmatch.fnmatch(occ_file, pat):
+                    msg_segments.add(segment_file)
+                    break
+            else:
+                msg_segments.add(filename)
+
+        if len(msg_segments) == 1:
+            # This message belongs in this segment.
+            segment_file = msg_segments.pop()
+            segment_po_files[segment_file].append(msg)
+        else:
+            # Either it's in more than one segment, or none, so put it back in
+            # the main file.
+            remaining_po.append(msg)
+
+    # Write out the results.
+    files_written = set()
+    for segment_file, pofile in segment_po_files.items():
+        out_file = filename.dirname() / segment_file
+        if len(pofile) == 0:
+            LOG.error("No messages to write to {file}, did you run segment twice?".format(file=out_file))
+        else:
+            LOG.info(writing_msg.format(file=out_file, num=len(pofile)))
+            pofile.save(out_file)
+            files_written.add(out_file)
+
+    LOG.info(writing_msg.format(file=filename, num=len(remaining_po)))
+    remaining_po.save(filename)
+
+    return files_written
+
+
+def main(argv):
+    """
+    $ segment.py LOCALE [...]
+
+    Segment the .po files in LOCALE(s) based on the segmenting rules in
+    config.yaml.
+
+    Note that segmenting is *not* idempotent: it modifies the input file, so
+    be careful that you don't run it twice on the same file.
+
+    """
+    # This is used as a tool only to segment translation files when adding a
+    # new segment.  In the regular workflow, the work is done by the extract
+    # phase calling the functions above.
+
+    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+    if len(argv) < 2:
+        sys.exit("Need a locale to segment")
+    for locale in argv[1:]:
+        segment_pofiles(locale)
+
+
+if __name__ == "__main__":
+    main(sys.argv)
--- a/i18n/tests/data/django_after.po
+++ b/i18n/tests/data/django_after.po
+# This is test data.
+# 
+msgid ""
+msgstr ""
+"Project-Id-Version: 0.1a\n"
+"Report-Msgid-Bugs-To: openedx-translation@googlegroups.com\n"
+"POT-Creation-Date: 2014-01-22 15:35-0500\n"
+"PO-Revision-Date: 2014-01-22 20:35:52.096456\n"
+"Last-Translator: \n"
+"Language-Team: openedx-translation <openedx-translation@googlegroups.com>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=UTF-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"Language: en\n"
+
+#: cms/djangoapps/contentstore/views/tabs.py:39
+#: lms/djangoapps/instructor/views/instructor_dashboard.py:111
+msgid "Course Info"
+msgstr "stuff about the course"
+
+#: common/djangoapps/course_modes/models.py:43
+msgid "Honor Code Certificate"
+msgstr "your paper"
+
+#: common/djangoapps/course_modes/views.py:81
+#: common/djangoapps/student/views.py:478
+msgid "Enrollment is closed"
+msgstr "no way, dude"
+
+#: common/static/js/vendor/mathjax-MathJax-c9db6ac/docs/source/mjtheme/layout.html:129
+#: lms/templates/wiki/plugins/attachments/index.html:40
+msgid "Search"
+msgstr "find it!"
+
+#: lms/djangoapps/courseware/features/video.py:111
+msgid "ERROR: No playable video sources found!"
+msgstr "try youtube, dude!"
--- a/i18n/tests/data/django_before.po
+++ b/i18n/tests/data/django_before.po
+# This is test data.
+# 
+msgid ""
+msgstr ""
+"Project-Id-Version: 0.1a\n"
+"Report-Msgid-Bugs-To: openedx-translation@googlegroups.com\n"
+"POT-Creation-Date: 2014-01-22 15:35-0500\n"
+"PO-Revision-Date: 2014-01-22 20:35:52.096456\n"
+"Last-Translator: \n"
+"Language-Team: openedx-translation <openedx-translation@googlegroups.com>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=UTF-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"Language: en\n"
+
+#: cms/djangoapps/contentstore/views/tabs.py:39
+#: lms/djangoapps/instructor/views/instructor_dashboard.py:111
+msgid "Course Info"
+msgstr "stuff about the course"
+
+#: common/djangoapps/course_modes/models.py:43
+msgid "Honor Code Certificate"
+msgstr "your paper"
+
+#: common/djangoapps/course_modes/views.py:81
+#: common/djangoapps/student/views.py:478
+msgid "Enrollment is closed"
+msgstr "no way, dude"
+
+#: cms/djangoapps/contentstore/views/course.py:237
+msgid ""
+"There is already a course defined with the same organization, course number,"
+" and course run. Please change either organization or course number to be "
+"unique."
+msgstr "org/course/run, wtf??"
+
+#: cms/djangoapps/contentstore/views/course.py:243
+#: cms/djangoapps/contentstore/views/course.py:247
+#: other_cms/djangoapps/contentstore/views/course.py:269
+#: cms/djangoapps/contentstore/views/course.py:272
+msgid ""
+"Please change either the organization or course number so that it is unique."
+msgstr "pick again!"
+
+#: common/static/js/vendor/mathjax-MathJax-c9db6ac/docs/source/mjtheme/layout.html:129
+#: lms/templates/wiki/plugins/attachments/index.html:40
+msgid "Search"
+msgstr "find it!"
+
+#: lms/djangoapps/courseware/features/video.py:111
+msgid "ERROR: No playable video sources found!"
+msgstr "try youtube, dude!"
--- a/i18n/tests/data/studio.po
+++ b/i18n/tests/data/studio.po
+# This is test data.
+# 
+msgid ""
+msgstr ""
+"Project-Id-Version: 0.1a\n"
+"Report-Msgid-Bugs-To: openedx-translation@googlegroups.com\n"
+"POT-Creation-Date: 2014-01-22 15:35-0500\n"
+"PO-Revision-Date: 2014-01-22 20:35:52.096456\n"
+"Last-Translator: \n"
+"Language-Team: openedx-translation <openedx-translation@googlegroups.com>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=UTF-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"Language: en\n"
+
+#: cms/djangoapps/contentstore/views/course.py:237
+msgid ""
+"There is already a course defined with the same organization, course number,"
+" and course run. Please change either organization or course number to be "
+"unique."
+msgstr "org/course/run, wtf??"
+
+#: cms/djangoapps/contentstore/views/course.py:243
+#: cms/djangoapps/contentstore/views/course.py:247
+#: other_cms/djangoapps/contentstore/views/course.py:269
+#: cms/djangoapps/contentstore/views/course.py:272
+msgid ""
+"Please change either the organization or course number so that it is unique."
+msgstr "pick again!"
--- a/i18n/tests/test_segment.py
+++ b/i18n/tests/test_segment.py
+"""Test i18n/segment.py"""
+
+import os.path
+import shutil
+import unittest
+
+from path import path
+import polib
+
+from i18n.segment import segment_pofile
+
+
+HERE = path(__file__).dirname()
+TEST_DATA = HERE / "data"
+WORK = HERE / "work"
+
+
+class SegmentTest(unittest.TestCase):
+    """Test segment_pofile."""
+
+    def setUp(self):
+        if not os.path.exists(WORK):
+            os.mkdir(WORK)
+        self.addCleanup(shutil.rmtree, WORK)
+
+    def assert_pofile_same(self, pofile1, pofile2):
+        """The paths `p1` and `p2` should be identical pofiles."""
+        po1 = polib.pofile(pofile1)
+        po2 = polib.pofile(pofile2)
+        self.assertEqual(po1, po2)
+
+    def test_sample_data(self):
+        work_file = WORK / "django.po"
+        shutil.copyfile(TEST_DATA / "django_before.po", work_file)
+        original_pofile = polib.pofile(work_file)
+
+        written = segment_pofile(
+            work_file,
+            {
+                'studio.po': [
+                    'cms/*',
+                    'other_cms/*',
+                ],
+            }
+        )
+
+        self.assertEqual(written, set([WORK / "django.po", WORK / "studio.po"]))
+
+        pofiles = [polib.pofile(f) for f in written]
+        after_entries = sum(len(pofile) for pofile in pofiles)
+        self.assertEqual(len(original_pofile), after_entries)
+
+        original_ids = set(m.msgid for m in original_pofile)
+        after_ids = set(m.msgid for pofile in pofiles for m in pofile)
+        self.assertEqual(original_ids, after_ids)
+
+        self.assert_pofile_same(WORK / "django.po", TEST_DATA / "django_after.po")
+        self.assert_pofile_same(WORK / "studio.po", TEST_DATA / "studio.po")