segment.py 4.94 KB
Newer Older
Ned Batchelder committed
1 2 3 4 5 6 7 8 9 10
#!/usr/bin/env python
"""
Segment a .po file to produce smaller files based on the locations of the
messages.
"""

import copy
import fnmatch
import logging
import sys
11
import argparse
Ned Batchelder committed
12
import polib
13
import textwrap
Ned Batchelder committed
14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96

from i18n.config import CONFIGURATION

LOG = logging.getLogger(__name__)


def segment_pofiles(locale):
    """Segment all the pofiles for `locale`.

    Returns a set of filenames, all the segment files written.

    """
    files_written = set()
    for filename, segments in CONFIGURATION.segment.items():
        filename = CONFIGURATION.get_messages_dir(locale) / filename
        files_written.update(segment_pofile(filename, segments))
    return files_written


def segment_pofile(filename, segments):
    """Segment a .po file using patterns in `segments`.

    The .po file at `filename` is read, and the occurrence locations of its
    messages are examined.  `segments` is a dictionary: the keys are segment
    .po filenames, the values are lists of patterns::

        {
            'django-studio.po': [
                'cms/*',
                'some-other-studio-place/*',
            ],
            'django-weird.po': [
                '*/weird_*.*',
            ],
        }

    If all a message's occurrences match the patterns for a segment, then that
    message is written to the new segmented .po file.

    Any message that matches no segments, or more than one, is written back to
    the original file.

    Arguments:
        filename (path.path): a path object referring to the original .po file.
        segments (dict): specification of the segments to create.

    Returns:
        a set of path objects, all the segment files written.

    """
    reading_msg = "Reading {num} entries from {file}"
    writing_msg = "Writing {num} entries to {file}"

    source_po = polib.pofile(filename)
    LOG.info(reading_msg.format(file=filename, num=len(source_po)))

    # A new pofile just like the source, but with no messages. We'll put
    # anything not segmented into this file.
    remaining_po = copy.deepcopy(source_po)
    remaining_po[:] = []

    # Turn the segments dictionary into two structures: segment_patterns is a
    # list of (pattern, segmentfile) pairs.  segment_po_files is a dict mapping
    # segment file names to pofile objects of their contents.
    segment_po_files = {filename: remaining_po}
    segment_patterns = []
    for segmentfile, patterns in segments.items():
        segment_po_files[segmentfile] = copy.deepcopy(remaining_po)
        segment_patterns.extend((pat, segmentfile) for pat in patterns)

    # Examine each message in the source file. If all of its occurrences match
    # a pattern for the same segment, it goes in that segment.  Otherwise, it
    # goes in remaining.
    for msg in source_po:
        msg_segments = set()
        for occ_file, _ in msg.occurrences:
            for pat, segment_file in segment_patterns:
                if fnmatch.fnmatch(occ_file, pat):
                    msg_segments.add(segment_file)
                    break
            else:
                msg_segments.add(filename)

97
        assert msg_segments
Ned Batchelder committed
98 99 100 101 102
        if len(msg_segments) == 1:
            # This message belongs in this segment.
            segment_file = msg_segments.pop()
            segment_po_files[segment_file].append(msg)
        else:
103
            # It's in more than one segment, so put it back in the main file.
Ned Batchelder committed
104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
            remaining_po.append(msg)

    # Write out the results.
    files_written = set()
    for segment_file, pofile in segment_po_files.items():
        out_file = filename.dirname() / segment_file
        if len(pofile) == 0:
            LOG.error("No messages to write to {file}, did you run segment twice?".format(file=out_file))
        else:
            LOG.info(writing_msg.format(file=out_file, num=len(pofile)))
            pofile.save(out_file)
            files_written.add(out_file)

    return files_written


David Baumgold committed
120 121 122 123
def main(locales=None, verbosity=1):  # pylint: disable=unused-argument
    """
    Main entry point of script
    """
Ned Batchelder committed
124 125 126
    # This is used as a tool only to segment translation files when adding a
    # new segment.  In the regular workflow, the work is done by the extract
    # phase calling the functions above.
127 128
    locales = locales or []
    for locale in locales:
Ned Batchelder committed
129 130 131 132
        segment_pofiles(locale)


if __name__ == "__main__":
133 134
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)

David Baumgold committed
135
    # pylint: disable=invalid-name
136 137 138 139 140 141 142 143 144 145 146 147 148
    description = textwrap.dedent("""
        Segment the .po files in LOCALE(s) based on the segmenting rules in
        config.yaml.

        Note that segmenting is *not* idempotent: it modifies the input file, so
        be careful that you don't run it twice on the same file.
    """.strip())

    parser = argparse.ArgumentParser(description=description)
    parser.add_argument("locale", nargs="+", help="a locale to segment")
    parser.add_argument("--verbose", "-v", action="count", default=0)
    args = parser.parse_args()
    main(locales=args.locale, verbosity=args.verbose)