xml_cleanup.py 3.33 KB
Newer Older
Victor Shnayder committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49
#!/usr/bin/env python

"""
Victor's xml cleanup script.  A big pile of useful hacks.  Do not use
without carefully reading the code and deciding that this is what you want.

In particular, the remove-meta option is only intended to be used after pulling out a policy
using the metadata_to_json management command.
"""

import os, fnmatch, re, sys
from lxml import etree
from collections import defaultdict

INVALID_CHARS = re.compile(r"[^\w.-]")

def clean(value):
    """
    Return value, made into a form legal for locations
    """
    return re.sub('_+', '_', INVALID_CHARS.sub('_', value))


# category -> set of url_names for that category that we've already seen
used_names = defaultdict(set)

def clean_unique(category, name):
    cleaned = clean(name)
    if cleaned not in used_names[category]:
        used_names[category].add(cleaned)
        return cleaned
    x = 1
    while cleaned + str(x) in used_names[category]:
        x += 1

    # Found one!
    cleaned = cleaned + str(x)
    used_names[category].add(cleaned)
    return cleaned

def cleanup(filepath, remove_meta):
    # Keys that are exported to the policy file, and so
    # can be removed from the xml afterward
    to_remove = ('format', 'display_name',
                 'graceperiod', 'showanswer', 'rerandomize',
                 'start', 'due', 'graded', 'hide_from_toc',
                 'ispublic', 'xqa_key')

    try:
50
        print "Cleaning {0}".format(filepath)
Victor Shnayder committed
51 52 53 54
        with open(filepath) as f:
            parser = etree.XMLParser(remove_comments=False)
            xml = etree.parse(filepath, parser=parser)
    except:
55
        print "Error parsing file {0}".format(filepath)
Victor Shnayder committed
56 57 58 59 60 61 62 63 64 65 66 67 68 69
        return

    for node in xml.iter(tag=etree.Element):
        attrs = node.attrib
        if 'url_name' in attrs:
            used_names[node.tag].add(attrs['url_name'])
        if 'name' in attrs:
            # Replace name with an identical display_name, and a unique url_name
            name = attrs['name']
            attrs['display_name'] = name
            attrs['url_name'] = clean_unique(node.tag, name)
            del attrs['name']

        if 'url_name' in attrs and 'slug' in attrs:
70
            print "WARNING: {0} has both slug and url_name".format(node)
Victor Shnayder committed
71 72 73 74

        if ('url_name' in attrs and 'filename' in attrs and
            len(attrs)==2 and attrs['url_name'] == attrs['filename']):
            # This is a pointer tag in disguise.  Get rid of the filename.
75
            print 'turning {0}.{1} into a pointer tag'.format(node.tag, attrs['url_name'])
Victor Shnayder committed
76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112
            del attrs['filename']

        if remove_meta:
            for attr in to_remove:
                if attr in attrs:
                    del attrs[attr]


    with open(filepath, "w") as f:
        f.write(etree.tostring(xml))


def find_replace(directory, filePattern, remove_meta):
    for path, dirs, files in os.walk(os.path.abspath(directory)):
        for filename in fnmatch.filter(files, filePattern):
            filepath = os.path.join(path, filename)
            cleanup(filepath, remove_meta)


def main(args):
    usage = "xml_cleanup [dir] [remove-meta]"
    n = len(args)
    if n < 1 or n > 2 or (n == 2 and args[1] != 'remove-meta'):
        print usage
        return

    remove_meta = False
    if n == 2:
        remove_meta = True

    find_replace(args[0], '*.xml', remove_meta)


if __name__ == '__main__':
    main(sys.argv[1:])