Commit 37a4d3ab by Victor Shnayder

hackish cleanup script

parent 395b33dd
#!/usr/bin/env python
"""
Victor's xml cleanup script. A big pile of useful hacks. Do not use
without carefully reading the code and deciding that this is what you want.
In particular, the remove-meta option is only intended to be used after pulling out a policy
using the metadata_to_json management command.
"""
import os, fnmatch, re, sys
from lxml import etree
from collections import defaultdict
INVALID_CHARS = re.compile(r"[^\w.-]")
def clean(value):
"""
Return value, made into a form legal for locations
"""
return re.sub('_+', '_', INVALID_CHARS.sub('_', value))
# category -> set of url_names for that category that we've already seen
used_names = defaultdict(set)
def clean_unique(category, name):
cleaned = clean(name)
if cleaned not in used_names[category]:
used_names[category].add(cleaned)
return cleaned
x = 1
while cleaned + str(x) in used_names[category]:
x += 1
# Found one!
cleaned = cleaned + str(x)
used_names[category].add(cleaned)
return cleaned
def cleanup(filepath, remove_meta):
# Keys that are exported to the policy file, and so
# can be removed from the xml afterward
to_remove = ('format', 'display_name',
'graceperiod', 'showanswer', 'rerandomize',
'start', 'due', 'graded', 'hide_from_toc',
'ispublic', 'xqa_key')
try:
print "Cleaning {}".format(filepath)
with open(filepath) as f:
parser = etree.XMLParser(remove_comments=False)
xml = etree.parse(filepath, parser=parser)
except:
print "Error parsing file {}".format(filepath)
return
for node in xml.iter(tag=etree.Element):
attrs = node.attrib
if 'url_name' in attrs:
used_names[node.tag].add(attrs['url_name'])
if 'name' in attrs:
# Replace name with an identical display_name, and a unique url_name
name = attrs['name']
attrs['display_name'] = name
attrs['url_name'] = clean_unique(node.tag, name)
del attrs['name']
if 'url_name' in attrs and 'slug' in attrs:
print "WARNING: {} has both slug and url_name"
if ('url_name' in attrs and 'filename' in attrs and
len(attrs)==2 and attrs['url_name'] == attrs['filename']):
# This is a pointer tag in disguise. Get rid of the filename.
print 'turning {}.{} into a pointer tag'.format(node.tag, attrs['url_name'])
del attrs['filename']
if remove_meta:
for attr in to_remove:
if attr in attrs:
del attrs[attr]
with open(filepath, "w") as f:
f.write(etree.tostring(xml))
def find_replace(directory, filePattern, remove_meta):
for path, dirs, files in os.walk(os.path.abspath(directory)):
for filename in fnmatch.filter(files, filePattern):
filepath = os.path.join(path, filename)
cleanup(filepath, remove_meta)
def main(args):
usage = "xml_cleanup [dir] [remove-meta]"
n = len(args)
if n < 1 or n > 2 or (n == 2 and args[1] != 'remove-meta'):
print usage
return
remove_meta = False
if n == 2:
remove_meta = True
find_replace(args[0], '*.xml', remove_meta)
if __name__ == '__main__':
main(sys.argv[1:])
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment