Commit 63b3999e by Victor Shnayder

Big refactor of content_parser

* Separate out all xml processing code from django code
* Preparing to move the content parsing parts into common/lib/xmodule (?)
parent 8ca56ed4
......@@ -38,29 +38,6 @@ log = logging.getLogger("mitx.courseware")
def format_url_params(params):
return [ urllib.quote(string.replace(' ','_')) for string in params ]
def xpath(xml, query_string, **args):
''' Safe xpath query into an xml tree:
* xml is the tree.
* query_string is the query
* args are the parameters. Substitute for {params}.
We should remove this with the move to lxml.
We should also use lxml argument passing. '''
doc = etree.fromstring(xml)
#print type(doc)
def escape(x):
# TODO: This should escape the string. For now, we just assume it's made of valid characters.
# Couldn't figure out how to escape for lxml in a few quick Googles
valid_chars="".join(map(chr, range(ord('a'),ord('z')+1)+range(ord('A'),ord('Z')+1)+range(ord('0'), ord('9')+1)))+"_ "
for e in x:
if e not in valid_chars:
raise Exception("Invalid char in xpath expression. TODO: Escape")
return x
args=dict( ((k, escape(args[k])) for k in args) )
#print args
results = doc.xpath(query_string.format(**args))
return results
def xpath_remove(tree, path):
''' Remove all items matching path from lxml tree. Works in
place.'''
......@@ -69,35 +46,34 @@ def xpath_remove(tree, path):
item.getparent().remove(item)
return tree
if __name__=='__main__':
print xpath('<html><problem name="Bob"></problem></html>', '/{search}/problem[@name="{name}"]',
search='html', name="Bob")
def id_tag(course):
''' Tag all course elements with unique IDs '''
default_ids = xmodule.get_default_ids()
# Tag elements with unique IDs
elements = course.xpath("|".join(['//'+c for c in default_ids]))
elements = course.xpath("|".join('//' + c for c in default_ids))
for elem in elements:
if elem.get('id'):
pass
elif elem.get(default_ids[elem.tag]):
new_id = elem.get(default_ids[elem.tag])
new_id = "".join([a for a in new_id if a.isalnum()]) # Convert to alphanumeric
# Without this, a conflict may occur between an hmtl or youtube id
new_id = elem.get(default_ids[elem.tag])
# Convert to alphanumeric
new_id = "".join(a for a in new_id if a.isalnum())
# Without this, a conflict may occur between an html or youtube id
new_id = default_ids[elem.tag] + new_id
elem.set('id', new_id)
else:
elem.set('id', "id"+fasthash(etree.tostring(elem)))
elem.set('id', "id" + fasthash(etree.tostring(elem)))
def propogate_downward_tag(element, attribute_name, parent_attribute = None):
''' This call is to pass down an attribute to all children. If an element
has this attribute, it will be "inherited" by all of its children. If a
child (A) already has that attribute, A will keep the same attribute and
all of A's children will inherit A's attribute. This is a recursive call.'''
if (parent_attribute is None): #This is the entry call. Select all elements with this attribute
if (parent_attribute is None):
#This is the entry call. Select all elements with this attribute
all_attributed_elements = element.xpath("//*[@" + attribute_name +"]")
for attributed_element in all_attributed_elements:
attribute_value = attributed_element.get(attribute_name)
......@@ -118,32 +94,6 @@ def propogate_downward_tag(element, attribute_name, parent_attribute = None):
#to its children later.
return
def user_groups(user):
if not user.is_authenticated():
return []
# TODO: Rewrite in Django
key = 'user_group_names_{user.id}'.format(user=user)
cache_expiration = 60 * 60 # one hour
# Kill caching on dev machines -- we switch groups a lot
group_names = cache.get(key)
if group_names is None:
group_names = [u.name for u in UserTestGroup.objects.filter(users=user)]
cache.set(key, group_names, cache_expiration)
return group_names
# return [u.name for u in UserTestGroup.objects.raw("select * from auth_user, student_usertestgroup, student_usertestgroup_users where auth_user.id = student_usertestgroup_users.user_id and student_usertestgroup_users.usertestgroup_id = student_usertestgroup.id and auth_user.id = %s", [user.id])]
def replace_custom_tags(tree):
tags = os.listdir(settings.DATA_DIR+'/custom_tags')
for tag in tags:
for element in tree.iter(tag):
element.tag = 'customtag'
impl = etree.SubElement(element, 'impl')
impl.text = tag
def course_xml_process(tree):
''' Do basic pre-processing of an XML tree. Assign IDs to all
......@@ -159,130 +109,249 @@ def course_xml_process(tree):
propogate_downward_tag(tree, "rerandomize")
return tree
def course_file(user,coursename=None):
''' Given a user, return course.xml'''
if user.is_authenticated():
filename = UserProfile.objects.get(user=user).courseware # user.profile_cache.courseware
else:
filename = 'guest_course.xml'
def toc_from_xml(dom, active_chapter, active_section):
'''
Create a table of contents from the course xml.
# if a specific course is specified, then use multicourse to get the right path to the course XML directory
if coursename and settings.ENABLE_MULTICOURSE:
xp = multicourse_settings.get_course_xmlpath(coursename)
filename = xp + filename # prefix the filename with the path
Return format:
[ {'name': name, 'sections': SECTIONS, 'active': bool}, ... ]
groups = user_groups(user)
options = {'dev_content':settings.DEV_CONTENT,
'groups' : groups}
where SECTIONS is a list
[ {'name': name, 'format': format, 'due': due, 'active' : bool}, ...]
cache_key = filename + "_processed?dev_content:" + str(options['dev_content']) + "&groups:" + str(sorted(groups))
if "dev" not in settings.DEFAULT_GROUPS:
tree_string = cache.get(cache_key)
else:
tree_string = None
active is set for the section and chapter corresponding to the passed
parameters. Everything else comes from the xml, or defaults to "".
if settings.DEBUG:
log.info('[courseware.content_parser.course_file] filename=%s, cache_key=%s' % (filename,cache_key))
# print '[courseware.content_parser.course_file] tree_string = ',tree_string
chapters with name 'hidden' are skipped.
'''
name = dom.xpath('//course/@name')[0]
if not tree_string:
tree = course_xml_process(etree.XML(render_to_string(filename, options, namespace = 'course')))
tree_string = etree.tostring(tree)
cache.set(cache_key, tree_string, 60)
else:
tree = etree.XML(tree_string)
chapters = dom.xpath('//course[@name=$name]/chapter', name=name)
ch = list()
for c in chapters:
if c.get('name') == 'hidden':
continue
sections = list()
for s in dom.xpath('//course[@name=$name]/chapter[@name=$chname]/section',
name=name, chname=c.get('name')):
format = s.get("subtitle") if s.get("subtitle") else s.get("format") or ""
active = (c.get("name") == active_chapter and
s.get("name") == active_section)
sections.append({'name': s.get("name") or "",
'format': format,
'due': s.get("due") or "",
'active': active})
ch.append({'name': c.get("name"),
'sections': sections,
'active': c.get("name") == active_chapter})
return ch
return tree
def section_file(user, section, coursename=None, dironly=False):
def replace_custom_tags_dir(tree, dir):
'''
Given a user and the name of a section, return that section.
This is done specific to each course.
If dironly=True then return the sections directory.
TODO: This is a bit weird; dironly should be scrapped.
Process tree to replace all custom tags defined in dir.
'''
filename = section+".xml"
tags = os.listdir(dir)
for tag in tags:
for element in tree.iter(tag):
element.tag = 'customtag'
impl = etree.SubElement(element, 'impl')
impl.text = tag
# if a specific course is specified, then use multicourse to get the right path to the course XML directory
xp = ''
if coursename and settings.ENABLE_MULTICOURSE: xp = multicourse_settings.get_course_xmlpath(coursename)
def parse_course_file(filename, options, namespace):
'''
Parse a course file with the given options, and return the resulting
xml tree object.
Options should be a dictionary including keys
'dev_content': bool,
'groups' : [list, of, user, groups]
'''
xml = etree.XML(render_to_string(filename, options, namespace = 'course'))
return course_xml_process(xml)
dirname = settings.DATA_DIR + xp + '/sections/'
if dironly: return dirname
def get_section(section, options, dirname):
'''
Given the name of a section, an options dict containing keys
'dev_content' and 'groups', and a directory to look in,
returns the xml tree for the section, or None if there's no
such section.
'''
filename = section + ".xml"
if filename not in os.listdir(dirname):
log.error(filename+" not in "+str(os.listdir(dirname)))
log.error(filename + " not in " + str(os.listdir(dirname)))
return None
options = {'dev_content':settings.DEV_CONTENT,
'groups' : user_groups(user)}
tree = course_xml_process(etree.XML(render_to_string(filename, options, namespace = 'sections')))
tree = parse_course_file(filename, options, namespace='sections')
return tree
def module_xml(user, module, id_tag, module_id, coursename=None):
''' Get XML for a module based on module and module_id. Assumes
module occurs once in courseware XML file or hidden section. '''
# Sanitize input
def get_module(tree, module, id_tag, module_id, sections_dirname, options):
'''
Given the xml tree of the course, get the xml string for a module
with the specified module type, id_tag, module_id. Looks in
sections_dirname for sections.
'''
# Sanitize input
if not module.isalnum():
raise Exception("Module is not alphanumeric")
if not module_id.isalnum():
raise Exception("Module ID is not alphanumeric")
# Generate search
xpath_search='//{module}[(@{id_tag} = "{id}") or (@id = "{id}")]'.format(module=module,
id_tag=id_tag,
id=module_id)
#result_set=doc.xpathEval(xpath_search)
doc = course_file(user,coursename)
sdirname = section_file(user,'',coursename,True) # get directory where sections information is stored
section_list = (s[:-4] for s in os.listdir(sdirname) if s[-4:]=='.xml')
result_set=doc.xpath(xpath_search)
if len(result_set)<1:
xpath_search='//{module}[(@{id_tag} = "{id}") or (@id = "{id}")]'.format(
module=module,
id_tag=id_tag,
id=module_id)
result_set = tree.xpath(xpath_search)
if len(result_set) < 1:
# Not found in main tree. Let's look in the section files.
section_list = (s[:-4] for s in os.listdir(sections_dirname) if s[-4:]=='.xml')
for section in section_list:
try:
s = section_file(user, section, coursename)
s = get_section(section, options, sections_dirname)
except etree.XMLSyntaxError:
ex= sys.exc_info()
raise ContentException("Malformed XML in " + section+ "("+str(ex[1].msg)+")")
ex = sys.exc_info()
raise ContentException("Malformed XML in " + section +
"(" + str(ex[1].msg) + ")")
result_set = s.xpath(xpath_search)
if len(result_set) != 0:
if len(result_set) != 0:
break
if len(result_set)>1:
if len(result_set) > 1:
log.error("WARNING: Potentially malformed course file", module, module_id)
if len(result_set)==0:
if settings.DEBUG:
log.error('[courseware.content_parser.module_xml] cannot find %s in course.xml tree' % xpath_search)
log.error('tree = %s' % etree.tostring(doc,pretty_print=True))
log.error('[content_parser.get_module] cannot find %s in course.xml tree',
xpath_search)
log.error('tree = %s' % etree.tostring(tree, pretty_print=True))
return None
if settings.DEBUG:
log.info('[courseware.content_parser.module_xml] found %s' % result_set)
# log.debug('[courseware.content_parser.module_xml] found %s' % result_set)
return etree.tostring(result_set[0])
#return result_set[0].serialize()
def toc_from_xml(dom, active_chapter, active_section):
name = dom.xpath('//course/@name')[0]
chapters = dom.xpath('//course[@name=$name]/chapter', name=name)
ch=list()
for c in chapters:
if c.get('name') == 'hidden':
continue
sections=list()
for s in dom.xpath('//course[@name=$name]/chapter[@name=$chname]/section', name=name, chname=c.get('name')):
sections.append({'name':s.get("name") or "",
'format':s.get("subtitle") if s.get("subtitle") else s.get("format") or "",
'due':s.get("due") or "",
'active':(c.get("name")==active_chapter and \
s.get("name")==active_section)})
ch.append({'name':c.get("name"),
'sections':sections,
'active':(c.get("name")==active_chapter)})
return ch
# ==== All Django-specific code below =============================================
def user_groups(user):
if not user.is_authenticated():
return []
# TODO: Rewrite in Django
key = 'user_group_names_{user.id}'.format(user=user)
cache_expiration = 60 * 60 # one hour
# Kill caching on dev machines -- we switch groups a lot
group_names = cache.get(key)
if group_names is None:
group_names = [u.name for u in UserTestGroup.objects.filter(users=user)]
cache.set(key, group_names, cache_expiration)
return group_names
def get_options(user):
return {'dev_content': settings.DEV_CONTENT,
'groups': user_groups(user)}
def replace_custom_tags(tree):
'''Replace custom tags defined in our custom_tags dir'''
replace_custom_tags_dir(tree, settings.DATA_DIR+'/custom_tags')
def course_file(user, coursename=None):
''' Given a user, return an xml tree object for the course file.
Handles getting the right file, and processing it depending on the
groups the user is in. Does caching of the xml strings.
'''
if user.is_authenticated():
# use user.profile_cache.courseware?
filename = UserProfile.objects.get(user=user).courseware
else:
filename = 'guest_course.xml'
# if a specific course is specified, then use multicourse to get
# the right path to the course XML directory
if coursename and settings.ENABLE_MULTICOURSE:
xp = multicourse_settings.get_course_xmlpath(coursename)
filename = xp + filename # prefix the filename with the path
groups = user_groups(user)
options = get_options(user)
# Try the cache...
cache_key = "{0}_processed?dev_content:{1}&groups:{2}".format(
filename,
options['dev_content'],
sorted(groups))
if "dev" in settings.DEFAULT_GROUPS:
tree_string = None
else:
tree_string = cache.get(cache_key)
if tree_string:
tree = etree.XML(tree_string)
else:
tree = parse_course_file(filename, options, namespace='course')
# Cache it
tree_string = etree.tostring(tree)
cache.set(cache_key, tree_string, 60)
return tree
def sections_dir(coursename=None):
''' Get directory where sections information is stored.
'''
# if a specific course is specified, then use multicourse to get the
# right path to the course XML directory
xp = ''
if coursename and settings.ENABLE_MULTICOURSE:
xp = multicourse_settings.get_course_xmlpath(coursename)
return settings.DATA_DIR + xp + '/sections/'
def section_file(user, section, coursename=None):
'''
Given a user and the name of a section, return that section.
This is done specific to each course.
Returns the xml tree for the section, or None if there's no such section.
'''
dirname = sections_dir(coursename)
return get_section(section, options, dirname)
def module_xml(user, module, id_tag, module_id, coursename=None):
''' Get XML for a module based on module and module_id. Assumes
module occurs once in courseware XML file or hidden section.
'''
tree = course_file(user, coursename)
sdirname = sections_dir(coursename)
options = get_options(user)
return get_module(tree, module, id_tag, module_id, sdirname, options)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment