Commit ed35cefa by Victor Shnayder

Fix html file handling.

* html files are now stored as follows:

If the html file is valid xml, store as html/stuff.xml

If it's not, store as html/stuff.xml, which contains
<html meta1="..."  filename="stuff.html">,
and html/stuff.html, which actually contains the contents.
Warn if the contents are not parseable with lxml's html parser,
but don't error.

* for parseable html, strip out the html tag when storing, so that it isn't
  rendered into the middle of a page

* lots of backcompat to deal with paths.  Can go away soon.

* fix output ordering in clean_xml
parent 46775386
from lxml import etree
def check_html(html):
'''
Check whether the passed in html string can be parsed by lxml.
Return bool success.
'''
parser = etree.HTMLParser()
try:
etree.fromstring(html, parser)
return True
except Exception as err:
pass
return False
from itertools import chain
from lxml import etree
def stringify_children(node):
'''
Return all contents of an xml tree, without the outside tags.
e.g. if node is parse of
"<html a="b" foo="bar">Hi <div>there <span>Bruce</span><b>!</b></div><html>"
should return
"Hi <div>there <span>Bruce</span><b>!</b></div>"
fixed from
http://stackoverflow.com/questions/4624062/get-all-text-inside-a-tag-in-lxml
'''
parts = ([node.text] +
list(chain(*([etree.tostring(c), c.tail]
for c in node.getchildren())
)))
# filter removes possible Nones in texts and tails
return ''.join(filter(None, parts))
from nose.tools import assert_equals
from lxml import etree
from stringify import stringify_children
def test_stringify():
html = '''<html a="b" foo="bar">Hi <div x="foo">there <span>Bruce</span><b>!</b></div></html>'''
xml = etree.fromstring(html)
out = stringify_children(xml)
assert_equals(out, '''Hi <div x="foo">there <span>Bruce</span><b>!</b></div>''')
import copy
from fs.errors import ResourceNotFoundError
import logging import logging
import os import os
from lxml import etree from lxml import etree
from xmodule.x_module import XModule from xmodule.x_module import XModule
from xmodule.raw_module import RawDescriptor from xmodule.xml_module import XmlDescriptor
from xmodule.editing_module import EditingDescriptor
from stringify import stringify_children
from html_checker import check_html
log = logging.getLogger("mitx.courseware") log = logging.getLogger("mitx.courseware")
class HtmlModule(XModule): class HtmlModule(XModule):
def get_html(self): def get_html(self):
return self.html return self.html
...@@ -19,33 +23,108 @@ class HtmlModule(XModule): ...@@ -19,33 +23,108 @@ class HtmlModule(XModule):
self.html = self.definition['data'] self.html = self.definition['data']
class HtmlDescriptor(RawDescriptor): class HtmlDescriptor(XmlDescriptor, EditingDescriptor):
""" """
Module for putting raw html in a course Module for putting raw html in a course
""" """
mako_template = "widgets/html-edit.html" mako_template = "widgets/html-edit.html"
module_class = HtmlModule module_class = HtmlModule
filename_extension = "html" filename_extension = "xml"
# TODO (cpennington): Delete this method once all fall 2012 course are being # VS[compat] TODO (cpennington): Delete this method once all fall 2012 course
# edited in the cms # are being edited in the cms
@classmethod @classmethod
def backcompat_paths(cls, path): def backcompat_paths(cls, path):
if path.endswith('.html.html'): origpath = path
path = path[:-5] if path.endswith('.html.xml'):
path = path[:-9] + '.html' #backcompat--look for html instead of xml
candidates = [] candidates = []
while os.sep in path: while os.sep in path:
candidates.append(path) candidates.append(path)
_, _, path = path.partition(os.sep) _, _, path = path.partition(os.sep)
# also look for .html versions instead of .xml
if origpath.endswith('.xml'):
candidates.append(origpath[:-4] + '.html')
return candidates return candidates
# NOTE: html descriptors are special. We do not want to parse and
# export them ourselves, because that can break things (e.g. lxml
# adds body tags when it exports, but they should just be html
# snippets that will be included in the middle of pages.
@classmethod @classmethod
def file_to_xml(cls, file_object): def definition_loader(cls, xml_object, system):
parser = etree.HTMLParser() '''Load a descriptor from the specified xml_object:
return etree.parse(file_object, parser).getroot()
If there is a filename attribute, load it as a string, and
log a warning if it is not parseable by etree.HTMLParser.
If there is not a filename attribute, the definition is the body
of the xml_object, without the root tag (do not want <html> in the
middle of a page)
'''
filename = xml_object.get('filename')
if filename is None:
definition_xml = copy.deepcopy(xml_object)
cls.clean_metadata_from_xml(definition_xml)
return {'data' : stringify_children(definition_xml)}
else:
filepath = cls._format_filepath(xml_object.tag, filename)
# VS[compat]
# TODO (cpennington): If the file doesn't exist at the right path,
# give the class a chance to fix it up. The file will be written out
# again in the correct format. This should go away once the CMS is
# online and has imported all current (fall 2012) courses from xml
if not system.resources_fs.exists(filepath):
candidates = cls.backcompat_paths(filepath)
#log.debug("candidates = {0}".format(candidates))
for candidate in candidates:
if system.resources_fs.exists(candidate):
filepath = candidate
break
try:
with system.resources_fs.open(filepath) as file:
html = file.read()
# Log a warning if we can't parse the file, but don't error
if not check_html(html):
log.warning("Couldn't parse html in {0}.".format(filepath))
return {'data' : html}
except (ResourceNotFoundError) as err:
msg = 'Unable to load file contents at path {0}: {1} '.format(
filepath, err)
log.error(msg)
raise
@classmethod @classmethod
def split_to_file(cls, xml_object): def split_to_file(cls, xml_object):
# never include inline html '''Never include inline html'''
return True return True
# TODO (vshnayder): make export put things in the right places.
def definition_to_xml(self, resource_fs):
'''If the contents are valid xml, write them to filename.xml. Otherwise,
write just the <html filename=""> tag to filename.xml, and the html
string to filename.html.
'''
try:
return etree.fromstring(self.definition['data'])
except etree.XMLSyntaxError:
pass
# Not proper format. Write html to file, return an empty tag
filepath = u'{category}/{name}.html'.format(category=self.category,
name=self.name)
resource_fs.makedir(os.path.dirname(filepath), allow_recreate=True)
with resource_fs.open(filepath, 'w') as file:
file.write(self.definition['data'])
elt = etree.Element('html')
elt.set("filename", self.name)
return elt
...@@ -163,6 +163,52 @@ class XmlDescriptor(XModuleDescriptor): ...@@ -163,6 +163,52 @@ class XmlDescriptor(XModuleDescriptor):
return etree.parse(file_object).getroot() return etree.parse(file_object).getroot()
@classmethod @classmethod
def definition_loader(cls, xml_object, system, location):
'''Load a descriptor definition from the specified xml_object.
Subclasses should not need to override this except in special
cases (e.g. html module)'''
filename = xml_object.get('filename')
if filename is None:
definition_xml = copy.deepcopy(xml_object)
else:
filepath = cls._format_filepath(xml_object.tag, filename)
# VS[compat]
# TODO (cpennington): If the file doesn't exist at the right path,
# give the class a chance to fix it up. The file will be written out again
# in the correct format.
# This should go away once the CMS is online and has imported all current (fall 2012)
# courses from xml
if not system.resources_fs.exists(filepath) and hasattr(cls, 'backcompat_paths'):
candidates = cls.backcompat_paths(filepath)
for candidate in candidates:
if system.resources_fs.exists(candidate):
filepath = candidate
break
try:
with system.resources_fs.open(filepath) as file:
definition_xml = cls.file_to_xml(file)
except (ResourceNotFoundError, etree.XMLSyntaxError):
msg = 'Unable to load file contents at path %s for item %s' % (
filepath, location.url())
log.exception(msg)
system.error_handler(msg)
# if error_handler didn't reraise, work around problem.
error_elem = etree.Element('error')
message_elem = etree.SubElement(error_elem, 'error_message')
message_elem.text = msg
stack_elem = etree.SubElement(error_elem, 'stack_trace')
stack_elem.text = traceback.format_exc()
return {'data': etree.tostring(error_elem)}
cls.clean_metadata_from_xml(definition_xml)
return cls.definition_from_xml(definition_xml, system)
@classmethod
def from_xml(cls, xml_data, system, org=None, course=None): def from_xml(cls, xml_data, system, org=None, course=None):
""" """
Creates an instance of this descriptor from the supplied xml_data. Creates an instance of this descriptor from the supplied xml_data.
...@@ -191,44 +237,8 @@ class XmlDescriptor(XModuleDescriptor): ...@@ -191,44 +237,8 @@ class XmlDescriptor(XModuleDescriptor):
metadata[attr_map.metadata_key] = attr_map.to_metadata(val) metadata[attr_map.metadata_key] = attr_map.to_metadata(val)
return metadata return metadata
def definition_loader(): definition = cls.definition_loader(xml_object, system, location)
filename = xml_object.get('filename') metadata = metadata_loader()
if filename is None:
definition_xml = copy.deepcopy(xml_object)
else:
filepath = cls._format_filepath(xml_object.tag, filename)
# VS[compat]
# TODO (cpennington): If the file doesn't exist at the right path,
# give the class a chance to fix it up. The file will be written out again
# in the correct format.
# This should go away once the CMS is online and has imported all current (fall 2012)
# courses from xml
if not system.resources_fs.exists(filepath) and hasattr(cls, 'backcompat_paths'):
candidates = cls.backcompat_paths(filepath)
for candidate in candidates:
if system.resources_fs.exists(candidate):
filepath = candidate
break
try:
with system.resources_fs.open(filepath) as file:
definition_xml = cls.file_to_xml(file)
except (ResourceNotFoundError, etree.XMLSyntaxError):
msg = 'Unable to load file contents at path %s for item %s' % (filepath, location.url())
log.exception(msg)
system.error_handler(msg)
# if error_handler didn't reraise, work around problem.
error_elem = etree.Element('error')
message_elem = etree.SubElement(error_elem, 'error_message')
message_elem.text = msg
stack_elem = etree.SubElement(error_elem, 'stack_trace')
stack_elem.text = traceback.format_exc()
return {'data': etree.tostring(error_elem)}
cls.clean_metadata_from_xml(definition_xml)
return cls.definition_from_xml(definition_xml, system)
# VS[compat] -- just have the url_name lookup once translation is done # VS[compat] -- just have the url_name lookup once translation is done
slug = xml_object.get('url_name', xml_object.get('slug')) slug = xml_object.get('url_name', xml_object.get('slug'))
return cls( return cls(
...@@ -283,7 +293,7 @@ class XmlDescriptor(XModuleDescriptor): ...@@ -283,7 +293,7 @@ class XmlDescriptor(XModuleDescriptor):
# Write it to a file if necessary # Write it to a file if necessary
if self.split_to_file(xml_object): if self.split_to_file(xml_object):
# Put this object in it's own file # Put this object in its own file
filepath = self.__class__._format_filepath(self.category, self.name) filepath = self.__class__._format_filepath(self.category, self.name)
resource_fs.makedir(os.path.dirname(filepath), allow_recreate=True) resource_fs.makedir(os.path.dirname(filepath), allow_recreate=True)
with resource_fs.open(filepath, 'w') as file: with resource_fs.open(filepath, 'w') as file:
......
...@@ -143,6 +143,7 @@ def check_roundtrip(course_dir): ...@@ -143,6 +143,7 @@ def check_roundtrip(course_dir):
# dircmp doesn't do recursive diffs. # dircmp doesn't do recursive diffs.
# diff = dircmp(course_dir, export_dir, ignore=[], hide=[]) # diff = dircmp(course_dir, export_dir, ignore=[], hide=[])
print "======== Roundtrip diff: =========" print "======== Roundtrip diff: ========="
sys.stdout.flush() # needed to make diff appear in the right place
os.system("diff -r {0} {1}".format(course_dir, export_dir)) os.system("diff -r {0} {1}".format(course_dir, export_dir))
print "======== ideally there is no diff above this =======" print "======== ideally there is no diff above this ======="
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment