Commit 63edfc6e by Marko Jevtic

Applied suggestion from the 1st review

parent 35e7a733
...@@ -11,7 +11,7 @@ import dogstats_wrapper as dog_stats_api ...@@ -11,7 +11,7 @@ import dogstats_wrapper as dog_stats_api
from .capa_base import CapaMixin, CapaFields, ComplexEncoder from .capa_base import CapaMixin, CapaFields, ComplexEncoder
from capa import responsetypes from capa import responsetypes
from .progress import Progress from .progress import Progress
from xmodule.annotator_mixin import html_to_text from util.misc import escape_html_characters
from xmodule.x_module import XModule, module_attr, DEPRECATION_VSCOMPAT_EVENT from xmodule.x_module import XModule, module_attr, DEPRECATION_VSCOMPAT_EVENT
from xmodule.raw_module import RawDescriptor from xmodule.raw_module import RawDescriptor
from xmodule.exceptions import NotFoundError, ProcessingError from xmodule.exceptions import NotFoundError, ProcessingError
...@@ -210,12 +210,7 @@ class CapaDescriptor(CapaFields, RawDescriptor): ...@@ -210,12 +210,7 @@ class CapaDescriptor(CapaFields, RawDescriptor):
"", "",
self.data self.data
) )
# Removing HTML-encoded non-breaking space characters capa_content = escape_html_characters(capa_content)
capa_content = re.sub(r"(\s| |//)+", " ", html_to_text(capa_content))
# Removing HTML CDATA
capa_content = re.sub(r"<!\[CDATA\[.*\]\]>", "", capa_content)
# Removing HTML comments
capa_content = re.sub(r"<!--.*-->", "", capa_content)
capa_body = { capa_body = {
"capa_content": capa_content, "capa_content": capa_content,
"display_name": self.display_name, "display_name": self.display_name,
......
...@@ -10,7 +10,7 @@ from fs.errors import ResourceNotFoundError ...@@ -10,7 +10,7 @@ from fs.errors import ResourceNotFoundError
from pkg_resources import resource_string from pkg_resources import resource_string
import dogstats_wrapper as dog_stats_api import dogstats_wrapper as dog_stats_api
from xmodule.annotator_mixin import html_to_text from util.misc import escape_html_characters
from xmodule.contentstore.content import StaticContent from xmodule.contentstore.content import StaticContent
from xmodule.editing_module import EditingDescriptor from xmodule.editing_module import EditingDescriptor
from xmodule.edxnotes_utils import edxnotes from xmodule.edxnotes_utils import edxnotes
...@@ -287,12 +287,7 @@ class HtmlDescriptor(HtmlFields, XmlDescriptor, EditingDescriptor): # pylint: d ...@@ -287,12 +287,7 @@ class HtmlDescriptor(HtmlFields, XmlDescriptor, EditingDescriptor): # pylint: d
"", "",
self.data self.data
) )
# Removing HTML-encoded non-breaking space characters html_content = escape_html_characters(html_content)
html_content = re.sub(r"(\s|&nbsp;|//)+", " ", html_to_text(html_content))
# Removing HTML CDATA
html_content = re.sub(r"<!\[CDATA\[.*\]\]>", "", html_content)
# Removing HTML comments
html_content = re.sub(r"<!--.*-->", "", html_content)
html_body = { html_body = {
"html_content": html_content, "html_content": html_content,
"display_name": self.display_name, "display_name": self.display_name,
......
"""Tests for methods defined in util/misc.py"""
from xmodule.util.misc import escape_html_characters
from unittest import TestCase
class UtilHtmlEscapeTests(TestCase):
"""
Tests for methods exposed in util/misc
"""
final_content = " This is a paragraph. "
def test_escape_html_comments(self):
html_content = """
<!--This is a comment. Comments are not displayed in the browser-->
This is a paragraph.
"""
self.assertEqual(escape_html_characters(html_content), self.final_content)
def test_escape_cdata_comments(self):
html_content = """
<![CDATA[
function matchwo(a,b)
{
if (a < b && a < 0) then
{
return 1;
}
else
{
return 0;
}
}
]]>
This is a paragraph.
"""
self.assertEqual(escape_html_characters(html_content), self.final_content)
def test_escape_non_breaking_space(self):
html_content = """
&nbsp;&nbsp;
&nbsp;
<![CDATA[
function matchwo(a,b)
{
if (a < b && a < 0) then
{
return 1;
}
else
{
return 0;
}
}
]]>
This is a paragraph.&nbsp;
"""
self.assertEqual(escape_html_characters(html_content), self.final_content)
""" """
Miscellaneous utility functions. Miscellaneous utility functions.
""" """
import re
from xmodule.annotator_mixin import html_to_text
def escape_invalid_characters(name, invalid_char_list, replace_with='_'): def escape_invalid_characters(name, invalid_char_list, replace_with='_'):
...@@ -24,3 +27,33 @@ def escape_invalid_characters(name, invalid_char_list, replace_with='_'): ...@@ -24,3 +27,33 @@ def escape_invalid_characters(name, invalid_char_list, replace_with='_'):
if char in name: if char in name:
name = name.replace(char, replace_with) name = name.replace(char, replace_with)
return name return name
def escape_html_characters(content):
"""
Remove HTML characters that shouldn't be indexed using ElasticSearch indexer
This method is complementary to html_to_text method found in xmodule/annotator_mixin.py
Args:
content (str): variable to escape html characters from
Returns:
content (str): content ready to be index by ElasticSearch
"""
# Removing HTML comments
return re.sub(
r"<!--.*-->",
"",
# Removing HTML CDATA
re.sub(
r"<!\[CDATA\[.*\]\]>",
"",
# Removing HTML-encoded non-breaking space characters
re.sub(
r"(\s|&nbsp;|//)+",
" ",
html_to_text(content))
)
)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment