Commit 63edfc6e by Marko Jevtic

Applied suggestion from the 1st review

parent 35e7a733
......@@ -11,7 +11,7 @@ import dogstats_wrapper as dog_stats_api
from .capa_base import CapaMixin, CapaFields, ComplexEncoder
from capa import responsetypes
from .progress import Progress
from xmodule.annotator_mixin import html_to_text
from util.misc import escape_html_characters
from xmodule.x_module import XModule, module_attr, DEPRECATION_VSCOMPAT_EVENT
from xmodule.raw_module import RawDescriptor
from xmodule.exceptions import NotFoundError, ProcessingError
......@@ -210,12 +210,7 @@ class CapaDescriptor(CapaFields, RawDescriptor):
"",
self.data
)
# Removing HTML-encoded non-breaking space characters
capa_content = re.sub(r"(\s| |//)+", " ", html_to_text(capa_content))
# Removing HTML CDATA
capa_content = re.sub(r"<!\[CDATA\[.*\]\]>", "", capa_content)
# Removing HTML comments
capa_content = re.sub(r"<!--.*-->", "", capa_content)
capa_content = escape_html_characters(capa_content)
capa_body = {
"capa_content": capa_content,
"display_name": self.display_name,
......
......@@ -10,7 +10,7 @@ from fs.errors import ResourceNotFoundError
from pkg_resources import resource_string
import dogstats_wrapper as dog_stats_api
from xmodule.annotator_mixin import html_to_text
from util.misc import escape_html_characters
from xmodule.contentstore.content import StaticContent
from xmodule.editing_module import EditingDescriptor
from xmodule.edxnotes_utils import edxnotes
......@@ -287,12 +287,7 @@ class HtmlDescriptor(HtmlFields, XmlDescriptor, EditingDescriptor): # pylint: d
"",
self.data
)
# Removing HTML-encoded non-breaking space characters
html_content = re.sub(r"(\s|&nbsp;|//)+", " ", html_to_text(html_content))
# Removing HTML CDATA
html_content = re.sub(r"<!\[CDATA\[.*\]\]>", "", html_content)
# Removing HTML comments
html_content = re.sub(r"<!--.*-->", "", html_content)
html_content = escape_html_characters(html_content)
html_body = {
"html_content": html_content,
"display_name": self.display_name,
......
"""Tests for methods defined in util/misc.py"""
from xmodule.util.misc import escape_html_characters
from unittest import TestCase
class UtilHtmlEscapeTests(TestCase):
"""
Tests for methods exposed in util/misc
"""
final_content = " This is a paragraph. "
def test_escape_html_comments(self):
html_content = """
<!--This is a comment. Comments are not displayed in the browser-->
This is a paragraph.
"""
self.assertEqual(escape_html_characters(html_content), self.final_content)
def test_escape_cdata_comments(self):
html_content = """
<![CDATA[
function matchwo(a,b)
{
if (a < b && a < 0) then
{
return 1;
}
else
{
return 0;
}
}
]]>
This is a paragraph.
"""
self.assertEqual(escape_html_characters(html_content), self.final_content)
def test_escape_non_breaking_space(self):
html_content = """
&nbsp;&nbsp;
&nbsp;
<![CDATA[
function matchwo(a,b)
{
if (a < b && a < 0) then
{
return 1;
}
else
{
return 0;
}
}
]]>
This is a paragraph.&nbsp;
"""
self.assertEqual(escape_html_characters(html_content), self.final_content)
"""
Miscellaneous utility functions.
"""
import re
from xmodule.annotator_mixin import html_to_text
def escape_invalid_characters(name, invalid_char_list, replace_with='_'):
......@@ -24,3 +27,33 @@ def escape_invalid_characters(name, invalid_char_list, replace_with='_'):
if char in name:
name = name.replace(char, replace_with)
return name
def escape_html_characters(content):
"""
Remove HTML characters that shouldn't be indexed using ElasticSearch indexer
This method is complementary to html_to_text method found in xmodule/annotator_mixin.py
Args:
content (str): variable to escape html characters from
Returns:
content (str): content ready to be index by ElasticSearch
"""
# Removing HTML comments
return re.sub(
r"<!--.*-->",
"",
# Removing HTML CDATA
re.sub(
r"<!\[CDATA\[.*\]\]>",
"",
# Removing HTML-encoded non-breaking space characters
re.sub(
r"(\s|&nbsp;|//)+",
" ",
html_to_text(content))
)
)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment