Commit 45832b98 by Marko Jevtić

Merge pull request #9283 from edx/mjevtic/SOL-1040

(SOL-1040) Indexing capa problems
parents 7b346f7d 8faff973
......@@ -2,6 +2,7 @@
import json
import logging
import sys
import re
from lxml import etree
from pkg_resources import resource_string
......@@ -10,6 +11,7 @@ import dogstats_wrapper as dog_stats_api
from .capa_base import CapaMixin, CapaFields, ComplexEncoder
from capa import responsetypes
from .progress import Progress
from xmodule.util.misc import escape_html_characters
from xmodule.x_module import XModule, module_attr, DEPRECATION_VSCOMPAT_EVENT
from xmodule.raw_module import RawDescriptor
from xmodule.exceptions import NotFoundError, ProcessingError
......@@ -193,16 +195,33 @@ class CapaDescriptor(CapaFields, RawDescriptor):
"""
Return dictionary prepared with module content and type for indexing.
"""
result = super(CapaDescriptor, self).index_dictionary()
if not result:
result = {}
index = {
'content_type': self.INDEX_CONTENT_TYPE,
'problem_types': list(self.problem_types),
"display_name": self.display_name
xblock_body = super(CapaDescriptor, self).index_dictionary()
# Removing solutions and hints, as well as script and style
capa_content = re.sub(
re.compile(
r"""
<solution>.*?</solution> |
<script>.*?</script> |
<style>.*?</style> |
<[a-z]*hint.*?>.*?</[a-z]*hint>
""",
re.DOTALL |
re.VERBOSE),
"",
self.data
)
capa_content = escape_html_characters(capa_content)
capa_body = {
"capa_content": capa_content,
"display_name": self.display_name,
}
result.update(index)
return result
if "content" in xblock_body:
xblock_body["content"].update(capa_body)
else:
xblock_body["content"] = capa_body
xblock_body["content_type"] = self.INDEX_CONTENT_TYPE
xblock_body["problem_types"] = list(self.problem_types)
return xblock_body
def has_support(self, view, functionality):
"""
......
......@@ -10,7 +10,7 @@ from fs.errors import ResourceNotFoundError
from pkg_resources import resource_string
import dogstats_wrapper as dog_stats_api
from xmodule.annotator_mixin import html_to_text
from xmodule.util.misc import escape_html_characters
from xmodule.contentstore.content import StaticContent
from xmodule.editing_module import EditingDescriptor
from xmodule.edxnotes_utils import edxnotes
......@@ -275,12 +275,19 @@ class HtmlDescriptor(HtmlFields, XmlDescriptor, EditingDescriptor): # pylint: d
def index_dictionary(self):
xblock_body = super(HtmlDescriptor, self).index_dictionary()
# Removing HTML-encoded non-breaking space characters
html_content = re.sub(r"(\s|&nbsp;|//)+", " ", html_to_text(self.data))
# Removing HTML CDATA
html_content = re.sub(r"<!\[CDATA\[.*\]\]>", "", html_content)
# Removing HTML comments
html_content = re.sub(r"<!--.*-->", "", html_content)
# Removing script and style
html_content = re.sub(
re.compile(
r"""
<script>.*?</script> |
<style>.*?</style>
""",
re.DOTALL |
re.VERBOSE),
"",
self.data
)
html_content = escape_html_characters(html_content)
html_body = {
"html_content": html_content,
"display_name": self.display_name,
......
......@@ -59,7 +59,7 @@ class HtmlDescriptorIndexingTestCase(unittest.TestCase):
Make sure that HtmlDescriptor can format data for indexing as expected.
"""
def test_index_dictionary(self):
def test_index_dictionary_simple_html_module(self):
sample_xml = '''
<html>
<p>Hello World!</p>
......@@ -71,6 +71,7 @@ class HtmlDescriptorIndexingTestCase(unittest.TestCase):
"content_type": "Text"
})
def test_index_dictionary_cdata_html_module(self):
sample_xml_cdata = '''
<html>
<p>This has CDATA in it.</p>
......@@ -83,6 +84,7 @@ class HtmlDescriptorIndexingTestCase(unittest.TestCase):
"content_type": "Text"
})
def test_index_dictionary_multiple_spaces_html_module(self):
sample_xml_tab_spaces = '''
<html>
<p> Text has spaces :) </p>
......@@ -94,6 +96,7 @@ class HtmlDescriptorIndexingTestCase(unittest.TestCase):
"content_type": "Text"
})
def test_index_dictionary_html_module_with_comment(self):
sample_xml_comment = '''
<html>
<p>This has HTML comment in it.</p>
......@@ -106,6 +109,7 @@ class HtmlDescriptorIndexingTestCase(unittest.TestCase):
"content_type": "Text"
})
def test_index_dictionary_html_module_with_both_comments_and_cdata(self):
sample_xml_mix_comment_cdata = '''
<html>
<!-- Beginning of the html -->
......@@ -120,3 +124,23 @@ class HtmlDescriptorIndexingTestCase(unittest.TestCase):
"content": {"html_content": " This has HTML comment in it. HTML end. ", "display_name": "Text"},
"content_type": "Text"
})
def test_index_dictionary_html_module_with_script_and_style_tags(self):
sample_xml_style_script_tags = '''
<html>
<style>p {color: green;}</style>
<!-- Beginning of the html -->
<p>This has HTML comment in it.<!-- Commenting Content --></p>
<!-- Here comes CDATA -->
<![CDATA[This is just a CDATA!]]>
<p>HTML end.</p>
<script>
var message = "Hello world!"
</script>
</html>
'''
descriptor = instantiate_descriptor(data=sample_xml_style_script_tags)
self.assertEqual(descriptor.index_dictionary(), {
"content": {"html_content": " This has HTML comment in it. HTML end. ", "display_name": "Text"},
"content_type": "Text"
})
"""Tests for methods defined in util/misc.py"""
from xmodule.util.misc import escape_html_characters
from unittest import TestCase
class UtilHtmlEscapeTests(TestCase):
"""
Tests for methods exposed in util/misc
"""
final_content = " This is a paragraph. "
def test_escape_html_comments(self):
html_content = """
<!--This is a comment. Comments are not displayed in the browser-->
This is a paragraph.
"""
self.assertEqual(escape_html_characters(html_content), self.final_content)
def test_escape_cdata_comments(self):
html_content = """
<![CDATA[
function matchwo(a,b)
{
if (a < b && a < 0) then
{
return 1;
}
else
{
return 0;
}
}
]]>
This is a paragraph.
"""
self.assertEqual(escape_html_characters(html_content), self.final_content)
def test_escape_non_breaking_space(self):
html_content = """
&nbsp;&nbsp;
&nbsp;
<![CDATA[
function matchwo(a,b)
{
if (a < b && a < 0) then
{
return 1;
}
else
{
return 0;
}
}
]]>
This is a paragraph.&nbsp;
"""
self.assertEqual(escape_html_characters(html_content), self.final_content)
"""
Miscellaneous utility functions.
"""
import re
from xmodule.annotator_mixin import html_to_text
def escape_invalid_characters(name, invalid_char_list, replace_with='_'):
......@@ -24,3 +27,34 @@ def escape_invalid_characters(name, invalid_char_list, replace_with='_'):
if char in name:
name = name.replace(char, replace_with)
return name
def escape_html_characters(content):
"""
Remove HTML characters that shouldn't be indexed using ElasticSearch indexer
This method is complementary to html_to_text method found in xmodule/annotator_mixin.py
Args:
content (str): variable to escape html characters from
Returns:
content (str): content ready to be index by ElasticSearch
"""
# Removing HTML comments
return re.sub(
r"<!--.*-->",
"",
# Removing HTML CDATA
re.sub(
r"<!\[CDATA\[.*\]\]>",
"",
# Removing HTML-encoded non-breaking space characters
re.sub(
r"(\s|&nbsp;|//)+",
" ",
html_to_text(content)
)
)
)
......@@ -80,7 +80,7 @@ class DashboardSearchTest(WebAppTest):
course_fix.add_children(
XBlockFixtureDesc('chapter', 'Section 1').add_children(
XBlockFixtureDesc('sequential', 'Subsection 1').add_children(
XBlockFixtureDesc('problem', 'dashboard search')
XBlockFixtureDesc('problem', 'Test Problem')
)
)
).add_children(
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment