Commit 45832b98 by Marko Jevtić

Merge pull request #9283 from edx/mjevtic/SOL-1040

(SOL-1040) Indexing capa problems
parents 7b346f7d 8faff973
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
import json import json
import logging import logging
import sys import sys
import re
from lxml import etree from lxml import etree
from pkg_resources import resource_string from pkg_resources import resource_string
...@@ -10,6 +11,7 @@ import dogstats_wrapper as dog_stats_api ...@@ -10,6 +11,7 @@ import dogstats_wrapper as dog_stats_api
from .capa_base import CapaMixin, CapaFields, ComplexEncoder from .capa_base import CapaMixin, CapaFields, ComplexEncoder
from capa import responsetypes from capa import responsetypes
from .progress import Progress from .progress import Progress
from xmodule.util.misc import escape_html_characters
from xmodule.x_module import XModule, module_attr, DEPRECATION_VSCOMPAT_EVENT from xmodule.x_module import XModule, module_attr, DEPRECATION_VSCOMPAT_EVENT
from xmodule.raw_module import RawDescriptor from xmodule.raw_module import RawDescriptor
from xmodule.exceptions import NotFoundError, ProcessingError from xmodule.exceptions import NotFoundError, ProcessingError
...@@ -193,16 +195,33 @@ class CapaDescriptor(CapaFields, RawDescriptor): ...@@ -193,16 +195,33 @@ class CapaDescriptor(CapaFields, RawDescriptor):
""" """
Return dictionary prepared with module content and type for indexing. Return dictionary prepared with module content and type for indexing.
""" """
result = super(CapaDescriptor, self).index_dictionary() xblock_body = super(CapaDescriptor, self).index_dictionary()
if not result: # Removing solutions and hints, as well as script and style
result = {} capa_content = re.sub(
index = { re.compile(
'content_type': self.INDEX_CONTENT_TYPE, r"""
'problem_types': list(self.problem_types), <solution>.*?</solution> |
"display_name": self.display_name <script>.*?</script> |
<style>.*?</style> |
<[a-z]*hint.*?>.*?</[a-z]*hint>
""",
re.DOTALL |
re.VERBOSE),
"",
self.data
)
capa_content = escape_html_characters(capa_content)
capa_body = {
"capa_content": capa_content,
"display_name": self.display_name,
} }
result.update(index) if "content" in xblock_body:
return result xblock_body["content"].update(capa_body)
else:
xblock_body["content"] = capa_body
xblock_body["content_type"] = self.INDEX_CONTENT_TYPE
xblock_body["problem_types"] = list(self.problem_types)
return xblock_body
def has_support(self, view, functionality): def has_support(self, view, functionality):
""" """
......
...@@ -10,7 +10,7 @@ from fs.errors import ResourceNotFoundError ...@@ -10,7 +10,7 @@ from fs.errors import ResourceNotFoundError
from pkg_resources import resource_string from pkg_resources import resource_string
import dogstats_wrapper as dog_stats_api import dogstats_wrapper as dog_stats_api
from xmodule.annotator_mixin import html_to_text from xmodule.util.misc import escape_html_characters
from xmodule.contentstore.content import StaticContent from xmodule.contentstore.content import StaticContent
from xmodule.editing_module import EditingDescriptor from xmodule.editing_module import EditingDescriptor
from xmodule.edxnotes_utils import edxnotes from xmodule.edxnotes_utils import edxnotes
...@@ -275,12 +275,19 @@ class HtmlDescriptor(HtmlFields, XmlDescriptor, EditingDescriptor): # pylint: d ...@@ -275,12 +275,19 @@ class HtmlDescriptor(HtmlFields, XmlDescriptor, EditingDescriptor): # pylint: d
def index_dictionary(self): def index_dictionary(self):
xblock_body = super(HtmlDescriptor, self).index_dictionary() xblock_body = super(HtmlDescriptor, self).index_dictionary()
# Removing HTML-encoded non-breaking space characters # Removing script and style
html_content = re.sub(r"(\s|&nbsp;|//)+", " ", html_to_text(self.data)) html_content = re.sub(
# Removing HTML CDATA re.compile(
html_content = re.sub(r"<!\[CDATA\[.*\]\]>", "", html_content) r"""
# Removing HTML comments <script>.*?</script> |
html_content = re.sub(r"<!--.*-->", "", html_content) <style>.*?</style>
""",
re.DOTALL |
re.VERBOSE),
"",
self.data
)
html_content = escape_html_characters(html_content)
html_body = { html_body = {
"html_content": html_content, "html_content": html_content,
"display_name": self.display_name, "display_name": self.display_name,
......
...@@ -59,7 +59,7 @@ class HtmlDescriptorIndexingTestCase(unittest.TestCase): ...@@ -59,7 +59,7 @@ class HtmlDescriptorIndexingTestCase(unittest.TestCase):
Make sure that HtmlDescriptor can format data for indexing as expected. Make sure that HtmlDescriptor can format data for indexing as expected.
""" """
def test_index_dictionary(self): def test_index_dictionary_simple_html_module(self):
sample_xml = ''' sample_xml = '''
<html> <html>
<p>Hello World!</p> <p>Hello World!</p>
...@@ -71,6 +71,7 @@ class HtmlDescriptorIndexingTestCase(unittest.TestCase): ...@@ -71,6 +71,7 @@ class HtmlDescriptorIndexingTestCase(unittest.TestCase):
"content_type": "Text" "content_type": "Text"
}) })
def test_index_dictionary_cdata_html_module(self):
sample_xml_cdata = ''' sample_xml_cdata = '''
<html> <html>
<p>This has CDATA in it.</p> <p>This has CDATA in it.</p>
...@@ -83,6 +84,7 @@ class HtmlDescriptorIndexingTestCase(unittest.TestCase): ...@@ -83,6 +84,7 @@ class HtmlDescriptorIndexingTestCase(unittest.TestCase):
"content_type": "Text" "content_type": "Text"
}) })
def test_index_dictionary_multiple_spaces_html_module(self):
sample_xml_tab_spaces = ''' sample_xml_tab_spaces = '''
<html> <html>
<p> Text has spaces :) </p> <p> Text has spaces :) </p>
...@@ -94,6 +96,7 @@ class HtmlDescriptorIndexingTestCase(unittest.TestCase): ...@@ -94,6 +96,7 @@ class HtmlDescriptorIndexingTestCase(unittest.TestCase):
"content_type": "Text" "content_type": "Text"
}) })
def test_index_dictionary_html_module_with_comment(self):
sample_xml_comment = ''' sample_xml_comment = '''
<html> <html>
<p>This has HTML comment in it.</p> <p>This has HTML comment in it.</p>
...@@ -106,6 +109,7 @@ class HtmlDescriptorIndexingTestCase(unittest.TestCase): ...@@ -106,6 +109,7 @@ class HtmlDescriptorIndexingTestCase(unittest.TestCase):
"content_type": "Text" "content_type": "Text"
}) })
def test_index_dictionary_html_module_with_both_comments_and_cdata(self):
sample_xml_mix_comment_cdata = ''' sample_xml_mix_comment_cdata = '''
<html> <html>
<!-- Beginning of the html --> <!-- Beginning of the html -->
...@@ -120,3 +124,23 @@ class HtmlDescriptorIndexingTestCase(unittest.TestCase): ...@@ -120,3 +124,23 @@ class HtmlDescriptorIndexingTestCase(unittest.TestCase):
"content": {"html_content": " This has HTML comment in it. HTML end. ", "display_name": "Text"}, "content": {"html_content": " This has HTML comment in it. HTML end. ", "display_name": "Text"},
"content_type": "Text" "content_type": "Text"
}) })
def test_index_dictionary_html_module_with_script_and_style_tags(self):
sample_xml_style_script_tags = '''
<html>
<style>p {color: green;}</style>
<!-- Beginning of the html -->
<p>This has HTML comment in it.<!-- Commenting Content --></p>
<!-- Here comes CDATA -->
<![CDATA[This is just a CDATA!]]>
<p>HTML end.</p>
<script>
var message = "Hello world!"
</script>
</html>
'''
descriptor = instantiate_descriptor(data=sample_xml_style_script_tags)
self.assertEqual(descriptor.index_dictionary(), {
"content": {"html_content": " This has HTML comment in it. HTML end. ", "display_name": "Text"},
"content_type": "Text"
})
"""Tests for methods defined in util/misc.py"""
from xmodule.util.misc import escape_html_characters
from unittest import TestCase
class UtilHtmlEscapeTests(TestCase):
"""
Tests for methods exposed in util/misc
"""
final_content = " This is a paragraph. "
def test_escape_html_comments(self):
html_content = """
<!--This is a comment. Comments are not displayed in the browser-->
This is a paragraph.
"""
self.assertEqual(escape_html_characters(html_content), self.final_content)
def test_escape_cdata_comments(self):
html_content = """
<![CDATA[
function matchwo(a,b)
{
if (a < b && a < 0) then
{
return 1;
}
else
{
return 0;
}
}
]]>
This is a paragraph.
"""
self.assertEqual(escape_html_characters(html_content), self.final_content)
def test_escape_non_breaking_space(self):
html_content = """
&nbsp;&nbsp;
&nbsp;
<![CDATA[
function matchwo(a,b)
{
if (a < b && a < 0) then
{
return 1;
}
else
{
return 0;
}
}
]]>
This is a paragraph.&nbsp;
"""
self.assertEqual(escape_html_characters(html_content), self.final_content)
""" """
Miscellaneous utility functions. Miscellaneous utility functions.
""" """
import re
from xmodule.annotator_mixin import html_to_text
def escape_invalid_characters(name, invalid_char_list, replace_with='_'): def escape_invalid_characters(name, invalid_char_list, replace_with='_'):
...@@ -24,3 +27,34 @@ def escape_invalid_characters(name, invalid_char_list, replace_with='_'): ...@@ -24,3 +27,34 @@ def escape_invalid_characters(name, invalid_char_list, replace_with='_'):
if char in name: if char in name:
name = name.replace(char, replace_with) name = name.replace(char, replace_with)
return name return name
def escape_html_characters(content):
"""
Remove HTML characters that shouldn't be indexed using ElasticSearch indexer
This method is complementary to html_to_text method found in xmodule/annotator_mixin.py
Args:
content (str): variable to escape html characters from
Returns:
content (str): content ready to be index by ElasticSearch
"""
# Removing HTML comments
return re.sub(
r"<!--.*-->",
"",
# Removing HTML CDATA
re.sub(
r"<!\[CDATA\[.*\]\]>",
"",
# Removing HTML-encoded non-breaking space characters
re.sub(
r"(\s|&nbsp;|//)+",
" ",
html_to_text(content)
)
)
)
...@@ -80,7 +80,7 @@ class DashboardSearchTest(WebAppTest): ...@@ -80,7 +80,7 @@ class DashboardSearchTest(WebAppTest):
course_fix.add_children( course_fix.add_children(
XBlockFixtureDesc('chapter', 'Section 1').add_children( XBlockFixtureDesc('chapter', 'Section 1').add_children(
XBlockFixtureDesc('sequential', 'Subsection 1').add_children( XBlockFixtureDesc('sequential', 'Subsection 1').add_children(
XBlockFixtureDesc('problem', 'dashboard search') XBlockFixtureDesc('problem', 'Test Problem')
) )
) )
).add_children( ).add_children(
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment