Merge pull request #9283 from edx/mjevtic/SOL-1040

(SOL-1040) Indexing capa problems

Merge pull request #9283 from edx/mjevtic/SOL-1040
(SOL-1040) Indexing capa problems
45832b98 · Marko Jevtić · 7b346f7d · 8faff973 · 45832b98 · 45832b98
Commit 45832b98 authored Aug 24, 2015 by Marko Jevtić
7 changed files
--- a/common/lib/xmodule/xmodule/capa_module.py
+++ b/common/lib/xmodule/xmodule/capa_module.py
@@ -2,6 +2,7 @@
 import json
 import logging
 import sys
+import re
 from lxml import etree

 from pkg_resources import resource_string
@@ -10,6 +11,7 @@ import dogstats_wrapper as dog_stats_api
 from .capa_base import CapaMixin, CapaFields, ComplexEncoder
 from capa import responsetypes
 from .progress import Progress
+from xmodule.util.misc import escape_html_characters
 from xmodule.x_module import XModule, module_attr, DEPRECATION_VSCOMPAT_EVENT
 from xmodule.raw_module import RawDescriptor
 from xmodule.exceptions import NotFoundError, ProcessingError
@@ -193,16 +195,33 @@ class CapaDescriptor(CapaFields, RawDescriptor):
        """
        Return dictionary prepared with module content and type for indexing.
        """
-        result = super(CapaDescriptor, self).index_dictionary()
-        if not result:
-            result = {}
-        index = {
-            'content_type': self.INDEX_CONTENT_TYPE,
-            'problem_types': list(self.problem_types),
-            "display_name": self.display_name
+        xblock_body = super(CapaDescriptor, self).index_dictionary()
+        # Removing solutions and hints, as well as script and style
+        capa_content = re.sub(
+            re.compile(
+                r"""
+                    <solution>.*?</solution> |
+                    <script>.*?</script> |
+                    <style>.*?</style> |
+                    <[a-z]*hint.*?>.*?</[a-z]*hint>
+                """,
+                re.DOTALL |
+                re.VERBOSE),
+            "",
+            self.data
+        )
+        capa_content = escape_html_characters(capa_content)
+        capa_body = {
+            "capa_content": capa_content,
+            "display_name": self.display_name,
        }
-        result.update(index)
-        return result
+        if "content" in xblock_body:
+            xblock_body["content"].update(capa_body)
+        else:
+            xblock_body["content"] = capa_body
+        xblock_body["content_type"] = self.INDEX_CONTENT_TYPE
+        xblock_body["problem_types"] = list(self.problem_types)
+        return xblock_body

    def has_support(self, view, functionality):
        """

--- a/common/lib/xmodule/xmodule/html_module.py
+++ b/common/lib/xmodule/xmodule/html_module.py
@@ -10,7 +10,7 @@ from fs.errors import ResourceNotFoundError
 from pkg_resources import resource_string

 import dogstats_wrapper as dog_stats_api
-from xmodule.annotator_mixin import html_to_text
+from xmodule.util.misc import escape_html_characters
 from xmodule.contentstore.content import StaticContent
 from xmodule.editing_module import EditingDescriptor
 from xmodule.edxnotes_utils import edxnotes
@@ -275,12 +275,19 @@ class HtmlDescriptor(HtmlFields, XmlDescriptor, EditingDescriptor):  # pylint: d

    def index_dictionary(self):
        xblock_body = super(HtmlDescriptor, self).index_dictionary()
-        # Removing HTML-encoded non-breaking space characters
-        html_content = re.sub(r"(\s|&nbsp;|//)+", " ", html_to_text(self.data))
-        # Removing HTML CDATA
-        html_content = re.sub(r"<!\[CDATA\[.*\]\]>", "", html_content)
-        # Removing HTML comments
-        html_content = re.sub(r"<!--.*-->", "", html_content)
+        # Removing script and style
+        html_content = re.sub(
+            re.compile(
+                r"""
+                    <script>.*?</script> |
+                    <style>.*?</style>
+                """,
+                re.DOTALL |
+                re.VERBOSE),
+            "",
+            self.data
+        )
+        html_content = escape_html_characters(html_content)
        html_body = {
            "html_content": html_content,
            "display_name": self.display_name,

--- a/common/lib/xmodule/xmodule/tests/test_capa_module.py
+++ b/common/lib/xmodule/xmodule/tests/test_capa_module.py
--- a/common/lib/xmodule/xmodule/tests/test_html_module.py
+++ b/common/lib/xmodule/xmodule/tests/test_html_module.py
@@ -59,7 +59,7 @@ class HtmlDescriptorIndexingTestCase(unittest.TestCase):
    Make sure that HtmlDescriptor can format data for indexing as expected.
    """

-    def test_index_dictionary(self):
+    def test_index_dictionary_simple_html_module(self):
        sample_xml = '''
            <html>
                <p>Hello World!</p>
@@ -71,6 +71,7 @@ class HtmlDescriptorIndexingTestCase(unittest.TestCase):
            "content_type": "Text"
        })

+    def test_index_dictionary_cdata_html_module(self):
        sample_xml_cdata = '''
            <html>
                <p>This has CDATA in it.</p>
@@ -83,6 +84,7 @@ class HtmlDescriptorIndexingTestCase(unittest.TestCase):
            "content_type": "Text"
        })

+    def test_index_dictionary_multiple_spaces_html_module(self):
        sample_xml_tab_spaces = '''
            <html>
                <p>     Text has spaces :)  </p>
@@ -94,6 +96,7 @@ class HtmlDescriptorIndexingTestCase(unittest.TestCase):
            "content_type": "Text"
        })

+    def test_index_dictionary_html_module_with_comment(self):
        sample_xml_comment = '''
            <html>
                <p>This has HTML comment in it.</p>
@@ -106,6 +109,7 @@ class HtmlDescriptorIndexingTestCase(unittest.TestCase):
            "content_type": "Text"
        })

+    def test_index_dictionary_html_module_with_both_comments_and_cdata(self):
        sample_xml_mix_comment_cdata = '''
            <html>
                <!-- Beginning of the html -->
@@ -120,3 +124,23 @@ class HtmlDescriptorIndexingTestCase(unittest.TestCase):
            "content": {"html_content": " This has HTML comment in it. HTML end. ", "display_name": "Text"},
            "content_type": "Text"
        })
+
+    def test_index_dictionary_html_module_with_script_and_style_tags(self):
+        sample_xml_style_script_tags = '''
+            <html>
+                <style>p {color: green;}</style>
+                <!-- Beginning of the html -->
+                <p>This has HTML comment in it.<!-- Commenting Content --></p>
+                <!-- Here comes CDATA -->
+                <![CDATA[This is just a CDATA!]]>
+                <p>HTML end.</p>
+                <script>
+                    var message = "Hello world!"
+                </script>
+            </html>
+        '''
+        descriptor = instantiate_descriptor(data=sample_xml_style_script_tags)
+        self.assertEqual(descriptor.index_dictionary(), {
+            "content": {"html_content": " This has HTML comment in it. HTML end. ", "display_name": "Text"},
+            "content_type": "Text"
+        })
--- a/common/lib/xmodule/xmodule/tests/test_utils_escape_html_characters.py
+++ b/common/lib/xmodule/xmodule/tests/test_utils_escape_html_characters.py
+"""Tests for methods defined in util/misc.py"""
+from xmodule.util.misc import escape_html_characters
+from unittest import TestCase
+
+
+class UtilHtmlEscapeTests(TestCase):
+    """
+    Tests for methods exposed in util/misc
+    """
+
+    final_content = " This is a paragraph. "
+
+    def test_escape_html_comments(self):
+        html_content = """
+            <!--This is a comment. Comments are not displayed in the browser-->
+
+            This is a paragraph.
+            """
+        self.assertEqual(escape_html_characters(html_content), self.final_content)
+
+    def test_escape_cdata_comments(self):
+        html_content = """
+            <![CDATA[
+                function matchwo(a,b)
+                {
+                if (a < b && a < 0) then
+                  {
+                  return 1;
+                  }
+                else
+                  {
+                  return 0;
+                  }
+                }
+            ]]>
+
+            This is a paragraph.
+            """
+        self.assertEqual(escape_html_characters(html_content), self.final_content)
+
+    def test_escape_non_breaking_space(self):
+        html_content = """
+            &nbsp;&nbsp;
+            &nbsp;
+            <![CDATA[
+                function matchwo(a,b)
+                {
+                if (a < b && a < 0) then
+                  {
+                  return 1;
+                  }
+                else
+                  {
+                  return 0;
+                  }
+                }
+            ]]>
+            This is a paragraph.&nbsp;
+        """
+        self.assertEqual(escape_html_characters(html_content), self.final_content)
--- a/common/lib/xmodule/xmodule/util/misc.py
+++ b/common/lib/xmodule/xmodule/util/misc.py
 """
 Miscellaneous utility functions.
 """
+import re
+
+from xmodule.annotator_mixin import html_to_text


 def escape_invalid_characters(name, invalid_char_list, replace_with='_'):
@@ -24,3 +27,34 @@ def escape_invalid_characters(name, invalid_char_list, replace_with='_'):
        if char in name:
            name = name.replace(char, replace_with)
    return name
+
+
+def escape_html_characters(content):
+    """
+    Remove HTML characters that shouldn't be indexed using ElasticSearch indexer
+    This method is complementary to html_to_text method found in xmodule/annotator_mixin.py
+
+    Args:
+        content (str): variable to escape html characters from
+
+    Returns:
+        content (str): content ready to be index by ElasticSearch
+
+    """
+
+    # Removing HTML comments
+    return re.sub(
+        r"<!--.*-->",
+        "",
+        # Removing HTML CDATA
+        re.sub(
+            r"<!\[CDATA\[.*\]\]>",
+            "",
+            # Removing HTML-encoded non-breaking space characters
+            re.sub(
+                r"(\s|&nbsp;|//)+",
+                " ",
+                html_to_text(content)
+            )
+        )
+    )
--- a/common/test/acceptance/tests/lms/test_lms_dashboard_search.py
+++ b/common/test/acceptance/tests/lms/test_lms_dashboard_search.py
@@ -80,7 +80,7 @@ class DashboardSearchTest(WebAppTest):
            course_fix.add_children(
                XBlockFixtureDesc('chapter', 'Section 1').add_children(
                    XBlockFixtureDesc('sequential', 'Subsection 1').add_children(
-                        XBlockFixtureDesc('problem', 'dashboard search')
+                        XBlockFixtureDesc('problem', 'Test Problem')
                    )
                )
            ).add_children(