Applied suggestion from the 1st review

63edfc6e · Marko Jevtic · 35e7a733 · 63edfc6e · 63edfc6e · 63edfc6e
Commit 63edfc6e authored Aug 17, 2015 by Marko Jevtic
Showing with 97 additions and 14 deletions

common/lib/xmodule/xmodule/capa_module.py
+2 -7

common/lib/xmodule/xmodule/html_module.py
+2 -7

common/lib/xmodule/xmodule/tests/test_utils_escape_html_characters.py
+60 -0

common/lib/xmodule/xmodule/util/misc.py
+33 -0

No files found.
--- a/common/lib/xmodule/xmodule/capa_module.py
+++ b/common/lib/xmodule/xmodule/capa_module.py
@@ -11,7 +11,7 @@ import dogstats_wrapper as dog_stats_api
 from .capa_base import CapaMixin, CapaFields, ComplexEncoder
 from capa import responsetypes
 from .progress import Progress
-from xmodule.annotator_mixin import html_to_text
+from util.misc import escape_html_characters
 from xmodule.x_module import XModule, module_attr, DEPRECATION_VSCOMPAT_EVENT
 from xmodule.raw_module import RawDescriptor
 from xmodule.exceptions import NotFoundError, ProcessingError
@@ -210,12 +210,7 @@ class CapaDescriptor(CapaFields, RawDescriptor):
            "",
            self.data
        )
-        # Removing HTML-encoded non-breaking space characters
-        capa_content = re.sub(r"(\s|&nbsp;|//)+", " ", html_to_text(capa_content))
-        # Removing HTML CDATA
-        capa_content = re.sub(r"<!\[CDATA\[.*\]\]>", "", capa_content)
-        # Removing HTML comments
-        capa_content = re.sub(r"<!--.*-->", "", capa_content)
+        capa_content = escape_html_characters(capa_content)
        capa_body = {
            "capa_content": capa_content,
            "display_name": self.display_name,

--- a/common/lib/xmodule/xmodule/html_module.py
+++ b/common/lib/xmodule/xmodule/html_module.py
@@ -10,7 +10,7 @@ from fs.errors import ResourceNotFoundError
 from pkg_resources import resource_string

 import dogstats_wrapper as dog_stats_api
-from xmodule.annotator_mixin import html_to_text
+from util.misc import escape_html_characters
 from xmodule.contentstore.content import StaticContent
 from xmodule.editing_module import EditingDescriptor
 from xmodule.edxnotes_utils import edxnotes
@@ -287,12 +287,7 @@ class HtmlDescriptor(HtmlFields, XmlDescriptor, EditingDescriptor):  # pylint: d
            "",
            self.data
        )
-        # Removing HTML-encoded non-breaking space characters
-        html_content = re.sub(r"(\s|&nbsp;|//)+", " ", html_to_text(html_content))
-        # Removing HTML CDATA
-        html_content = re.sub(r"<!\[CDATA\[.*\]\]>", "", html_content)
-        # Removing HTML comments
-        html_content = re.sub(r"<!--.*-->", "", html_content)
+        html_content = escape_html_characters(html_content)
        html_body = {
            "html_content": html_content,
            "display_name": self.display_name,

--- a/common/lib/xmodule/xmodule/tests/test_utils_escape_html_characters.py
+++ b/common/lib/xmodule/xmodule/tests/test_utils_escape_html_characters.py
+"""Tests for methods defined in util/misc.py"""
+from xmodule.util.misc import escape_html_characters
+from unittest import TestCase
+
+
+class UtilHtmlEscapeTests(TestCase):
+    """
+    Tests for methods exposed in util/misc
+    """
+
+    final_content = " This is a paragraph. "
+
+    def test_escape_html_comments(self):
+        html_content = """
+            <!--This is a comment. Comments are not displayed in the browser-->
+
+            This is a paragraph.
+            """
+        self.assertEqual(escape_html_characters(html_content), self.final_content)
+
+    def test_escape_cdata_comments(self):
+        html_content = """
+            <![CDATA[
+                function matchwo(a,b)
+                {
+                if (a < b && a < 0) then
+                  {
+                  return 1;
+                  }
+                else
+                  {
+                  return 0;
+                  }
+                }
+            ]]>
+
+            This is a paragraph.
+            """
+        self.assertEqual(escape_html_characters(html_content), self.final_content)
+
+    def test_escape_non_breaking_space(self):
+        html_content = """
+            &nbsp;&nbsp;
+            &nbsp;
+            <![CDATA[
+                function matchwo(a,b)
+                {
+                if (a < b && a < 0) then
+                  {
+                  return 1;
+                  }
+                else
+                  {
+                  return 0;
+                  }
+                }
+            ]]>
+            This is a paragraph.&nbsp;
+        """
+        self.assertEqual(escape_html_characters(html_content), self.final_content)
--- a/common/lib/xmodule/xmodule/util/misc.py
+++ b/common/lib/xmodule/xmodule/util/misc.py
 """
 Miscellaneous utility functions.
 """
+import re
+
+from xmodule.annotator_mixin import html_to_text


 def escape_invalid_characters(name, invalid_char_list, replace_with='_'):
@@ -24,3 +27,33 @@ def escape_invalid_characters(name, invalid_char_list, replace_with='_'):
        if char in name:
            name = name.replace(char, replace_with)
    return name
+
+
+def escape_html_characters(content):
+    """
+    Remove HTML characters that shouldn't be indexed using ElasticSearch indexer
+    This method is complementary to html_to_text method found in xmodule/annotator_mixin.py
+
+    Args:
+        content (str): variable to escape html characters from
+
+    Returns:
+        content (str): content ready to be index by ElasticSearch
+
+    """
+
+    # Removing HTML comments
+    return re.sub(
+        r"<!--.*-->",
+        "",
+        # Removing HTML CDATA
+        re.sub(
+            r"<!\[CDATA\[.*\]\]>",
+            "",
+            # Removing HTML-encoded non-breaking space characters
+            re.sub(
+                r"(\s|&nbsp;|//)+",
+                " ",
+                html_to_text(content))
+            )
+        )