Strip HTML markup from AnswerValues that have corresponding ValueIds.

Also strip leading and trailing whitespace from all values before output, and normalize whitespace when stripping HTML markup. Change-Id: Iac33e2776d22f4459091d7031605cf14952f9688

Strip HTML markup from AnswerValues that have corresponding ValueIds.
Also strip leading and trailing whitespace from all values before output, and normalize whitespace when stripping HTML markup. Change-Id: Iac33e2776d22f4459091d7031605cf14952f9688
e504a9e9 · Brian Wilson · 6a17d980 · e504a9e9 · e504a9e9 · e504a9e9
Commit e504a9e9 authored Mar 17, 2014 by Brian Wilson
Show whitespace changes
Inline Side-by-side

Showing with 135 additions and 38 deletions

edx/analytics/tasks/answer_dist.py
+67 -28

edx/analytics/tasks/tests/test_answer_dist.py
+67 -10

requirements/default.txt
+1 -0

No files found.
--- a/edx/analytics/tasks/answer_dist.py
+++ b/edx/analytics/tasks/answer_dist.py
@@ -2,9 +2,10 @@
 Luigi tasks for extracting problem answer distribution statistics from
 tracking log files.
 """
+import csv
 import hashlib
+import html5lib
 import json
-import csv
 from operator import itemgetter

 import luigi
@@ -310,8 +311,13 @@ class AnswerDistributionPerCourseMixin(object):
                    answer_value = answer.get('answer', answer.get('answer_value_id'))

                # These values may be lists, so convert to output format.
+                # And if we have a value_id, the corresponding answer_value
+                # may contain HTML markup, that should be stripped.
+                # But don't strip markup otherwise, as it may be part of
+                # the answer.
                value_id = self.stringify(value_id)
-                answer_value = self.stringify(answer_value)
+                answer_value_contains_html = (value_id is not None and value_id != '')
+                answer_value = self.stringify(answer_value, contains_html=answer_value_contains_html)

                # If there is a variant, then the question might not be
                # the same for all variants presented to students.  So
@@ -489,7 +495,7 @@ class AnswerDistributionPerCourseMixin(object):
        return u'{value}_{variant}'.format(value=self.stringify(answer_value), variant=variant)

    @staticmethod
-    def stringify(answer_value):
+    def stringify(answer_value, contains_html=False):
        """
        Convert answer value to a canonical string representation.

@@ -498,26 +504,65 @@ class AnswerDistributionPerCourseMixin(object):
        (e.g. "[choice_1|choice_3|choice_4]").

        If answer_value is a string, just returns as-is.
+
+        If contains_html is True, the answer_string is parsed as XML,
+        and the text value of the answer_value is returned.
+
        """
        # If it's a list, convert to a string.  Note that it's not
        # enough to call str() or unicode(), as this will appear as
        # "[u'choice_5']".
+        def normalize(value):
+            """Pull out HTML tags if requested."""
+            return get_text_from_html(value) if contains_html else value.strip()

-        # TODO: also need to strip out XML tags here, that may be
-        # appearing for the answer text displayed for choices.  But
-        # what happens if the answer is not a choice, but is a text
-        # string, and that string happens to have XML-like markup in
-        # it?
        if isinstance(answer_value, basestring):
-            return answer_value
+            return normalize(answer_value)
        elif isinstance(answer_value, list):
-            return u'[{list_val}]'.format(list_val=u'|'.join(answer_value))
+            list_val = u'|'.join(normalize(value) for value in answer_value)
+            return u'[{list_val}]'.format(list_val=list_val)
        else:
            # unexpected type:
            log.error("Unexpected type for an answer_value: %s", answer_value)
            return unicode(answer_value)


+def get_text_from_html(markup):
+    """
+    Convert html markup to plain text.
+
+    Includes stripping excess whitespace, and assuring whitespace
+    exists between elements (e.g. table elements).
+    """
+    try:
+        root = html5lib.parse(markup)
+        text_list = []
+        for val in get_text_from_element(root):
+            text_list.extend(val.split())
+        text = u' '.join(text_list)
+    except Exception as exception:  # pylint: disable=broad-except
+        # TODO: find out what exceptions might actually occur here, if any.
+        # This may be unnecessarily paranoid, given html5lib's fallback behavior.
+        log.error("Unparseable answer value markup: '%s' return exception %s", markup, exception)
+        text = markup.strip()
+
+    return text
+
+
+def get_text_from_element(node):
+    """Traverse ElementTree node recursively to return text values."""
+    tag = node.tag
+    if not isinstance(tag, basestring) and tag is not None:
+        return
+    if node.text:
+        yield node.text
+    for child in node:
+        for text in get_text_from_element(child):
+            yield text
+        if child.tail:
+            yield child.tail
+
+
 ##################################
 # Task requires/output definitions
 ##################################
@@ -543,7 +588,8 @@ class BaseAnswerDistributionTask(MapReduceJobTask):
        # Boto is used for S3 access and cjson for parsing log files.
        import boto
        import cjson
-        return [boto, cjson]
+        import six
+        return [boto, cjson, html5lib, six]


 class LastProblemCheckEvent(LastProblemCheckEventMixin, BaseAnswerDistributionTask):
@@ -629,16 +675,15 @@ class AnswerDistributionOneFilePerCourseTask(MultiOutputMapReduceJobTask):
        """
        Groups inputs by course_id, writes all records with the same course_id to the same output file.

-        Each input line is expected to consist of tab separated columns. The first column is expected to be the
-        course_id and is used to group the entries. The course_id is stripped from the output and the remaining columns
-        are written to the appropriate output file in the same format they were read in (tab separated).
+        Each input line is expected to consist of two tab separated columns. The first column is expected to be the
+        course_id and is used to group the entries. The course_id is stripped from the output and the remaining column
+        is written to the appropriate output file in the same format it was read in (i.e. as an encoded JSON string).
        """
-        split_line = line.split('\t')
        # Ensure that the first column is interpreted as the grouping key by the hadoop streaming API.  Note that since
        # Configuration values can change this behavior, the remaining tab separated columns are encoded in a python
        # structure before returning to hadoop.  They are decoded in the reducer.
-        course_id, content = split_line[0], split_line[1:]
-        yield course_id, tuple(content)
+        course_id, content = line.split('\t')
+        yield course_id, content

    def output_path_for_key(self, course_id):
        """
@@ -664,16 +709,9 @@ class AnswerDistributionOneFilePerCourseTask(MultiOutputMapReduceJobTask):
            (k, k) for k in field_names
        ))

-        # Collect in memory the list of dicts to be output.
-        row_data = []
-        for content_tuple in values:
-            # Restore tabs that were removed in the map task.  Tabs
-            # are special characters to hadoop, so they were removed
-            # to prevent interpretation of them.  Restore them here.
-            tab_separated_content = '\t'.join(content_tuple)
-            row_data.append(json.loads(tab_separated_content))
-
-        # Sort the list of dicts by their field names before encoding.
+        # Collect in memory the list of dicts to be output.  Then sort
+        # the list of dicts by their field names before encoding.
+        row_data = [json.loads(content) for content in values]
        row_data = sorted(row_data, key=itemgetter(*field_names))

        for row_dict in row_data:
@@ -685,7 +723,8 @@ class AnswerDistributionOneFilePerCourseTask(MultiOutputMapReduceJobTask):
    def extra_modules(self):
        import cjson
        import boto
-        return [cjson, boto]
+        import six
+        return [boto, cjson, html5lib, six]


 ################################

--- a/edx/analytics/tasks/tests/test_answer_dist.py
+++ b/edx/analytics/tasks/tests/test_answer_dist.py
@@ -477,15 +477,51 @@ class AnswerDistributionPerCourseReduceTest(unittest.TestCase):
        expected_output = self._get_expected_output(answer_data)
        self._check_output([input_data], (expected_output,))

-    def test_choice_answer(self):
+    def check_choice_answer(self, answer, expected):
+        """Run a choice answer with a provided value, and compare with expected."""
        answer_data = self._get_answer_data(
            answer_value_id='choice_1',
-            answer='First Choice',
+            answer=answer,
        )
        input_data = (self.timestamp, json.dumps(answer_data))
-        expected_output = self._get_expected_output(answer_data, ValueID='choice_1', AnswerValue='First Choice')
+        expected_output = self._get_expected_output(answer_data, ValueID='choice_1', AnswerValue=expected)
        self._check_output([input_data], (expected_output,))

+    def test_choice_answer(self):
+        self.check_choice_answer('First Choice', 'First Choice')
+
+    def test_choice_answer_with_whitespace(self):
+        self.check_choice_answer('First Choice\t', 'First Choice')
+
+    def test_choice_answer_with_empty_string(self):
+        self.check_choice_answer('', '')
+
+    def test_choice_answer_with_empty_markup(self):
+        self.check_choice_answer('<text><span>First Choice</span></text>', 'First Choice')
+
+    def test_choice_answer_with_non_element_markup(self):
+        # This tests a branch of the get_text_from_element logic,
+        # where there is no tag on an element.
+        self.check_choice_answer(
+            '<text><span>First<!-- embedded comment --> Choice</span></text>',
+            'First Choice'
+        )
+
+    def test_choice_answer_with_html_markup(self):
+        self.check_choice_answer('<p>First<br>Choice', 'First Choice')
+
+    def test_choice_answer_with_embedded_whitespace(self):
+        self.check_choice_answer('First  \t\n    Choice  ', 'First Choice')
+
+    def test_choice_answer_with_bad_html_markup(self):
+        self.check_choice_answer('<p First <br>Choice', 'Choice')
+
+    def test_choice_answer_with_bad2_html_markup(self):
+        self.check_choice_answer('First br>Choice', 'First br>Choice')
+
+    def test_choice_answer_with_cdata_html_markup(self):
+        self.check_choice_answer('First <![CDATA[This is to be ignored.]]>  Choice', 'First Choice')
+
    def test_multiple_choice_answer(self):
        answer_data = self._get_answer_data(
            answer_value_id=['choice_1', 'choice_2', 'choice_4'],
@@ -500,6 +536,24 @@ class AnswerDistributionPerCourseReduceTest(unittest.TestCase):
        )
        self._check_output([input_data], (expected_output,))

+    def test_multiple_choice_answer_with_markup(self):
+        answer_data = self._get_answer_data(
+            answer_value_id=['choice_1', 'choice_2', 'choice_4'],
+            answer=[
+                u'<text>First Ch\u014dice</text>',
+                u'Second <sup>Ch\u014dice</sup>',
+                u'Fourth <table><tbody><tr><td>Ch\u014dice</td></tr></tbody></table> goes here.'
+            ],
+            response_type="multiplechoiceresponse",
+        )
+        input_data = (self.timestamp, json.dumps(answer_data))
+        expected_output = self._get_expected_output(
+            answer_data,
+            ValueID='[choice_1|choice_2|choice_4]',
+            AnswerValue=u'[First Ch\u014dice|Second Ch\u014dice|Fourth Ch\u014dice goes here.]'
+        )
+        self._check_output([input_data], (expected_output,))
+
    def test_filtered_response_type(self):
        answer_data = self._get_answer_data(
            response_type="customresponse",
@@ -551,6 +605,14 @@ class AnswerDistributionPerCourseReduceTest(unittest.TestCase):
        expected_output_2 = self._get_expected_output(answer_data_2)
        self._check_output([input_data_1, input_data_2], (expected_output_1, expected_output_2))

+    def test_two_answer_event_different_answer_by_whitespace(self):
+        answer_data_1 = self._get_answer_data(answer="\t\n\nfirst   ")
+        answer_data_2 = self._get_answer_data(answer="first")
+        input_data_1 = (self.earlier_timestamp, json.dumps(answer_data_1))
+        input_data_2 = (self.timestamp, json.dumps(answer_data_2))
+        expected_output = self._get_expected_output(answer_data_2, Count=2)
+        self._check_output([input_data_1, input_data_2], (expected_output,))
+
    def test_two_answer_event_different_old_and_new(self):
        answer_data_1 = self._get_non_submission_answer_data(answer_value_id="first")
        answer_data_2 = self._get_answer_data(problem_display_name=self.problem_display_name)
@@ -685,12 +747,7 @@ class AnswerDistributionOneFilePerCourseTaskTest(unittest.TestCase):
    def test_map_single_value(self):
        key, value = next(self.task.mapper('foo\tbar'))
        self.assertEquals(key, 'foo')
-        self.assertEquals(value, ('bar',))
-
-    def test_map_multiple_values(self):
-        key, value = next(self.task.mapper('foo\tbar\tbaz'))
-        self.assertEquals(key, 'foo')
-        self.assertEquals(value, ('bar', 'baz'))
+        self.assertEquals(value, 'bar')

    def test_reduce_multiple_values(self):
        field_names = AnswerDistributionPerCourseMixin.get_column_order()
@@ -705,7 +762,7 @@ class AnswerDistributionOneFilePerCourseTaskTest(unittest.TestCase):
        sample_input_2 = json.dumps(dict(column_values_2))
        mock_output_file = Mock()

-        self.task.multi_output_reducer('foo', iter([(sample_input_1,), (sample_input_2,)]), mock_output_file)
+        self.task.multi_output_reducer('foo', iter([sample_input_1, sample_input_2]), mock_output_file)

        expected_header_string = ','.join(field_names) + '\r\n'
        self.assertEquals(mock_output_file.write.mock_calls[0], call(expected_header_string))

--- a/requirements/default.txt
+++ b/requirements/default.txt
@@ -8,5 +8,6 @@ tornado==3.1.1
 ansible==1.4.4
 python-cjson==1.0.5
 oursql==0.9.3.1
+html5lib==1.0b3

 -e git+https://github.com/spotify/luigi.git@a33756c781b9bf7e51384f0eb19d6a25050ef136#egg=luigi