Commit b16d3b46 by Brian Wilson Committed by Gerrit Code Review

Merge "Strip HTML markup from AnswerValues that have corresponding ValueIds."

parents c33e25cf e504a9e9
...@@ -2,9 +2,10 @@ ...@@ -2,9 +2,10 @@
Luigi tasks for extracting problem answer distribution statistics from Luigi tasks for extracting problem answer distribution statistics from
tracking log files. tracking log files.
""" """
import csv
import hashlib import hashlib
import html5lib
import json import json
import csv
from operator import itemgetter from operator import itemgetter
import luigi import luigi
...@@ -310,8 +311,13 @@ class AnswerDistributionPerCourseMixin(object): ...@@ -310,8 +311,13 @@ class AnswerDistributionPerCourseMixin(object):
answer_value = answer.get('answer', answer.get('answer_value_id')) answer_value = answer.get('answer', answer.get('answer_value_id'))
# These values may be lists, so convert to output format. # These values may be lists, so convert to output format.
# And if we have a value_id, the corresponding answer_value
# may contain HTML markup, that should be stripped.
# But don't strip markup otherwise, as it may be part of
# the answer.
value_id = self.stringify(value_id) value_id = self.stringify(value_id)
answer_value = self.stringify(answer_value) answer_value_contains_html = (value_id is not None and value_id != '')
answer_value = self.stringify(answer_value, contains_html=answer_value_contains_html)
# If there is a variant, then the question might not be # If there is a variant, then the question might not be
# the same for all variants presented to students. So # the same for all variants presented to students. So
...@@ -489,7 +495,7 @@ class AnswerDistributionPerCourseMixin(object): ...@@ -489,7 +495,7 @@ class AnswerDistributionPerCourseMixin(object):
return u'{value}_{variant}'.format(value=self.stringify(answer_value), variant=variant) return u'{value}_{variant}'.format(value=self.stringify(answer_value), variant=variant)
@staticmethod @staticmethod
def stringify(answer_value): def stringify(answer_value, contains_html=False):
""" """
Convert answer value to a canonical string representation. Convert answer value to a canonical string representation.
...@@ -498,26 +504,65 @@ class AnswerDistributionPerCourseMixin(object): ...@@ -498,26 +504,65 @@ class AnswerDistributionPerCourseMixin(object):
(e.g. "[choice_1|choice_3|choice_4]"). (e.g. "[choice_1|choice_3|choice_4]").
If answer_value is a string, just returns as-is. If answer_value is a string, just returns as-is.
If contains_html is True, the answer_string is parsed as XML,
and the text value of the answer_value is returned.
""" """
# If it's a list, convert to a string. Note that it's not # If it's a list, convert to a string. Note that it's not
# enough to call str() or unicode(), as this will appear as # enough to call str() or unicode(), as this will appear as
# "[u'choice_5']". # "[u'choice_5']".
def normalize(value):
"""Pull out HTML tags if requested."""
return get_text_from_html(value) if contains_html else value.strip()
# TODO: also need to strip out XML tags here, that may be
# appearing for the answer text displayed for choices. But
# what happens if the answer is not a choice, but is a text
# string, and that string happens to have XML-like markup in
# it?
if isinstance(answer_value, basestring): if isinstance(answer_value, basestring):
return answer_value return normalize(answer_value)
elif isinstance(answer_value, list): elif isinstance(answer_value, list):
return u'[{list_val}]'.format(list_val=u'|'.join(answer_value)) list_val = u'|'.join(normalize(value) for value in answer_value)
return u'[{list_val}]'.format(list_val=list_val)
else: else:
# unexpected type: # unexpected type:
log.error("Unexpected type for an answer_value: %s", answer_value) log.error("Unexpected type for an answer_value: %s", answer_value)
return unicode(answer_value) return unicode(answer_value)
def get_text_from_html(markup):
"""
Convert html markup to plain text.
Includes stripping excess whitespace, and assuring whitespace
exists between elements (e.g. table elements).
"""
try:
root = html5lib.parse(markup)
text_list = []
for val in get_text_from_element(root):
text_list.extend(val.split())
text = u' '.join(text_list)
except Exception as exception: # pylint: disable=broad-except
# TODO: find out what exceptions might actually occur here, if any.
# This may be unnecessarily paranoid, given html5lib's fallback behavior.
log.error("Unparseable answer value markup: '%s' return exception %s", markup, exception)
text = markup.strip()
return text
def get_text_from_element(node):
"""Traverse ElementTree node recursively to return text values."""
tag = node.tag
if not isinstance(tag, basestring) and tag is not None:
return
if node.text:
yield node.text
for child in node:
for text in get_text_from_element(child):
yield text
if child.tail:
yield child.tail
################################## ##################################
# Task requires/output definitions # Task requires/output definitions
################################## ##################################
...@@ -543,7 +588,8 @@ class BaseAnswerDistributionTask(MapReduceJobTask): ...@@ -543,7 +588,8 @@ class BaseAnswerDistributionTask(MapReduceJobTask):
# Boto is used for S3 access and cjson for parsing log files. # Boto is used for S3 access and cjson for parsing log files.
import boto import boto
import cjson import cjson
return [boto, cjson] import six
return [boto, cjson, html5lib, six]
class LastProblemCheckEvent(LastProblemCheckEventMixin, BaseAnswerDistributionTask): class LastProblemCheckEvent(LastProblemCheckEventMixin, BaseAnswerDistributionTask):
...@@ -629,16 +675,15 @@ class AnswerDistributionOneFilePerCourseTask(MultiOutputMapReduceJobTask): ...@@ -629,16 +675,15 @@ class AnswerDistributionOneFilePerCourseTask(MultiOutputMapReduceJobTask):
""" """
Groups inputs by course_id, writes all records with the same course_id to the same output file. Groups inputs by course_id, writes all records with the same course_id to the same output file.
Each input line is expected to consist of tab separated columns. The first column is expected to be the Each input line is expected to consist of two tab separated columns. The first column is expected to be the
course_id and is used to group the entries. The course_id is stripped from the output and the remaining columns course_id and is used to group the entries. The course_id is stripped from the output and the remaining column
are written to the appropriate output file in the same format they were read in (tab separated). is written to the appropriate output file in the same format it was read in (i.e. as an encoded JSON string).
""" """
split_line = line.split('\t')
# Ensure that the first column is interpreted as the grouping key by the hadoop streaming API. Note that since # Ensure that the first column is interpreted as the grouping key by the hadoop streaming API. Note that since
# Configuration values can change this behavior, the remaining tab separated columns are encoded in a python # Configuration values can change this behavior, the remaining tab separated columns are encoded in a python
# structure before returning to hadoop. They are decoded in the reducer. # structure before returning to hadoop. They are decoded in the reducer.
course_id, content = split_line[0], split_line[1:] course_id, content = line.split('\t')
yield course_id, tuple(content) yield course_id, content
def output_path_for_key(self, course_id): def output_path_for_key(self, course_id):
""" """
...@@ -664,16 +709,9 @@ class AnswerDistributionOneFilePerCourseTask(MultiOutputMapReduceJobTask): ...@@ -664,16 +709,9 @@ class AnswerDistributionOneFilePerCourseTask(MultiOutputMapReduceJobTask):
(k, k) for k in field_names (k, k) for k in field_names
)) ))
# Collect in memory the list of dicts to be output. # Collect in memory the list of dicts to be output. Then sort
row_data = [] # the list of dicts by their field names before encoding.
for content_tuple in values: row_data = [json.loads(content) for content in values]
# Restore tabs that were removed in the map task. Tabs
# are special characters to hadoop, so they were removed
# to prevent interpretation of them. Restore them here.
tab_separated_content = '\t'.join(content_tuple)
row_data.append(json.loads(tab_separated_content))
# Sort the list of dicts by their field names before encoding.
row_data = sorted(row_data, key=itemgetter(*field_names)) row_data = sorted(row_data, key=itemgetter(*field_names))
for row_dict in row_data: for row_dict in row_data:
...@@ -685,7 +723,8 @@ class AnswerDistributionOneFilePerCourseTask(MultiOutputMapReduceJobTask): ...@@ -685,7 +723,8 @@ class AnswerDistributionOneFilePerCourseTask(MultiOutputMapReduceJobTask):
def extra_modules(self): def extra_modules(self):
import cjson import cjson
import boto import boto
return [cjson, boto] import six
return [boto, cjson, html5lib, six]
################################ ################################
......
...@@ -477,15 +477,51 @@ class AnswerDistributionPerCourseReduceTest(unittest.TestCase): ...@@ -477,15 +477,51 @@ class AnswerDistributionPerCourseReduceTest(unittest.TestCase):
expected_output = self._get_expected_output(answer_data) expected_output = self._get_expected_output(answer_data)
self._check_output([input_data], (expected_output,)) self._check_output([input_data], (expected_output,))
def test_choice_answer(self): def check_choice_answer(self, answer, expected):
"""Run a choice answer with a provided value, and compare with expected."""
answer_data = self._get_answer_data( answer_data = self._get_answer_data(
answer_value_id='choice_1', answer_value_id='choice_1',
answer='First Choice', answer=answer,
) )
input_data = (self.timestamp, json.dumps(answer_data)) input_data = (self.timestamp, json.dumps(answer_data))
expected_output = self._get_expected_output(answer_data, ValueID='choice_1', AnswerValue='First Choice') expected_output = self._get_expected_output(answer_data, ValueID='choice_1', AnswerValue=expected)
self._check_output([input_data], (expected_output,)) self._check_output([input_data], (expected_output,))
def test_choice_answer(self):
self.check_choice_answer('First Choice', 'First Choice')
def test_choice_answer_with_whitespace(self):
self.check_choice_answer('First Choice\t', 'First Choice')
def test_choice_answer_with_empty_string(self):
self.check_choice_answer('', '')
def test_choice_answer_with_empty_markup(self):
self.check_choice_answer('<text><span>First Choice</span></text>', 'First Choice')
def test_choice_answer_with_non_element_markup(self):
# This tests a branch of the get_text_from_element logic,
# where there is no tag on an element.
self.check_choice_answer(
'<text><span>First<!-- embedded comment --> Choice</span></text>',
'First Choice'
)
def test_choice_answer_with_html_markup(self):
self.check_choice_answer('<p>First<br>Choice', 'First Choice')
def test_choice_answer_with_embedded_whitespace(self):
self.check_choice_answer('First \t\n Choice ', 'First Choice')
def test_choice_answer_with_bad_html_markup(self):
self.check_choice_answer('<p First <br>Choice', 'Choice')
def test_choice_answer_with_bad2_html_markup(self):
self.check_choice_answer('First br>Choice', 'First br>Choice')
def test_choice_answer_with_cdata_html_markup(self):
self.check_choice_answer('First <![CDATA[This is to be ignored.]]> Choice', 'First Choice')
def test_multiple_choice_answer(self): def test_multiple_choice_answer(self):
answer_data = self._get_answer_data( answer_data = self._get_answer_data(
answer_value_id=['choice_1', 'choice_2', 'choice_4'], answer_value_id=['choice_1', 'choice_2', 'choice_4'],
...@@ -500,6 +536,24 @@ class AnswerDistributionPerCourseReduceTest(unittest.TestCase): ...@@ -500,6 +536,24 @@ class AnswerDistributionPerCourseReduceTest(unittest.TestCase):
) )
self._check_output([input_data], (expected_output,)) self._check_output([input_data], (expected_output,))
def test_multiple_choice_answer_with_markup(self):
answer_data = self._get_answer_data(
answer_value_id=['choice_1', 'choice_2', 'choice_4'],
answer=[
u'<text>First Ch\u014dice</text>',
u'Second <sup>Ch\u014dice</sup>',
u'Fourth <table><tbody><tr><td>Ch\u014dice</td></tr></tbody></table> goes here.'
],
response_type="multiplechoiceresponse",
)
input_data = (self.timestamp, json.dumps(answer_data))
expected_output = self._get_expected_output(
answer_data,
ValueID='[choice_1|choice_2|choice_4]',
AnswerValue=u'[First Ch\u014dice|Second Ch\u014dice|Fourth Ch\u014dice goes here.]'
)
self._check_output([input_data], (expected_output,))
def test_filtered_response_type(self): def test_filtered_response_type(self):
answer_data = self._get_answer_data( answer_data = self._get_answer_data(
response_type="customresponse", response_type="customresponse",
...@@ -551,6 +605,14 @@ class AnswerDistributionPerCourseReduceTest(unittest.TestCase): ...@@ -551,6 +605,14 @@ class AnswerDistributionPerCourseReduceTest(unittest.TestCase):
expected_output_2 = self._get_expected_output(answer_data_2) expected_output_2 = self._get_expected_output(answer_data_2)
self._check_output([input_data_1, input_data_2], (expected_output_1, expected_output_2)) self._check_output([input_data_1, input_data_2], (expected_output_1, expected_output_2))
def test_two_answer_event_different_answer_by_whitespace(self):
answer_data_1 = self._get_answer_data(answer="\t\n\nfirst ")
answer_data_2 = self._get_answer_data(answer="first")
input_data_1 = (self.earlier_timestamp, json.dumps(answer_data_1))
input_data_2 = (self.timestamp, json.dumps(answer_data_2))
expected_output = self._get_expected_output(answer_data_2, Count=2)
self._check_output([input_data_1, input_data_2], (expected_output,))
def test_two_answer_event_different_old_and_new(self): def test_two_answer_event_different_old_and_new(self):
answer_data_1 = self._get_non_submission_answer_data(answer_value_id="first") answer_data_1 = self._get_non_submission_answer_data(answer_value_id="first")
answer_data_2 = self._get_answer_data(problem_display_name=self.problem_display_name) answer_data_2 = self._get_answer_data(problem_display_name=self.problem_display_name)
...@@ -685,12 +747,7 @@ class AnswerDistributionOneFilePerCourseTaskTest(unittest.TestCase): ...@@ -685,12 +747,7 @@ class AnswerDistributionOneFilePerCourseTaskTest(unittest.TestCase):
def test_map_single_value(self): def test_map_single_value(self):
key, value = next(self.task.mapper('foo\tbar')) key, value = next(self.task.mapper('foo\tbar'))
self.assertEquals(key, 'foo') self.assertEquals(key, 'foo')
self.assertEquals(value, ('bar',)) self.assertEquals(value, 'bar')
def test_map_multiple_values(self):
key, value = next(self.task.mapper('foo\tbar\tbaz'))
self.assertEquals(key, 'foo')
self.assertEquals(value, ('bar', 'baz'))
def test_reduce_multiple_values(self): def test_reduce_multiple_values(self):
field_names = AnswerDistributionPerCourseMixin.get_column_order() field_names = AnswerDistributionPerCourseMixin.get_column_order()
...@@ -705,7 +762,7 @@ class AnswerDistributionOneFilePerCourseTaskTest(unittest.TestCase): ...@@ -705,7 +762,7 @@ class AnswerDistributionOneFilePerCourseTaskTest(unittest.TestCase):
sample_input_2 = json.dumps(dict(column_values_2)) sample_input_2 = json.dumps(dict(column_values_2))
mock_output_file = Mock() mock_output_file = Mock()
self.task.multi_output_reducer('foo', iter([(sample_input_1,), (sample_input_2,)]), mock_output_file) self.task.multi_output_reducer('foo', iter([sample_input_1, sample_input_2]), mock_output_file)
expected_header_string = ','.join(field_names) + '\r\n' expected_header_string = ','.join(field_names) + '\r\n'
self.assertEquals(mock_output_file.write.mock_calls[0], call(expected_header_string)) self.assertEquals(mock_output_file.write.mock_calls[0], call(expected_header_string))
......
...@@ -8,5 +8,6 @@ tornado==3.1.1 ...@@ -8,5 +8,6 @@ tornado==3.1.1
ansible==1.4.4 ansible==1.4.4
python-cjson==1.0.5 python-cjson==1.0.5
oursql==0.9.3.1 oursql==0.9.3.1
html5lib==1.0b3
-e git+https://github.com/spotify/luigi.git@a33756c781b9bf7e51384f0eb19d6a25050ef136#egg=luigi -e git+https://github.com/spotify/luigi.git@a33756c781b9bf7e51384f0eb19d6a25050ef136#egg=luigi
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment