Commit e504a9e9 by Brian Wilson

Strip HTML markup from AnswerValues that have corresponding ValueIds.

Also strip leading and trailing whitespace from all values before output,
and normalize whitespace when stripping HTML markup.

Change-Id: Iac33e2776d22f4459091d7031605cf14952f9688
parent 6a17d980
......@@ -2,9 +2,10 @@
Luigi tasks for extracting problem answer distribution statistics from
tracking log files.
"""
import csv
import hashlib
import html5lib
import json
import csv
from operator import itemgetter
import luigi
......@@ -310,8 +311,13 @@ class AnswerDistributionPerCourseMixin(object):
answer_value = answer.get('answer', answer.get('answer_value_id'))
# These values may be lists, so convert to output format.
# And if we have a value_id, the corresponding answer_value
# may contain HTML markup, that should be stripped.
# But don't strip markup otherwise, as it may be part of
# the answer.
value_id = self.stringify(value_id)
answer_value = self.stringify(answer_value)
answer_value_contains_html = (value_id is not None and value_id != '')
answer_value = self.stringify(answer_value, contains_html=answer_value_contains_html)
# If there is a variant, then the question might not be
# the same for all variants presented to students. So
......@@ -489,7 +495,7 @@ class AnswerDistributionPerCourseMixin(object):
return u'{value}_{variant}'.format(value=self.stringify(answer_value), variant=variant)
@staticmethod
def stringify(answer_value):
def stringify(answer_value, contains_html=False):
"""
Convert answer value to a canonical string representation.
......@@ -498,26 +504,65 @@ class AnswerDistributionPerCourseMixin(object):
(e.g. "[choice_1|choice_3|choice_4]").
If answer_value is a string, just returns as-is.
If contains_html is True, the answer_string is parsed as XML,
and the text value of the answer_value is returned.
"""
# If it's a list, convert to a string. Note that it's not
# enough to call str() or unicode(), as this will appear as
# "[u'choice_5']".
def normalize(value):
"""Pull out HTML tags if requested."""
return get_text_from_html(value) if contains_html else value.strip()
# TODO: also need to strip out XML tags here, that may be
# appearing for the answer text displayed for choices. But
# what happens if the answer is not a choice, but is a text
# string, and that string happens to have XML-like markup in
# it?
if isinstance(answer_value, basestring):
return answer_value
return normalize(answer_value)
elif isinstance(answer_value, list):
return u'[{list_val}]'.format(list_val=u'|'.join(answer_value))
list_val = u'|'.join(normalize(value) for value in answer_value)
return u'[{list_val}]'.format(list_val=list_val)
else:
# unexpected type:
log.error("Unexpected type for an answer_value: %s", answer_value)
return unicode(answer_value)
def get_text_from_html(markup):
"""
Convert html markup to plain text.
Includes stripping excess whitespace, and assuring whitespace
exists between elements (e.g. table elements).
"""
try:
root = html5lib.parse(markup)
text_list = []
for val in get_text_from_element(root):
text_list.extend(val.split())
text = u' '.join(text_list)
except Exception as exception: # pylint: disable=broad-except
# TODO: find out what exceptions might actually occur here, if any.
# This may be unnecessarily paranoid, given html5lib's fallback behavior.
log.error("Unparseable answer value markup: '%s' return exception %s", markup, exception)
text = markup.strip()
return text
def get_text_from_element(node):
"""Traverse ElementTree node recursively to return text values."""
tag = node.tag
if not isinstance(tag, basestring) and tag is not None:
return
if node.text:
yield node.text
for child in node:
for text in get_text_from_element(child):
yield text
if child.tail:
yield child.tail
##################################
# Task requires/output definitions
##################################
......@@ -543,7 +588,8 @@ class BaseAnswerDistributionTask(MapReduceJobTask):
# Boto is used for S3 access and cjson for parsing log files.
import boto
import cjson
return [boto, cjson]
import six
return [boto, cjson, html5lib, six]
class LastProblemCheckEvent(LastProblemCheckEventMixin, BaseAnswerDistributionTask):
......@@ -629,16 +675,15 @@ class AnswerDistributionOneFilePerCourseTask(MultiOutputMapReduceJobTask):
"""
Groups inputs by course_id, writes all records with the same course_id to the same output file.
Each input line is expected to consist of tab separated columns. The first column is expected to be the
course_id and is used to group the entries. The course_id is stripped from the output and the remaining columns
are written to the appropriate output file in the same format they were read in (tab separated).
Each input line is expected to consist of two tab separated columns. The first column is expected to be the
course_id and is used to group the entries. The course_id is stripped from the output and the remaining column
is written to the appropriate output file in the same format it was read in (i.e. as an encoded JSON string).
"""
split_line = line.split('\t')
# Ensure that the first column is interpreted as the grouping key by the hadoop streaming API. Note that since
# Configuration values can change this behavior, the remaining tab separated columns are encoded in a python
# structure before returning to hadoop. They are decoded in the reducer.
course_id, content = split_line[0], split_line[1:]
yield course_id, tuple(content)
course_id, content = line.split('\t')
yield course_id, content
def output_path_for_key(self, course_id):
"""
......@@ -664,16 +709,9 @@ class AnswerDistributionOneFilePerCourseTask(MultiOutputMapReduceJobTask):
(k, k) for k in field_names
))
# Collect in memory the list of dicts to be output.
row_data = []
for content_tuple in values:
# Restore tabs that were removed in the map task. Tabs
# are special characters to hadoop, so they were removed
# to prevent interpretation of them. Restore them here.
tab_separated_content = '\t'.join(content_tuple)
row_data.append(json.loads(tab_separated_content))
# Sort the list of dicts by their field names before encoding.
# Collect in memory the list of dicts to be output. Then sort
# the list of dicts by their field names before encoding.
row_data = [json.loads(content) for content in values]
row_data = sorted(row_data, key=itemgetter(*field_names))
for row_dict in row_data:
......@@ -685,7 +723,8 @@ class AnswerDistributionOneFilePerCourseTask(MultiOutputMapReduceJobTask):
def extra_modules(self):
import cjson
import boto
return [cjson, boto]
import six
return [boto, cjson, html5lib, six]
################################
......
......@@ -477,15 +477,51 @@ class AnswerDistributionPerCourseReduceTest(unittest.TestCase):
expected_output = self._get_expected_output(answer_data)
self._check_output([input_data], (expected_output,))
def test_choice_answer(self):
def check_choice_answer(self, answer, expected):
"""Run a choice answer with a provided value, and compare with expected."""
answer_data = self._get_answer_data(
answer_value_id='choice_1',
answer='First Choice',
answer=answer,
)
input_data = (self.timestamp, json.dumps(answer_data))
expected_output = self._get_expected_output(answer_data, ValueID='choice_1', AnswerValue='First Choice')
expected_output = self._get_expected_output(answer_data, ValueID='choice_1', AnswerValue=expected)
self._check_output([input_data], (expected_output,))
def test_choice_answer(self):
self.check_choice_answer('First Choice', 'First Choice')
def test_choice_answer_with_whitespace(self):
self.check_choice_answer('First Choice\t', 'First Choice')
def test_choice_answer_with_empty_string(self):
self.check_choice_answer('', '')
def test_choice_answer_with_empty_markup(self):
self.check_choice_answer('<text><span>First Choice</span></text>', 'First Choice')
def test_choice_answer_with_non_element_markup(self):
# This tests a branch of the get_text_from_element logic,
# where there is no tag on an element.
self.check_choice_answer(
'<text><span>First<!-- embedded comment --> Choice</span></text>',
'First Choice'
)
def test_choice_answer_with_html_markup(self):
self.check_choice_answer('<p>First<br>Choice', 'First Choice')
def test_choice_answer_with_embedded_whitespace(self):
self.check_choice_answer('First \t\n Choice ', 'First Choice')
def test_choice_answer_with_bad_html_markup(self):
self.check_choice_answer('<p First <br>Choice', 'Choice')
def test_choice_answer_with_bad2_html_markup(self):
self.check_choice_answer('First br>Choice', 'First br>Choice')
def test_choice_answer_with_cdata_html_markup(self):
self.check_choice_answer('First <![CDATA[This is to be ignored.]]> Choice', 'First Choice')
def test_multiple_choice_answer(self):
answer_data = self._get_answer_data(
answer_value_id=['choice_1', 'choice_2', 'choice_4'],
......@@ -500,6 +536,24 @@ class AnswerDistributionPerCourseReduceTest(unittest.TestCase):
)
self._check_output([input_data], (expected_output,))
def test_multiple_choice_answer_with_markup(self):
answer_data = self._get_answer_data(
answer_value_id=['choice_1', 'choice_2', 'choice_4'],
answer=[
u'<text>First Ch\u014dice</text>',
u'Second <sup>Ch\u014dice</sup>',
u'Fourth <table><tbody><tr><td>Ch\u014dice</td></tr></tbody></table> goes here.'
],
response_type="multiplechoiceresponse",
)
input_data = (self.timestamp, json.dumps(answer_data))
expected_output = self._get_expected_output(
answer_data,
ValueID='[choice_1|choice_2|choice_4]',
AnswerValue=u'[First Ch\u014dice|Second Ch\u014dice|Fourth Ch\u014dice goes here.]'
)
self._check_output([input_data], (expected_output,))
def test_filtered_response_type(self):
answer_data = self._get_answer_data(
response_type="customresponse",
......@@ -551,6 +605,14 @@ class AnswerDistributionPerCourseReduceTest(unittest.TestCase):
expected_output_2 = self._get_expected_output(answer_data_2)
self._check_output([input_data_1, input_data_2], (expected_output_1, expected_output_2))
def test_two_answer_event_different_answer_by_whitespace(self):
answer_data_1 = self._get_answer_data(answer="\t\n\nfirst ")
answer_data_2 = self._get_answer_data(answer="first")
input_data_1 = (self.earlier_timestamp, json.dumps(answer_data_1))
input_data_2 = (self.timestamp, json.dumps(answer_data_2))
expected_output = self._get_expected_output(answer_data_2, Count=2)
self._check_output([input_data_1, input_data_2], (expected_output,))
def test_two_answer_event_different_old_and_new(self):
answer_data_1 = self._get_non_submission_answer_data(answer_value_id="first")
answer_data_2 = self._get_answer_data(problem_display_name=self.problem_display_name)
......@@ -685,12 +747,7 @@ class AnswerDistributionOneFilePerCourseTaskTest(unittest.TestCase):
def test_map_single_value(self):
key, value = next(self.task.mapper('foo\tbar'))
self.assertEquals(key, 'foo')
self.assertEquals(value, ('bar',))
def test_map_multiple_values(self):
key, value = next(self.task.mapper('foo\tbar\tbaz'))
self.assertEquals(key, 'foo')
self.assertEquals(value, ('bar', 'baz'))
self.assertEquals(value, 'bar')
def test_reduce_multiple_values(self):
field_names = AnswerDistributionPerCourseMixin.get_column_order()
......@@ -705,7 +762,7 @@ class AnswerDistributionOneFilePerCourseTaskTest(unittest.TestCase):
sample_input_2 = json.dumps(dict(column_values_2))
mock_output_file = Mock()
self.task.multi_output_reducer('foo', iter([(sample_input_1,), (sample_input_2,)]), mock_output_file)
self.task.multi_output_reducer('foo', iter([sample_input_1, sample_input_2]), mock_output_file)
expected_header_string = ','.join(field_names) + '\r\n'
self.assertEquals(mock_output_file.write.mock_calls[0], call(expected_header_string))
......
......@@ -8,5 +8,6 @@ tornado==3.1.1
ansible==1.4.4
python-cjson==1.0.5
oursql==0.9.3.1
html5lib==1.0b3
-e git+https://github.com/spotify/luigi.git@a33756c781b9bf7e51384f0eb19d6a25050ef136#egg=luigi
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment