Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
E
edx-analytics-data-api
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
edx
edx-analytics-data-api
Commits
b16d3b46
Commit
b16d3b46
authored
Mar 26, 2014
by
Brian Wilson
Committed by
Gerrit Code Review
Mar 26, 2014
Browse files
Options
Browse Files
Download
Plain Diff
Merge "Strip HTML markup from AnswerValues that have corresponding ValueIds."
parents
c33e25cf
e504a9e9
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
135 additions
and
38 deletions
+135
-38
edx/analytics/tasks/answer_dist.py
+67
-28
edx/analytics/tasks/tests/test_answer_dist.py
+67
-10
requirements/default.txt
+1
-0
No files found.
edx/analytics/tasks/answer_dist.py
View file @
b16d3b46
...
...
@@ -2,9 +2,10 @@
Luigi tasks for extracting problem answer distribution statistics from
tracking log files.
"""
import
csv
import
hashlib
import
html5lib
import
json
import
csv
from
operator
import
itemgetter
import
luigi
...
...
@@ -310,8 +311,13 @@ class AnswerDistributionPerCourseMixin(object):
answer_value
=
answer
.
get
(
'answer'
,
answer
.
get
(
'answer_value_id'
))
# These values may be lists, so convert to output format.
# And if we have a value_id, the corresponding answer_value
# may contain HTML markup, that should be stripped.
# But don't strip markup otherwise, as it may be part of
# the answer.
value_id
=
self
.
stringify
(
value_id
)
answer_value
=
self
.
stringify
(
answer_value
)
answer_value_contains_html
=
(
value_id
is
not
None
and
value_id
!=
''
)
answer_value
=
self
.
stringify
(
answer_value
,
contains_html
=
answer_value_contains_html
)
# If there is a variant, then the question might not be
# the same for all variants presented to students. So
...
...
@@ -489,7 +495,7 @@ class AnswerDistributionPerCourseMixin(object):
return
u'{value}_{variant}'
.
format
(
value
=
self
.
stringify
(
answer_value
),
variant
=
variant
)
@staticmethod
def
stringify
(
answer_value
):
def
stringify
(
answer_value
,
contains_html
=
False
):
"""
Convert answer value to a canonical string representation.
...
...
@@ -498,26 +504,65 @@ class AnswerDistributionPerCourseMixin(object):
(e.g. "[choice_1|choice_3|choice_4]").
If answer_value is a string, just returns as-is.
If contains_html is True, the answer_string is parsed as XML,
and the text value of the answer_value is returned.
"""
# If it's a list, convert to a string. Note that it's not
# enough to call str() or unicode(), as this will appear as
# "[u'choice_5']".
def
normalize
(
value
):
"""Pull out HTML tags if requested."""
return
get_text_from_html
(
value
)
if
contains_html
else
value
.
strip
()
# TODO: also need to strip out XML tags here, that may be
# appearing for the answer text displayed for choices. But
# what happens if the answer is not a choice, but is a text
# string, and that string happens to have XML-like markup in
# it?
if
isinstance
(
answer_value
,
basestring
):
return
answer_value
return
normalize
(
answer_value
)
elif
isinstance
(
answer_value
,
list
):
return
u'[{list_val}]'
.
format
(
list_val
=
u'|'
.
join
(
answer_value
))
list_val
=
u'|'
.
join
(
normalize
(
value
)
for
value
in
answer_value
)
return
u'[{list_val}]'
.
format
(
list_val
=
list_val
)
else
:
# unexpected type:
log
.
error
(
"Unexpected type for an answer_value:
%
s"
,
answer_value
)
return
unicode
(
answer_value
)
def
get_text_from_html
(
markup
):
"""
Convert html markup to plain text.
Includes stripping excess whitespace, and assuring whitespace
exists between elements (e.g. table elements).
"""
try
:
root
=
html5lib
.
parse
(
markup
)
text_list
=
[]
for
val
in
get_text_from_element
(
root
):
text_list
.
extend
(
val
.
split
())
text
=
u' '
.
join
(
text_list
)
except
Exception
as
exception
:
# pylint: disable=broad-except
# TODO: find out what exceptions might actually occur here, if any.
# This may be unnecessarily paranoid, given html5lib's fallback behavior.
log
.
error
(
"Unparseable answer value markup: '
%
s' return exception
%
s"
,
markup
,
exception
)
text
=
markup
.
strip
()
return
text
def
get_text_from_element
(
node
):
"""Traverse ElementTree node recursively to return text values."""
tag
=
node
.
tag
if
not
isinstance
(
tag
,
basestring
)
and
tag
is
not
None
:
return
if
node
.
text
:
yield
node
.
text
for
child
in
node
:
for
text
in
get_text_from_element
(
child
):
yield
text
if
child
.
tail
:
yield
child
.
tail
##################################
# Task requires/output definitions
##################################
...
...
@@ -543,7 +588,8 @@ class BaseAnswerDistributionTask(MapReduceJobTask):
# Boto is used for S3 access and cjson for parsing log files.
import
boto
import
cjson
return
[
boto
,
cjson
]
import
six
return
[
boto
,
cjson
,
html5lib
,
six
]
class
LastProblemCheckEvent
(
LastProblemCheckEventMixin
,
BaseAnswerDistributionTask
):
...
...
@@ -629,16 +675,15 @@ class AnswerDistributionOneFilePerCourseTask(MultiOutputMapReduceJobTask):
"""
Groups inputs by course_id, writes all records with the same course_id to the same output file.
Each input line is expected to consist of tab separated columns. The first column is expected to be the
course_id and is used to group the entries. The course_id is stripped from the output and the remaining column
s
are written to the appropriate output file in the same format they were read in (tab separated
).
Each input line is expected to consist of t
wo t
ab separated columns. The first column is expected to be the
course_id and is used to group the entries. The course_id is stripped from the output and the remaining column
is written to the appropriate output file in the same format it was read in (i.e. as an encoded JSON string
).
"""
split_line
=
line
.
split
(
'
\t
'
)
# Ensure that the first column is interpreted as the grouping key by the hadoop streaming API. Note that since
# Configuration values can change this behavior, the remaining tab separated columns are encoded in a python
# structure before returning to hadoop. They are decoded in the reducer.
course_id
,
content
=
split_line
[
0
],
split_line
[
1
:]
yield
course_id
,
tuple
(
content
)
course_id
,
content
=
line
.
split
(
'
\t
'
)
yield
course_id
,
content
def
output_path_for_key
(
self
,
course_id
):
"""
...
...
@@ -664,16 +709,9 @@ class AnswerDistributionOneFilePerCourseTask(MultiOutputMapReduceJobTask):
(
k
,
k
)
for
k
in
field_names
))
# Collect in memory the list of dicts to be output.
row_data
=
[]
for
content_tuple
in
values
:
# Restore tabs that were removed in the map task. Tabs
# are special characters to hadoop, so they were removed
# to prevent interpretation of them. Restore them here.
tab_separated_content
=
'
\t
'
.
join
(
content_tuple
)
row_data
.
append
(
json
.
loads
(
tab_separated_content
))
# Sort the list of dicts by their field names before encoding.
# Collect in memory the list of dicts to be output. Then sort
# the list of dicts by their field names before encoding.
row_data
=
[
json
.
loads
(
content
)
for
content
in
values
]
row_data
=
sorted
(
row_data
,
key
=
itemgetter
(
*
field_names
))
for
row_dict
in
row_data
:
...
...
@@ -685,7 +723,8 @@ class AnswerDistributionOneFilePerCourseTask(MultiOutputMapReduceJobTask):
def
extra_modules
(
self
):
import
cjson
import
boto
return
[
cjson
,
boto
]
import
six
return
[
boto
,
cjson
,
html5lib
,
six
]
################################
...
...
edx/analytics/tasks/tests/test_answer_dist.py
View file @
b16d3b46
...
...
@@ -477,15 +477,51 @@ class AnswerDistributionPerCourseReduceTest(unittest.TestCase):
expected_output
=
self
.
_get_expected_output
(
answer_data
)
self
.
_check_output
([
input_data
],
(
expected_output
,))
def
test_choice_answer
(
self
):
def
check_choice_answer
(
self
,
answer
,
expected
):
"""Run a choice answer with a provided value, and compare with expected."""
answer_data
=
self
.
_get_answer_data
(
answer_value_id
=
'choice_1'
,
answer
=
'First Choice'
,
answer
=
answer
,
)
input_data
=
(
self
.
timestamp
,
json
.
dumps
(
answer_data
))
expected_output
=
self
.
_get_expected_output
(
answer_data
,
ValueID
=
'choice_1'
,
AnswerValue
=
'First Choice'
)
expected_output
=
self
.
_get_expected_output
(
answer_data
,
ValueID
=
'choice_1'
,
AnswerValue
=
expected
)
self
.
_check_output
([
input_data
],
(
expected_output
,))
def
test_choice_answer
(
self
):
self
.
check_choice_answer
(
'First Choice'
,
'First Choice'
)
def
test_choice_answer_with_whitespace
(
self
):
self
.
check_choice_answer
(
'First Choice
\t
'
,
'First Choice'
)
def
test_choice_answer_with_empty_string
(
self
):
self
.
check_choice_answer
(
''
,
''
)
def
test_choice_answer_with_empty_markup
(
self
):
self
.
check_choice_answer
(
'<text><span>First Choice</span></text>'
,
'First Choice'
)
def
test_choice_answer_with_non_element_markup
(
self
):
# This tests a branch of the get_text_from_element logic,
# where there is no tag on an element.
self
.
check_choice_answer
(
'<text><span>First<!-- embedded comment --> Choice</span></text>'
,
'First Choice'
)
def
test_choice_answer_with_html_markup
(
self
):
self
.
check_choice_answer
(
'<p>First<br>Choice'
,
'First Choice'
)
def
test_choice_answer_with_embedded_whitespace
(
self
):
self
.
check_choice_answer
(
'First
\t\n
Choice '
,
'First Choice'
)
def
test_choice_answer_with_bad_html_markup
(
self
):
self
.
check_choice_answer
(
'<p First <br>Choice'
,
'Choice'
)
def
test_choice_answer_with_bad2_html_markup
(
self
):
self
.
check_choice_answer
(
'First br>Choice'
,
'First br>Choice'
)
def
test_choice_answer_with_cdata_html_markup
(
self
):
self
.
check_choice_answer
(
'First <![CDATA[This is to be ignored.]]> Choice'
,
'First Choice'
)
def
test_multiple_choice_answer
(
self
):
answer_data
=
self
.
_get_answer_data
(
answer_value_id
=
[
'choice_1'
,
'choice_2'
,
'choice_4'
],
...
...
@@ -500,6 +536,24 @@ class AnswerDistributionPerCourseReduceTest(unittest.TestCase):
)
self
.
_check_output
([
input_data
],
(
expected_output
,))
def
test_multiple_choice_answer_with_markup
(
self
):
answer_data
=
self
.
_get_answer_data
(
answer_value_id
=
[
'choice_1'
,
'choice_2'
,
'choice_4'
],
answer
=
[
u'<text>First Ch
\u014d
ice</text>'
,
u'Second <sup>Ch
\u014d
ice</sup>'
,
u'Fourth <table><tbody><tr><td>Ch
\u014d
ice</td></tr></tbody></table> goes here.'
],
response_type
=
"multiplechoiceresponse"
,
)
input_data
=
(
self
.
timestamp
,
json
.
dumps
(
answer_data
))
expected_output
=
self
.
_get_expected_output
(
answer_data
,
ValueID
=
'[choice_1|choice_2|choice_4]'
,
AnswerValue
=
u'[First Ch
\u014d
ice|Second Ch
\u014d
ice|Fourth Ch
\u014d
ice goes here.]'
)
self
.
_check_output
([
input_data
],
(
expected_output
,))
def
test_filtered_response_type
(
self
):
answer_data
=
self
.
_get_answer_data
(
response_type
=
"customresponse"
,
...
...
@@ -551,6 +605,14 @@ class AnswerDistributionPerCourseReduceTest(unittest.TestCase):
expected_output_2
=
self
.
_get_expected_output
(
answer_data_2
)
self
.
_check_output
([
input_data_1
,
input_data_2
],
(
expected_output_1
,
expected_output_2
))
def
test_two_answer_event_different_answer_by_whitespace
(
self
):
answer_data_1
=
self
.
_get_answer_data
(
answer
=
"
\t\n\n
first "
)
answer_data_2
=
self
.
_get_answer_data
(
answer
=
"first"
)
input_data_1
=
(
self
.
earlier_timestamp
,
json
.
dumps
(
answer_data_1
))
input_data_2
=
(
self
.
timestamp
,
json
.
dumps
(
answer_data_2
))
expected_output
=
self
.
_get_expected_output
(
answer_data_2
,
Count
=
2
)
self
.
_check_output
([
input_data_1
,
input_data_2
],
(
expected_output
,))
def
test_two_answer_event_different_old_and_new
(
self
):
answer_data_1
=
self
.
_get_non_submission_answer_data
(
answer_value_id
=
"first"
)
answer_data_2
=
self
.
_get_answer_data
(
problem_display_name
=
self
.
problem_display_name
)
...
...
@@ -685,12 +747,7 @@ class AnswerDistributionOneFilePerCourseTaskTest(unittest.TestCase):
def
test_map_single_value
(
self
):
key
,
value
=
next
(
self
.
task
.
mapper
(
'foo
\t
bar'
))
self
.
assertEquals
(
key
,
'foo'
)
self
.
assertEquals
(
value
,
(
'bar'
,))
def
test_map_multiple_values
(
self
):
key
,
value
=
next
(
self
.
task
.
mapper
(
'foo
\t
bar
\t
baz'
))
self
.
assertEquals
(
key
,
'foo'
)
self
.
assertEquals
(
value
,
(
'bar'
,
'baz'
))
self
.
assertEquals
(
value
,
'bar'
)
def
test_reduce_multiple_values
(
self
):
field_names
=
AnswerDistributionPerCourseMixin
.
get_column_order
()
...
...
@@ -705,7 +762,7 @@ class AnswerDistributionOneFilePerCourseTaskTest(unittest.TestCase):
sample_input_2
=
json
.
dumps
(
dict
(
column_values_2
))
mock_output_file
=
Mock
()
self
.
task
.
multi_output_reducer
(
'foo'
,
iter
([
(
sample_input_1
,),
(
sample_input_2
,)
]),
mock_output_file
)
self
.
task
.
multi_output_reducer
(
'foo'
,
iter
([
sample_input_1
,
sample_input_2
]),
mock_output_file
)
expected_header_string
=
','
.
join
(
field_names
)
+
'
\r\n
'
self
.
assertEquals
(
mock_output_file
.
write
.
mock_calls
[
0
],
call
(
expected_header_string
))
...
...
requirements/default.txt
View file @
b16d3b46
...
...
@@ -8,5 +8,6 @@ tornado==3.1.1
ansible==1.4.4
python-cjson==1.0.5
oursql==0.9.3.1
html5lib==1.0b3
-e git+https://github.com/spotify/luigi.git@a33756c781b9bf7e51384f0eb19d6a25050ef136#egg=luigi
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment