Commit f6c8db32 by Brian Wilson

Refactor course enrollment calculations.

Omit days with no change in user status.  Add tests for reduce.  Rename time-related functions.
parent c1164ce5
...@@ -72,7 +72,7 @@ class PathSetTask(luigi.Task): ...@@ -72,7 +72,7 @@ class PathSetTask(luigi.Task):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
super(PathSetTask, self).__init__(*args, **kwargs) super(PathSetTask, self).__init__(*args, **kwargs)
self.s3 = boto.connect_s3() self.s3 = None
def requires(self): def requires(self):
if self.src.startswith('s3'): if self.src.startswith('s3'):
...@@ -103,6 +103,10 @@ class PathSetTask(luigi.Task): ...@@ -103,6 +103,10 @@ class PathSetTask(luigi.Task):
def _generate_sources(self): def _generate_sources(self):
bucket_name, root = get_s3_bucket_key_names(self.src) bucket_name, root = get_s3_bucket_key_names(self.src)
# connect lazily, only if necessary:
if self.s3 is None:
self.s3 = boto.connect_s3()
bucket = self.s3.get_bucket(bucket_name) bucket = self.s3.get_bucket(bucket_name)
keys = (s.key for s in bucket.list(root) if s.size > 0) keys = (s.key for s in bucket.list(root) if s.size > 0)
......
"""
Tests for tasks that collect enrollment events.
"""
import unittest
from edx.analytics.tasks.course_enroll import (
BaseCourseEnrollmentEventsPerDay,
BaseCourseEnrollmentChangesPerDay,
BaseCourseEnrollmentTotalsPerDay,
)
from datetime import datetime
class CourseEnrollEventReduceTest(unittest.TestCase):
"""
Tests to verify that event log parsing works correctly.
"""
def setUp(self):
self.task = BaseCourseEnrollmentEventsPerDay()
self.key = ('course', 'user')
def _get_reducer_output(self, values):
"""Run reducer with provided values hardcoded key."""
return list(self.task.reducer(self.key, values))
def test_no_events(self):
self.assertEquals(self._get_reducer_output([]), [])
def test_single_enrollment(self):
self.assertEquals(self._get_reducer_output(
[
('2013-01-01T00:00:01', 1),
]),
[
(('course', '2013-01-01'), 1),
])
def test_single_unenrollment(self):
self.assertEquals(self._get_reducer_output(
[
('2013-01-01T00:00:01', -1),
]),
[
(('course', '2013-01-01'), -1),
])
def test_multiple_events_on_same_day(self):
# run first with no output expected:
self.assertEquals(self._get_reducer_output(
[
('2013-01-01T00:00:01', 1),
('2013-01-01T00:00:02', -1),
('2013-01-01T00:00:03', 1),
('2013-01-01T00:00:04', -1),
]),
[
])
# then run with output expected:
self.assertEquals(self._get_reducer_output(
[
('2013-01-01T00:00:01', 1),
('2013-01-01T00:00:02', -1),
('2013-01-01T00:00:03', -1),
('2013-01-01T00:00:04', 1),
]),
[
(('course', '2013-01-01'), 1),
])
def test_multiple_events_out_of_order(self):
# Make sure that events are sorted by the reducer.
self.assertEquals(self._get_reducer_output(
[
('2013-01-01T00:00:04', -1),
('2013-01-01T00:00:03', 1),
('2013-01-01T00:00:01', 1),
('2013-01-01T00:00:02', -1),
]),
[
])
def test_multiple_enroll_events_on_same_day(self):
self.assertEquals(self._get_reducer_output(
[
('2013-01-01T00:00:01', 1),
('2013-01-01T00:00:02', 1),
('2013-01-01T00:00:03', 1),
('2013-01-01T00:00:04', 1),
]),
[
(('course', '2013-01-01'), 1),
])
def test_multiple_unenroll_events_on_same_day(self):
self.assertEquals(self._get_reducer_output(
[
('2013-01-01T00:00:01', -1),
('2013-01-01T00:00:02', -1),
('2013-01-01T00:00:03', -1),
('2013-01-01T00:00:04', -1),
]),
[
(('course', '2013-01-01'), -1),
])
def test_multiple_enroll_events_on_many_days(self):
self.assertEquals(self._get_reducer_output(
[
('2013-01-01T00:00:01', 1),
('2013-01-01T00:00:02', 1),
('2013-01-02T00:00:03', 1),
('2013-01-02T00:00:04', 1),
('2013-01-04T00:00:05', 1),
]),
[
(('course', '2013-01-01'), 1),
])
def test_multiple_events_on_many_days(self):
# Run with an arbitrary list of events.
self.assertEquals(self._get_reducer_output(
[
('2013-01-01T1', 1),
('2013-01-01T2', -1),
('2013-01-01T3', 1),
('2013-01-01T4', -1),
('2013-01-02', 1),
('2013-01-03', 1),
('2013-01-04T1', 1),
('2013-01-04T2', -1),
('2013-01-05', -1),
('2013-01-06', -1),
('2013-01-07', 1),
('2013-01-08T1', 1),
('2013-01-08T2', 1),
('2013-01-09T1', -1),
('2013-01-09T2', -1),
]),
[
(('course', '2013-01-02'), 1),
(('course', '2013-01-04'), -1),
(('course', '2013-01-07'), 1),
(('course', '2013-01-09'), -1),
])
class CourseEnrollChangesReduceTest(unittest.TestCase):
"""
Verify that BaseCourseEnrollmentChangesPerDay.reduce() works correctly.
"""
def setUp(self):
self.task = BaseCourseEnrollmentChangesPerDay()
self.key = ('course', '2013-01-01')
def _get_reducer_output(self, values):
"""Run reducer with provided values hardcoded key."""
return list(self.task.reducer(self.key, values))
def test_no_user_counts(self):
self.assertEquals(self._get_reducer_output([]), [(self.key, 0)])
def test_single_user_count(self):
self.assertEquals(self._get_reducer_output([1]), [(self.key, 1)])
def test_multiple_user_count(self):
inputs = [1, 1, 1, -1, 1]
self.assertEquals(self._get_reducer_output(inputs), [(self.key, 3)])
class CourseEnrollTotalsReduceTest(unittest.TestCase):
"""
Verify that BaseCourseEnrollmentTotalsPerDay.reduce() works correctly.
"""
def setUp(self):
self.task = BaseCourseEnrollmentTotalsPerDay()
self.key = 'course'
def _get_reducer_output(self, values):
"""Run reducer with provided values hardcoded key."""
return list(self.task.reducer(self.key, values))
def test_no_user_counts(self):
self.assertEquals(self._get_reducer_output([]), [])
def test_single_user_count(self):
self.assertEquals(self._get_reducer_output(
[
('2013-01-01', 5),
]),
[
(self.key, '2013-01-01', 5),
])
def test_multiple_user_count(self):
self.assertEquals(self._get_reducer_output(
[
('2013-01-01', 5),
('2013-01-02', 8),
('2013-01-03', 4),
('2013-01-04', 9),
]),
[
(self.key, '2013-01-01', 5),
(self.key, '2013-01-02', 13),
(self.key, '2013-01-03', 17),
(self.key, '2013-01-04', 26),
])
...@@ -8,26 +8,44 @@ import re ...@@ -8,26 +8,44 @@ import re
PATTERN_JSON = re.compile(r'^.*?(\{.*\})\s*$') PATTERN_JSON = re.compile(r'^.*?(\{.*\})\s*$')
# borrowed from modulestore/parsers.py:
ALLOWED_ID_CHARS = r'[a-zA-Z0-9_\-~.:]'
PATTERN_COURSEID = re.compile(r'^' + ALLOWED_ID_CHARS + r'+$')
def get_datetime_string(timestamp): def is_valid_course_id(course_id):
return timestamp.strftime('%Y-%m-%dT%H:%M:%S') """
Determines if a course_id from an event log is possibly legitimate.
Applies two tests:
def get_date_string(timestamp): * Course Id can be split into org/coursename/runname using '/' as delimiter.
return timestamp.strftime('%Y-%m-%d') * Components of id contain only "allowed" characters as defined in modulestore/parsers.py.
Note this will need to be updated as split-mongo changes are rolled out
def get_date_from_datetime(datetime_string): that permit a broader set of id values.
return datetime_string.split('T')[0] """
components = course_id.split('/')
if len(components) != 3:
return False
return all(PATTERN_COURSEID.match(component) for component in components)
def json_decode(line): def json_decode(line):
"""Wrapper to decode JSON string in implementation-independent way.""" """Wrapper to decode JSON string in an implementation-independent way."""
return cjson.decode(line) return cjson.decode(line)
def parse_eventlog_item(line, nested=False): def parse_eventlog_item(line, nested=False):
""" Parse a tracking log input line as JSON to create a dict representation.""" """
Parse a tracking log input line as JSON to create a dict representation.
Arguments:
* line: the eventlog text
* nested: boolean flag permitting this to be called recursively.
Apparently some eventlog entries are pure JSON, while others are
JSON that are prepended by a timestamp.
"""
try: try:
parsed = json_decode(line) parsed = json_decode(line)
except: except:
...@@ -48,11 +66,32 @@ def parse_eventlog_item(line, nested=False): ...@@ -48,11 +66,32 @@ def parse_eventlog_item(line, nested=False):
def log_item(msg, item, level='ERROR'): def log_item(msg, item, level='ERROR'):
"""Writes a message about an eventlog item."""
sys.stderr.write("{level}: {msg}: {item}\n".format(msg=msg, item=item, level=level)) sys.stderr.write("{level}: {msg}: {item}\n".format(msg=msg, item=item, level=level))
def get_timestamp(item): # Time-related terminology:
# * datetime: a datetime object.
# * timestamp: a string, with date and time (to second), in ISO format.
# * datestamp: a string with only date information, in ISO format.
def get_timestamp(datetime):
"""Returns a string with the datetime value of the provided datetime object."""
return datetime.strftime('%Y-%m-%dT%H:%M:%S')
def get_datestamp(datetime):
"""Returns a string with the date value of the provided datetime object."""
return datetime.strftime('%Y-%m-%d')
def get_datestamp_from_timestamp(timestamp):
"""Returns a string with the date value of the provided ISO datetime string."""
return timestamp.split('T')[0]
def get_datetime(item):
"""Returns a datetime object from an event item, if present."""
try: try:
timestamp = item['time'] timestamp = item['time']
removed_ms = timestamp.split('.')[0] removed_ms = timestamp.split('.')[0]
...@@ -62,6 +101,11 @@ def get_timestamp(item): ...@@ -62,6 +101,11 @@ def get_timestamp(item):
def get_event_data(item): def get_event_data(item):
"""
Returns event data from an event log item as a dict object.
Returns None if not found.
"""
event_value = item.get('event') event_value = item.get('event')
if event_value is None: if event_value is None:
...@@ -69,7 +113,7 @@ def get_event_data(item): ...@@ -69,7 +113,7 @@ def get_event_data(item):
return None return None
if isinstance(event_value, basestring): if isinstance(event_value, basestring):
# If the value is a string, try to parse as JSON into a dict:. # If the value is a string, try to parse as JSON into a dict.
try: try:
event_value = json_decode(event_value) event_value = json_decode(event_value)
except: except:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment