Commit 8ad6b877 by Gabe Mulley

Exclude blacklisted courses from aggregates

Some courses were created accidentally, or their students were moved to another course_id.  We want to be able to filter out these courses and exclude them from our various aggregate counts.

Change-Id: I0828d9bb0dcb90a5d9e14b5a5c496c73ed0c3dac
Fixes: AN-492
parent 5a50d4e4
...@@ -43,6 +43,7 @@ class WeeklyIncrementalUsersAndEnrollments(luigi.Task, AllCourseEnrollmentCountM ...@@ -43,6 +43,7 @@ class WeeklyIncrementalUsersAndEnrollments(luigi.Task, AllCourseEnrollmentCountM
destination = luigi.Parameter() destination = luigi.Parameter()
date = luigi.DateParameter() date = luigi.DateParameter()
weeks = luigi.IntParameter(default=10) weeks = luigi.IntParameter(default=10)
blacklist = luigi.Parameter(default=None)
ROW_LABELS = { ROW_LABELS = {
'header': 'name', 'header': 'name',
...@@ -57,6 +58,8 @@ class WeeklyIncrementalUsersAndEnrollments(luigi.Task, AllCourseEnrollmentCountM ...@@ -57,6 +58,8 @@ class WeeklyIncrementalUsersAndEnrollments(luigi.Task, AllCourseEnrollmentCountM
'enrollments': ExternalURL(self.enrollments), 'enrollments': ExternalURL(self.enrollments),
'registrations': ExternalURL(self.registrations), 'registrations': ExternalURL(self.registrations),
} }
if self.blacklist:
results.update({'blacklist': ExternalURL(self.blacklist)})
return results return results
def output(self): def output(self):
...@@ -70,9 +73,8 @@ class WeeklyIncrementalUsersAndEnrollments(luigi.Task, AllCourseEnrollmentCountM ...@@ -70,9 +73,8 @@ class WeeklyIncrementalUsersAndEnrollments(luigi.Task, AllCourseEnrollmentCountM
# Load the explicit enrollment data into a pandas dataframe. # Load the explicit enrollment data into a pandas dataframe.
daily_enrollment_changes = self.read_enrollments() daily_enrollment_changes = self.read_enrollments()
# Remove (or merge or whatever) data for courses that course_blacklist = self.read_course_blacklist()
# would otherwise result in duplicate counts. self.filter_out_courses(daily_enrollment_changes, course_blacklist)
self.filter_duplicate_courses(daily_enrollment_changes)
# Sum per-course counts to create a single series # Sum per-course counts to create a single series
# of total enrollment counts per day. # of total enrollment counts per day.
......
...@@ -20,7 +20,7 @@ class TestWeeklyIncrementalUsersAndEnrollments(unittest.TestCase): ...@@ -20,7 +20,7 @@ class TestWeeklyIncrementalUsersAndEnrollments(unittest.TestCase):
"""Returns label value for reference row, given its internal row name.""" """Returns label value for reference row, given its internal row name."""
return WeeklyIncrementalUsersAndEnrollments.ROW_LABELS[row_name] return WeeklyIncrementalUsersAndEnrollments.ROW_LABELS[row_name]
def run_task(self, registrations, enrollments, date, weeks): def run_task(self, registrations, enrollments, date, weeks, blacklist=None):
""" """
Run task with fake targets. Run task with fake targets.
...@@ -35,7 +35,8 @@ class TestWeeklyIncrementalUsersAndEnrollments(unittest.TestCase): ...@@ -35,7 +35,8 @@ class TestWeeklyIncrementalUsersAndEnrollments(unittest.TestCase):
enrollments='fake_enrollments', enrollments='fake_enrollments',
destination='fake_destination', destination='fake_destination',
date=parsed_date, date=parsed_date,
weeks=weeks weeks=weeks,
blacklist=blacklist
) )
# Default missing inputs # Default missing inputs
...@@ -60,6 +61,10 @@ class TestWeeklyIncrementalUsersAndEnrollments(unittest.TestCase): ...@@ -60,6 +61,10 @@ class TestWeeklyIncrementalUsersAndEnrollments(unittest.TestCase):
'registrations': FakeTarget(reformat(registrations)), 'registrations': FakeTarget(reformat(registrations)),
} }
# Mock blacklist only if specified.
if blacklist:
input_targets.update({'blacklist': FakeTarget(reformat(blacklist))})
task.input = MagicMock(return_value=input_targets) task.input = MagicMock(return_value=input_targets)
output_target = FakeTarget() output_target = FakeTarget()
...@@ -253,3 +258,19 @@ class TestWeeklyIncrementalUsersAndEnrollments(unittest.TestCase): ...@@ -253,3 +258,19 @@ class TestWeeklyIncrementalUsersAndEnrollments(unittest.TestCase):
res = self.run_task(None, enrollments.encode('utf-8'), '2013-04-02', 2) res = self.run_task(None, enrollments.encode('utf-8'), '2013-04-02', 2)
self.assertEqual(res.loc[self.row_label('enrollment_change')]['2013-04-02'], 5) self.assertEqual(res.loc[self.row_label('enrollment_change')]['2013-04-02'], 5)
def test_blacklist(self):
enrollments = """
course_1 2013-01-02 1
course_2 2013-01-02 2
course_3 2013-01-02 4
course_2 2013-01-09 1
course_3 2013-01-15 2
"""
blacklist = """
course_1
course_2
"""
res = self.run_task(None, enrollments, '2013-01-15', 2, blacklist=blacklist)
self.assertEqual(res.loc[self.row_label('enrollment_change')]['2013-01-08'], 4)
self.assertEqual(res.loc[self.row_label('enrollment_change')]['2013-01-15'], 2)
...@@ -23,7 +23,7 @@ class TestWeeklyAllUsersAndEnrollments(unittest.TestCase): ...@@ -23,7 +23,7 @@ class TestWeeklyAllUsersAndEnrollments(unittest.TestCase):
self.enrollment_label = WeeklyAllUsersAndEnrollments.ROW_LABELS['enrollments'] self.enrollment_label = WeeklyAllUsersAndEnrollments.ROW_LABELS['enrollments']
self.registrations_label = WeeklyAllUsersAndEnrollments.ROW_LABELS['registrations'] self.registrations_label = WeeklyAllUsersAndEnrollments.ROW_LABELS['registrations']
def run_task(self, registrations, enrollments, date, weeks, offset=None, history=None): def run_task(self, registrations, enrollments, date, weeks, offset=None, history=None, blacklist=None):
""" """
Run task with fake targets. Run task with fake targets.
...@@ -41,7 +41,8 @@ class TestWeeklyAllUsersAndEnrollments(unittest.TestCase): ...@@ -41,7 +41,8 @@ class TestWeeklyAllUsersAndEnrollments(unittest.TestCase):
destination='fake_destination', destination='fake_destination',
date=parsed_date, date=parsed_date,
weeks=weeks, weeks=weeks,
credentials=None credentials=None,
blacklist=blacklist
) )
# Mock the input and output targets # Mock the input and output targets
...@@ -76,6 +77,10 @@ class TestWeeklyAllUsersAndEnrollments(unittest.TestCase): ...@@ -76,6 +77,10 @@ class TestWeeklyAllUsersAndEnrollments(unittest.TestCase):
if history: if history:
input_targets.update({'history': FakeTarget(reformat(history))}) input_targets.update({'history': FakeTarget(reformat(history))})
# Mock blacklist only if specified.
if blacklist:
input_targets.update({'blacklist': FakeTarget(reformat(blacklist))})
task.input = MagicMock(return_value=input_targets) task.input = MagicMock(return_value=input_targets)
output_target = FakeTarget() output_target = FakeTarget()
...@@ -216,6 +221,22 @@ class TestWeeklyAllUsersAndEnrollments(unittest.TestCase): ...@@ -216,6 +221,22 @@ class TestWeeklyAllUsersAndEnrollments(unittest.TestCase):
self.assertEqual(total_enrollment['2013-03-21'], 22) self.assertEqual(total_enrollment['2013-03-21'], 22)
self.assertEqual(total_enrollment['2013-03-28'], 22) self.assertEqual(total_enrollment['2013-03-28'], 22)
def test_blacklist(self):
enrollments = """
course_1 2013-01-02 1
course_2 2013-01-02 2
course_3 2013-01-02 4
course_2 2013-01-09 1
course_3 2013-01-15 2
"""
blacklist = """
course_1
course_2
"""
res = self.run_task('', enrollments, '2013-01-15', 2, blacklist=blacklist)
self.assertEqual(res.loc[self.enrollment_label]['2013-01-08'], 4)
self.assertEqual(res.loc[self.enrollment_label]['2013-01-15'], 6)
def test_unicode(self): def test_unicode(self):
course_id = u'course_\u2603' course_id = u'course_\u2603'
......
...@@ -68,18 +68,38 @@ class AllCourseEnrollmentCountMixin(CourseEnrollmentCountMixin): ...@@ -68,18 +68,38 @@ class AllCourseEnrollmentCountMixin(CourseEnrollmentCountMixin):
""" """
return self.read_date_count_tsv(input_file).interpolate(method='time') return self.read_date_count_tsv(input_file).interpolate(method='time')
def filter_duplicate_courses(self, daily_enrollment_totals): def read_course_blacklist(self):
# TODO: implement this for real. (This is just a placeholder.) """
# At this point we should remove data for courses that are Reads a set of course_ids from the blacklist input file if one was
# represented by other courses, because the students have been specified, otherwise returns an empty set.
# moved to the new course. Perhaps this should actually
# perform a merge of the two courses, since we would want the Expected input file format is a single course ID per line.
# history of one before the move date, and the history of the
# second after that date. Returns:
A set of course_ids that should not be included in aggregates.
# Note that this is not the same filtering that would be applied """
# to the EnrollmentsByWeek report. if self.input().get('blacklist'):
pass with self.input()['blacklist'].open('r') as blacklist_file:
data = read_tsv(blacklist_file, ['course_id'])
return set(data['course_id'])
else:
return set()
def filter_out_courses(self, course_data, course_blacklist):
"""
Removes data for courses that should be excluded from aggregates.
Args:
course_data (pandas.DataFrame): A DataFrame containing a single
column for each course.
course_blacklist (iterable): A collection of course IDs to
remove from the data table.
Returns:
None, the `course_data` is modified in place.
"""
# Drop from axis 1 because we are dropping columns, not rows.
course_data.drop(course_blacklist, axis=1, inplace=True)
def save_output(self, results, output_file): def save_output(self, results, output_file):
""" """
...@@ -146,6 +166,7 @@ class WeeklyAllUsersAndEnrollments(luigi.Task, AllCourseEnrollmentCountMixin): ...@@ -146,6 +166,7 @@ class WeeklyAllUsersAndEnrollments(luigi.Task, AllCourseEnrollmentCountMixin):
date = luigi.DateParameter() date = luigi.DateParameter()
weeks = luigi.IntParameter(default=52) weeks = luigi.IntParameter(default=52)
credentials = luigi.Parameter() credentials = luigi.Parameter()
blacklist = luigi.Parameter(default=None)
ROW_LABELS = { ROW_LABELS = {
'header': 'name', 'header': 'name',
...@@ -183,6 +204,8 @@ class WeeklyAllUsersAndEnrollments(luigi.Task, AllCourseEnrollmentCountMixin): ...@@ -183,6 +204,8 @@ class WeeklyAllUsersAndEnrollments(luigi.Task, AllCourseEnrollmentCountMixin):
results.update({'offsets': ExternalURL(self.offsets)}) results.update({'offsets': ExternalURL(self.offsets)})
if self.history: if self.history:
results.update({'history': ExternalURL(self.history)}) results.update({'history': ExternalURL(self.history)})
if self.blacklist:
results.update({'blacklist': ExternalURL(self.blacklist)})
return results return results
...@@ -203,9 +226,8 @@ class WeeklyAllUsersAndEnrollments(luigi.Task, AllCourseEnrollmentCountMixin): ...@@ -203,9 +226,8 @@ class WeeklyAllUsersAndEnrollments(luigi.Task, AllCourseEnrollmentCountMixin):
offsets = self.read_offsets() offsets = self.read_offsets()
daily_enrollment_totals = self.calculate_total_enrollment(daily_enrollment_changes, offsets) daily_enrollment_totals = self.calculate_total_enrollment(daily_enrollment_changes, offsets)
# Remove (or merge or whatever) data for courses that course_blacklist = self.read_course_blacklist()
# would otherwise result in duplicate counts. self.filter_out_courses(daily_enrollment_totals, course_blacklist)
self.filter_duplicate_courses(daily_enrollment_totals)
# Sum per-course counts to create a single series # Sum per-course counts to create a single series
# of total enrollment counts per day. # of total enrollment counts per day.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment