Commit 8ad6b877 by Gabe Mulley

Exclude blacklisted courses from aggregates

Some courses were created accidentally, or their students were moved to another course_id.  We want to be able to filter out these courses and exclude them from our various aggregate counts.

Change-Id: I0828d9bb0dcb90a5d9e14b5a5c496c73ed0c3dac
Fixes: AN-492
parent 5a50d4e4
......@@ -43,6 +43,7 @@ class WeeklyIncrementalUsersAndEnrollments(luigi.Task, AllCourseEnrollmentCountM
destination = luigi.Parameter()
date = luigi.DateParameter()
weeks = luigi.IntParameter(default=10)
blacklist = luigi.Parameter(default=None)
ROW_LABELS = {
'header': 'name',
......@@ -57,6 +58,8 @@ class WeeklyIncrementalUsersAndEnrollments(luigi.Task, AllCourseEnrollmentCountM
'enrollments': ExternalURL(self.enrollments),
'registrations': ExternalURL(self.registrations),
}
if self.blacklist:
results.update({'blacklist': ExternalURL(self.blacklist)})
return results
def output(self):
......@@ -70,9 +73,8 @@ class WeeklyIncrementalUsersAndEnrollments(luigi.Task, AllCourseEnrollmentCountM
# Load the explicit enrollment data into a pandas dataframe.
daily_enrollment_changes = self.read_enrollments()
# Remove (or merge or whatever) data for courses that
# would otherwise result in duplicate counts.
self.filter_duplicate_courses(daily_enrollment_changes)
course_blacklist = self.read_course_blacklist()
self.filter_out_courses(daily_enrollment_changes, course_blacklist)
# Sum per-course counts to create a single series
# of total enrollment counts per day.
......
......@@ -20,7 +20,7 @@ class TestWeeklyIncrementalUsersAndEnrollments(unittest.TestCase):
"""Returns label value for reference row, given its internal row name."""
return WeeklyIncrementalUsersAndEnrollments.ROW_LABELS[row_name]
def run_task(self, registrations, enrollments, date, weeks):
def run_task(self, registrations, enrollments, date, weeks, blacklist=None):
"""
Run task with fake targets.
......@@ -35,7 +35,8 @@ class TestWeeklyIncrementalUsersAndEnrollments(unittest.TestCase):
enrollments='fake_enrollments',
destination='fake_destination',
date=parsed_date,
weeks=weeks
weeks=weeks,
blacklist=blacklist
)
# Default missing inputs
......@@ -60,6 +61,10 @@ class TestWeeklyIncrementalUsersAndEnrollments(unittest.TestCase):
'registrations': FakeTarget(reformat(registrations)),
}
# Mock blacklist only if specified.
if blacklist:
input_targets.update({'blacklist': FakeTarget(reformat(blacklist))})
task.input = MagicMock(return_value=input_targets)
output_target = FakeTarget()
......@@ -253,3 +258,19 @@ class TestWeeklyIncrementalUsersAndEnrollments(unittest.TestCase):
res = self.run_task(None, enrollments.encode('utf-8'), '2013-04-02', 2)
self.assertEqual(res.loc[self.row_label('enrollment_change')]['2013-04-02'], 5)
def test_blacklist(self):
enrollments = """
course_1 2013-01-02 1
course_2 2013-01-02 2
course_3 2013-01-02 4
course_2 2013-01-09 1
course_3 2013-01-15 2
"""
blacklist = """
course_1
course_2
"""
res = self.run_task(None, enrollments, '2013-01-15', 2, blacklist=blacklist)
self.assertEqual(res.loc[self.row_label('enrollment_change')]['2013-01-08'], 4)
self.assertEqual(res.loc[self.row_label('enrollment_change')]['2013-01-15'], 2)
......@@ -23,7 +23,7 @@ class TestWeeklyAllUsersAndEnrollments(unittest.TestCase):
self.enrollment_label = WeeklyAllUsersAndEnrollments.ROW_LABELS['enrollments']
self.registrations_label = WeeklyAllUsersAndEnrollments.ROW_LABELS['registrations']
def run_task(self, registrations, enrollments, date, weeks, offset=None, history=None):
def run_task(self, registrations, enrollments, date, weeks, offset=None, history=None, blacklist=None):
"""
Run task with fake targets.
......@@ -41,7 +41,8 @@ class TestWeeklyAllUsersAndEnrollments(unittest.TestCase):
destination='fake_destination',
date=parsed_date,
weeks=weeks,
credentials=None
credentials=None,
blacklist=blacklist
)
# Mock the input and output targets
......@@ -76,6 +77,10 @@ class TestWeeklyAllUsersAndEnrollments(unittest.TestCase):
if history:
input_targets.update({'history': FakeTarget(reformat(history))})
# Mock blacklist only if specified.
if blacklist:
input_targets.update({'blacklist': FakeTarget(reformat(blacklist))})
task.input = MagicMock(return_value=input_targets)
output_target = FakeTarget()
......@@ -216,6 +221,22 @@ class TestWeeklyAllUsersAndEnrollments(unittest.TestCase):
self.assertEqual(total_enrollment['2013-03-21'], 22)
self.assertEqual(total_enrollment['2013-03-28'], 22)
def test_blacklist(self):
enrollments = """
course_1 2013-01-02 1
course_2 2013-01-02 2
course_3 2013-01-02 4
course_2 2013-01-09 1
course_3 2013-01-15 2
"""
blacklist = """
course_1
course_2
"""
res = self.run_task('', enrollments, '2013-01-15', 2, blacklist=blacklist)
self.assertEqual(res.loc[self.enrollment_label]['2013-01-08'], 4)
self.assertEqual(res.loc[self.enrollment_label]['2013-01-15'], 6)
def test_unicode(self):
course_id = u'course_\u2603'
......
......@@ -68,18 +68,38 @@ class AllCourseEnrollmentCountMixin(CourseEnrollmentCountMixin):
"""
return self.read_date_count_tsv(input_file).interpolate(method='time')
def filter_duplicate_courses(self, daily_enrollment_totals):
# TODO: implement this for real. (This is just a placeholder.)
# At this point we should remove data for courses that are
# represented by other courses, because the students have been
# moved to the new course. Perhaps this should actually
# perform a merge of the two courses, since we would want the
# history of one before the move date, and the history of the
# second after that date.
# Note that this is not the same filtering that would be applied
# to the EnrollmentsByWeek report.
pass
def read_course_blacklist(self):
"""
Reads a set of course_ids from the blacklist input file if one was
specified, otherwise returns an empty set.
Expected input file format is a single course ID per line.
Returns:
A set of course_ids that should not be included in aggregates.
"""
if self.input().get('blacklist'):
with self.input()['blacklist'].open('r') as blacklist_file:
data = read_tsv(blacklist_file, ['course_id'])
return set(data['course_id'])
else:
return set()
def filter_out_courses(self, course_data, course_blacklist):
"""
Removes data for courses that should be excluded from aggregates.
Args:
course_data (pandas.DataFrame): A DataFrame containing a single
column for each course.
course_blacklist (iterable): A collection of course IDs to
remove from the data table.
Returns:
None, the `course_data` is modified in place.
"""
# Drop from axis 1 because we are dropping columns, not rows.
course_data.drop(course_blacklist, axis=1, inplace=True)
def save_output(self, results, output_file):
"""
......@@ -146,6 +166,7 @@ class WeeklyAllUsersAndEnrollments(luigi.Task, AllCourseEnrollmentCountMixin):
date = luigi.DateParameter()
weeks = luigi.IntParameter(default=52)
credentials = luigi.Parameter()
blacklist = luigi.Parameter(default=None)
ROW_LABELS = {
'header': 'name',
......@@ -183,6 +204,8 @@ class WeeklyAllUsersAndEnrollments(luigi.Task, AllCourseEnrollmentCountMixin):
results.update({'offsets': ExternalURL(self.offsets)})
if self.history:
results.update({'history': ExternalURL(self.history)})
if self.blacklist:
results.update({'blacklist': ExternalURL(self.blacklist)})
return results
......@@ -203,9 +226,8 @@ class WeeklyAllUsersAndEnrollments(luigi.Task, AllCourseEnrollmentCountMixin):
offsets = self.read_offsets()
daily_enrollment_totals = self.calculate_total_enrollment(daily_enrollment_changes, offsets)
# Remove (or merge or whatever) data for courses that
# would otherwise result in duplicate counts.
self.filter_duplicate_courses(daily_enrollment_totals)
course_blacklist = self.read_course_blacklist()
self.filter_out_courses(daily_enrollment_totals, course_blacklist)
# Sum per-course counts to create a single series
# of total enrollment counts per day.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment