Exclude blacklisted courses from aggregates

Some courses were created accidentally, or their students were moved to another course_id. We want to be able to filter out these courses and exclude them from our various aggregate counts. Change-Id: I0828d9bb0dcb90a5d9e14b5a5c496c73ed0c3dac Fixes: AN-492

Exclude blacklisted courses from aggregates
Some courses were created accidentally, or their students were moved to another course_id. We want to be able to filter out these courses and exclude them from our various aggregate counts. Change-Id: I0828d9bb0dcb90a5d9e14b5a5c496c73ed0c3dac Fixes: AN-492
8ad6b877 · Gabe Mulley · 5a50d4e4 · 8ad6b877 · 8ad6b877 · 8ad6b877
Commit 8ad6b877 authored Feb 20, 2014 by Gabe Mulley
4 changed files
--- a/edx/analytics/tasks/reports/incremental_enrollments.py
+++ b/edx/analytics/tasks/reports/incremental_enrollments.py
@@ -43,6 +43,7 @@ class WeeklyIncrementalUsersAndEnrollments(luigi.Task, AllCourseEnrollmentCountM
    destination = luigi.Parameter()
    date = luigi.DateParameter()
    weeks = luigi.IntParameter(default=10)
+    blacklist = luigi.Parameter(default=None)

    ROW_LABELS = {
        'header': 'name',
@@ -57,6 +58,8 @@ class WeeklyIncrementalUsersAndEnrollments(luigi.Task, AllCourseEnrollmentCountM
            'enrollments': ExternalURL(self.enrollments),
            'registrations': ExternalURL(self.registrations),
        }
+        if self.blacklist:
+            results.update({'blacklist': ExternalURL(self.blacklist)})
        return results

    def output(self):
@@ -70,9 +73,8 @@ class WeeklyIncrementalUsersAndEnrollments(luigi.Task, AllCourseEnrollmentCountM
        # Load the explicit enrollment data into a pandas dataframe.
        daily_enrollment_changes = self.read_enrollments()

-        # Remove (or merge or whatever) data for courses that
-        # would otherwise result in duplicate counts.
-        self.filter_duplicate_courses(daily_enrollment_changes)
+        course_blacklist = self.read_course_blacklist()
+        self.filter_out_courses(daily_enrollment_changes, course_blacklist)

        # Sum per-course counts to create a single series
        # of total enrollment counts per day.

--- a/edx/analytics/tasks/reports/tests/test_incremental_enrollments.py
+++ b/edx/analytics/tasks/reports/tests/test_incremental_enrollments.py
@@ -20,7 +20,7 @@ class TestWeeklyIncrementalUsersAndEnrollments(unittest.TestCase):
        """Returns label value for reference row, given its internal row name."""
        return WeeklyIncrementalUsersAndEnrollments.ROW_LABELS[row_name]

-    def run_task(self, registrations, enrollments, date, weeks):
+    def run_task(self, registrations, enrollments, date, weeks, blacklist=None):
        """
        Run task with fake targets.

@@ -35,7 +35,8 @@ class TestWeeklyIncrementalUsersAndEnrollments(unittest.TestCase):
            enrollments='fake_enrollments',
            destination='fake_destination',
            date=parsed_date,
-            weeks=weeks
+            weeks=weeks,
+            blacklist=blacklist
        )

        # Default missing inputs
@@ -60,6 +61,10 @@ class TestWeeklyIncrementalUsersAndEnrollments(unittest.TestCase):
            'registrations': FakeTarget(reformat(registrations)),
        }

+        # Mock blacklist only if specified.
+        if blacklist:
+            input_targets.update({'blacklist': FakeTarget(reformat(blacklist))})
+
        task.input = MagicMock(return_value=input_targets)

        output_target = FakeTarget()
@@ -253,3 +258,19 @@ class TestWeeklyIncrementalUsersAndEnrollments(unittest.TestCase):
        res = self.run_task(None, enrollments.encode('utf-8'), '2013-04-02', 2)

        self.assertEqual(res.loc[self.row_label('enrollment_change')]['2013-04-02'], 5)
+
+    def test_blacklist(self):
+        enrollments = """
+        course_1 2013-01-02 1
+        course_2 2013-01-02 2
+        course_3 2013-01-02 4
+        course_2 2013-01-09 1
+        course_3 2013-01-15 2
+        """
+        blacklist = """
+        course_1
+        course_2
+        """
+        res = self.run_task(None, enrollments, '2013-01-15', 2, blacklist=blacklist)
+        self.assertEqual(res.loc[self.row_label('enrollment_change')]['2013-01-08'], 4)
+        self.assertEqual(res.loc[self.row_label('enrollment_change')]['2013-01-15'], 2)
--- a/edx/analytics/tasks/reports/tests/test_total_enrollments.py
+++ b/edx/analytics/tasks/reports/tests/test_total_enrollments.py
@@ -23,7 +23,7 @@ class TestWeeklyAllUsersAndEnrollments(unittest.TestCase):
        self.enrollment_label = WeeklyAllUsersAndEnrollments.ROW_LABELS['enrollments']
        self.registrations_label = WeeklyAllUsersAndEnrollments.ROW_LABELS['registrations']

-    def run_task(self, registrations, enrollments, date, weeks, offset=None, history=None):
+    def run_task(self, registrations, enrollments, date, weeks, offset=None, history=None, blacklist=None):
        """
        Run task with fake targets.

@@ -41,7 +41,8 @@ class TestWeeklyAllUsersAndEnrollments(unittest.TestCase):
            destination='fake_destination',
            date=parsed_date,
            weeks=weeks,
-            credentials=None
+            credentials=None,
+            blacklist=blacklist
        )

        # Mock the input and output targets
@@ -76,6 +77,10 @@ class TestWeeklyAllUsersAndEnrollments(unittest.TestCase):
        if history:
            input_targets.update({'history': FakeTarget(reformat(history))})

+        # Mock blacklist only if specified.
+        if blacklist:
+            input_targets.update({'blacklist': FakeTarget(reformat(blacklist))})
+
        task.input = MagicMock(return_value=input_targets)

        output_target = FakeTarget()
@@ -216,6 +221,22 @@ class TestWeeklyAllUsersAndEnrollments(unittest.TestCase):
        self.assertEqual(total_enrollment['2013-03-21'], 22)
        self.assertEqual(total_enrollment['2013-03-28'], 22)

+    def test_blacklist(self):
+        enrollments = """
+        course_1 2013-01-02 1
+        course_2 2013-01-02 2
+        course_3 2013-01-02 4
+        course_2 2013-01-09 1
+        course_3 2013-01-15 2
+        """
+        blacklist = """
+        course_1
+        course_2
+        """
+        res = self.run_task('', enrollments, '2013-01-15', 2, blacklist=blacklist)
+        self.assertEqual(res.loc[self.enrollment_label]['2013-01-08'], 4)
+        self.assertEqual(res.loc[self.enrollment_label]['2013-01-15'], 6)
+
    def test_unicode(self):
        course_id = u'course_\u2603'


--- a/edx/analytics/tasks/reports/total_enrollments.py
+++ b/edx/analytics/tasks/reports/total_enrollments.py
@@ -68,18 +68,38 @@ class AllCourseEnrollmentCountMixin(CourseEnrollmentCountMixin):
        """
        return self.read_date_count_tsv(input_file).interpolate(method='time')

-    def filter_duplicate_courses(self, daily_enrollment_totals):
-        # TODO: implement this for real.  (This is just a placeholder.)
-        # At this point we should remove data for courses that are
-        # represented by other courses, because the students have been
-        # moved to the new course.  Perhaps this should actually
-        # perform a merge of the two courses, since we would want the
-        # history of one before the move date, and the history of the
-        # second after that date.
-
-        # Note that this is not the same filtering that would be applied
-        # to the EnrollmentsByWeek report.
-        pass
+    def read_course_blacklist(self):
+        """
+        Reads a set of course_ids from the blacklist input file if one was
+        specified, otherwise returns an empty set.
+
+        Expected input file format is a single course ID per line.
+
+        Returns:
+            A set of course_ids that should not be included in aggregates.
+        """
+        if self.input().get('blacklist'):
+            with self.input()['blacklist'].open('r') as blacklist_file:
+                data = read_tsv(blacklist_file, ['course_id'])
+            return set(data['course_id'])
+        else:
+            return set()
+
+    def filter_out_courses(self, course_data, course_blacklist):
+        """
+        Removes data for courses that should be excluded from aggregates.
+
+        Args:
+            course_data (pandas.DataFrame): A DataFrame containing a single
+                column for each course.
+            course_blacklist (iterable): A collection of course IDs to
+                remove from the data table.
+
+        Returns:
+            None, the `course_data` is modified in place.
+        """
+        # Drop from axis 1 because we are dropping columns, not rows.
+        course_data.drop(course_blacklist, axis=1, inplace=True)

    def save_output(self, results, output_file):
        """
@@ -146,6 +166,7 @@ class WeeklyAllUsersAndEnrollments(luigi.Task, AllCourseEnrollmentCountMixin):
    date = luigi.DateParameter()
    weeks = luigi.IntParameter(default=52)
    credentials = luigi.Parameter()
+    blacklist = luigi.Parameter(default=None)

    ROW_LABELS = {
        'header': 'name',
@@ -183,6 +204,8 @@ class WeeklyAllUsersAndEnrollments(luigi.Task, AllCourseEnrollmentCountMixin):
            results.update({'offsets': ExternalURL(self.offsets)})
        if self.history:
            results.update({'history': ExternalURL(self.history)})
+        if self.blacklist:
+            results.update({'blacklist': ExternalURL(self.blacklist)})

        return results

@@ -203,9 +226,8 @@ class WeeklyAllUsersAndEnrollments(luigi.Task, AllCourseEnrollmentCountMixin):
        offsets = self.read_offsets()
        daily_enrollment_totals = self.calculate_total_enrollment(daily_enrollment_changes, offsets)

-        # Remove (or merge or whatever) data for courses that
-        # would otherwise result in duplicate counts.
-        self.filter_duplicate_courses(daily_enrollment_totals)
+        course_blacklist = self.read_course_blacklist()
+        self.filter_out_courses(daily_enrollment_totals, course_blacklist)

        # Sum per-course counts to create a single series
        # of total enrollment counts per day.