Merge "Implement skeleton version of Total Enrollment and Users report."

d0307a38 · Brian Wilson · Gerrit Code Review · 8b555648 · 2365e97e · d0307a38
Commit d0307a38 authored Feb 12, 2014 by Brian Wilson Committed by Gerrit Code Review Feb 12, 2014
Showing with 558 additions and 91 deletions

edx/analytics/tasks/reports/enrollments.py
+124 -91

edx/analytics/tasks/reports/tests/test_total_enrollments.py
+198 -0

edx/analytics/tasks/reports/total_enrollments.py
+213 -0

edx/analytics/tasks/util/tsv.py
+23 -0

No files found.
--- a/edx/analytics/tasks/reports/enrollments.py
+++ b/edx/analytics/tasks/reports/enrollments.py
--- a/edx/analytics/tasks/reports/tests/test_total_enrollments.py
+++ b/edx/analytics/tasks/reports/tests/test_total_enrollments.py
+"""Tests for Total Users and Enrollment report."""
+
+from contextlib import contextmanager
+import datetime
+import textwrap
+from StringIO import StringIO
+from unittest import TestCase
+
+import luigi
+import luigi.hdfs
+from mock import MagicMock
+from numpy import isnan
+import pandas
+
+from edx.analytics.tasks.reports.total_enrollments import TotalUsersAndEnrollmentsByWeek, TOTAL_ENROLLMENT_ROWNAME
+
+
+class FakeTarget(object):
+    """
+    Fake luigi like target that saves data in memory, using a
+    StringIO buffer.
+    """
+    def __init__(self, value=''):
+        self.buffer = StringIO(value)
+        # Rewind the buffer head so the value can be read
+        self.buffer.seek(0)
+
+    @contextmanager
+    def open(self, *args, **kwargs):
+        yield self.buffer
+
+        # Rewind the head for easy reading
+        self.buffer.seek(0)
+
+
+class TestTotalUsersAndEnrollmentsByWeek(TestCase):
+    """Tests for TotalUsersAndEnrollmentsByWeek class."""
+
+    def run_task(self, source, date, weeks, offset=None, history=None):
+        """
+        Run task with fake targets.
+
+        Returns:
+            the task output as a pandas dataframe.
+        """
+
+        parsed_date = datetime.datetime.strptime(date, '%Y-%m-%d').date()
+
+        # Make offsets None if it was not specified.
+        task = TotalUsersAndEnrollmentsByWeek(
+            source='fake_source',
+            offsets='fake_offsets' if offset else None,
+            history='fake_history' if history else None,
+            destination='fake_destination',
+            date=parsed_date,
+            weeks=weeks
+        )
+
+        # Mock the input and output targets
+
+        def reformat(string):
+            # Reformat string to make it like a hadoop tsv
+            return textwrap.dedent(string).strip().replace(' ', '\t')
+
+        input_targets = {
+            'source': FakeTarget(reformat(source)),
+        }
+
+        # Mock offsets only if specified.
+        if offset:
+            input_targets.update({'offsets': FakeTarget(reformat(offset))})
+
+        # Mock history only if specified.
+        if history:
+            input_targets.update({'history': FakeTarget(reformat(history))})
+
+        task.input = MagicMock(return_value=input_targets)
+
+        output_target = FakeTarget()
+        task.output = MagicMock(return_value=output_target)
+
+        # Run the task and parse the output into a pandas dataframe
+
+        task.run()
+
+        data = output_target.buffer.read()
+        result = pandas.read_csv(StringIO(data),
+                                 na_values=['-'],
+                                 index_col='name')
+
+        return result
+
+    def test_parse_source(self):
+        source = """
+        course_1 2013-01-01 10
+        course_1 2013-01-02 10
+        course_1 2013-01-03 10
+        course_1 2013-01-09 10
+        course_1 2013-01-17 10
+        course_2 2013-01-01 10
+        course_3 2013-01-01 10
+        """
+        res = self.run_task(source, '2013-01-17', 3)
+        # self.assertEqual(set(['name']), set(res.index))
+        self.assertEqual(set(['2013-01-03', '2013-01-10', '2013-01-17']),
+                         set(res.columns))
+
+        self.assertEqual(res.loc[TOTAL_ENROLLMENT_ROWNAME]['2013-01-03'], 50)
+        self.assertEqual(res.loc[TOTAL_ENROLLMENT_ROWNAME]['2013-01-10'], 60)
+        self.assertEqual(res.loc[TOTAL_ENROLLMENT_ROWNAME]['2013-01-17'], 70)
+
+    def test_week_grouping(self):
+        source = """
+        course_1 2013-01-06 10
+        course_1 2013-01-14 10
+        """
+        res = self.run_task(source, '2013-01-21', 4)
+        weeks = set(['2012-12-31', '2013-01-07', '2013-01-14', '2013-01-21'])
+        self.assertEqual(weeks, set(str(w) for w in res.columns))
+        total_enrollment = res.loc[TOTAL_ENROLLMENT_ROWNAME]
+        self.assertTrue(isnan(total_enrollment['2012-12-31']))  # no data
+        self.assertEqual(total_enrollment['2013-01-07'], 10)
+        self.assertEqual(total_enrollment['2013-01-14'], 20)
+        self.assertTrue(isnan(total_enrollment['2013-01-21']))  # no data
+
+    def test_cumulative(self):
+        source = """
+        course_1 2013-02-01 4
+        course_1 2013-02-04 4
+        course_1 2013-02-08 5
+        course_1 2013-02-12 -4
+        course_1 2013-02-16 6
+        course_1 2013-02-18 6
+        course_2 2013-02-12 2
+        course_2 2013-02-14 3
+        course_2 2013-02-15 -2
+        """
+        res = self.run_task(source, '2013-02-18', 2)
+        total_enrollment = res.loc[TOTAL_ENROLLMENT_ROWNAME]
+        self.assertEqual(total_enrollment['2013-02-11'], 13)
+        self.assertEqual(total_enrollment['2013-02-18'], 24)
+
+    def test_offsets(self):
+        source = """
+        course_1 2013-03-01 1
+        course_1 2013-03-30 2
+        course_2 2013-03-07 1
+        course_2 2013-03-08 1
+        course_2 2013-03-10 1
+        course_2 2013-03-13 1
+        course_3 2013-03-15 1
+        course_3 2013-03-18 1
+        course_3 2013-03-19 1
+        """
+
+        offset = """
+        course_2 2013-03-07 8
+        course_3 2013-03-15 6
+        """
+        res = self.run_task(source, '2013-03-28', 4, offset=offset)
+        total_enrollment = res.loc[TOTAL_ENROLLMENT_ROWNAME]
+        self.assertEqual(total_enrollment['2013-03-07'], 10)
+        self.assertEqual(total_enrollment['2013-03-14'], 13)
+        self.assertEqual(total_enrollment['2013-03-21'], 22)
+        self.assertEqual(total_enrollment['2013-03-28'], 22)
+
+    def test_unicode(self):
+        course_id = u'course_\u2603'
+
+        source = u"""
+        {course_id} 2013-04-01 1
+        {course_id} 2013-04-02 1
+        """.format(course_id=course_id)
+
+        res = self.run_task(source.encode('utf-8'), '2013-04-02', 1)
+
+        self.assertEqual(res.loc[TOTAL_ENROLLMENT_ROWNAME]['2013-04-02'], 2)
+
+    def test_task_urls(self):
+        date = datetime.date(2013, 01, 20)
+
+        task = TotalUsersAndEnrollmentsByWeek(source='s3://bucket/path/',
+                                              offsets='s3://bucket/file.txt',
+                                              destination='file://path/file.txt',
+                                              date=date)
+
+        requires = task.requires()
+
+        source = requires['source'].output()
+        self.assertIsInstance(source, luigi.hdfs.HdfsTarget)
+        self.assertEqual(source.format, luigi.hdfs.PlainDir)
+
+        offsets = requires['offsets'].output()
+        self.assertIsInstance(offsets, luigi.hdfs.HdfsTarget)
+        self.assertEqual(offsets.format, luigi.hdfs.Plain)
+
+        destination = task.output()
+        self.assertIsInstance(destination, luigi.File)
--- a/edx/analytics/tasks/reports/total_enrollments.py
+++ b/edx/analytics/tasks/reports/total_enrollments.py
+"""Total Enrollment related reports"""
+
+import csv
+
+import luigi
+import luigi.hdfs
+
+import numpy
+import pandas
+
+from edx.analytics.tasks.url import ExternalURL, get_target_from_url
+from edx.analytics.tasks.reports.enrollments import CourseEnrollmentCountMixin
+
+
+ROWNAME_HEADER = 'name'
+TOTAL_ENROLLMENT_ROWNAME = 'Total Enrollment'
+
+
+class TotalUsersAndEnrollmentsByWeek(luigi.Task, CourseEnrollmentCountMixin):
+    """
+    Calculates total users and enrollments across all (known) courses per week.
+
+    Parameters:
+        source: Location of daily enrollments per date. The format is a
+            TSV file, with fields course_id, date and count.
+        destination: Location of the resulting report. The output format is an
+            excel-compatible CSV file.
+        history:  Location of historical values for total course enrollment.
+            The format is a TSV file, with fields "date" and "enrollments".
+        offsets: Location of seed values for each course. The format is a
+            Hadoop TSV file, with fields "course_id", "date" and "offset".
+        date: End date of the last week requested.
+        weeks: Number of weeks from the end date to request.
+
+    Output:
+        Excel-compatible CSV file with a header row and two non-header
+        rows.  The first column is a title for the row, and subsequent
+        columns are the total counts for each week requested.  The
+        first non-header row contains the total users at the end of
+        each week.  The second row contains the total course
+        enrollments at the end of each week.
+
+    """
+    # TODO: add the first (total users) row later, when we have access to total
+    # user counts (e.g. queried from and reconstructed from a production database).
+
+    source = luigi.Parameter()
+    destination = luigi.Parameter()
+    offsets = luigi.Parameter(default=None)
+    history = luigi.Parameter(default=None)
+    date = luigi.DateParameter()
+    weeks = luigi.IntParameter(default=52)
+
+    def requires(self):
+        results = {'source': ExternalURL(self.source)}
+        if self.offsets:
+            results.update({'offsets': ExternalURL(self.offsets)})
+        if self.history:
+            results.update({'history': ExternalURL(self.history)})
+
+        return results
+
+    def output(self):
+        return get_target_from_url(self.destination)
+
+    def run(self):
+        # Load the explicit enrollment data into a pandas dataframe.
+        daily_enrollment_changes = self.read_source()
+
+        # Add enrollment offsets to allow totals to be calculated
+        # for explicit enrollments.
+        offsets = self.read_offsets()
+        daily_enrollment_totals = self.calculate_total_enrollment(daily_enrollment_changes, offsets)
+
+        # Remove (or merge or whatever) data for courses that
+        # would otherwise result in duplicate counts.
+        self.filter_duplicate_courses(daily_enrollment_totals)
+
+        # Sum per-course counts to create a single series
+        # of total enrollment counts per day.
+        daily_overall_enrollment = daily_enrollment_totals.sum(axis=1)
+        daily_overall_enrollment.name = TOTAL_ENROLLMENT_ROWNAME
+
+        # Prepend total enrollment history.
+        overall_enrollment_history = self.read_history()
+        if overall_enrollment_history is not None:
+            self.prepend_history(daily_overall_enrollment, overall_enrollment_history)
+
+        # TODO: get user counts, as another series.
+
+        # TODO: Combine the two series into a single DataFrame, indexed by date.
+        # For now, put the single series into a data frame, so that
+        # it can be sampled and output in a consistent way.
+        total_counts_by_day = pandas.DataFrame(daily_overall_enrollment)
+
+        # Select values from DataFrame to display per-week.
+        total_counts_by_week = self.select_weekly_values(
+            total_counts_by_day,
+            self.date,
+            self.weeks,
+        )
+
+        with self.output().open('w') as output_file:
+            self.save_output(total_counts_by_week, output_file)
+
+    def read_source(self):
+        """
+        Read source into a pandas DataFrame.
+
+        Returns:
+            Pandas dataframe with one column per course_id. Indexed
+            for the time interval available in the source data.
+
+        """
+        with self.input()['source'].open('r') as input_file:
+            course_date_count_data = self.read_course_date_count_tsv(input_file)
+            data = self.initialize_daily_count(course_date_count_data)
+        return data
+
+    def read_offsets(self):
+        """
+        Read offsets into a pandas DataFrame.
+
+        Returns:
+            Pandas dataframe with one row per course_id and
+            columns for the date and count of the offset.
+
+            Returns None if no offset was specified.
+
+        """
+        data = None
+        if self.input().get('offsets'):
+            with self.input()['offsets'].open('r') as offset_file:
+                data = self.read_course_date_count_tsv(offset_file)
+
+        return data
+
+    def read_history(self):
+        """
+        Read course total enrollment history into a pandas DataFrame.
+
+        Returns:
+            Pandas Series, indexed by date, containing total
+            enrollment counts by date.
+
+            Returns None if no history was specified.
+        """
+        # TODO: implement this for real.  (This is just a placeholder.)
+        data = None
+        if self.input().get('history'):
+            with self.input()['history'].open('r') as history_file:
+                # TODO:  read input file and convert to a Series.
+                pass
+        return data
+
+    def prepend_history(self, count_by_day, history):
+        """
+        Add history to a series in-place.
+
+        Args:
+            count_by_day: pandas Series
+            history: pandas Series, also of counts indexed by date.
+
+        """
+        # TODO: implement this for real.  (This is just a placeholder.)
+        # Check that entry doesn't already exist in count_by_day
+        # before adding value from history.
+        # For gaps in history, values should be extrapolated.
+        # Also may to need to reindex, since new dates are being added.
+        pass
+
+    def filter_duplicate_courses(self, daily_enrollment_totals):
+        # TODO: implement this for real.  (This is just a placeholder.)
+        # At this point we should remove data for courses that are
+        # represented by other courses, because the students have been
+        # moved to the new course.  Perhaps this should actually
+        # perform a merge of the two courses, since we would want the
+        # history of one before the move date, and the history of the
+        # second after that date.
+
+        # Note that this is not the same filtering that would be applied
+        # to the EnrollmentsByWeek report.
+        pass
+
+    def save_output(self, results, output_file):
+        """
+        Write output to CSV file.
+
+        Args:
+            results:  a pandas DataFrame object containing series data
+                per row to be output.
+
+        """
+        # transpose the dataframe so that weeks are columns, and output:
+        results = results.transpose()
+
+        # List of fieldnames for the report
+        fieldnames = [ROWNAME_HEADER] + list(results.columns)
+
+        writer = csv.DictWriter(output_file, fieldnames)
+        writer.writerow(dict((k, k) for k in fieldnames))  # Write header
+
+        def format_counts(counts_dict):
+            for k, v in counts_dict.iteritems():
+                yield k, '-' if numpy.isnan(v) else int(v)
+
+        for series_name, series in results.iterrows():
+            values = {
+                ROWNAME_HEADER: series_name,
+            }
+            by_week_values = format_counts(series.to_dict())
+            values.update(by_week_values)
+            writer.writerow(values)
--- a/edx/analytics/tasks/util/tsv.py
+++ b/edx/analytics/tasks/util/tsv.py
+"""Helpers for reading TSV files."""
+
+import csv
+import pandas
+
+
+def read_tsv(input_file, names):
+    """
+    Reads a tab-separated file into a DataFrame.
+
+    Args:
+        input_file (str): Path to the input file.
+        names (list): The names of the columns in the input file.
+    Returns:
+        A pandas DataFrame read from the file contents of the file.
+    """
+    return pandas.read_csv(
+        input_file,
+        names=names,
+        quoting=csv.QUOTE_NONE,
+        encoding=None,
+        delimiter='\t'
+    )