Add tasks to calculate course enrollment.

c1164ce5 · Brian Wilson · 80b4b413 · c1164ce5 · c1164ce5 · c1164ce5
Commit c1164ce5 authored Jan 13, 2014 by Brian Wilson
12 changed files
--- a/.coveragerc
+++ b/.coveragerc
+# .coveragerc for analytics-tasks
+[run]
+data_file = .coverage
+source = edx/analytics
+
+[report]
+ignore_errors = True
+
+[html]
+title = Analytics-Tasks Python Test Coverage Report
+directory = report
+
+[xml]
+output = coverage.xml
--- a/.pep8
+++ b/.pep8
+[pep8]
+ignore=E501
\ No newline at end of file
--- a/Makefile
+++ b/Makefile
@@ -11,3 +11,20 @@ requirements:

 test-requirements: requirements
 	pip install -r requirements/test.txt
+
+test: test-requirements
+	rm -rf .coverage
+	python -m coverage run --rcfile=./.coveragerc `which nosetests`
+
+coverage: test
+	coverage html
+	coverage xml -o coverage.xml
+	diff-cover coverage.xml --html-report diff_cover.html
+
+	# Compute quality
+	diff-quality --violations=pep8 --html-report diff_quality_pep8.html
+	diff-quality --violations=pylint --html-report diff_quality_pylint.html
+
+	# Compute style violations
+	pep8 > pep8.report || echo "Not pep8 clean"
+	pylint -f parseable edx > pylint.report || echo "Not pylint clean"
--- a/edx/analytics/tasks/course_enroll.py
+++ b/edx/analytics/tasks/course_enroll.py
+"""
+Luigi tasks for extracting course enrollment statistics from tracking log files.
+
+Example command lines:
+
+ (local)
+
+  python course_enroll.py --local-scheduler CourseEnrollmentTotalsPerDay
+        --name mytest --src input --include 'tracking*' --include '2012*'
+        --dest output7
+
+ (local using s3)
+
+  python course_enroll.py --local-scheduler CourseEnrollmentTotalsPerDay
+        --name mytest --src s3://edx-analytics-test-data/data --include 'tracking*' 
+        --dest s3://edx-analytics-scratch/output
+
+"""
+
+import luigi
+import luigi.hadoop
+import luigi.s3
+import luigi.hdfs
+
+import edx.analytics.util.eventlog as eventlog
+from edx.analytics.tasks.pathutil import get_target_for_url, PathSetTask
+
+
+################################
+# Task Map-Reduce definitions
+################################
+
+
+class BaseCourseEnrollmentEventsPerDay(luigi.hadoop.JobTask):
+    """Calculates daily change in enrollment for a user in a course, given raw event log input."""
+
+    def get_implicit_enrollment_output(self, item):
+        """
+        Generates output values for implicit enrollment events.
+
+        Output format:  (course_id, username), (datetime, action_value)
+
+          where action_value = 1 (enrolled) or -1 (unenrolled)
+
+        Returns None if the enrollment event on the line is not valid.
+        """
+
+        event_data = eventlog.get_event_data(item)
+        if event_data is None:
+            # Assume it's already logged (and with more specifics).
+            return None
+
+        # The args are part of the POST request.  
+        # The course_id is stored in a list, so just take the first value:
+        post_args = event_data['POST']
+        if 'course_id' not in post_args:
+            eventlog.log_item("encountered event with no course_id in post args", item)
+            return None
+
+        course_id = post_args['course_id'][0]
+        if len(course_id) == 0:
+            eventlog.log_item("encountered event with zero-length course_id in post args", item)
+            return None
+
+        # This is a hack, due to a bug in luigi/hadoop.py:
+        # In JobTask.writer(), it calls "\t".join(map(str, flatten(output)))
+        # which returns a UnicodeEncodeError when output contains non-ascii characters.
+        # For now, just log and skip such course_ids.  Create a separate story in future
+        # to make sure that Luigi handles non-ascii characters in general.
+        try:
+            str(course_id)
+        except:
+            eventlog.log_item("encountered event with non-ascii course_id in post args", item)
+            return None
+
+        # The value of action is expected to be 'enroll' or 'unenroll', but is
+        # stored in a list.  We just take the first value (but log if there are more).
+        if 'enrollment_action' not in post_args:
+            eventlog.log_item("encountered event with no enrollment_action in post args", item)
+            return None
+        actions = post_args['enrollment_action']
+        if len(actions) != 1:
+            eventlog.log_item("encountered event with multiple enrollment_actions in post args", item, "WARNING")
+
+        action = actions[0]
+        if action == 'enroll':
+            action_value = 1 
+        elif action == 'unenroll':
+            action_value = -1 
+        else:
+            eventlog.log_item("encountered event with unrecognized value for enrollment_action in post args", item, "WARNING")
+            return None
+
+        # get additional data: timestamp and username:
+        timestamp = eventlog.get_timestamp(item)
+        if timestamp is None:
+            # bad format?
+            eventlog.log_item("encountered event with bad timestamp", item)
+            return None
+
+        if 'username' not in item:
+            # bad format?
+            eventlog.log_item("encountered implicit enrollment event with no username", item, "WARNING")
+            return None
+
+        username = item['username']
+        
+        return (course_id, username), (eventlog.get_datetime_string(timestamp), action_value)
+
+    def get_explicit_enrollment_output(self, item, event_type):
+        """
+        Generates output values for explicit enrollment events.
+
+        Output format:  (course_id, username), (datetime, action_value)
+
+          where action_value = 1 (enrolled) or -1 (unenrolled)
+
+        Returns None if the enrollment event on the line is not valid.
+        """
+        # convert the type to a value:
+        if event_type == 'edx.course.enrollment.activated':
+            action_value = 1
+        elif event_type == 'edx.course.enrollment.deactivated':
+            action_value = -1
+
+        # Data is stored in the context, but it's also in the data.
+        # Pick one.
+        event_data = eventlog.get_event_data(item)
+        if event_data is None:
+            # Assume it's already logged (and with more specifics).
+            return None
+
+        course_id = event_data['course_id']
+        # for now, ignore the enrollment 'mode' (e.g. 'honor')
+
+        # get additional data:
+        timestamp = eventlog.get_timestamp(item)
+        if timestamp is None:
+            # bad format?
+            eventlog.log_item("encountered event with bad timestamp", item)
+            return None
+
+        # there is also a user_id in the event_data, but who knows if
+        # it's the same as the username?  But for old events, we don't have
+        # such a user_id, and I don't think we're planning on loading such a mapping.
+        if 'username' not in item:
+            # bad format?
+            eventlog.log_item("encountered explicit enrollment event with no username", item)
+            return None
+        username = item['username']
+        
+        return (course_id, username), (eventlog.get_datetime_string(timestamp), action_value)
+
+
+    def get_enrollment_event(self, line):
+        """
+        Generates output values for explicit enrollment events.
+
+        Output format:  (course_id, username), (datetime, action_value)
+
+          where action_value = 1 (enrolled) or -1 (unenrolled)
+
+        Returns None if there is no enrollment event on the line.
+        """
+        # Before parsing, check that the line contains something that
+        # suggests it's an enrollment event.
+        if 'edx.course.enrollment' not in line and '/change_enrollment' not in line:
+            return None
+
+        # try to parse the line into a dict:
+        item = eventlog.parse_eventlog_item(line)
+        if item is None:
+            # The line didn't parse.  For this specific purpose,
+            # we can assume that all enrollment-related lines would parse,
+            # and these non-parsing lines would get skipped anyway.
+            return None
+
+        # get event type, and check that it exists:
+        event_type = item.get('event_type')
+        if event_type is None:
+            eventlog.log_item("encountered event with no event_type", item)
+            return None
+
+        # check if it is an 'explicit' enrollment event:
+        if (event_type == 'edx.course.enrollment.activated' or 
+            event_type == 'edx.course.enrollment.deactivated'):
+            return self.get_explicit_enrollment_output(item, event_type)
+
+        # check if it is an 'implicit' enrollment event:
+        if event_type == '/change_enrollment':
+            return self.get_implicit_enrollment_output(item)
+
+        # Not an enrollment event...
+        return None
+
+    def mapper(self, line):
+        """
+        Output format:  (course_id, username), (datetime, action_value)
+
+          where action_value = 1 (enrolled) or -1 (unenrolled)
+
+        Example:
+            edX/DemoX/Demo_Course	dummyuser	2013-09-10	1
+            edX/DemoX/Demo_Course	dummyuser	2013-09-10	1
+            edX/DemoX/Demo_Course	dummyuser	2013-09-10     -1
+
+        """
+        parsed_tuple = self.get_enrollment_event(line)
+        if parsed_tuple is not None:
+            # sys.stderr.write("Found tuple in mapper: " + str(parsed_tuple) + '\n')
+            yield parsed_tuple
+
+    def reducer(self, key, values):
+        """
+        Calculate status for each user on the end of each day where they changed their status.
+
+        Output key:   (course_id, date)
+        Output value:  net enrollment change on that date for an individual user.
+             Expected values are -1, 0 (no change), 1
+
+        Note that we don't bother to actually output the username,
+        since it's not needed downstream.
+
+        If the user were already enrolled (or attempted enrollment),
+        the net change from a subsequent enrollment is zero.  Same to
+        unenroll after an unenroll.  This is true whether they occur
+        on the same day or on widely disparate days.  For implicit
+        enrollment events, we don't know when they succeed, so we
+        assume they succeed the first time, and ignore subsequent
+        attempts.  Likewise for implicit enrollment events followed by
+        explicit enrollment events.
+
+        An unenroll following an enroll on the same day will also
+        result in zero change.
+
+        Example:
+            edX/DemoX/Demo_Course	2013-09-10	1
+            edX/DemoX/Demo_Course	2013-09-10     -1
+            edX/DemoX/Demo_Course	2013-09-10      0
+
+        """
+        # sys.stderr.write("Found key in reducer: " + str(key) + '\n')
+        course_id, username = key
+        sorted_values = sorted(values)
+        prev_date = None
+        prev_change = 0
+        net_change = 0
+
+        for (datetime, change_value) in sorted_values:
+            # get the day's date from the event timestamp:
+            this_date = eventlog.get_date_from_datetime(datetime)
+            # if the date is different, then output the previous date:
+            if this_date != prev_date and prev_date is not None:
+                # sys.stderr.write("outputting date and value: " + str(prev_date) + " " + str(net_change)  + '\n')
+                yield (course_id, prev_date), net_change
+                net_change = 0
+
+            # sys.stderr.write("accumulating date and value: " + str(this_date) + " " + str(change_value) + '\n')
+            # accumulate the new numbers:
+            prev_date = this_date
+            if change_value != prev_change:
+                net_change += change_value
+                prev_change = change_value
+
+        if prev_date is not None:
+            yield (course_id, prev_date), net_change
+
+
+class BaseCourseEnrollmentChangesPerDay(luigi.hadoop.JobTask):
+    """Calculates daily changes in enrollment, given per-user net changes by date."""
+
+    def mapper(self, line):
+        """
+        Output key:   (course_id, date)
+        Output value:  net enrollment change on that date for an individual user.
+             Expected values are -1, 0 (no change), 1
+
+        Example:
+            edX/DemoX/Demo_Course	2013-09-10	1
+            edX/DemoX/Demo_Course	2013-09-10     -1
+            edX/DemoX/Demo_Course	2013-09-10      0
+
+        """
+        # yield line
+        inputs = line.split('\t')
+        if len(inputs) == 3:
+            yield (inputs[0], inputs[1]), inputs[2]
+
+    def reducer(self, key, values):
+        """
+        Reducer: sums enrollments for a given course on a particular date.
+
+        Inputs are enrollments changes on a day due to a specific user.
+        Outputs are enrollment changes on a day summed across all users.
+
+        Key:   (course_id, date)
+        Values:  individual enrollment changes, represented as -1 or 1.
+
+        Output key:  (course_id, date)
+        Output value:  sum(changes)
+        """
+        # sys.stderr.write("Found key in second reducer: " + str(key) + '\n')
+        sum_value = 0
+        for value in values:
+            sum_value += int(value)
+        yield key, sum_value
+
+
+class BaseCourseEnrollmentTotalsPerDay(luigi.hadoop.JobTask):
+    """Calculates cumulative changes in enrollment, given net changes by date."""
+
+    def mapper(self, line):
+        """
+        Key:   course_id
+        Values:  (date, net enrollment change on that date)
+
+        Example:  
+            edX/DemoX/Demo_Course	2013-09-10	5
+            edX/DemoX/Demo_Course	2013-09-11     -3
+        """
+        # yield line
+        inputs = line.split('\t')
+        if len(inputs) == 3:
+            yield inputs[0], (inputs[1], inputs[2])
+
+    def reducer(self, key, values):
+        """
+        Reducer: sums enrollments for a given course through a particular date.
+
+        Key:   course_id
+        Values:   date, and enrollment changes per day
+
+        Output key:  course_id
+        Output value:  date, accum(changes)
+        """
+        # sys.stderr.write("Found key in third reducer: " + str(key) + '\n')
+        sorted_values = sorted(values)
+        accum_count = 0
+        for date, count in sorted_values:
+            accum_count += int(count)
+            yield key, date, accum_count
+
+
+##################################
+# Task requires/output definitions
+##################################
+
+class CourseEnrollmentEventsPerDay(BaseCourseEnrollmentEventsPerDay):
+
+    name = luigi.Parameter()
+    src = luigi.Parameter()
+    dest = luigi.Parameter()
+    include = luigi.Parameter(is_list=True, default=('*',))
+    run_locally = luigi.BooleanParameter()
+
+    def requires(self):
+        return PathSetTask(self.src, self.include, self.run_locally)
+
+    def output(self):
+        # generate a single output file
+        output_name = 'course_enrollment_events_per_day_{name}'.format(name=self.name)
+        return get_target_for_url(self.dest, output_name, self.run_locally)
+
+    def extra_modules(self):
+        import boto
+        import cjson
+        import edx.analytics.util
+        return [boto, edx.analytics.util, cjson]
+
+
+class CourseEnrollmentChangesPerDay(BaseCourseEnrollmentChangesPerDay):
+
+    name = luigi.Parameter()
+    src = luigi.Parameter()
+    dest = luigi.Parameter()
+    include = luigi.Parameter(is_list=True, default=('*',))
+    run_locally = luigi.BooleanParameter()
+
+    def requires(self):
+        return CourseEnrollmentEventsPerDay(self.name, self.src, self.dest, self.include, self.run_locally)
+
+    def output(self):
+        # generate a single output file
+        output_name = 'course_enrollment_changes_per_day_{name}'.format(name=self.name)
+        return get_target_for_url(self.dest, output_name, self.run_locally)
+
+    def extra_modules(self):
+        import boto
+        import cjson
+        import edx.analytics.util
+        return [boto, edx.analytics.util, cjson]
+
+
+class CourseEnrollmentTotalsPerDay(BaseCourseEnrollmentTotalsPerDay):
+
+    name = luigi.Parameter()
+    src = luigi.Parameter()
+    dest = luigi.Parameter()
+    include = luigi.Parameter(is_list=True, default=('*',))
+    run_locally = luigi.BooleanParameter()
+
+    def requires(self):
+        return CourseEnrollmentChangesPerDay(self.name, self.src, self.dest, self.include, self.run_locally)
+
+    def output(self):
+        # generate a single output file
+        output_name = 'course_enrollment_totals_per_day_{name}'.format(name=self.name)
+        return get_target_for_url(self.dest, output_name, self.run_locally)
+
+    def extra_modules(self):
+        import boto
+        import cjson
+        import edx.analytics.util
+        return [boto, edx.analytics.util, cjson]
+
+
+class FirstCourseEnrollmentEventsPerDay(BaseCourseEnrollmentEventsPerDay):
+
+    name = luigi.Parameter()
+    src = luigi.Parameter()
+    dest = luigi.Parameter()
+    include = luigi.Parameter(is_list=True, default=('*',))
+    run_locally = luigi.BooleanParameter()
+
+    def requires(self):
+        return PathSetTask(self.src, self.include, self.run_locally)
+
+    def output(self):
+        # generate a single output file
+        output_name = 'first_course_enrollment_events_per_day_{name}'.format(name=self.name)
+        return get_target_for_url(self.dest, output_name, self.run_locally)
+
+    def extra_modules(self):
+        import boto
+        import cjson
+        import edx.analytics.util
+        return [boto, edx.analytics.util, cjson]
+
+    def reducer(self, key, values):
+        """
+        Calculate first time each user enrolls in a course.
+
+        Output key:   (course_id, date)
+        Output value:  1 on the first date the user enrolls.
+
+        Note that we don't bother to actually output the username,
+        since it's not needed downstream.
+
+        Example:
+            edX/DemoX/Demo_Course	2013-09-10	1
+
+        """
+        # sys.stderr.write("Found key in reducer: " + str(key) + '\n')
+        course_id, username = key
+        sorted_values = sorted(values)
+        for (datetime, change_value) in sorted_values:
+            # get the day's date from the event timestamp:
+            this_date = eventlog.get_date_from_datetime(datetime)
+            if change_value > 0:
+                yield (course_id, this_date), change_value
+                return
+
+
+class FirstCourseEnrollmentChangesPerDay(BaseCourseEnrollmentChangesPerDay):
+
+    name = luigi.Parameter()
+    src = luigi.Parameter()
+    dest = luigi.Parameter()
+    include = luigi.Parameter(is_list=True, default=('*',))
+    run_locally = luigi.BooleanParameter()
+
+    def requires(self):
+        return FirstCourseEnrollmentEventsPerDay(self.name, self.src, self.dest, self.include, self.run_locally)
+
+    def output(self):
+        # generate a single output file
+        output_name = 'first_course_enrollment_changes_per_day_{name}'.format(name=self.name)
+        return get_target_for_url(self.dest, output_name, self.run_locally)
+
+    def extra_modules(self):
+        import boto
+        import cjson
+        import edx.analytics.util
+        return [boto, edx.analytics.util, cjson]
+
+
+class FirstCourseEnrollmentTotalsPerDay(BaseCourseEnrollmentTotalsPerDay):
+
+    name = luigi.Parameter()
+    src = luigi.Parameter()
+    dest = luigi.Parameter()
+    include = luigi.Parameter(is_list=True, default=('*',))
+    run_locally = luigi.BooleanParameter()
+
+    def requires(self):
+        return FirstCourseEnrollmentChangesPerDay(self.name, self.src, self.dest, self.include, self.run_locally)
+
+    def output(self):
+        # generate a single output file
+        output_name = 'first_course_enrollment_totals_per_day_{name}'.format(name=self.name)
+        return get_target_for_url(self.dest, output_name, self.run_locally)
+
+    def extra_modules(self):
+        import boto
+        import cjson
+        import edx.analytics.util
+        return [boto, edx.analytics.util, cjson]
+
+
+################################
+# Running tasks
+################################
+
+
+def main():
+    import argparse
+    import boto
+    import cjson
+    luigi.hadoop.attach(boto, argparse, cjson)
+    luigi.run()
+
+
+if __name__ == '__main__':
+    main()
--- a/edx/analytics/tasks/pathutil.py
+++ b/edx/analytics/tasks/pathutil.py
+"""
+Helper classes to specify file dependencies for input and output.
+
+Supports inputs from S3 and local FS.
+Supports outputs to HDFS, S3, and local FS.
+
+"""
+
+import os
+import boto
+import glob
+from urlparse import urlparse
+from fnmatch import fnmatch
+
+import luigi
+import luigi.s3
+import luigi.hdfs
+import luigi.format
+
+def get_s3_bucket_key_names(url):
+    """Extract bucket_name and root from S3 URL."""
+    parts = urlparse(url)
+    return (parts.netloc.strip('/'), parts.path.strip('/'))
+
+
+def join_as_s3_url(bucket, root, path):
+    """Construct a URL for accessing S3, given its components."""
+    return 's3://{bucket}/{root}/{path}'.format(bucket=bucket, root=root, path=path)
+
+
+class LocalPathTask(luigi.ExternalTask):
+    """
+    An external task that to require existence of
+    a path in a local file system.
+
+    Treats files ending with .gz as Gzip files.
+    """
+    path = luigi.Parameter()
+
+    def output(self):
+        if self.path.endswith('.gz'):
+            yield luigi.LocalTarget(self.path, format=luigi.format.Gzip) 
+        else:
+            yield luigi.LocalTarget(self.path)
+
+
+class HdfsPathTask(luigi.ExternalTask):
+    """
+    An external task that to require existence of
+    a path in HDFS.
+    """
+    path = luigi.Parameter()
+
+    def output(self):
+        return luigi.hdfs.HdfsTarget(self.path)
+
+
+class PathSetTask(luigi.Task):
+    """
+    A task to select a subset of files in an S3 bucket or local FS.
+
+    Parameters:
+
+      src: a URL pointing to a folder in s3:// or local FS.
+      include:  a list of patterns to use to select.  Multiple patterns are OR'd.
+      run_locally:  if True, use S3PathTask instead of HDFSPathTask, to permit
+          reading S3 data when running in local mode.
+    """
+    src = luigi.Parameter()
+    include = luigi.Parameter(is_list=True, default=('*',))
+    run_locally = luigi.BooleanParameter()
+
+    def __init__(self, *args, **kwargs):
+        super(PathSetTask, self).__init__(*args, **kwargs)
+        self.s3 = boto.connect_s3()
+
+    def requires(self):
+        if self.src.startswith('s3'):
+            for bucket, root, path in self._generate_sources():
+                source = join_as_s3_url(bucket, root, path)
+                if self.run_locally:
+                    yield luigi.s3.S3PathTask(source)
+                else:
+                    yield HdfsPathTask(source)
+        else:
+            filelist = []
+            for include_val in self.include:
+                glob_pattern = "{src}/{include}".format(src=self.src, include=include_val)
+                filelist.extend(glob.glob(glob_pattern))
+            for filepath in filelist:
+                yield LocalPathTask(filepath)
+
+    def complete(self):
+        # An optimization: just declare that the task is always
+        # complete, by definition, because it is whatever files were
+        # requested that match the filter, not a set of files whose
+        # existence needs to be checked or generated again.
+        return True
+
+    def output(self):
+        return [task.output() for task in self.requires()]
+
+    def _generate_sources(self):
+        bucket_name, root = get_s3_bucket_key_names(self.src)
+
+        bucket = self.s3.get_bucket(bucket_name)
+        keys = (s.key for s in bucket.list(root) if s.size > 0)
+
+        # remove root
+        paths = (k.lstrip(root).strip('/') for k in keys)
+        paths = self._filter_matches(paths)
+
+        return ((bucket.name, root, path) for path in paths)
+
+    def _filter_matches(self, names):
+        patterns = self.include
+        fn = lambda n: any(fnmatch(n, p) for p in patterns)
+        return (n for n in names if fn(n))
+
+
+def get_target_for_url(dest, output_name, run_locally=False):
+    """
+    Generate an appropriate target for a given path, depending on protocol.
+
+    Parameters:
+
+      dest: a URL pointing to a folder in s3:// or hdfs:// or local FS.
+      output_name:  name of file to be output.
+      run_locally:  if True, use S3Target instead of HdfsTarget, to permit
+          writing S3 data when running in local mode.
+
+    """
+    output_url = os.path.join(dest, output_name)
+    if output_url.startswith('s3://'):
+        if run_locally:
+            return luigi.s3.S3Target(output_url)
+        else:
+            return luigi.hdfs.HdfsTarget(output_url)
+    elif output_url.startswith('hdfs://'):
+        return luigi.hdfs.HdfsTarget(output_url)
+    else:
+        return luigi.LocalTarget(output_url)
--- a/edx/analytics/util/__init__.py
+++ b/edx/analytics/util/__init__.py
--- a/edx/analytics/util/eventlog.py
+++ b/edx/analytics/util/eventlog.py
+"""Support for reading tracking event logs."""
+
+import sys
+import cjson
+import datetime
+import re
+
+
+PATTERN_JSON = re.compile(r'^.*?(\{.*\})\s*$')
+
+
+def get_datetime_string(timestamp):
+    return timestamp.strftime('%Y-%m-%dT%H:%M:%S')
+
+
+def get_date_string(timestamp):
+    return timestamp.strftime('%Y-%m-%d')
+
+
+def get_date_from_datetime(datetime_string):
+    return datetime_string.split('T')[0]
+
+
+def json_decode(line):
+    """Wrapper to decode JSON string in implementation-independent way."""
+    return cjson.decode(line)
+
+
+def parse_eventlog_item(line, nested=False):
+    """ Parse a tracking log input line as JSON to create a dict representation."""
+    try:
+        parsed = json_decode(line)
+    except:
+        if not nested:
+            json_match = PATTERN_JSON.match(line)
+            if json_match:
+                return parse_eventlog_item(json_match.group(1), nested=True)
+
+        # Seem to be truncated in input data at 10000 for some log files, 2043 for others...
+        # First filter out common ones:
+        # if 'save_problem_check' not in line:
+        #     sys.stderr.write("ERROR: encountered event with bad json: length = {len} start={start}\n".format(len=len(line), start=line[:40]))
+        # Even that leaves too many to log.
+        # TODO: Might be good going forward to collect stats on the length of truncation and the counts for
+        # different event "names" (normalized event_type values).
+        return None
+    return parsed
+
+
+def log_item(msg, item, level='ERROR'):
+    sys.stderr.write("{level}: {msg}: {item}\n".format(msg=msg, item=item, level=level))
+
+
+def get_timestamp(item):
+
+    try:
+        timestamp = item['time']
+        removed_ms = timestamp.split('.')[0]
+        return datetime.datetime.strptime(removed_ms, '%Y-%m-%dT%H:%M:%S')
+    except:
+        return None
+
+
+def get_event_data(item):
+    event_value = item.get('event')
+
+    if event_value is None:
+        log_item("encountered event with missing event value", item)
+        return None
+
+    if isinstance(event_value, basestring):
+        # If the value is a string, try to parse as JSON into a dict:.
+        try:
+            event_value = json_decode(event_value)
+        except:
+            log_item("encountered event with unparsable event value", item)
+            return None
+
+    if isinstance(event_value, dict):
+        # It's fine, just return.
+        return event_value
+    else:
+        log_item("encountered event data with unrecognized type", item)
+        return None
+
--- a/edx/analytics/util/tests/test_eventlog.py
+++ b/edx/analytics/util/tests/test_eventlog.py
+"""
+Tests for utilities that parse event logs.
+
+"""
+
+import unittest
+
+import edx.analytics.util.eventlog as eventlog
+
+class EventLogTest(unittest.TestCase):
+    """
+    Tests to verify that event log parsing works correctly.
+    """
+
+    def test_parse_valid_eventlog_item(self):
+        line = '{"username": "successful"}'
+        result = eventlog.parse_eventlog_item(line)
+        self.assertTrue(isinstance(result, dict))
+
+    def test_parse_eventlog_item_truncated(self):
+        line = '{"username": "unsuccessful'
+        result = eventlog.parse_eventlog_item(line)
+        self.assertIsNone(result)
+
+    def test_parse_eventlog_item_with_cruft(self):
+        line = 'leading cruft here {"username": "successful"}  '
+        result = eventlog.parse_eventlog_item(line)
+        self.assertTrue(isinstance(result, dict))
+
+    def test_parse_eventlog_item_with_nonascii(self):
+        line = '{"username": "b\ufffdb"}'
+        result = eventlog.parse_eventlog_item(line)
+        self.assertTrue(isinstance(result, dict))
+        self.assertEquals(result['username'], u'b\ufffdb')
+
+
--- a/pylintrc
+++ b/pylintrc
+[MASTER]
+
+# Specify a configuration file.
+#rcfile=
+
+# Python code to execute, usually for sys.path manipulation such as
+# pygtk.require().
+#init-hook=
+
+# Profiled execution.
+profile=no
+
+# Add files or directories to the blacklist. They should be base names, not
+# paths.
+ignore=CVS, migrations
+
+# Pickle collected data for later comparisons.
+persistent=yes
+
+# List of plugins (as comma separated values of python modules names) to load,
+# usually to register additional checkers.
+load-plugins=
+
+
+[MESSAGES CONTROL]
+
+# Enable the message, report, category or checker with the given id(s). You can
+# either give multiple identifier separated by comma (,) or put this option
+# multiple time.
+#enable=
+
+# Disable the message, report, category or checker with the given id(s). You
+# can either give multiple identifier separated by comma (,) or put this option
+# multiple time (only on the command line, not in the configuration file where
+# it should appear only once).
+disable=
+# Never going to use these
+# I0011: Locally disabling W0232
+# W0141: Used builtin function 'map'
+# W0142: Used * or ** magic
+# R0921: Abstract class not referenced
+# R0922: Abstract class is only referenced 1 times
+    I0011,W0141,W0142,R0921,R0922,
+
+# Django makes classes that trigger these
+# W0232: Class has no __init__ method
+    W0232,
+
+# Might use these when the code is in better shape
+# C0302: Too many lines in module
+# R0201: Method could be a function
+# R0901: Too many ancestors
+# R0902: Too many instance attributes
+# R0903: Too few public methods (1/2)
+# R0904: Too many public methods
+# R0911: Too many return statements
+# R0912: Too many branches
+# R0913: Too many arguments
+# R0914: Too many local variables
+    C0302,R0201,R0901,R0902,R0903,R0904,R0911,R0912,R0913,R0914
+
+
+[REPORTS]
+
+# Set the output format. Available formats are text, parseable, colorized, msvs
+# (visual studio) and html
+output-format=text
+
+# Include message's id in output
+include-ids=yes
+
+# Put messages in a separate file for each module / package specified on the
+# command line instead of printing them on stdout. Reports (if any) will be
+# written in a file name "pylint_global.[txt|html]".
+files-output=no
+
+# Tells whether to display a full report or only the messages
+reports=no
+
+# Python expression which should return a note less than 10 (10 is the highest
+# note). You have access to the variables errors warning, statement which
+# respectively contain the number of errors / warnings messages and the total
+# number of statements analyzed. This is used by the global evaluation report
+# (RP0004).
+evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
+
+# Add a comment according to your evaluation note. This is used by the global
+# evaluation report (RP0004).
+comment=no
+
+
+[TYPECHECK]
+
+# Tells whether missing members accessed in mixin class should be ignored. A
+# mixin class is detected if its name ends with "mixin" (case insensitive).
+ignore-mixin-members=yes
+
+# List of classes names for which member attributes should not be checked
+# (useful for classes with attributes dynamically set).
+ignored-classes=SQLObject
+
+# When zope mode is activated, add a predefined set of Zope acquired attributes
+# to generated-members.
+zope=no
+
+# List of members which are set dynamically and missed by pylint inference
+# system, and so shouldn't trigger E0201 when accessed. Python regular
+# expressions are accepted.
+generated-members=
+    REQUEST,
+    acl_users,
+    aq_parent,
+    objects,
+    DoesNotExist,
+    can_read,
+    can_write,
+    get_url,
+    size,
+    content,
+    status_code,
+# For factory_boy factories
+    create
+
+
+[BASIC]
+
+# Required attributes for module, separated by a comma
+required-attributes=
+
+# List of builtins function names that should not be used, separated by a comma
+bad-functions=map,filter,apply,input
+
+# Regular expression which should only match correct module names
+module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
+
+# Regular expression which should only match correct module level names
+const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__)|log|urlpatterns)$
+
+# Regular expression which should only match correct class names
+class-rgx=[A-Z_][a-zA-Z0-9]+$
+
+# Regular expression which should only match correct function names
+function-rgx=[a-z_][a-z0-9_]{2,30}$
+
+# Regular expression which should only match correct method names
+method-rgx=([a-z_][a-z0-9_]{2,60}|setUp|set[Uu]pClass|tearDown|tear[Dd]ownClass|assert[A-Z]\w*)$
+
+# Regular expression which should only match correct instance attribute names
+attr-rgx=[a-z_][a-z0-9_]{2,30}$
+
+# Regular expression which should only match correct argument names
+argument-rgx=[a-z_][a-z0-9_]{2,30}$
+
+# Regular expression which should only match correct variable names
+variable-rgx=[a-z_][a-z0-9_]{2,30}$
+
+# Regular expression which should only match correct list comprehension /
+# generator expression variable names
+inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$
+
+# Good variable names which should always be accepted, separated by a comma
+good-names=i,j,k,ex,Run,_
+
+# Bad variable names which should always be refused, separated by a comma
+bad-names=foo,bar,baz,toto,tutu,tata
+
+# Regular expression which should only match functions or classes name which do
+# not require a docstring
+no-docstring-rgx=__.*__|test_.*|setUp|tearDown
+
+
+[MISCELLANEOUS]
+
+# List of note tags to take in consideration, separated by a comma.
+notes=FIXME,XXX,TODO
+
+
+[FORMAT]
+
+# Maximum number of characters on a single line.
+max-line-length=120
+
+# Maximum number of lines in a module
+max-module-lines=1000
+
+# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1
+# tab).
+indent-string='    '
+
+
+[SIMILARITIES]
+
+# Minimum lines number of a similarity.
+min-similarity-lines=4
+
+# Ignore comments when computing similarities.
+ignore-comments=yes
+
+# Ignore docstrings when computing similarities.
+ignore-docstrings=yes
+
+
+[VARIABLES]
+
+# Tells whether we should check for unused import in __init__ files.
+init-import=no
+
+# A regular expression matching the beginning of the name of dummy variables
+# (i.e. not used).
+dummy-variables-rgx=_|dummy|unused|.*_unused
+
+# List of additional names supposed to be defined in builtins. Remember that
+# you should avoid to define new builtins when possible.
+additional-builtins=
+
+
+[IMPORTS]
+
+# Deprecated modules which should not be used, separated by a comma
+deprecated-modules=regsub,string,TERMIOS,Bastion,rexec
+
+# Create a graph of every (i.e. internal and external) dependencies in the
+# given file (report RP0402 must not be disabled)
+import-graph=
+
+# Create a graph of external dependencies in the given file (report RP0402 must
+# not be disabled)
+ext-import-graph=
+
+# Create a graph of internal dependencies in the given file (report RP0402 must
+# not be disabled)
+int-import-graph=
+
+
+[DESIGN]
+
+# Maximum number of arguments for function / method
+max-args=5
+
+# Argument names that match this expression will be ignored. Default to name
+# with leading underscore
+ignored-argument-names=_.*
+
+# Maximum number of locals for function / method body
+max-locals=15
+
+# Maximum number of return / yield for function / method body
+max-returns=6
+
+# Maximum number of branch for function / method body
+max-branchs=12
+
+# Maximum number of statements in function / method body
+max-statements=50
+
+# Maximum number of parents for a class (see R0901).
+max-parents=7
+
+# Maximum number of attributes for a class (see R0902).
+max-attributes=7
+
+# Minimum number of public methods for a class (see R0903).
+min-public-methods=2
+
+# Maximum number of public methods for a class (see R0904).
+max-public-methods=20
+
+
+[CLASSES]
+
+# List of interface methods to ignore, separated by a comma. This is used for
+# instance to not check methods defines in Zope's Interface base class.
+ignore-iface-methods=isImplementedBy,deferred,extends,names,namesAndDescriptions,queryDescriptionFor,getBases,getDescriptionFor,getDoc,getName,getTaggedValue,getTaggedValueTags,isEqualOrExtendedBy,setTaggedValue,isImplementedByInstancesOf,adaptWith,is_implemented_by
+
+# List of method names used to declare (i.e. assign) instance attributes.
+defining-attr-methods=__init__,__new__,setUp
+
+# List of valid names for the first argument in a class method.
+valid-classmethod-first-arg=cls
+
+
+[EXCEPTIONS]
+
+# Exceptions that will emit a warning when being caught. Defaults to
+# "Exception"
+overgeneral-exceptions=Exception
--- a/requirements/default.txt
+++ b/requirements/default.txt
@@ -6,5 +6,6 @@ pbr==0.5.23
 stevedore==0.13
 tornado==3.1.1
 ansible==1.4.4
+python-cjson==1.0.5

 -e git+https://github.com/spotify/luigi.git@a33756c781b9bf7e51384f0eb19d6a25050ef136#egg=luigi
--- a/requirements/test.txt
+++ b/requirements/test.txt
 nose
+nose-ignore-docstring
+coverage==3.7
+pep8==1.4.5
+pylint==0.28
+diff-cover >= 0.2.1
--- a/setup.cfg
+++ b/setup.cfg
@@ -19,8 +19,10 @@ data_files =
 console_scripts =
    launch-task = edx.analytics.tasks.main:main
    remote-task = edx.analytics.tasks.remote:main
+
 edx.analytics.tasks =
    s3-copy = edx.analytics.tasks.s3:S3Copy
    s3-sync = edx.analytics.tasks.s3:S3Sync
    sync-events = edx.analytics.tasks.eventlogs:SyncEventLogs
    enrollments-report = edx.analytics.reports.enrollments:EnrollmentsByWeek
+    course-enroll = edx.analytics.tasks.course_enroll:CourseEnrollmentTotalsPerDay