Commit c1164ce5 by Brian Wilson

Add tasks to calculate course enrollment.

parent 80b4b413
# .coveragerc for analytics-tasks
[run]
data_file = .coverage
source = edx/analytics
[report]
ignore_errors = True
[html]
title = Analytics-Tasks Python Test Coverage Report
directory = report
[xml]
output = coverage.xml
[pep8]
ignore=E501
\ No newline at end of file
......@@ -11,3 +11,20 @@ requirements:
test-requirements: requirements
pip install -r requirements/test.txt
test: test-requirements
rm -rf .coverage
python -m coverage run --rcfile=./.coveragerc `which nosetests`
coverage: test
coverage html
coverage xml -o coverage.xml
diff-cover coverage.xml --html-report diff_cover.html
# Compute quality
diff-quality --violations=pep8 --html-report diff_quality_pep8.html
diff-quality --violations=pylint --html-report diff_quality_pylint.html
# Compute style violations
pep8 > pep8.report || echo "Not pep8 clean"
pylint -f parseable edx > pylint.report || echo "Not pylint clean"
"""
Luigi tasks for extracting course enrollment statistics from tracking log files.
Example command lines:
(local)
python course_enroll.py --local-scheduler CourseEnrollmentTotalsPerDay
--name mytest --src input --include 'tracking*' --include '2012*'
--dest output7
(local using s3)
python course_enroll.py --local-scheduler CourseEnrollmentTotalsPerDay
--name mytest --src s3://edx-analytics-test-data/data --include 'tracking*'
--dest s3://edx-analytics-scratch/output
"""
import luigi
import luigi.hadoop
import luigi.s3
import luigi.hdfs
import edx.analytics.util.eventlog as eventlog
from edx.analytics.tasks.pathutil import get_target_for_url, PathSetTask
################################
# Task Map-Reduce definitions
################################
class BaseCourseEnrollmentEventsPerDay(luigi.hadoop.JobTask):
"""Calculates daily change in enrollment for a user in a course, given raw event log input."""
def get_implicit_enrollment_output(self, item):
"""
Generates output values for implicit enrollment events.
Output format: (course_id, username), (datetime, action_value)
where action_value = 1 (enrolled) or -1 (unenrolled)
Returns None if the enrollment event on the line is not valid.
"""
event_data = eventlog.get_event_data(item)
if event_data is None:
# Assume it's already logged (and with more specifics).
return None
# The args are part of the POST request.
# The course_id is stored in a list, so just take the first value:
post_args = event_data['POST']
if 'course_id' not in post_args:
eventlog.log_item("encountered event with no course_id in post args", item)
return None
course_id = post_args['course_id'][0]
if len(course_id) == 0:
eventlog.log_item("encountered event with zero-length course_id in post args", item)
return None
# This is a hack, due to a bug in luigi/hadoop.py:
# In JobTask.writer(), it calls "\t".join(map(str, flatten(output)))
# which returns a UnicodeEncodeError when output contains non-ascii characters.
# For now, just log and skip such course_ids. Create a separate story in future
# to make sure that Luigi handles non-ascii characters in general.
try:
str(course_id)
except:
eventlog.log_item("encountered event with non-ascii course_id in post args", item)
return None
# The value of action is expected to be 'enroll' or 'unenroll', but is
# stored in a list. We just take the first value (but log if there are more).
if 'enrollment_action' not in post_args:
eventlog.log_item("encountered event with no enrollment_action in post args", item)
return None
actions = post_args['enrollment_action']
if len(actions) != 1:
eventlog.log_item("encountered event with multiple enrollment_actions in post args", item, "WARNING")
action = actions[0]
if action == 'enroll':
action_value = 1
elif action == 'unenroll':
action_value = -1
else:
eventlog.log_item("encountered event with unrecognized value for enrollment_action in post args", item, "WARNING")
return None
# get additional data: timestamp and username:
timestamp = eventlog.get_timestamp(item)
if timestamp is None:
# bad format?
eventlog.log_item("encountered event with bad timestamp", item)
return None
if 'username' not in item:
# bad format?
eventlog.log_item("encountered implicit enrollment event with no username", item, "WARNING")
return None
username = item['username']
return (course_id, username), (eventlog.get_datetime_string(timestamp), action_value)
def get_explicit_enrollment_output(self, item, event_type):
"""
Generates output values for explicit enrollment events.
Output format: (course_id, username), (datetime, action_value)
where action_value = 1 (enrolled) or -1 (unenrolled)
Returns None if the enrollment event on the line is not valid.
"""
# convert the type to a value:
if event_type == 'edx.course.enrollment.activated':
action_value = 1
elif event_type == 'edx.course.enrollment.deactivated':
action_value = -1
# Data is stored in the context, but it's also in the data.
# Pick one.
event_data = eventlog.get_event_data(item)
if event_data is None:
# Assume it's already logged (and with more specifics).
return None
course_id = event_data['course_id']
# for now, ignore the enrollment 'mode' (e.g. 'honor')
# get additional data:
timestamp = eventlog.get_timestamp(item)
if timestamp is None:
# bad format?
eventlog.log_item("encountered event with bad timestamp", item)
return None
# there is also a user_id in the event_data, but who knows if
# it's the same as the username? But for old events, we don't have
# such a user_id, and I don't think we're planning on loading such a mapping.
if 'username' not in item:
# bad format?
eventlog.log_item("encountered explicit enrollment event with no username", item)
return None
username = item['username']
return (course_id, username), (eventlog.get_datetime_string(timestamp), action_value)
def get_enrollment_event(self, line):
"""
Generates output values for explicit enrollment events.
Output format: (course_id, username), (datetime, action_value)
where action_value = 1 (enrolled) or -1 (unenrolled)
Returns None if there is no enrollment event on the line.
"""
# Before parsing, check that the line contains something that
# suggests it's an enrollment event.
if 'edx.course.enrollment' not in line and '/change_enrollment' not in line:
return None
# try to parse the line into a dict:
item = eventlog.parse_eventlog_item(line)
if item is None:
# The line didn't parse. For this specific purpose,
# we can assume that all enrollment-related lines would parse,
# and these non-parsing lines would get skipped anyway.
return None
# get event type, and check that it exists:
event_type = item.get('event_type')
if event_type is None:
eventlog.log_item("encountered event with no event_type", item)
return None
# check if it is an 'explicit' enrollment event:
if (event_type == 'edx.course.enrollment.activated' or
event_type == 'edx.course.enrollment.deactivated'):
return self.get_explicit_enrollment_output(item, event_type)
# check if it is an 'implicit' enrollment event:
if event_type == '/change_enrollment':
return self.get_implicit_enrollment_output(item)
# Not an enrollment event...
return None
def mapper(self, line):
"""
Output format: (course_id, username), (datetime, action_value)
where action_value = 1 (enrolled) or -1 (unenrolled)
Example:
edX/DemoX/Demo_Course dummyuser 2013-09-10 1
edX/DemoX/Demo_Course dummyuser 2013-09-10 1
edX/DemoX/Demo_Course dummyuser 2013-09-10 -1
"""
parsed_tuple = self.get_enrollment_event(line)
if parsed_tuple is not None:
# sys.stderr.write("Found tuple in mapper: " + str(parsed_tuple) + '\n')
yield parsed_tuple
def reducer(self, key, values):
"""
Calculate status for each user on the end of each day where they changed their status.
Output key: (course_id, date)
Output value: net enrollment change on that date for an individual user.
Expected values are -1, 0 (no change), 1
Note that we don't bother to actually output the username,
since it's not needed downstream.
If the user were already enrolled (or attempted enrollment),
the net change from a subsequent enrollment is zero. Same to
unenroll after an unenroll. This is true whether they occur
on the same day or on widely disparate days. For implicit
enrollment events, we don't know when they succeed, so we
assume they succeed the first time, and ignore subsequent
attempts. Likewise for implicit enrollment events followed by
explicit enrollment events.
An unenroll following an enroll on the same day will also
result in zero change.
Example:
edX/DemoX/Demo_Course 2013-09-10 1
edX/DemoX/Demo_Course 2013-09-10 -1
edX/DemoX/Demo_Course 2013-09-10 0
"""
# sys.stderr.write("Found key in reducer: " + str(key) + '\n')
course_id, username = key
sorted_values = sorted(values)
prev_date = None
prev_change = 0
net_change = 0
for (datetime, change_value) in sorted_values:
# get the day's date from the event timestamp:
this_date = eventlog.get_date_from_datetime(datetime)
# if the date is different, then output the previous date:
if this_date != prev_date and prev_date is not None:
# sys.stderr.write("outputting date and value: " + str(prev_date) + " " + str(net_change) + '\n')
yield (course_id, prev_date), net_change
net_change = 0
# sys.stderr.write("accumulating date and value: " + str(this_date) + " " + str(change_value) + '\n')
# accumulate the new numbers:
prev_date = this_date
if change_value != prev_change:
net_change += change_value
prev_change = change_value
if prev_date is not None:
yield (course_id, prev_date), net_change
class BaseCourseEnrollmentChangesPerDay(luigi.hadoop.JobTask):
"""Calculates daily changes in enrollment, given per-user net changes by date."""
def mapper(self, line):
"""
Output key: (course_id, date)
Output value: net enrollment change on that date for an individual user.
Expected values are -1, 0 (no change), 1
Example:
edX/DemoX/Demo_Course 2013-09-10 1
edX/DemoX/Demo_Course 2013-09-10 -1
edX/DemoX/Demo_Course 2013-09-10 0
"""
# yield line
inputs = line.split('\t')
if len(inputs) == 3:
yield (inputs[0], inputs[1]), inputs[2]
def reducer(self, key, values):
"""
Reducer: sums enrollments for a given course on a particular date.
Inputs are enrollments changes on a day due to a specific user.
Outputs are enrollment changes on a day summed across all users.
Key: (course_id, date)
Values: individual enrollment changes, represented as -1 or 1.
Output key: (course_id, date)
Output value: sum(changes)
"""
# sys.stderr.write("Found key in second reducer: " + str(key) + '\n')
sum_value = 0
for value in values:
sum_value += int(value)
yield key, sum_value
class BaseCourseEnrollmentTotalsPerDay(luigi.hadoop.JobTask):
"""Calculates cumulative changes in enrollment, given net changes by date."""
def mapper(self, line):
"""
Key: course_id
Values: (date, net enrollment change on that date)
Example:
edX/DemoX/Demo_Course 2013-09-10 5
edX/DemoX/Demo_Course 2013-09-11 -3
"""
# yield line
inputs = line.split('\t')
if len(inputs) == 3:
yield inputs[0], (inputs[1], inputs[2])
def reducer(self, key, values):
"""
Reducer: sums enrollments for a given course through a particular date.
Key: course_id
Values: date, and enrollment changes per day
Output key: course_id
Output value: date, accum(changes)
"""
# sys.stderr.write("Found key in third reducer: " + str(key) + '\n')
sorted_values = sorted(values)
accum_count = 0
for date, count in sorted_values:
accum_count += int(count)
yield key, date, accum_count
##################################
# Task requires/output definitions
##################################
class CourseEnrollmentEventsPerDay(BaseCourseEnrollmentEventsPerDay):
name = luigi.Parameter()
src = luigi.Parameter()
dest = luigi.Parameter()
include = luigi.Parameter(is_list=True, default=('*',))
run_locally = luigi.BooleanParameter()
def requires(self):
return PathSetTask(self.src, self.include, self.run_locally)
def output(self):
# generate a single output file
output_name = 'course_enrollment_events_per_day_{name}'.format(name=self.name)
return get_target_for_url(self.dest, output_name, self.run_locally)
def extra_modules(self):
import boto
import cjson
import edx.analytics.util
return [boto, edx.analytics.util, cjson]
class CourseEnrollmentChangesPerDay(BaseCourseEnrollmentChangesPerDay):
name = luigi.Parameter()
src = luigi.Parameter()
dest = luigi.Parameter()
include = luigi.Parameter(is_list=True, default=('*',))
run_locally = luigi.BooleanParameter()
def requires(self):
return CourseEnrollmentEventsPerDay(self.name, self.src, self.dest, self.include, self.run_locally)
def output(self):
# generate a single output file
output_name = 'course_enrollment_changes_per_day_{name}'.format(name=self.name)
return get_target_for_url(self.dest, output_name, self.run_locally)
def extra_modules(self):
import boto
import cjson
import edx.analytics.util
return [boto, edx.analytics.util, cjson]
class CourseEnrollmentTotalsPerDay(BaseCourseEnrollmentTotalsPerDay):
name = luigi.Parameter()
src = luigi.Parameter()
dest = luigi.Parameter()
include = luigi.Parameter(is_list=True, default=('*',))
run_locally = luigi.BooleanParameter()
def requires(self):
return CourseEnrollmentChangesPerDay(self.name, self.src, self.dest, self.include, self.run_locally)
def output(self):
# generate a single output file
output_name = 'course_enrollment_totals_per_day_{name}'.format(name=self.name)
return get_target_for_url(self.dest, output_name, self.run_locally)
def extra_modules(self):
import boto
import cjson
import edx.analytics.util
return [boto, edx.analytics.util, cjson]
class FirstCourseEnrollmentEventsPerDay(BaseCourseEnrollmentEventsPerDay):
name = luigi.Parameter()
src = luigi.Parameter()
dest = luigi.Parameter()
include = luigi.Parameter(is_list=True, default=('*',))
run_locally = luigi.BooleanParameter()
def requires(self):
return PathSetTask(self.src, self.include, self.run_locally)
def output(self):
# generate a single output file
output_name = 'first_course_enrollment_events_per_day_{name}'.format(name=self.name)
return get_target_for_url(self.dest, output_name, self.run_locally)
def extra_modules(self):
import boto
import cjson
import edx.analytics.util
return [boto, edx.analytics.util, cjson]
def reducer(self, key, values):
"""
Calculate first time each user enrolls in a course.
Output key: (course_id, date)
Output value: 1 on the first date the user enrolls.
Note that we don't bother to actually output the username,
since it's not needed downstream.
Example:
edX/DemoX/Demo_Course 2013-09-10 1
"""
# sys.stderr.write("Found key in reducer: " + str(key) + '\n')
course_id, username = key
sorted_values = sorted(values)
for (datetime, change_value) in sorted_values:
# get the day's date from the event timestamp:
this_date = eventlog.get_date_from_datetime(datetime)
if change_value > 0:
yield (course_id, this_date), change_value
return
class FirstCourseEnrollmentChangesPerDay(BaseCourseEnrollmentChangesPerDay):
name = luigi.Parameter()
src = luigi.Parameter()
dest = luigi.Parameter()
include = luigi.Parameter(is_list=True, default=('*',))
run_locally = luigi.BooleanParameter()
def requires(self):
return FirstCourseEnrollmentEventsPerDay(self.name, self.src, self.dest, self.include, self.run_locally)
def output(self):
# generate a single output file
output_name = 'first_course_enrollment_changes_per_day_{name}'.format(name=self.name)
return get_target_for_url(self.dest, output_name, self.run_locally)
def extra_modules(self):
import boto
import cjson
import edx.analytics.util
return [boto, edx.analytics.util, cjson]
class FirstCourseEnrollmentTotalsPerDay(BaseCourseEnrollmentTotalsPerDay):
name = luigi.Parameter()
src = luigi.Parameter()
dest = luigi.Parameter()
include = luigi.Parameter(is_list=True, default=('*',))
run_locally = luigi.BooleanParameter()
def requires(self):
return FirstCourseEnrollmentChangesPerDay(self.name, self.src, self.dest, self.include, self.run_locally)
def output(self):
# generate a single output file
output_name = 'first_course_enrollment_totals_per_day_{name}'.format(name=self.name)
return get_target_for_url(self.dest, output_name, self.run_locally)
def extra_modules(self):
import boto
import cjson
import edx.analytics.util
return [boto, edx.analytics.util, cjson]
################################
# Running tasks
################################
def main():
import argparse
import boto
import cjson
luigi.hadoop.attach(boto, argparse, cjson)
luigi.run()
if __name__ == '__main__':
main()
"""
Helper classes to specify file dependencies for input and output.
Supports inputs from S3 and local FS.
Supports outputs to HDFS, S3, and local FS.
"""
import os
import boto
import glob
from urlparse import urlparse
from fnmatch import fnmatch
import luigi
import luigi.s3
import luigi.hdfs
import luigi.format
def get_s3_bucket_key_names(url):
"""Extract bucket_name and root from S3 URL."""
parts = urlparse(url)
return (parts.netloc.strip('/'), parts.path.strip('/'))
def join_as_s3_url(bucket, root, path):
"""Construct a URL for accessing S3, given its components."""
return 's3://{bucket}/{root}/{path}'.format(bucket=bucket, root=root, path=path)
class LocalPathTask(luigi.ExternalTask):
"""
An external task that to require existence of
a path in a local file system.
Treats files ending with .gz as Gzip files.
"""
path = luigi.Parameter()
def output(self):
if self.path.endswith('.gz'):
yield luigi.LocalTarget(self.path, format=luigi.format.Gzip)
else:
yield luigi.LocalTarget(self.path)
class HdfsPathTask(luigi.ExternalTask):
"""
An external task that to require existence of
a path in HDFS.
"""
path = luigi.Parameter()
def output(self):
return luigi.hdfs.HdfsTarget(self.path)
class PathSetTask(luigi.Task):
"""
A task to select a subset of files in an S3 bucket or local FS.
Parameters:
src: a URL pointing to a folder in s3:// or local FS.
include: a list of patterns to use to select. Multiple patterns are OR'd.
run_locally: if True, use S3PathTask instead of HDFSPathTask, to permit
reading S3 data when running in local mode.
"""
src = luigi.Parameter()
include = luigi.Parameter(is_list=True, default=('*',))
run_locally = luigi.BooleanParameter()
def __init__(self, *args, **kwargs):
super(PathSetTask, self).__init__(*args, **kwargs)
self.s3 = boto.connect_s3()
def requires(self):
if self.src.startswith('s3'):
for bucket, root, path in self._generate_sources():
source = join_as_s3_url(bucket, root, path)
if self.run_locally:
yield luigi.s3.S3PathTask(source)
else:
yield HdfsPathTask(source)
else:
filelist = []
for include_val in self.include:
glob_pattern = "{src}/{include}".format(src=self.src, include=include_val)
filelist.extend(glob.glob(glob_pattern))
for filepath in filelist:
yield LocalPathTask(filepath)
def complete(self):
# An optimization: just declare that the task is always
# complete, by definition, because it is whatever files were
# requested that match the filter, not a set of files whose
# existence needs to be checked or generated again.
return True
def output(self):
return [task.output() for task in self.requires()]
def _generate_sources(self):
bucket_name, root = get_s3_bucket_key_names(self.src)
bucket = self.s3.get_bucket(bucket_name)
keys = (s.key for s in bucket.list(root) if s.size > 0)
# remove root
paths = (k.lstrip(root).strip('/') for k in keys)
paths = self._filter_matches(paths)
return ((bucket.name, root, path) for path in paths)
def _filter_matches(self, names):
patterns = self.include
fn = lambda n: any(fnmatch(n, p) for p in patterns)
return (n for n in names if fn(n))
def get_target_for_url(dest, output_name, run_locally=False):
"""
Generate an appropriate target for a given path, depending on protocol.
Parameters:
dest: a URL pointing to a folder in s3:// or hdfs:// or local FS.
output_name: name of file to be output.
run_locally: if True, use S3Target instead of HdfsTarget, to permit
writing S3 data when running in local mode.
"""
output_url = os.path.join(dest, output_name)
if output_url.startswith('s3://'):
if run_locally:
return luigi.s3.S3Target(output_url)
else:
return luigi.hdfs.HdfsTarget(output_url)
elif output_url.startswith('hdfs://'):
return luigi.hdfs.HdfsTarget(output_url)
else:
return luigi.LocalTarget(output_url)
"""Support for reading tracking event logs."""
import sys
import cjson
import datetime
import re
PATTERN_JSON = re.compile(r'^.*?(\{.*\})\s*$')
def get_datetime_string(timestamp):
return timestamp.strftime('%Y-%m-%dT%H:%M:%S')
def get_date_string(timestamp):
return timestamp.strftime('%Y-%m-%d')
def get_date_from_datetime(datetime_string):
return datetime_string.split('T')[0]
def json_decode(line):
"""Wrapper to decode JSON string in implementation-independent way."""
return cjson.decode(line)
def parse_eventlog_item(line, nested=False):
""" Parse a tracking log input line as JSON to create a dict representation."""
try:
parsed = json_decode(line)
except:
if not nested:
json_match = PATTERN_JSON.match(line)
if json_match:
return parse_eventlog_item(json_match.group(1), nested=True)
# Seem to be truncated in input data at 10000 for some log files, 2043 for others...
# First filter out common ones:
# if 'save_problem_check' not in line:
# sys.stderr.write("ERROR: encountered event with bad json: length = {len} start={start}\n".format(len=len(line), start=line[:40]))
# Even that leaves too many to log.
# TODO: Might be good going forward to collect stats on the length of truncation and the counts for
# different event "names" (normalized event_type values).
return None
return parsed
def log_item(msg, item, level='ERROR'):
sys.stderr.write("{level}: {msg}: {item}\n".format(msg=msg, item=item, level=level))
def get_timestamp(item):
try:
timestamp = item['time']
removed_ms = timestamp.split('.')[0]
return datetime.datetime.strptime(removed_ms, '%Y-%m-%dT%H:%M:%S')
except:
return None
def get_event_data(item):
event_value = item.get('event')
if event_value is None:
log_item("encountered event with missing event value", item)
return None
if isinstance(event_value, basestring):
# If the value is a string, try to parse as JSON into a dict:.
try:
event_value = json_decode(event_value)
except:
log_item("encountered event with unparsable event value", item)
return None
if isinstance(event_value, dict):
# It's fine, just return.
return event_value
else:
log_item("encountered event data with unrecognized type", item)
return None
"""
Tests for utilities that parse event logs.
"""
import unittest
import edx.analytics.util.eventlog as eventlog
class EventLogTest(unittest.TestCase):
"""
Tests to verify that event log parsing works correctly.
"""
def test_parse_valid_eventlog_item(self):
line = '{"username": "successful"}'
result = eventlog.parse_eventlog_item(line)
self.assertTrue(isinstance(result, dict))
def test_parse_eventlog_item_truncated(self):
line = '{"username": "unsuccessful'
result = eventlog.parse_eventlog_item(line)
self.assertIsNone(result)
def test_parse_eventlog_item_with_cruft(self):
line = 'leading cruft here {"username": "successful"} '
result = eventlog.parse_eventlog_item(line)
self.assertTrue(isinstance(result, dict))
def test_parse_eventlog_item_with_nonascii(self):
line = '{"username": "b\ufffdb"}'
result = eventlog.parse_eventlog_item(line)
self.assertTrue(isinstance(result, dict))
self.assertEquals(result['username'], u'b\ufffdb')
[MASTER]
# Specify a configuration file.
#rcfile=
# Python code to execute, usually for sys.path manipulation such as
# pygtk.require().
#init-hook=
# Profiled execution.
profile=no
# Add files or directories to the blacklist. They should be base names, not
# paths.
ignore=CVS, migrations
# Pickle collected data for later comparisons.
persistent=yes
# List of plugins (as comma separated values of python modules names) to load,
# usually to register additional checkers.
load-plugins=
[MESSAGES CONTROL]
# Enable the message, report, category or checker with the given id(s). You can
# either give multiple identifier separated by comma (,) or put this option
# multiple time.
#enable=
# Disable the message, report, category or checker with the given id(s). You
# can either give multiple identifier separated by comma (,) or put this option
# multiple time (only on the command line, not in the configuration file where
# it should appear only once).
disable=
# Never going to use these
# I0011: Locally disabling W0232
# W0141: Used builtin function 'map'
# W0142: Used * or ** magic
# R0921: Abstract class not referenced
# R0922: Abstract class is only referenced 1 times
I0011,W0141,W0142,R0921,R0922,
# Django makes classes that trigger these
# W0232: Class has no __init__ method
W0232,
# Might use these when the code is in better shape
# C0302: Too many lines in module
# R0201: Method could be a function
# R0901: Too many ancestors
# R0902: Too many instance attributes
# R0903: Too few public methods (1/2)
# R0904: Too many public methods
# R0911: Too many return statements
# R0912: Too many branches
# R0913: Too many arguments
# R0914: Too many local variables
C0302,R0201,R0901,R0902,R0903,R0904,R0911,R0912,R0913,R0914
[REPORTS]
# Set the output format. Available formats are text, parseable, colorized, msvs
# (visual studio) and html
output-format=text
# Include message's id in output
include-ids=yes
# Put messages in a separate file for each module / package specified on the
# command line instead of printing them on stdout. Reports (if any) will be
# written in a file name "pylint_global.[txt|html]".
files-output=no
# Tells whether to display a full report or only the messages
reports=no
# Python expression which should return a note less than 10 (10 is the highest
# note). You have access to the variables errors warning, statement which
# respectively contain the number of errors / warnings messages and the total
# number of statements analyzed. This is used by the global evaluation report
# (RP0004).
evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
# Add a comment according to your evaluation note. This is used by the global
# evaluation report (RP0004).
comment=no
[TYPECHECK]
# Tells whether missing members accessed in mixin class should be ignored. A
# mixin class is detected if its name ends with "mixin" (case insensitive).
ignore-mixin-members=yes
# List of classes names for which member attributes should not be checked
# (useful for classes with attributes dynamically set).
ignored-classes=SQLObject
# When zope mode is activated, add a predefined set of Zope acquired attributes
# to generated-members.
zope=no
# List of members which are set dynamically and missed by pylint inference
# system, and so shouldn't trigger E0201 when accessed. Python regular
# expressions are accepted.
generated-members=
REQUEST,
acl_users,
aq_parent,
objects,
DoesNotExist,
can_read,
can_write,
get_url,
size,
content,
status_code,
# For factory_boy factories
create
[BASIC]
# Required attributes for module, separated by a comma
required-attributes=
# List of builtins function names that should not be used, separated by a comma
bad-functions=map,filter,apply,input
# Regular expression which should only match correct module names
module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
# Regular expression which should only match correct module level names
const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__)|log|urlpatterns)$
# Regular expression which should only match correct class names
class-rgx=[A-Z_][a-zA-Z0-9]+$
# Regular expression which should only match correct function names
function-rgx=[a-z_][a-z0-9_]{2,30}$
# Regular expression which should only match correct method names
method-rgx=([a-z_][a-z0-9_]{2,60}|setUp|set[Uu]pClass|tearDown|tear[Dd]ownClass|assert[A-Z]\w*)$
# Regular expression which should only match correct instance attribute names
attr-rgx=[a-z_][a-z0-9_]{2,30}$
# Regular expression which should only match correct argument names
argument-rgx=[a-z_][a-z0-9_]{2,30}$
# Regular expression which should only match correct variable names
variable-rgx=[a-z_][a-z0-9_]{2,30}$
# Regular expression which should only match correct list comprehension /
# generator expression variable names
inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$
# Good variable names which should always be accepted, separated by a comma
good-names=i,j,k,ex,Run,_
# Bad variable names which should always be refused, separated by a comma
bad-names=foo,bar,baz,toto,tutu,tata
# Regular expression which should only match functions or classes name which do
# not require a docstring
no-docstring-rgx=__.*__|test_.*|setUp|tearDown
[MISCELLANEOUS]
# List of note tags to take in consideration, separated by a comma.
notes=FIXME,XXX,TODO
[FORMAT]
# Maximum number of characters on a single line.
max-line-length=120
# Maximum number of lines in a module
max-module-lines=1000
# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1
# tab).
indent-string=' '
[SIMILARITIES]
# Minimum lines number of a similarity.
min-similarity-lines=4
# Ignore comments when computing similarities.
ignore-comments=yes
# Ignore docstrings when computing similarities.
ignore-docstrings=yes
[VARIABLES]
# Tells whether we should check for unused import in __init__ files.
init-import=no
# A regular expression matching the beginning of the name of dummy variables
# (i.e. not used).
dummy-variables-rgx=_|dummy|unused|.*_unused
# List of additional names supposed to be defined in builtins. Remember that
# you should avoid to define new builtins when possible.
additional-builtins=
[IMPORTS]
# Deprecated modules which should not be used, separated by a comma
deprecated-modules=regsub,string,TERMIOS,Bastion,rexec
# Create a graph of every (i.e. internal and external) dependencies in the
# given file (report RP0402 must not be disabled)
import-graph=
# Create a graph of external dependencies in the given file (report RP0402 must
# not be disabled)
ext-import-graph=
# Create a graph of internal dependencies in the given file (report RP0402 must
# not be disabled)
int-import-graph=
[DESIGN]
# Maximum number of arguments for function / method
max-args=5
# Argument names that match this expression will be ignored. Default to name
# with leading underscore
ignored-argument-names=_.*
# Maximum number of locals for function / method body
max-locals=15
# Maximum number of return / yield for function / method body
max-returns=6
# Maximum number of branch for function / method body
max-branchs=12
# Maximum number of statements in function / method body
max-statements=50
# Maximum number of parents for a class (see R0901).
max-parents=7
# Maximum number of attributes for a class (see R0902).
max-attributes=7
# Minimum number of public methods for a class (see R0903).
min-public-methods=2
# Maximum number of public methods for a class (see R0904).
max-public-methods=20
[CLASSES]
# List of interface methods to ignore, separated by a comma. This is used for
# instance to not check methods defines in Zope's Interface base class.
ignore-iface-methods=isImplementedBy,deferred,extends,names,namesAndDescriptions,queryDescriptionFor,getBases,getDescriptionFor,getDoc,getName,getTaggedValue,getTaggedValueTags,isEqualOrExtendedBy,setTaggedValue,isImplementedByInstancesOf,adaptWith,is_implemented_by
# List of method names used to declare (i.e. assign) instance attributes.
defining-attr-methods=__init__,__new__,setUp
# List of valid names for the first argument in a class method.
valid-classmethod-first-arg=cls
[EXCEPTIONS]
# Exceptions that will emit a warning when being caught. Defaults to
# "Exception"
overgeneral-exceptions=Exception
......@@ -6,5 +6,6 @@ pbr==0.5.23
stevedore==0.13
tornado==3.1.1
ansible==1.4.4
python-cjson==1.0.5
-e git+https://github.com/spotify/luigi.git@a33756c781b9bf7e51384f0eb19d6a25050ef136#egg=luigi
nose
nose-ignore-docstring
coverage==3.7
pep8==1.4.5
pylint==0.28
diff-cover >= 0.2.1
......@@ -19,8 +19,10 @@ data_files =
console_scripts =
launch-task = edx.analytics.tasks.main:main
remote-task = edx.analytics.tasks.remote:main
edx.analytics.tasks =
s3-copy = edx.analytics.tasks.s3:S3Copy
s3-sync = edx.analytics.tasks.s3:S3Sync
sync-events = edx.analytics.tasks.eventlogs:SyncEventLogs
enrollments-report = edx.analytics.reports.enrollments:EnrollmentsByWeek
course-enroll = edx.analytics.tasks.course_enroll:CourseEnrollmentTotalsPerDay
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment