Commit 126ae6f7 by Brian Wilson

Define logging via config file.

Also other refactoring based on PR feedback.
parent f724d3de
[pep8]
ignore=E501
\ No newline at end of file
ignore=E501
max_line_length=119
......@@ -13,6 +13,7 @@ test-requirements: requirements
pip install -r requirements/test.txt
test: test-requirements
# TODO: when we have better coverage, modify this to actually fail when coverage is too low.
rm -rf .coverage
python -m coverage run --rcfile=./.coveragerc `which nosetests`
......
[core]
logging_conf_file=logging.cfg
[event-logs]
source = s3://edx-all-tracking-logs
destination = s3://edx-analytics-events/raw/by-server
include = prod-edx-*/tracking.log-*.gz
prod-edxapp-*/tracking.log-*.gz
\ No newline at end of file
prod-edxapp-*/tracking.log-*.gz
"""
Main method for running tasks.
Invoke a task by running `launch-task` with task's classname and
arguments for Luigi and for the task. Use `remote-task` to run
to submit the task to run on an EMR cluster.
Example command lines for various tasks:
* CourseEnrollmentChangesPerDay:
launch-task --local-scheduler CourseEnrollmentChangesPerDay
--name mytest --src input --include 'tracking*' --include '2012*'
--dest output7
remote-task --job-flow-id <job-id> --branch <branch-name> --remote-name run-20140204
--local-scheduler CourseEnrollmentChangesPerDay
--name run-20140204 --src s3://edx-all-tracking-logs --include 'prod-edx*/tracking.*-201312*.gz'
--include 'prod-edx*/tracking.*-2014*.gz' --dest s3://edx-analytics-scratch/output
"""
import os.path
import logging
......@@ -26,7 +48,7 @@ def main():
configuration.add_config_path(DEFAULT_CONFIGURATION_FILE)
if not os.path.isfile(DEFAULT_CONFIGURATION_FILE):
log.warning('Default configuration file not found:', DEFAULT_CONFIGURATION_FILE)
log.warning('Default configuration file not found: %s', DEFAULT_CONFIGURATION_FILE)
# Tell luigi what dependencies to pass to the Hadoop nodes
# - argparse is not included by default in python 2.6
......
......@@ -9,15 +9,13 @@ Supports outputs to HDFS, S3, and local FS.
import os
import boto
import glob
from urlparse import urlparse
from fnmatch import fnmatch
import luigi
import luigi.s3
import luigi.hdfs
import luigi.format
from s3_util import join_as_s3_url, generate_s3_sources
from edx.analytics.tasks.s3_util import join_as_s3_url, generate_s3_sources
class LocalPathTask(luigi.ExternalTask):
......
......@@ -19,7 +19,7 @@ def main():
change_directory_to_ansible_script_home()
extra_vars = convert_cli_arguments_to_ansible_extra_vars(arguments)
run_ansible_playbook(arguments.verbose, extra_vars)
......@@ -43,7 +43,7 @@ def convert_cli_arguments_to_ansible_extra_vars(arguments):
def run_ansible_playbook(verbose, extra_vars):
ansible_playbook_path = os.path.join(sys.prefix, 'bin', 'ansible-playbook')
command = [
ansible_playbook_path, '-i', 'ec2.py', 'task.yml', '-e', extra_vars
ansible_playbook_path, '-i', 'ec2.py', 'task.yml', '-e', extra_vars
]
if verbose:
command.append('-vvvv')
......
......@@ -4,7 +4,8 @@ import boto
import luigi
import luigi.s3
from s3_util import join_as_s3_url, get_s3_bucket_key_names, generate_s3_sources, get_s3_key
from edx.analytics.tasks.s3_util import join_as_s3_url, get_s3_bucket_key_names, generate_s3_sources, get_s3_key
class S3Copy(luigi.Task):
"""
......@@ -113,4 +114,3 @@ class S3Sync(luigi.Task):
def output(self):
for task in self.requires():
yield task.output()
......@@ -5,8 +5,6 @@ Utility methods for interacting with S3 via boto.
from fnmatch import fnmatch
from urlparse import urlparse
import boto
def get_s3_bucket_key_names(url):
"""Extract the bucket and key names from a S3 URL"""
......@@ -18,6 +16,7 @@ def join_as_s3_url(bucket, root, path):
"""Combine bucket name, root path and relative path into a S3 URL"""
return 's3://{0}/{1}/{2}'.format(bucket, root, path)
def get_s3_key(s3_conn, url):
"""Returns an S3 key for use in further boto actions."""
bucket_name, key_name = get_s3_bucket_key_names(url)
......@@ -25,6 +24,7 @@ def get_s3_key(s3_conn, url):
key = bucket.get_key(key_name)
return key
def generate_s3_sources(s3_conn, source, patterns):
"""
Returns a list of S3 sources that match filters.
......@@ -61,8 +61,8 @@ def generate_s3_sources(s3_conn, source, patterns):
return ((bucket.name, root, path) for path in paths)
def _filter_matches(patterns, names):
"""Return only key names that match any of the include patterns."""
fn = lambda n: any(fnmatch(n, p) for p in patterns)
return (n for n in names if fn(n))
func = lambda n: any(fnmatch(n, p) for p in patterns)
return (n for n in names if func(n))
"""Support for reading tracking event logs."""
import sys
import cjson
import datetime
import re
import logging
logger = logging.getLogger(__name__)
PATTERN_JSON = re.compile(r'^.*?(\{.*\})\s*$')
......@@ -32,13 +33,13 @@ def is_valid_course_id(course_id):
return all(PATTERN_COURSEID.match(component) for component in components)
def json_decode(line):
def decode_json(line):
"""Wrapper to decode JSON string in an implementation-independent way."""
# TODO: Verify correctness of cjson
return cjson.decode(line)
def parse_eventlog_item(line, nested=False):
def parse_json_event(line, nested=False):
"""
Parse a tracking log input line as JSON to create a dict representation.
......@@ -50,12 +51,12 @@ def parse_eventlog_item(line, nested=False):
JSON that are prepended by a timestamp.
"""
try:
parsed = json_decode(line)
parsed = decode_json(line)
except Exception:
if not nested:
json_match = PATTERN_JSON.match(line)
if json_match:
return parse_eventlog_item(json_match.group(1), nested=True)
return parse_json_event(json_match.group(1), nested=True)
# TODO: There are too many to be logged. It might be useful
# at some point to collect stats on the length of truncation
......@@ -65,23 +66,24 @@ def parse_eventlog_item(line, nested=False):
# Note that empirically some seem to be truncated in input
# data at 10000 characters, 2043 for others...
return None
return parsed
# TODO: add basic validation here.
def log_item(msg, item, level='ERROR'):
"""Writes a message about an eventlog item."""
# TODO: replace this with real logging.
sys.stderr.write("{level}: {msg}: {item}\n".format(msg=msg, item=item, level=level))
return parsed
# Time-related terminology:
# * datetime: a datetime object.
# * timestamp: a string, with date and time (to second), in ISO format.
# * timestamp: a string, with date and time (to millisecond), in ISO format.
# * datestamp: a string with only date information, in ISO format.
def datetime_to_timestamp(datetime_obj):
"""Returns a string with the datetime value of the provided datetime object."""
return datetime_obj.strftime('%Y-%m-%dT%H:%M:%S')
"""
Returns a string with the datetime value of the provided datetime object.
Note that if the datetime has zero microseconds, the microseconds will not be output.
"""
return datetime_obj.isoformat()
def datetime_to_datestamp(datetime_obj):
......@@ -94,39 +96,42 @@ def timestamp_to_datestamp(timestamp):
return timestamp.split('T')[0]
def get_event_time(item):
"""Returns a datetime object from an event item, if present."""
def get_event_time(event):
"""Returns a datetime object from an event object, if present."""
try:
timestamp = item['time']
removed_ms = timestamp.split('.')[0]
return datetime.datetime.strptime(removed_ms, '%Y-%m-%dT%H:%M:%S')
# Get entry, and strip off time zone information. Keep microseconds, if any.
raw_timestamp = event['time']
timestamp = raw_timestamp.split('+')[0]
if '.' not in timestamp:
timestamp = '{datetime}.000000'.format(datetime=timestamp)
return datetime.datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%S.%f')
except Exception:
return None
def get_event_data(item):
def get_event_data(event):
"""
Returns event data from an event log item as a dict object.
Returns event data from an event log entry as a dict object.
Returns None if not found.
"""
event_value = item.get('event')
event_value = event.get('event')
if event_value is None:
log_item("encountered event with missing event value", item)
logger.error("encountered event with missing event value: %s", event)
return None
if isinstance(event_value, basestring):
# If the value is a string, try to parse as JSON into a dict.
try:
event_value = json_decode(event_value)
event_value = decode_json(event_value)
except Exception:
log_item("encountered event with unparsable event value", item)
logger.error("encountered event with unparsable event value: %s", event)
return None
if isinstance(event_value, dict):
# It's fine, just return.
return event_value
else:
log_item("encountered event data with unrecognized type", item)
logger.error("encountered event data with unrecognized type: %s", event)
return None
......@@ -31,24 +31,24 @@ class ParseEventLogTest(unittest.TestCase):
Verify that event log parsing works correctly.
"""
def test_parse_valid_eventlog_item(self):
def test_parse_valid_json_event(self):
line = '{"username": "successful"}'
result = eventlog.parse_eventlog_item(line)
result = eventlog.parse_json_event(line)
self.assertTrue(isinstance(result, dict))
def test_parse_eventlog_item_truncated(self):
def test_parse_json_event_truncated(self):
line = '{"username": "unsuccessful'
result = eventlog.parse_eventlog_item(line)
result = eventlog.parse_json_event(line)
self.assertIsNone(result)
def test_parse_eventlog_item_with_cruft(self):
def test_parse_json_event_with_cruft(self):
line = 'leading cruft here {"username": "successful"} '
result = eventlog.parse_eventlog_item(line)
result = eventlog.parse_json_event(line)
self.assertTrue(isinstance(result, dict))
def test_parse_eventlog_item_with_nonascii(self):
def test_parse_json_event_with_nonascii(self):
line = '{"username": "b\ufffdb"}'
result = eventlog.parse_eventlog_item(line)
result = eventlog.parse_json_event(line)
self.assertTrue(isinstance(result, dict))
self.assertEquals(result['username'], u'b\ufffdb')
......@@ -57,16 +57,38 @@ class TimestampTest(unittest.TestCase):
"""Verify timestamp-related functions."""
def test_datestamp_from_timestamp(self):
timestamp = "2013-12-17T15:38:32"
timestamp = "2013-12-17T15:38:32.805444"
self.assertEquals(eventlog.timestamp_to_datestamp(timestamp), "2013-12-17")
def test_missing_datetime(self):
item = {"something else": "not an event"}
self.assertIsNone(eventlog.get_event_time(item))
def test_good_datetime(self):
def test_good_datetime_with_microseconds_and_timezone(self):
item = {"time": "2013-12-17T15:38:32.805444+00:00"}
dt_value = eventlog.get_event_time(item)
self.assertIsNotNone(dt_value)
self.assertEquals(eventlog.datetime_to_timestamp(dt_value), "2013-12-17T15:38:32.805444")
self.assertEquals(eventlog.datetime_to_datestamp(dt_value), "2013-12-17")
def test_good_datetime_with_timezone(self):
item = {"time": "2013-12-17T15:38:32+00:00"}
dt_value = eventlog.get_event_time(item)
self.assertIsNotNone(dt_value)
self.assertEquals(eventlog.datetime_to_timestamp(dt_value), "2013-12-17T15:38:32")
self.assertEquals(eventlog.datetime_to_datestamp(dt_value), "2013-12-17")
def test_good_datetime_with_microseconds(self):
item = {"time": "2013-12-17T15:38:32.805444"}
dt_value = eventlog.get_event_time(item)
self.assertIsNotNone(dt_value)
self.assertEquals(eventlog.datetime_to_timestamp(dt_value), "2013-12-17T15:38:32.805444")
self.assertEquals(eventlog.datetime_to_datestamp(dt_value), "2013-12-17")
def test_good_datetime_with_no_microseconds_or_timezone(self):
item = {"time": "2013-12-17T15:38:32"}
dt_value = eventlog.get_event_time(item)
self.assertIsNotNone(dt_value)
self.assertEquals(eventlog.datetime_to_timestamp(dt_value), "2013-12-17T15:38:32")
self.assertEquals(eventlog.datetime_to_datestamp(dt_value), "2013-12-17")
......
#
# Define logging for use with analytics tasks.
#
# This defines handlers for logging coming from
# edx/analytics code, and from luigi code.
# Luigi messages go to stdout, while edx messages
# are routed to stderr.
[loggers]
keys=root,edx_analytics,luigi_interface
[handlers]
keys=stderrHandler,luigiHandler,localHandler
[formatters]
keys=standard,luigi_default
[logger_root]
level=DEBUG
handlers=localHandler
[logger_edx_analytics]
# Errors from edx/analytics get routed to stderr.
level=WARNING
handlers=stderrHandler
qualname=edx.analytics
propagate=0
[logger_luigi_interface]
# Errors from luigi-interface get routed to stdout.
level=INFO
handlers=luigiHandler
qualname=luigi-interface
propagate=0
[handler_stderrHandler]
class=StreamHandler
formatter=standard
args=(sys.stderr,)
[handler_luigiHandler]
# Define as in luigi/interface.py.
class=StreamHandler
formatter=luigi_default
args=(sys.stdout,)
[handler_localHandler]
# Define as in edx-platform/common/lib/logsettings.py (for dev logging, not syslog).
class=logging.handlers.RotatingFileHandler
formatter=standard
args=('edx_analytics.log', 'w')
# 'maxBytes': 1024 * 1024 * 2,
# 'backupCount': 5,
[formatter_standard]
# Define as in edx-platform/common/lib/logsettings.py (for dev logging, not syslog).
format=%(asctime)s %(levelname)s %(process)d [%(name)s] %(filename)s:%(lineno)d - %(message)s
[formatter_luigi_default]
# Define as in luigi/interface.py.
format=%(levelname)s: %(message)s
......@@ -25,4 +25,4 @@ edx.analytics.tasks =
s3-sync = edx.analytics.tasks.s3:S3Sync
sync-events = edx.analytics.tasks.eventlogs:SyncEventLogs
enrollments-report = edx.analytics.reports.enrollments:EnrollmentsByWeek
course-enroll = edx.analytics.tasks.course_enroll:CourseEnrollmentTotalsPerDay
course-enroll = edx.analytics.tasks.course_enroll:CourseEnrollmentChangesPerDay
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment