Commit f2eafda8 by Clinton Blackburn Committed by Clinton Blackburn

Re-architected Course Validation Scripts

 - Storing data in Elasticsearch instead of writing to CSV
 - Using a base reporter class to allow for the addition of future report generators
 - Added report generator for graded content data
 - Added reporter generator for graded content page screenshots
parent a4aa0e81
import os
DASHBOARD_SERVER_URL = os.environ['DASHBOARD_SERVER_URL']
from acceptance_tests import str2bool
ENABLE_AUTO_AUTH = str2bool(os.environ.get('ENABLE_AUTO_AUTH', False))
DASHBOARD_SERVER_URL = os.environ['DASHBOARD_SERVER_URL'].strip('/')
API_SERVER_URL = os.environ['API_SERVER_URL']
API_AUTH_TOKEN = os.environ['API_AUTH_TOKEN']
LMS_URL = os.environ['LMS_URL']
LMS_USERNAME = os.environ['LMS_USERNAME']
LMS_PASSWORD = os.environ['LMS_PASSWORD']
LMS_URL = os.environ.get('LMS_URL')
LMS_USERNAME = os.environ.get('LMS_USERNAME')
LMS_PASSWORD = os.environ.get('LMS_PASSWORD')
BASIC_AUTH_USERNAME = os.environ.get('BASIC_AUTH_USERNAME')
BASIC_AUTH_PASSWORD = os.environ.get('BASIC_AUTH_PASSWORD')
......@@ -14,3 +18,6 @@ BASIC_AUTH_CREDENTIALS = None
if BASIC_AUTH_USERNAME and BASIC_AUTH_PASSWORD:
BASIC_AUTH_CREDENTIALS = (BASIC_AUTH_USERNAME, BASIC_AUTH_PASSWORD)
COURSE_API_URL = os.environ.get('COURSE_API_URL')
COURSE_API_KEY = os.environ.get('COURSE_API_KEY')
from analyticsclient.constants import demographic
from analyticsclient.exceptions import ClientError
import requests
from acceptance_tests.course_validation import DASHBOARD_SERVER_URL
COURSE_PAGES = ['enrollment/activity', 'enrollment/geography', 'engagement/content']
API_REPORT_KEYS = ['api_enrollment_activity', 'api_enrollment_geography', 'api_activity']
class CourseReporter(object):
course = None
course_id = None
def __init__(self, course, cookies=None):
self.course = course
self.course_id = course.course_id
self.http_client = requests.Session()
self.http_client.cookies = cookies
def _http_status(self, url):
r = self.http_client.get(url)
return r.status_code
def _build_course_url(self, path):
return '{0}/courses/{1}/{2}/'.format(DASHBOARD_SERVER_URL, self.course_id, path)
def has_enrollment_activity(self):
try:
self.course.enrollment()
return True
except ClientError:
return False
def has_enrollment_geography(self):
try:
self.course.enrollment(demographic.LOCATION)
return True
except ClientError:
return False
def has_engagement_activity(self):
try:
self.course.activity()
return True
except ClientError:
return False
def report(self):
report = {
'course_id': self.course_id
}
# Check that the pages load
for page in COURSE_PAGES:
report[page] = self._http_status(self._build_course_url(page))
# Check API for data
report['api_enrollment_activity'] = self.has_enrollment_activity()
report['api_enrollment_geography'] = self.has_enrollment_geography()
report['api_activity'] = self.has_engagement_activity()
return report
#! /usr/bin/env python
"""
This script generates a report containing the following details for each course:
* HTTP status codes for every course page in Insights
* Boolean indicating if the API has activity and enrollment data for the course
A live feed of the report can be tail'ed from <TIMESTAMP>-course_report.log. The final output CSV is available
in the file <TIMESTAMP>-course_report.csv. <TIMESTAMP> is the time, in UTC, at which this script was initialized.
To execute this script run the following command from the parent directory of acceptance_tests:
$ python -m acceptance_tests.course_validation.generate_report
"""
import csv
import datetime
import logging
import time
from multiprocessing import Queue, Pool
from pyquery import PyQuery as pq
import requests
from analyticsclient.client import Client
from acceptance_tests.course_validation import API_SERVER_URL, API_AUTH_TOKEN, LMS_URL, LMS_USERNAME, LMS_PASSWORD, \
BASIC_AUTH_CREDENTIALS, DASHBOARD_SERVER_URL
from acceptance_tests.course_validation.course_reporter import CourseReporter, API_REPORT_KEYS, COURSE_PAGES
NUM_PROCESSES = 8
TIMESTAMP = datetime.datetime.utcnow().strftime('%Y-%m-%d-%H-%M-%S')
logger = logging.getLogger(__name__)
def _setup_logging():
logger.setLevel(logging.DEBUG)
# Log all debug and higher to files
fh = logging.FileHandler('{}-course_report.log'.format(TIMESTAMP))
fh.setLevel(logging.DEBUG)
# Log info and higher to console
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
# Setup log formatting
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
fh.setFormatter(formatter)
ch.setFormatter(formatter)
# Add the handlers to the logger
logger.addHandler(fh)
logger.addHandler(ch)
# Disable requests debug logs
logging.getLogger("requests").setLevel(logging.WARNING)
def check_course(course_id):
"""
Gather info on the given course.
"""
logger.debug('Checking %s...', course_id)
course = check_course.api_client.courses(course_id)
reporter = CourseReporter(course, check_course.cookies)
report = reporter.report()
logger.info(report)
check_course.q.put(report)
def pool_init(q, api_client, cookies):
"""
Initialize the variables needed by the mapping function.
"""
check_course.q = q
check_course.api_client = api_client
check_course.cookies = cookies
def write_csv(reports):
"""
Write the data from the Queue to a CSV.
"""
logger.info('Writing data to CSV...')
keys = ['course_id'] + COURSE_PAGES + API_REPORT_KEYS
filename = '{}-course_report.csv'.format(TIMESTAMP)
f = open(filename, 'wb')
dict_writer = csv.DictWriter(f, keys)
dict_writer.writer.writerow(keys)
while not reports.empty():
dict_writer.writerow(reports.get())
logger.info('Data was saved to %s.', filename)
def login(http_client):
logger.info('Logging into LMS...')
lms_login = '{}/login'.format(LMS_URL)
lms_login_ajax = '{}/login_ajax'.format(LMS_URL)
# Make a call to the login page to get cookies (esp. CSRF token)
http_client.get(lms_login)
# Set the headers and data for the actual login request.
headers = {
'referer': lms_login
}
data = {
'email': LMS_USERNAME,
'password': LMS_PASSWORD,
'csrfmiddlewaretoken': http_client.cookies['csrftoken']
}
# Login!
r = http_client.post(lms_login_ajax, data=data, headers=headers)
success = r.json().get('success', False)
if not success:
msg = 'Login failed!'
logger.error(msg)
raise Exception(msg)
logger.info('Login succeeded.')
def get_courses(http_client):
course_list_url = '{}/courses/'.format(DASHBOARD_SERVER_URL)
r = http_client.get(course_list_url)
if r.status_code != 200:
msg = 'Failed to retrieve course list!'
logger.error(msg)
raise Exception(msg)
d = pq(r.text)
courses = []
elements = d('.course-list .course-title')
for element in elements:
courses.append(element.text.strip())
logger.info('Retrieved %s courses from %s.', len(courses), course_list_url)
return courses
def main():
start = time.time()
api_client = Client(base_url=API_SERVER_URL, auth_token=API_AUTH_TOKEN, timeout=1000)
http_client = requests.Session()
if BASIC_AUTH_CREDENTIALS:
http_client.auth = BASIC_AUTH_CREDENTIALS
login(http_client)
# Basic auth is no longer needed
http_client.auth = None
# Get courses
courses = get_courses(http_client)
# Collect the data
reports = Queue()
try:
p = Pool(NUM_PROCESSES, pool_init, [reports, api_client, http_client.cookies])
p.map(check_course, courses)
except Exception as e: # pylint: disable=broad-except
logger.error('Validation failed to finish: %s', e)
# Write the data to an external file
write_csv(reports)
end = time.time()
logger.info('Finished in %d seconds.', end - start)
if __name__ == "__main__":
_setup_logging()
main()
{
"course_performance_screenshot": {
"properties": {
"approved": {
"type": "boolean"
},
"course_id": {
"type": "string"
},
"course_valid": {
"type": "boolean"
},
"error": {
"type": "string"
},
"pages": {
"properties": {
"filename": {
"type": "string"
},
"url_path": {
"type": "string"
}
}
},
"reviewed": {
"type": "boolean"
},
"start": {
"type": "date",
"format": "dateOptionalTime"
}
}
},
"course_performance": {
"properties": {
"assignment_types": {
"properties": {
"actual": {
"type": "string"
},
"expected": {
"type": "string"
},
"results": {
"properties": {
"actual": {
"type": "integer"
},
"expected": {
"type": "integer"
},
"load_time": {
"type": "double"
},
"name": {
"type": "string"
},
"problems": {
"properties": {
"number_in_structure": {
"type": "integer"
},
"number_with_submissions": {
"type": "integer"
},
"total_submissions": {
"type": "integer"
},
"valid": {
"type": "boolean"
}
}
},
"status": {
"type": "integer"
},
"valid": {
"type": "boolean"
}
}
},
"valid": {
"type": "boolean"
}
}
},
"course_id": {
"type": "string"
},
"course_valid": {
"type": "boolean"
},
"error": {
"type": "string"
},
"has_submissions": {
"type": "boolean"
},
"start": {
"type": "date",
"format": "dateOptionalTime"
}
}
}
}
"""
Script that produces min-max-avg for problem counts across all courses.
"""
from elasticsearch import Elasticsearch
es = Elasticsearch(retry_on_timeout=True)
index = 'course_reports_stage'
body = {
'size': 1000,
'query': {'match_all': {}},
"fields": [
"course_id",
"num_problems"
],
"script_fields": {
"num_problems": {
"script": "if (_source.assignment_types) { result=0; for (element in _source.assignment_types.results) { "
"result = result + element.problems.number_in_structure; }; result;} else { 0 }",
"type": "number"
}
}
}
res = es.search(index=index, doc_type='course_performance', body=body)
num_courses = res['hits']['total']
print "Courses: %d" % num_courses
data = {}
for hit in res['hits']['hits']:
fields = hit['fields']
data[fields['course_id'][0]] = fields['num_problems'][0]
sum_counts = sum(data.values())
print 'Max: %d' % max(data.values())
print 'Avg: %d' % (sum_counts / num_courses)
print 'Min: %d' % min(data.values())
print 'Sum: %d' % sum_counts
#! /usr/bin/env python
"""
This script executes reporters that provide data about course pages. Any reporter inheriting from ReporterBase
can be used.
All collected data is logged to a local file and indexed in an Elasticsearch index. The local file can be found at
<TIMESTAMP>-course_report.log. <TIMESTAMP> is the time, in UTC, at which this script was initialized.
Elasticsearch data is written to the index named course_reports. Note that subsequent runs of this script will overwrite
existing data as new data is collected.
The list of courses on which to report is pulled from the course structure API and saved locally in a file named
courses.json. Subsequent script runs will use this file instead of the API. If you want fresh data,
simply delete the file.
To execute this script run the following command from the parent directory of acceptance_tests:
$ python -m acceptance_tests.course_validation.report_runner
"""
import datetime
import io
import json
import logging
from multiprocessing import Pool
from os.path import abspath, dirname, join
import time
import traceback
from elasticsearch import Elasticsearch
import requests
from acceptance_tests.course_validation import LMS_URL, LMS_USERNAME, LMS_PASSWORD, \
BASIC_AUTH_CREDENTIALS, COURSE_API_URL, COURSE_API_KEY, ENABLE_AUTO_AUTH, DASHBOARD_SERVER_URL
from acceptance_tests.course_validation.report_generators import CoursePerformanceReportGenerator
from common.clients import CourseStructureApiClient
NUM_PROCESSES = 8
TIMESTAMP = datetime.datetime.utcnow().strftime('%Y-%m-%d-%H-%M-%S')
logger = logging.getLogger(__name__)
es = Elasticsearch(retry_on_timeout=True)
index_name = 'course_reports'
# Add additional reports here to get different information.
# For example, if you want screenshots, add the CoursePerformanceScreenshotReporter.
# Obvious Note: The more reporters you run, the longer the script will run.
reporters = [CoursePerformanceReportGenerator, ]
def _setup_logging():
level = logging.DEBUG
msg_format = '%(asctime)s - %(levelname)s - %(message)s'
logging.basicConfig(
filename='{}-course_report.log'.format(TIMESTAMP),
format=msg_format,
level=level)
# Log to console, in addition to file
ch = logging.StreamHandler()
ch.setLevel(level)
ch.setFormatter(logging.Formatter(msg_format))
logging.root.addHandler(ch)
# Disable requests and elasticsearch debug logs
logging.getLogger('requests').setLevel(logging.WARNING)
logging.getLogger('elasticsearch').setLevel(logging.WARNING)
logging.getLogger('elasticsearch.trace').setLevel(logging.WARNING)
def check_course(course_id):
"""
Gather info on the given course.
"""
logger.debug('Checking %s...', course_id)
valid = True
for reporter_class in reporters:
try:
# Generate a report for each reporter
reporter = reporter_class(course_id, check_course.cookies)
_valid, report = reporter.generate_report()
valid &= _valid
except Exception as e: # pylint: disable=broad-except
logger.error('Validation for course %s failed: %s\n%s', course_id, e, traceback.format_exc())
valid = False
report = {'course_id': course_id, 'course_valid': False, 'error': unicode(e)}
# Dump the info to the log and Elasticsearch
logger.info(json.dumps(report))
try:
doc_type = reporter_class.REPORT_NAME
es.index(index=index_name, doc_type=doc_type, body=report, id=course_id)
except Exception as e:
logger.error('%s\n%s', e, traceback.format_exc())
raise
if valid:
logger.info('Successfully validated %s.', course_id)
else:
logger.error('Course %s is not valid!', course_id)
def pool_init(cookies):
"""
Initialize the variables needed by the mapping function.
"""
check_course.cookies = cookies
def login(http_client):
failure_msg = 'Login failed!'
if ENABLE_AUTO_AUTH:
logger.info('Logging into dashboard with auto auth...')
response = http_client.get('{}/test/auto_auth/'.format(DASHBOARD_SERVER_URL))
if response.status_code == 200:
logger.info('Login succeeded.')
return
else:
logger.fatal(failure_msg)
raise Exception(failure_msg)
logger.info('Logging into LMS...')
if BASIC_AUTH_CREDENTIALS:
http_client.auth = BASIC_AUTH_CREDENTIALS
lms_login = '{}/login'.format(LMS_URL)
lms_login_ajax = '{}/login_ajax'.format(LMS_URL)
# Make a call to the login page to get cookies (esp. CSRF token)
http_client.get(lms_login)
# Set the headers and data for the actual login request.
headers = {
'referer': lms_login
}
data = {
'email': LMS_USERNAME,
'password': LMS_PASSWORD,
'csrfmiddlewaretoken': http_client.cookies['csrftoken']
}
# Login!
r = http_client.post(lms_login_ajax, data=data, headers=headers)
success = r.json().get('success', False)
if not success:
msg = failure_msg
logger.error(msg)
raise Exception(msg)
# Basic auth is no longer needed
http_client.auth = None
logger.info('Login succeeded.')
def get_courses():
filename = 'courses.json'
try:
with io.open(filename, 'r', encoding='utf-8') as f:
courses = json.load(f)
except Exception as e: # pylint: disable=broad-except
logger.warning('Failed to read courses from file: %s', e)
courses = []
if not courses:
logger.info('Retrieving courses from API...')
client = CourseStructureApiClient(COURSE_API_URL, COURSE_API_KEY)
courses = client.all_courses
courses = [course['id'] for course in courses]
courses.sort(key=lambda course: course.lower())
with io.open(filename, 'w', encoding='utf-8') as f:
f.write(unicode(json.dumps(courses, ensure_ascii=False)))
logger.info('Retrieved %s courses.', len(courses))
return courses
def main():
start = time.time()
http_client = requests.Session()
# Log into Insights using either OIDC (via LMS) or auto auth.
login(http_client)
# Get courses on which to report
courses = get_courses()
# Create index to store results
if not es.indices.exists(index_name):
es.indices.create(index_name)
# Create the mappings
mappings_file = join(dirname(abspath(__file__)), 'mappings.json')
with io.open(mappings_file, 'r', encoding='utf-8') as f:
mappings = json.load(f)
for doc_type, body in mappings.iteritems():
es.indices.put_mapping(index=index_name, doc_type=doc_type, body=body)
def finish():
end = time.time()
logger.info('Finished in %d seconds.', end - start)
try:
p = Pool(NUM_PROCESSES, pool_init, [http_client.cookies])
p.map(check_course, courses)
p.close()
except (KeyboardInterrupt, SystemExit):
p.terminate()
finish()
raise
except Exception as e: # pylint: disable=broad-except
logger.error('Validation failed to finish: %s', e)
finish()
if __name__ == "__main__":
_setup_logging()
main()
import logging
import slumber
from slumber.exceptions import HttpClientError
from common.auth import BearerAuth
logger = logging.getLogger(__name__)
class CourseStructureApiClient(slumber.API):
"""
Course Structure API Client
......@@ -14,3 +20,28 @@ class CourseStructureApiClient(slumber.API):
def __init__(self, url, access_token):
super(CourseStructureApiClient, self).__init__(url, auth=BearerAuth(access_token))
@property
def all_courses(self):
courses = []
page = 1
while page:
try:
logger.debug('Retrieving page %d of course info...', page)
response = self.courses.get(page=page, page_size=100)
course_details = response['results']
courses += course_details
if response['next']:
page += 1
else:
page = None
logger.debug('Completed retrieval of course info. Retrieved info for %d courses.', len(courses))
except HttpClientError as e:
logger.error("Unable to retrieve course data: %s", e)
page = None
break
return courses
......@@ -5,6 +5,7 @@ coverage==3.7.1
ddt==1.0.0
django-dynamic-fixture==1.8.1
django-nose==1.3
elasticsearch==1.4.0
unittest2==0.8.0
httpretty==0.8.4
mock==1.0.1
......@@ -14,7 +15,6 @@ nose==1.3.4
pep257==0.4.1
pep8==1.6.0
pylint==1.4.1
pyquery>=1.2.9
selenium>=2.44.0
sure==1.2.7
testfixtures==4.1.2
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment