Re-architected Course Validation Scripts

- Storing data in Elasticsearch instead of writing to CSV - Using a base reporter class to allow for the addition of future report generators - Added report generator for graded content data - Added reporter generator for graded content page screenshots

Re-architected Course Validation Scripts
- Storing data in Elasticsearch instead of writing to CSV - Using a base reporter class to allow for the addition of future report generators - Added report generator for graded content data - Added reporter generator for graded content page screenshots
f2eafda8 · Clinton Blackburn · Clinton Blackburn · a4aa0e81 · f2eafda8 · a4aa0e81
Commit f2eafda8 authored Mar 05, 2015 by Clinton Blackburn Committed by Clinton Blackburn Mar 09, 2015
9 changed files
--- a/acceptance_tests/course_validation/__init__.py
+++ b/acceptance_tests/course_validation/__init__.py
 import os

-DASHBOARD_SERVER_URL = os.environ['DASHBOARD_SERVER_URL']
+from acceptance_tests import str2bool
+
+
+ENABLE_AUTO_AUTH = str2bool(os.environ.get('ENABLE_AUTO_AUTH', False))
+DASHBOARD_SERVER_URL = os.environ['DASHBOARD_SERVER_URL'].strip('/')
 API_SERVER_URL = os.environ['API_SERVER_URL']
 API_AUTH_TOKEN = os.environ['API_AUTH_TOKEN']

-LMS_URL = os.environ['LMS_URL']
-LMS_USERNAME = os.environ['LMS_USERNAME']
-LMS_PASSWORD = os.environ['LMS_PASSWORD']
+LMS_URL = os.environ.get('LMS_URL')
+LMS_USERNAME = os.environ.get('LMS_USERNAME')
+LMS_PASSWORD = os.environ.get('LMS_PASSWORD')

 BASIC_AUTH_USERNAME = os.environ.get('BASIC_AUTH_USERNAME')
 BASIC_AUTH_PASSWORD = os.environ.get('BASIC_AUTH_PASSWORD')
@@ -14,3 +18,6 @@ BASIC_AUTH_CREDENTIALS = None

 if BASIC_AUTH_USERNAME and BASIC_AUTH_PASSWORD:
    BASIC_AUTH_CREDENTIALS = (BASIC_AUTH_USERNAME, BASIC_AUTH_PASSWORD)
+
+COURSE_API_URL = os.environ.get('COURSE_API_URL')
+COURSE_API_KEY = os.environ.get('COURSE_API_KEY')
--- a/acceptance_tests/course_validation/course_reporter.py
+++ b/acceptance_tests/course_validation/course_reporter.py
-from analyticsclient.constants import demographic
-from analyticsclient.exceptions import ClientError
-import requests
-
-from acceptance_tests.course_validation import DASHBOARD_SERVER_URL
-
-
-COURSE_PAGES = ['enrollment/activity', 'enrollment/geography', 'engagement/content']
-API_REPORT_KEYS = ['api_enrollment_activity', 'api_enrollment_geography', 'api_activity']
-
-
-class CourseReporter(object):
-    course = None
-    course_id = None
-
-    def __init__(self, course, cookies=None):
-        self.course = course
-        self.course_id = course.course_id
-        self.http_client = requests.Session()
-        self.http_client.cookies = cookies
-
-    def _http_status(self, url):
-        r = self.http_client.get(url)
-        return r.status_code
-
-    def _build_course_url(self, path):
-        return '{0}/courses/{1}/{2}/'.format(DASHBOARD_SERVER_URL, self.course_id, path)
-
-    def has_enrollment_activity(self):
-        try:
-            self.course.enrollment()
-            return True
-        except ClientError:
-            return False
-
-    def has_enrollment_geography(self):
-        try:
-            self.course.enrollment(demographic.LOCATION)
-            return True
-        except ClientError:
-            return False
-
-    def has_engagement_activity(self):
-        try:
-            self.course.activity()
-            return True
-        except ClientError:
-            return False
-
-    def report(self):
-        report = {
-            'course_id': self.course_id
-        }
-
-        # Check that the pages load
-        for page in COURSE_PAGES:
-            report[page] = self._http_status(self._build_course_url(page))
-
-        # Check API for data
-        report['api_enrollment_activity'] = self.has_enrollment_activity()
-        report['api_enrollment_geography'] = self.has_enrollment_geography()
-        report['api_activity'] = self.has_engagement_activity()
-
-        return report
--- a/acceptance_tests/course_validation/generate_report.py
+++ b/acceptance_tests/course_validation/generate_report.py
-#! /usr/bin/env python
-
-"""
-This script generates a report containing the following details for each course:
- * HTTP status codes for every course page in Insights
- * Boolean indicating if the API has activity and enrollment data for the course
-
-A live feed of the report can be tail'ed from <TIMESTAMP>-course_report.log. The final output CSV is available
-in the file <TIMESTAMP>-course_report.csv. <TIMESTAMP> is the time, in UTC, at which this script was initialized.
-
-To execute this script run the following command from the parent directory of acceptance_tests:
-
-    $ python -m acceptance_tests.course_validation.generate_report
-"""
-
-import csv
-import datetime
-import logging
-import time
-from multiprocessing import Queue, Pool
-
-from pyquery import PyQuery as pq
-import requests
-from analyticsclient.client import Client
-
-from acceptance_tests.course_validation import API_SERVER_URL, API_AUTH_TOKEN, LMS_URL, LMS_USERNAME, LMS_PASSWORD, \
-    BASIC_AUTH_CREDENTIALS, DASHBOARD_SERVER_URL
-from acceptance_tests.course_validation.course_reporter import CourseReporter, API_REPORT_KEYS, COURSE_PAGES
-
-
-NUM_PROCESSES = 8
-TIMESTAMP = datetime.datetime.utcnow().strftime('%Y-%m-%d-%H-%M-%S')
-logger = logging.getLogger(__name__)
-
-
-def _setup_logging():
-    logger.setLevel(logging.DEBUG)
-
-    # Log all debug and higher to files
-    fh = logging.FileHandler('{}-course_report.log'.format(TIMESTAMP))
-    fh.setLevel(logging.DEBUG)
-
-    # Log info and higher to console
-    ch = logging.StreamHandler()
-    ch.setLevel(logging.DEBUG)
-
-    # Setup log formatting
-    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
-    fh.setFormatter(formatter)
-    ch.setFormatter(formatter)
-
-    # Add the handlers to the logger
-    logger.addHandler(fh)
-    logger.addHandler(ch)
-
-    # Disable requests debug logs
-    logging.getLogger("requests").setLevel(logging.WARNING)
-
-
-def check_course(course_id):
-    """
-    Gather info on the given course.
-    """
-    logger.debug('Checking %s...', course_id)
-
-    course = check_course.api_client.courses(course_id)
-    reporter = CourseReporter(course, check_course.cookies)
-    report = reporter.report()
-    logger.info(report)
-    check_course.q.put(report)
-
-
-def pool_init(q, api_client, cookies):
-    """
-    Initialize the variables needed by the mapping function.
-    """
-    check_course.q = q
-    check_course.api_client = api_client
-    check_course.cookies = cookies
-
-
-def write_csv(reports):
-    """
-    Write the data from the Queue to a CSV.
-    """
-    logger.info('Writing data to CSV...')
-
-    keys = ['course_id'] + COURSE_PAGES + API_REPORT_KEYS
-    filename = '{}-course_report.csv'.format(TIMESTAMP)
-
-    f = open(filename, 'wb')
-    dict_writer = csv.DictWriter(f, keys)
-    dict_writer.writer.writerow(keys)
-
-    while not reports.empty():
-        dict_writer.writerow(reports.get())
-
-    logger.info('Data was saved to %s.', filename)
-
-
-def login(http_client):
-    logger.info('Logging into LMS...')
-
-    lms_login = '{}/login'.format(LMS_URL)
-    lms_login_ajax = '{}/login_ajax'.format(LMS_URL)
-
-    # Make a call to the login page to get cookies (esp. CSRF token)
-    http_client.get(lms_login)
-
-    # Set the headers and data for the actual login request.
-    headers = {
-        'referer': lms_login
-    }
-    data = {
-        'email': LMS_USERNAME,
-        'password': LMS_PASSWORD,
-        'csrfmiddlewaretoken': http_client.cookies['csrftoken']
-    }
-
-    # Login!
-    r = http_client.post(lms_login_ajax, data=data, headers=headers)
-    success = r.json().get('success', False)
-
-    if not success:
-        msg = 'Login failed!'
-        logger.error(msg)
-        raise Exception(msg)
-
-    logger.info('Login succeeded.')
-
-
-def get_courses(http_client):
-    course_list_url = '{}/courses/'.format(DASHBOARD_SERVER_URL)
-    r = http_client.get(course_list_url)
-
-    if r.status_code != 200:
-        msg = 'Failed to retrieve course list!'
-        logger.error(msg)
-        raise Exception(msg)
-
-    d = pq(r.text)
-    courses = []
-    elements = d('.course-list .course-title')
-    for element in elements:
-        courses.append(element.text.strip())
-
-    logger.info('Retrieved %s courses from %s.', len(courses), course_list_url)
-
-    return courses
-
-
-def main():
-    start = time.time()
-    api_client = Client(base_url=API_SERVER_URL, auth_token=API_AUTH_TOKEN, timeout=1000)
-    http_client = requests.Session()
-
-    if BASIC_AUTH_CREDENTIALS:
-        http_client.auth = BASIC_AUTH_CREDENTIALS
-
-    login(http_client)
-
-    # Basic auth is no longer needed
-    http_client.auth = None
-
-    # Get courses
-    courses = get_courses(http_client)
-
-    # Collect the data
-    reports = Queue()
-    try:
-        p = Pool(NUM_PROCESSES, pool_init, [reports, api_client, http_client.cookies])
-        p.map(check_course, courses)
-    except Exception as e:  # pylint: disable=broad-except
-        logger.error('Validation failed to finish: %s', e)
-
-    # Write the data to an external file
-    write_csv(reports)
-    end = time.time()
-
-    logger.info('Finished in %d seconds.', end - start)
-
-
-if __name__ == "__main__":
-    _setup_logging()
-    main()
--- a/acceptance_tests/course_validation/mappings.json
+++ b/acceptance_tests/course_validation/mappings.json
+{
+  "course_performance_screenshot": {
+    "properties": {
+      "approved": {
+        "type": "boolean"
+      },
+      "course_id": {
+        "type": "string"
+      },
+      "course_valid": {
+        "type": "boolean"
+      },
+      "error": {
+        "type": "string"
+      },
+      "pages": {
+        "properties": {
+          "filename": {
+            "type": "string"
+          },
+          "url_path": {
+            "type": "string"
+          }
+        }
+      },
+      "reviewed": {
+        "type": "boolean"
+      },
+      "start": {
+        "type": "date",
+        "format": "dateOptionalTime"
+      }
+    }
+  },
+  "course_performance": {
+    "properties": {
+      "assignment_types": {
+        "properties": {
+          "actual": {
+            "type": "string"
+          },
+          "expected": {
+            "type": "string"
+          },
+          "results": {
+            "properties": {
+              "actual": {
+                "type": "integer"
+              },
+              "expected": {
+                "type": "integer"
+              },
+              "load_time": {
+                "type": "double"
+              },
+              "name": {
+                "type": "string"
+              },
+              "problems": {
+                "properties": {
+                  "number_in_structure": {
+                    "type": "integer"
+                  },
+                  "number_with_submissions": {
+                    "type": "integer"
+                  },
+                  "total_submissions": {
+                    "type": "integer"
+                  },
+                  "valid": {
+                    "type": "boolean"
+                  }
+                }
+              },
+              "status": {
+                "type": "integer"
+              },
+              "valid": {
+                "type": "boolean"
+              }
+            }
+          },
+          "valid": {
+            "type": "boolean"
+          }
+        }
+      },
+      "course_id": {
+        "type": "string"
+      },
+      "course_valid": {
+        "type": "boolean"
+      },
+      "error": {
+        "type": "string"
+      },
+      "has_submissions": {
+        "type": "boolean"
+      },
+      "start": {
+        "type": "date",
+        "format": "dateOptionalTime"
+      }
+    }
+  }
+}
--- a/acceptance_tests/course_validation/problem_count_stats.py
+++ b/acceptance_tests/course_validation/problem_count_stats.py
+"""
+Script that produces min-max-avg for problem counts across all courses.
+"""
+
+from elasticsearch import Elasticsearch
+
+es = Elasticsearch(retry_on_timeout=True)
+index = 'course_reports_stage'
+
+body = {
+    'size': 1000,
+    'query': {'match_all': {}},
+    "fields": [
+        "course_id",
+        "num_problems"
+    ],
+    "script_fields": {
+        "num_problems": {
+            "script": "if (_source.assignment_types) { result=0; for (element in _source.assignment_types.results) { "
+                      "result = result + element.problems.number_in_structure; }; result;} else { 0 }",
+            "type": "number"
+        }
+    }
+}
+res = es.search(index=index, doc_type='course_performance', body=body)
+
+num_courses = res['hits']['total']
+
+print "Courses: %d" % num_courses
+
+data = {}
+for hit in res['hits']['hits']:
+    fields = hit['fields']
+    data[fields['course_id'][0]] = fields['num_problems'][0]
+
+sum_counts = sum(data.values())
+
+print 'Max: %d' % max(data.values())
+print 'Avg: %d' % (sum_counts / num_courses)
+print 'Min: %d' % min(data.values())
+print 'Sum: %d' % sum_counts
--- a/acceptance_tests/course_validation/report_generators.py
+++ b/acceptance_tests/course_validation/report_generators.py
--- a/acceptance_tests/course_validation/report_runner.py
+++ b/acceptance_tests/course_validation/report_runner.py
+#! /usr/bin/env python
+
+"""
+This script executes reporters that provide data about course pages. Any reporter inheriting from ReporterBase
+can be used.
+
+All collected data is logged to a local file and indexed in an Elasticsearch index. The local file can be found at
+<TIMESTAMP>-course_report.log. <TIMESTAMP> is the time, in UTC, at which this script was initialized.
+
+Elasticsearch data is written to the index named course_reports. Note that subsequent runs of this script will overwrite
+existing data as new data is collected.
+
+The list of courses on which to report is pulled from the course structure API and saved locally in a file named
+courses.json. Subsequent script runs will use this file instead of the API. If you want fresh data,
+simply delete the file.
+
+To execute this script run the following command from the parent directory of acceptance_tests:
+
+    $ python -m acceptance_tests.course_validation.report_runner
+"""
+
+import datetime
+import io
+import json
+import logging
+from multiprocessing import Pool
+from os.path import abspath, dirname, join
+import time
+import traceback
+
+from elasticsearch import Elasticsearch
+import requests
+
+from acceptance_tests.course_validation import LMS_URL, LMS_USERNAME, LMS_PASSWORD, \
+    BASIC_AUTH_CREDENTIALS, COURSE_API_URL, COURSE_API_KEY, ENABLE_AUTO_AUTH, DASHBOARD_SERVER_URL
+from acceptance_tests.course_validation.report_generators import CoursePerformanceReportGenerator
+from common.clients import CourseStructureApiClient
+
+
+NUM_PROCESSES = 8
+TIMESTAMP = datetime.datetime.utcnow().strftime('%Y-%m-%d-%H-%M-%S')
+
+logger = logging.getLogger(__name__)
+es = Elasticsearch(retry_on_timeout=True)
+index_name = 'course_reports'
+
+# Add additional reports here to get different information.
+# For example, if you want screenshots, add the CoursePerformanceScreenshotReporter.
+# Obvious Note: The more reporters you run, the longer the script will run.
+reporters = [CoursePerformanceReportGenerator, ]
+
+
+def _setup_logging():
+    level = logging.DEBUG
+    msg_format = '%(asctime)s - %(levelname)s - %(message)s'
+
+    logging.basicConfig(
+        filename='{}-course_report.log'.format(TIMESTAMP),
+        format=msg_format,
+        level=level)
+
+    # Log to console, in addition to file
+    ch = logging.StreamHandler()
+    ch.setLevel(level)
+    ch.setFormatter(logging.Formatter(msg_format))
+    logging.root.addHandler(ch)
+
+    # Disable requests and elasticsearch debug logs
+    logging.getLogger('requests').setLevel(logging.WARNING)
+    logging.getLogger('elasticsearch').setLevel(logging.WARNING)
+    logging.getLogger('elasticsearch.trace').setLevel(logging.WARNING)
+
+
+def check_course(course_id):
+    """
+    Gather info on the given course.
+    """
+    logger.debug('Checking %s...', course_id)
+
+    valid = True
+
+    for reporter_class in reporters:
+        try:
+            # Generate a report for each reporter
+            reporter = reporter_class(course_id, check_course.cookies)
+            _valid, report = reporter.generate_report()
+            valid &= _valid
+        except Exception as e:  # pylint: disable=broad-except
+            logger.error('Validation for course %s failed: %s\n%s', course_id, e, traceback.format_exc())
+            valid = False
+            report = {'course_id': course_id, 'course_valid': False, 'error': unicode(e)}
+
+        # Dump the info to the log and Elasticsearch
+        logger.info(json.dumps(report))
+        try:
+            doc_type = reporter_class.REPORT_NAME
+            es.index(index=index_name, doc_type=doc_type, body=report, id=course_id)
+        except Exception as e:
+            logger.error('%s\n%s', e, traceback.format_exc())
+            raise
+
+        if valid:
+            logger.info('Successfully validated %s.', course_id)
+        else:
+            logger.error('Course %s is not valid!', course_id)
+
+
+def pool_init(cookies):
+    """
+    Initialize the variables needed by the mapping function.
+    """
+    check_course.cookies = cookies
+
+
+def login(http_client):
+    failure_msg = 'Login failed!'
+
+    if ENABLE_AUTO_AUTH:
+        logger.info('Logging into dashboard with auto auth...')
+        response = http_client.get('{}/test/auto_auth/'.format(DASHBOARD_SERVER_URL))
+
+        if response.status_code == 200:
+            logger.info('Login succeeded.')
+            return
+        else:
+            logger.fatal(failure_msg)
+            raise Exception(failure_msg)
+
+    logger.info('Logging into LMS...')
+
+    if BASIC_AUTH_CREDENTIALS:
+        http_client.auth = BASIC_AUTH_CREDENTIALS
+
+    lms_login = '{}/login'.format(LMS_URL)
+    lms_login_ajax = '{}/login_ajax'.format(LMS_URL)
+
+    # Make a call to the login page to get cookies (esp. CSRF token)
+    http_client.get(lms_login)
+
+    # Set the headers and data for the actual login request.
+    headers = {
+        'referer': lms_login
+    }
+    data = {
+        'email': LMS_USERNAME,
+        'password': LMS_PASSWORD,
+        'csrfmiddlewaretoken': http_client.cookies['csrftoken']
+    }
+
+    # Login!
+    r = http_client.post(lms_login_ajax, data=data, headers=headers)
+    success = r.json().get('success', False)
+
+    if not success:
+        msg = failure_msg
+        logger.error(msg)
+        raise Exception(msg)
+
+    # Basic auth is no longer needed
+    http_client.auth = None
+
+    logger.info('Login succeeded.')
+
+
+def get_courses():
+    filename = 'courses.json'
+
+    try:
+        with io.open(filename, 'r', encoding='utf-8') as f:
+            courses = json.load(f)
+    except Exception as e:  # pylint: disable=broad-except
+        logger.warning('Failed to read courses from file: %s', e)
+        courses = []
+
+    if not courses:
+        logger.info('Retrieving courses from API...')
+        client = CourseStructureApiClient(COURSE_API_URL, COURSE_API_KEY)
+        courses = client.all_courses
+        courses = [course['id'] for course in courses]
+        courses.sort(key=lambda course: course.lower())
+
+        with io.open(filename, 'w', encoding='utf-8') as f:
+            f.write(unicode(json.dumps(courses, ensure_ascii=False)))
+
+    logger.info('Retrieved %s courses.', len(courses))
+
+    return courses
+
+
+def main():
+    start = time.time()
+    http_client = requests.Session()
+
+    # Log into Insights using either OIDC (via LMS) or auto auth.
+    login(http_client)
+
+    # Get courses on which to report
+    courses = get_courses()
+
+    # Create index to store results
+    if not es.indices.exists(index_name):
+        es.indices.create(index_name)
+
+        # Create the mappings
+        mappings_file = join(dirname(abspath(__file__)), 'mappings.json')
+        with io.open(mappings_file, 'r', encoding='utf-8') as f:
+            mappings = json.load(f)
+
+        for doc_type, body in mappings.iteritems():
+            es.indices.put_mapping(index=index_name, doc_type=doc_type, body=body)
+
+    def finish():
+        end = time.time()
+        logger.info('Finished in %d seconds.', end - start)
+
+    try:
+        p = Pool(NUM_PROCESSES, pool_init, [http_client.cookies])
+        p.map(check_course, courses)
+        p.close()
+    except (KeyboardInterrupt, SystemExit):
+        p.terminate()
+        finish()
+        raise
+    except Exception as e:  # pylint: disable=broad-except
+        logger.error('Validation failed to finish: %s', e)
+
+    finish()
+
+
+if __name__ == "__main__":
+    _setup_logging()
+    main()
--- a/common/clients.py
+++ b/common/clients.py
+import logging
+
 import slumber
+from slumber.exceptions import HttpClientError

 from common.auth import BearerAuth


+logger = logging.getLogger(__name__)
+
+
 class CourseStructureApiClient(slumber.API):
    """
    Course Structure API Client
@@ -14,3 +20,28 @@ class CourseStructureApiClient(slumber.API):

    def __init__(self, url, access_token):
        super(CourseStructureApiClient, self).__init__(url, auth=BearerAuth(access_token))
+
+    @property
+    def all_courses(self):
+        courses = []
+        page = 1
+
+        while page:
+            try:
+                logger.debug('Retrieving page %d of course info...', page)
+                response = self.courses.get(page=page, page_size=100)
+                course_details = response['results']
+
+                courses += course_details
+
+                if response['next']:
+                    page += 1
+                else:
+                    page = None
+                    logger.debug('Completed retrieval of course info. Retrieved info for %d courses.', len(courses))
+            except HttpClientError as e:
+                logger.error("Unable to retrieve course data: %s", e)
+                page = None
+                break
+
+        return courses
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -5,6 +5,7 @@ coverage==3.7.1
 ddt==1.0.0
 django-dynamic-fixture==1.8.1
 django-nose==1.3
+elasticsearch==1.4.0
 unittest2==0.8.0
 httpretty==0.8.4
 mock==1.0.1
@@ -14,7 +15,6 @@ nose==1.3.4
 pep257==0.4.1
 pep8==1.6.0
 pylint==1.4.1
-pyquery>=1.2.9
 selenium>=2.44.0
 sure==1.2.7
 testfixtures==4.1.2