Commit f0e6cdd3 by Will Daly

Merge pull request #6110 from edx/will/email-opt-in-analytics

Add management command to create the email opt-in list.
parents c74b05e9 61ecae98
"""Generate a list indicating whether users have opted in or out of receiving email from an org.
Email opt-in is stored as an org-level preference.
When reports are generated, we need to handle:
1) Org aliases: some organizations might have multiple course key "org" values.
We choose the most recently set preference among all org aliases.
Since this information isn't stored anywhere in edx-platform,
the caller needs to pass in the list of orgs and aliases.
2) No preference set: Some users may not have an opt-in preference set
if they enrolled before the preference was introduced.
These users are opted in by default.
3) Restricting to a subset of courses in an org: Some orgs have courses
that we don't want to include in the results (e.g. EdX-created test courses).
Allow the caller to explicitly specify the list of courses in the org.
The command will always use the read replica database if one is configured.
"""
import os.path
import csv
import time
import contextlib
import logging
from django.core.management.base import BaseCommand, CommandError
from django.conf import settings
from django.db import connections
from opaque_keys.edx.keys import CourseKey
from xmodule.modulestore.django import modulestore
LOGGER = logging.getLogger(__name__)
class Command(BaseCommand):
"""Generate a list of email opt-in values for user enrollments. """
args = "<OUTPUT_FILENAME> <ORG_ALIASES> --courses=COURSE_ID_LIST"
help = "Generate a list of email opt-in values for user enrollments."
# Fields output in the CSV
OUTPUT_FIELD_NAMES = [
"email",
"full_name",
"course_id",
"is_opted_in_for_email",
"preference_set_date"
]
# Number of records to read at a time when making
# multiple queries over a potentially large dataset.
QUERY_INTERVAL = 1000
def handle(self, *args, **options):
"""Execute the command.
Arguments:
file_path (str): Path to the output file.
*org_list (unicode): List of organization aliases.
Keyword Arguments:
courses (unicode): Comma-separated list of course keys. If provided,
include only these courses in the results.
Raises:
CommandError
"""
file_path, org_list = self._parse_args(args)
# Retrieve all the courses for the org.
# If we were given a specific list of courses to include,
# filter out anything not in that list.
courses = self._get_courses_for_org(org_list)
only_courses = options.get("courses")
if only_courses is not None:
only_courses = [
CourseKey.from_string(course_key.strip())
for course_key in only_courses.split(",")
]
courses = list(set(courses) & set(only_courses))
# Add in organizations from the course keys, to ensure
# we're including orgs with different capitalizations
org_list = list(set(org_list) | set(course.org for course in courses))
# If no courses are found, abort
if not courses:
raise CommandError(
u"No courses found for orgs: {orgs}".format(
orgs=", ".join(org_list)
)
)
# Let the user know what's about to happen
LOGGER.info(
u"Retrieving data for courses: {courses}".format(
courses=", ".join([unicode(course) for course in courses])
)
)
# Open the output file and generate the report.
with open(file_path, "w") as file_handle:
with self._log_execution_time():
self._write_email_opt_in_prefs(file_handle, org_list, courses)
# Remind the user where the output file is
LOGGER.info(u"Output file: {file_path}".format(file_path=file_path))
def _parse_args(self, args):
"""Check and parse arguments.
Validates that the right number of args were provided
and that the output file doesn't already exist.
Arguments:
args (list): List of arguments given at the command line.
Returns:
Tuple of (file_path, org_list)
Raises:
CommandError
"""
if len(args) < 2:
raise CommandError(u"Usage: {args}".format(args=self.args))
file_path = args[0]
org_list = args[1:]
if os.path.exists(file_path):
raise CommandError("File already exists at '{path}'".format(path=file_path))
return file_path, org_list
def _get_courses_for_org(self, org_aliases):
"""Retrieve all course keys for a particular org.
Arguments:
org_aliases (list): List of aliases for the org.
Returns:
List of `CourseKey`s
"""
all_courses = modulestore().get_courses()
orgs_lowercase = [org.lower() for org in org_aliases]
return [
course.id
for course in all_courses
if course.id.org.lower() in orgs_lowercase
]
@contextlib.contextmanager
def _log_execution_time(self):
"""Context manager for measuring execution time. """
start_time = time.time()
yield
execution_time = time.time() - start_time
LOGGER.info(u"Execution time: {time} seconds".format(time=execution_time))
def _write_email_opt_in_prefs(self, file_handle, org_aliases, courses):
"""Write email opt-in preferences to the output file.
This will generate a CSV with one row for each enrollment.
This means that the user's "opt in" preference will be specified
multiple times if the user has enrolled in multiple courses
within the org. However, the values should always be the same:
if the user is listed as "opted out" for course A, she will
also be listed as "opted out" for courses B, C, and D.
Arguments:
file_handle (file): Handle to the output file.
org_aliases (list): List of aliases for the org.
courses (list): List of course keys in the org.
Returns:
None
"""
writer = csv.DictWriter(file_handle, fieldnames=self.OUTPUT_FIELD_NAMES)
cursor = self._db_cursor()
query = (
u"""
SELECT
user.`email` AS `email`,
profile.`name` AS `full_name`,
enrollment.`course_id` AS `course_id`,
(
SELECT value
FROM user_api_userorgtag
WHERE org IN ( {org_list} )
AND `key`=\"email-optin\"
AND `user_id`=user.`id`
ORDER BY modified DESC
LIMIT 1
) AS `is_opted_in_for_email`,
(
SELECT modified
FROM user_api_userorgtag
WHERE org IN ( {org_list} )
AND `key`=\"email-optin\"
AND `user_id`=user.`id`
ORDER BY modified DESC
LIMIT 1
) AS `preference_set_date`
FROM
student_courseenrollment AS enrollment
LEFT JOIN auth_user AS user ON user.id=enrollment.user_id
LEFT JOIN auth_userprofile AS profile ON profile.user_id=user.id
WHERE enrollment.course_id IN ( {course_id_list} )
"""
).format(
course_id_list=self._sql_list(courses),
org_list=self._sql_list(org_aliases)
)
cursor.execute(query)
row_count = 0
for row in self._iterate_results(cursor):
email, full_name, course_id, is_opted_in, pref_set_date = row
writer.writerow({
"email": email.encode('utf-8'),
"full_name": full_name.encode('utf-8'),
"course_id": course_id.encode('utf-8'),
"is_opted_in_for_email": is_opted_in if is_opted_in else "True",
"preference_set_date": pref_set_date,
})
row_count += 1
# Log the number of rows we processed
LOGGER.info(u"Retrieved {num_rows} records.".format(num_rows=row_count))
def _iterate_results(self, cursor):
"""Iterate through the results of a database query, fetching in chunks.
Arguments:
cursor: The database cursor
Yields:
tuple of row values from the query
"""
while True:
rows = cursor.fetchmany(self.QUERY_INTERVAL)
if not rows:
break
for row in rows:
yield row
def _sql_list(self, values):
"""Serialize a list of values for including in a SQL "IN" statement. """
return u",".join([u'"{}"'.format(val) for val in values])
def _db_cursor(self):
"""Return a database cursor to the read replica if one is available. """
# Use the read replica if one has been configured
db_alias = (
'read_replica'
if 'read_replica' in settings.DATABASES
else 'default'
)
return connections[db_alias].cursor()
# -*- coding: utf-8 -*-
"""Tests for the email opt-in list management command. """
import os.path
import tempfile
import shutil
import csv
from collections import defaultdict
from unittest import skipUnless
import ddt
from django.conf import settings
from django.test.utils import override_settings
from django.core.management.base import CommandError
from xmodule.modulestore.tests.django_utils import ModuleStoreTestCase, mixed_store_config
from xmodule.modulestore.tests.factories import CourseFactory
from student.tests.factories import UserFactory, CourseEnrollmentFactory
from student.models import CourseEnrollment
import user_api.api.profile as profile_api
from user_api.models import UserOrgTag
from user_api.management.commands import email_opt_in_list
MODULESTORE_CONFIG = mixed_store_config(settings.COMMON_TEST_DATA_ROOT, {}, include_xml=False)
@ddt.ddt
@skipUnless(settings.ROOT_URLCONF == 'lms.urls', 'Test only valid in lms')
@override_settings(MODULESTORE=MODULESTORE_CONFIG)
class EmailOptInListTest(ModuleStoreTestCase):
"""Tests for the email opt-in list management command. """
USER_USERNAME = "test_user"
USER_FIRST_NAME = u"Ṫëṡẗ"
USER_LAST_NAME = u"Űśéŕ"
TEST_ORG = u"téśt_őŕǵ"
OUTPUT_FILE_NAME = "test_org_email_opt_in.csv"
OUTPUT_FIELD_NAMES = [
"email",
"full_name",
"course_id",
"is_opted_in_for_email",
"preference_set_date"
]
def setUp(self):
self.user = UserFactory.create(
username=self.USER_USERNAME,
first_name=self.USER_FIRST_NAME,
last_name=self.USER_LAST_NAME
)
self.courses = []
self.enrollments = defaultdict(list)
def test_not_enrolled(self):
self._create_courses_and_enrollments((self.TEST_ORG, False))
output = self._run_command(self.TEST_ORG)
# The user isn't enrolled in the course, so the output should be empty
self._assert_output(output)
def test_enrolled_no_pref(self):
self._create_courses_and_enrollments((self.TEST_ORG, True))
output = self._run_command(self.TEST_ORG)
# By default, if no preference is set by the user is enrolled, opt in
self._assert_output(output, (self.user, self.courses[0].id, True))
def test_enrolled_pref_opted_in(self):
self._create_courses_and_enrollments((self.TEST_ORG, True))
self._set_opt_in_pref(self.user, self.TEST_ORG, True)
output = self._run_command(self.TEST_ORG)
self._assert_output(output, (self.user, self.courses[0].id, True))
def test_enrolled_pref_opted_out(self):
self._create_courses_and_enrollments((self.TEST_ORG, True))
self._set_opt_in_pref(self.user, self.TEST_ORG, False)
output = self._run_command(self.TEST_ORG)
self._assert_output(output, (self.user, self.courses[0].id, False))
def test_opt_in_then_opt_out(self):
self._create_courses_and_enrollments((self.TEST_ORG, True))
self._set_opt_in_pref(self.user, self.TEST_ORG, True)
self._set_opt_in_pref(self.user, self.TEST_ORG, False)
output = self._run_command(self.TEST_ORG)
self._assert_output(output, (self.user, self.courses[0].id, False))
def test_exclude_non_org_courses(self):
# Enroll in a course that's not in the org
self._create_courses_and_enrollments(
(self.TEST_ORG, True),
("other_org", True)
)
# Opt out of the other course
self._set_opt_in_pref(self.user, "other_org", False)
# The first course is included in the results,
# but the second course is excluded,
# so the user should be opted in by default.
output = self._run_command(self.TEST_ORG)
self._assert_output(
output,
(self.user, self.courses[0].id, True),
expect_pref_datetime=False
)
def test_enrolled_conflicting_prefs(self):
# Enroll in two courses, both in the org
self._create_courses_and_enrollments(
(self.TEST_ORG, True),
("org_alias", True)
)
# Opt into the first course, then opt out of the second course
self._set_opt_in_pref(self.user, self.TEST_ORG, True)
self._set_opt_in_pref(self.user, "org_alias", False)
# The second preference change should take precedence
# Note that *both* courses are included in the list,
# but they should have the same value.
output = self._run_command(self.TEST_ORG, other_names=["org_alias"])
self._assert_output(
output,
(self.user, self.courses[0].id, False),
(self.user, self.courses[1].id, False)
)
# Opt into the first course
# Even though the other course still has a preference set to false,
# the newest preference takes precedence
self._set_opt_in_pref(self.user, self.TEST_ORG, True)
output = self._run_command(self.TEST_ORG, other_names=["org_alias"])
self._assert_output(
output,
(self.user, self.courses[0].id, True),
(self.user, self.courses[1].id, True)
)
@ddt.data(True, False)
def test_unenrolled_from_all_courses(self, opt_in_pref):
# Enroll in the course and set a preference
self._create_courses_and_enrollments((self.TEST_ORG, True))
self._set_opt_in_pref(self.user, self.TEST_ORG, opt_in_pref)
# Unenroll from the course
CourseEnrollment.unenroll(self.user, self.courses[0].id, skip_refund=True)
# Enrollments should still appear in the outpu
output = self._run_command(self.TEST_ORG)
self._assert_output(output, (self.user, self.courses[0].id, opt_in_pref))
def test_unenrolled_from_some_courses(self):
# Enroll in several courses in the org
self._create_courses_and_enrollments(
(self.TEST_ORG, True),
(self.TEST_ORG, True),
(self.TEST_ORG, True),
("org_alias", True)
)
# Set a preference for the aliased course
self._set_opt_in_pref(self.user, "org_alias", False)
# Unenroll from the aliased course
CourseEnrollment.unenroll(self.user, self.courses[3].id, skip_refund=True)
# Expect that the preference still applies,
# and all the enrollments should appear in the list
output = self._run_command(self.TEST_ORG, other_names=["org_alias"])
self._assert_output(
output,
(self.user, self.courses[0].id, False),
(self.user, self.courses[1].id, False),
(self.user, self.courses[2].id, False),
(self.user, self.courses[3].id, False)
)
def test_no_courses_for_org_name(self):
self._create_courses_and_enrollments((self.TEST_ORG, True))
self._set_opt_in_pref(self.user, self.TEST_ORG, True)
# No course available for this particular org
with self.assertRaisesRegexp(CommandError, "^No courses found for orgs:"):
self._run_command("other_org")
def test_specify_subset_of_courses(self):
# Create several courses in the same org
self._create_courses_and_enrollments(
(self.TEST_ORG, True),
(self.TEST_ORG, True),
(self.TEST_ORG, True),
)
# Execute the command, but exclude the second course from the list
only_courses = [self.courses[0].id, self.courses[1].id]
self._run_command(self.TEST_ORG, only_courses=only_courses)
# Choose numbers before and after the query interval boundary
@ddt.data(2, 3, 4, 5, 6, 7, 8, 9)
def test_many_users(self, num_users):
# Create many users and enroll them in the test course
course = CourseFactory.create(org=self.TEST_ORG)
usernames = []
for _ in range(num_users):
user = UserFactory.create()
usernames.append(user.username)
CourseEnrollmentFactory.create(course_id=course.id, user=user)
# Generate the report
output = self._run_command(self.TEST_ORG, query_interval=4)
# Expect that every enrollment shows up in the report
output_emails = [row["email"] for row in output]
for email in output_emails:
self.assertIn(email, output_emails)
def test_org_capitalization(self):
# Lowercase some of the org names in the course IDs
self._create_courses_and_enrollments(
("MyOrg", True),
("myorg", True)
)
# Set preferences for both courses
self._set_opt_in_pref(self.user, "MyOrg", True)
self._set_opt_in_pref(self.user, "myorg", False)
# Execute the command, expecting both enrollments to show up
# We're passing in the uppercase org, but we set the lowercase
# version more recently, so we expect the lowercase org
# preference to apply.
output = self._run_command("MyOrg")
self._assert_output(
output,
(self.user, self.courses[0].id, False),
(self.user, self.courses[1].id, False)
)
@ddt.data(0, 1)
def test_not_enough_args(self, num_args):
args = ["dummy"] * num_args
expected_msg_regex = "^Usage: <OUTPUT_FILENAME> <ORG_ALIASES> --courses=COURSE_ID_LIST$"
with self.assertRaisesRegexp(CommandError, expected_msg_regex):
email_opt_in_list.Command().handle(*args)
def test_file_already_exists(self):
temp_file = tempfile.NamedTemporaryFile(delete=True)
def _cleanup(): # pylint: disable=missing-docstring
temp_file.close()
with self.assertRaisesRegexp(CommandError, "^File already exists"):
email_opt_in_list.Command().handle(temp_file.name, self.TEST_ORG)
def _create_courses_and_enrollments(self, *args):
"""Create courses and enrollments.
Created courses and enrollments are stored in instance variables
so tests can refer to them later.
Arguments:
*args: Tuples of (course_org, should_enroll), where
course_org is the name of the org in the course key
and should_enroll is a boolean indicating whether to enroll
the user in the course.
Returns:
None
"""
for course_number, (course_org, should_enroll) in enumerate(args):
course = CourseFactory.create(org=course_org, number=str(course_number))
if should_enroll:
enrollment = CourseEnrollmentFactory.create(
is_active=True,
course_id=course.id,
user=self.user
)
self.enrollments[course.id].append(enrollment)
self.courses.append(course)
def _set_opt_in_pref(self, user, org, is_opted_in):
"""Set the email opt-in preference.
Arguments:
user (User): The user model.
org (unicode): The org in the course key.
is_opted_in (bool): Whether the user is opted in or out of emails.
Returns:
None
"""
profile_api.update_email_opt_in(user.username, org, is_opted_in)
def _latest_pref_set_date(self, user):
"""Retrieve the latest opt-in preference for the user,
across all orgs and preference keys.
Arguments:
user (User): The user whos preference was set.
Returns:
ISO-formatted date string or empty string
"""
pref = UserOrgTag.objects.filter(user=user).order_by("-modified")
return pref[0].modified.isoformat(' ') if len(pref) > 0 else ""
def _run_command(self, org, other_names=None, only_courses=None, query_interval=None):
"""Execute the management command to generate the email opt-in list.
Arguments:
org (unicode): The org to generate the report for.
Keyword Arguments:
other_names (list): List of other aliases for the org.
only_courses (list): If provided, include only these course IDs in the report.
query_interval (int): If provided, override the default query interval.
Returns:
list: The rows of the generated CSV report. Each item is a dictionary.
"""
# Create a temporary directory for the output
# Delete it when we're finished
temp_dir_path = tempfile.mkdtemp()
def _cleanup(): # pylint: disable=missing-docstring
shutil.rmtree(temp_dir_path)
self.addCleanup(_cleanup)
# Sanitize the arguments
if other_names is None:
other_names = []
output_path = os.path.join(temp_dir_path, self.OUTPUT_FILE_NAME)
org_list = [org] + other_names
if only_courses is not None:
only_courses = ",".join(unicode(course_id) for course_id in only_courses)
command = email_opt_in_list.Command()
# Override the query interval to speed up the tests
if query_interval is not None:
command.QUERY_INTERVAL = query_interval
# Execute the command
command.handle(output_path, *org_list, courses=only_courses)
# Retrieve the output from the file
try:
with open(output_path) as output_file:
reader = csv.DictReader(output_file, fieldnames=self.OUTPUT_FIELD_NAMES)
rows = [row for row in reader]
except IOError:
self.fail("Could not find or open output file at '{path}'".format(path=output_path))
# Return the output as a list of dictionaries
return rows
def _assert_output(self, output, *args, **kwargs):
"""Check the output of the report.
Arguments:
output (list): List of rows in the output CSV file.
*args: Tuples of (user, course_id, opt_in_pref)
Keyword Arguments:
expect_pref_datetime (bool): If false, expect an empty
string for the preference.
Returns:
None
Raises:
AssertionError
"""
self.assertEqual(len(output), len(args))
for user, course_id, opt_in_pref in args:
self.assertIn({
"email": user.email.encode('utf-8'),
"full_name": user.profile.name.encode('utf-8'),
"course_id": unicode(course_id).encode('utf-8'),
"is_opted_in_for_email": unicode(opt_in_pref),
"preference_set_date": (
self._latest_pref_set_date(self.user)
if kwargs.get("expect_pref_datetime", True)
else ""
)
}, output)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment