Commit e617b248 by Clinton Blackburn Committed by GitHub

Updated programs-related data loaders (#217)

- Creating programs only based on data from the Programs API
- Pulling additional info from the marketing site
- Associating programs with courses

ECOM-5099
parent 82ee72f0
import abc
import html2text
from dateutil.parser import parse
from django.utils.functional import cached_property
from edx_rest_api_client.client import EdxRestApiClient
......@@ -78,6 +79,15 @@ class AbstractDataLoader(metaclass=abc.ABCMeta):
return {k: cls.clean_string(v) for k, v in data.items()}
@classmethod
def clean_html(cls, content):
"""Cleans HTML from a string and returns a Markdown version."""
stripped = content.replace(' ', '')
html_converter = html2text.HTML2Text()
html_converter.wrap_links = False
html_converter.body_width = None
return html_converter.handle(stripped).strip()
@classmethod
def parse_date(cls, date_string):
"""
Returns a parsed date.
......@@ -113,3 +123,12 @@ class AbstractDataLoader(metaclass=abc.ABCMeta):
""" Remove orphaned objects from the database. """
for model in (Image, Person, Video):
delete_orphans(model)
@classmethod
def get_or_create_video(cls, url):
video = None
if url:
video, __ = Video.objects.get_or_create(src=url)
return video
......@@ -238,8 +238,8 @@ class EcommerceApiDataLoader(AbstractDataLoader):
class ProgramsApiDataLoader(AbstractDataLoader):
""" Loads programs from the Programs API. """
image_width = 435
image_height = 145
image_width = 1440
image_height = 480
def ingest(self):
api_url = self.partner.programs_api_url
......@@ -281,15 +281,31 @@ class ProgramsApiDataLoader(AbstractDataLoader):
program, __ = Program.objects.update_or_create(uuid=uuid, defaults=defaults)
organizations = []
for org in body['organizations']:
organization, __ = Organization.objects.get_or_create(
key=org['key'], defaults={'name': org['display_name'], 'partner': self.partner}
)
organizations.append(organization)
org_keys = [org['key'] for org in body['organizations']]
organizations = Organization.objects.filter(key__in=org_keys, partner=self.partner)
if len(org_keys) != organizations.count():
logger.error('Organizations for program [%s] are invalid!', uuid)
program.authoring_organizations.clear()
program.authoring_organizations.add(*organizations)
course_run_keys = set()
for course_code in body.get('course_codes', []):
course_run_keys.update([course_run['course_key'] for course_run in course_code['run_modes']])
# The course_code key field is technically useless, so we must build the course list from the
# associated course runs.
courses = Course.objects.filter(course_runs__key__in=course_run_keys).distinct()
program.courses.clear()
program.courses.add(*courses)
excluded_course_runs = CourseRun.objects.filter(course__in=courses). \
exclude(key__in=course_run_keys)
program.excluded_course_runs.clear()
program.excluded_course_runs.add(*excluded_course_runs)
program.save()
except Exception: # pylint: disable=broad-except
logger.exception('Failed to load program %s', uuid)
......
import abc
import logging
from urllib.parse import urljoin, urlencode
import html2text
import requests
from django.utils.functional import cached_property
......@@ -152,14 +152,6 @@ class DrupalApiDataLoader(AbstractDataLoader):
logger.warning('Could not find language with ISO code [%s].', iso_code)
return None
def clean_html(self, content):
"""Cleans HTML from a string and returns a Markdown version."""
stripped = content.replace(' ', '')
html_converter = html2text.HTML2Text()
html_converter.wrap_links = False
html_converter.body_width = None
return html_converter.handle(stripped).strip()
def get_courserun_image(self, body):
image = None
image_url = body['image']
......@@ -170,9 +162,9 @@ class DrupalApiDataLoader(AbstractDataLoader):
return image
class MarketingSiteDataLoader(AbstractDataLoader):
class AbstractMarketingSiteDataLoader(AbstractDataLoader):
def __init__(self, partner, api_url, access_token=None, token_type=None):
super(MarketingSiteDataLoader, self).__init__(partner, api_url, access_token, token_type)
super(AbstractMarketingSiteDataLoader, self).__init__(partner, api_url, access_token, token_type)
if not (self.partner.marketing_site_api_username and self.partner.marketing_site_api_password):
msg = 'Marketing Site API credentials are not properly configured for Partner [{partner}]!'.format(
......@@ -200,30 +192,23 @@ class MarketingSiteDataLoader(AbstractDataLoader):
return session
def ingest(self): # pragma: no cover
""" Load data for all supported objects (e.g. courses, runs). """
# TODO Ingest schools
# TODO Ingest instructors
# TODO Ingest course runs (courses)
self.retrieve_and_ingest_node_type('xseries', self.update_xseries)
def get_query_kwargs(self):
return {}
def retrieve_and_ingest_node_type(self, node_type, update_method):
"""
Retrieves all nodes of the specified type, and calls `update_method` for each node.
Args:
node_type (str): Type of node to retrieve (e.g. course, xseries, school, instructor)
update_method: Method to which the retrieved data should be passed.
"""
def ingest(self):
""" Load data for all supported objects (e.g. courses, runs). """
page = 0
query_kwargs = self.get_query_kwargs()
while page is not None and page >= 0:
while page is not None and page >= 0: # pragma: no cover
kwargs = {
'type': node_type,
'type': self.node_type,
'max-depth': 2,
'load-entity-refs': 'subject,file,taxonomy_term,taxonomy_vocabulary,node,field_collection_item',
'page': page,
}
kwargs.update(query_kwargs)
qs = urlencode(kwargs)
url = '{root}/node.json?{qs}'.format(root=self.api_url, qs=qs)
response = self.api_client.get(url)
......@@ -241,7 +226,7 @@ class MarketingSiteDataLoader(AbstractDataLoader):
try:
url = datum['url']
datum = self.clean_strings(datum)
update_method(datum)
self.process_node(datum)
except: # pylint: disable=bare-except
logger.exception('Failed to load %s.', url)
......@@ -250,16 +235,54 @@ class MarketingSiteDataLoader(AbstractDataLoader):
else:
break
def update_xseries(self, data):
def _get_nested_url(self, field):
""" Helper method that retrieves the nested `url` field in the specified field, if it exists.
This works around the fact that Drupal represents empty objects as arrays instead of objects."""
field = field or {}
return field.get('url')
@abc.abstractmethod
def process_node(self, data): # pragma: no cover
pass
@abc.abstractproperty
def node_type(self): # pragma: no cover
pass
class XSeriesMarketingSiteDataLoader(AbstractMarketingSiteDataLoader):
@property
def node_type(self):
return 'xseries'
def process_node(self, data):
marketing_slug = data['url'].split('/')[-1]
card_image_url = data.get('field_card_image', {}).get('url')
defaults = {
'title': data['title'],
try:
program = Program.objects.get(marketing_slug=marketing_slug, partner=self.partner)
except Program.DoesNotExist:
logger.error('Program [%s] exists on the marketing site, but not in the Programs Service!', marketing_slug)
return None
card_image_url = self._get_nested_url(data.get('field_card_image'))
video_url = self._get_nested_url(data.get('field_product_video'))
# NOTE (CCB): Remove the heading at the beginning of the overview. Why this isn't part of the template
# is beyond me. It's just silly.
overview = self.clean_html(data['body']['value'])
overview = overview.lstrip('### XSeries Program Overview').strip()
data = {
'subtitle': data.get('field_xseries_subtitle_short'),
'category': 'XSeries',
'partner': self.partner,
'card_image_url': card_image_url,
'overview': overview,
'video': self.get_or_create_video(video_url)
}
Program.objects.update_or_create(marketing_slug=marketing_slug, defaults=defaults)
for field, value in data.items():
setattr(program, field, value)
program.save()
logger.info('Processed XSeries with marketing_slug [%s].', marketing_slug)
return program
......@@ -18,8 +18,8 @@ from course_discovery.apps.course_metadata.models import (
)
from course_discovery.apps.course_metadata.tests import mock_data
from course_discovery.apps.course_metadata.tests.factories import (
CourseRunFactory, SeatFactory, ImageFactory, PersonFactory, VideoFactory
)
CourseRunFactory, SeatFactory, ImageFactory, PersonFactory, VideoFactory,
OrganizationFactory, CourseFactory)
LOGGER_PATH = 'course_discovery.apps.course_metadata.data_loaders.api.logger'
......@@ -354,8 +354,28 @@ class ProgramsApiDataLoaderTests(ApiClientTestMixin, DataLoaderTestMixin, TestCa
def api_url(self):
return self.partner.programs_api_url
def create_mock_organizations(self, programs):
for program in programs:
for organization in program.get('organizations', []):
OrganizationFactory(key=organization['key'], partner=self.partner)
def create_mock_courses_and_runs(self, programs):
for program in programs:
for course_code in program.get('course_codes', []):
key = '{org}+{course}'.format(org=course_code['organization']['key'], course=course_code['key'])
course = CourseFactory(key=key, partner=self.partner)
for course_run in course_code['run_modes']:
CourseRunFactory(course=course, key=course_run['course_key'])
# Add an additional course run that should be excluded
CourseRunFactory(course=course)
def mock_api(self):
bodies = mock_data.PROGRAMS_API_BODIES
self.create_mock_organizations(bodies)
self.create_mock_courses_and_runs(bodies)
url = self.api_url + 'programs/'
responses.add_callback(
responses.GET,
......@@ -369,7 +389,7 @@ class ProgramsApiDataLoaderTests(ApiClientTestMixin, DataLoaderTestMixin, TestCa
def assert_program_loaded(self, body):
""" Assert a Program corresponding to the specified data body was properly loaded into the database. """
program = Program.objects.get(uuid=AbstractDataLoader.clean_string(body['uuid']))
program = Program.objects.get(uuid=AbstractDataLoader.clean_string(body['uuid']), partner=self.partner)
self.assertEqual(program.title, body['name'])
for attr in ('subtitle', 'category', 'status', 'marketing_slug',):
......@@ -380,9 +400,20 @@ class ProgramsApiDataLoaderTests(ApiClientTestMixin, DataLoaderTestMixin, TestCa
self.assertEqual(keys, [org.key for org in expected_organizations])
self.assertListEqual(list(program.authoring_organizations.all()), expected_organizations)
banner_image_url = body.get('banner_image_urls', {}).get('w435h145')
banner_image_url = body.get('banner_image_urls', {}).get('w1440h480')
self.assertEqual(program.banner_image_url, banner_image_url)
course_run_keys = set()
course_codes = body.get('course_codes', [])
for course_code in course_codes:
course_run_keys.update([course_run['course_key'] for course_run in course_code['run_modes']])
courses = list(Course.objects.filter(course_runs__key__in=course_run_keys).distinct().order_by('key'))
self.assertEqual(list(program.courses.order_by('key')), courses)
# Verify the additional course runs added in create_mock_courses_and_runs are excluded.
self.assertEqual(program.excluded_course_runs.count(), len(course_codes))
@responses.activate
def test_ingest(self):
""" Verify the method ingests data from the Organizations API. """
......@@ -401,3 +432,19 @@ class ProgramsApiDataLoaderTests(ApiClientTestMixin, DataLoaderTestMixin, TestCa
self.assert_program_loaded(datum)
self.loader.ingest()
@responses.activate
def test_ingest_with_missing_organizations(self):
api_data = self.mock_api()
Organization.objects.all().delete()
self.assertEqual(Program.objects.count(), 0)
self.assertEqual(Organization.objects.count(), 0)
with mock.patch(LOGGER_PATH) as mock_logger:
self.loader.ingest()
calls = [mock.call('Organizations for program [%s] are invalid!', datum['uuid']) for datum in api_data]
mock_logger.error.assert_has_calls(calls)
self.assertEqual(Program.objects.count(), len(api_data))
self.assertEqual(Organization.objects.count(), 0)
......@@ -8,14 +8,15 @@ from django.test import TestCase
from opaque_keys.edx.keys import CourseKey
from course_discovery.apps.course_metadata.data_loaders.marketing_site import (
DrupalApiDataLoader, MarketingSiteDataLoader
DrupalApiDataLoader, XSeriesMarketingSiteDataLoader,
)
from course_discovery.apps.course_metadata.data_loaders.tests import JSON
from course_discovery.apps.course_metadata.data_loaders.tests.mixins import ApiClientTestMixin, DataLoaderTestMixin
from course_discovery.apps.course_metadata.models import (
Course, CourseOrganization, CourseRun, Organization, Person, Subject, Program
Course, CourseOrganization, CourseRun, Organization, Person, Subject, Program, Video,
)
from course_discovery.apps.course_metadata.tests import mock_data
from course_discovery.apps.course_metadata.tests.factories import ProgramFactory
from course_discovery.apps.ietf_language_tags.models import LanguageTag
ENGLISH_LANGUAGE_TAG = LanguageTag(code='en-us', name='English - United States')
......@@ -215,26 +216,11 @@ class DrupalApiDataLoaderTests(ApiClientTestMixin, DataLoaderTestMixin, TestCase
self.assertEqual(self.loader.get_language_tag(body), expected)
class MarketingSiteDataLoaderTests(DataLoaderTestMixin, TestCase):
loader_class = MarketingSiteDataLoader
LOGIN_COOKIE = ('session_id', 'abc123')
class AbstractMarketingSiteDataLoaderTestMixin(DataLoaderTestMixin):
@property
def api_url(self):
return self.partner.marketing_site_url_root
def mock_login_response(self, failure=False):
url = self.api_url + 'user'
landing_url = '{base}users/{username}'.format(base=self.api_url,
username=self.partner.marketing_site_api_username)
status = 500 if failure else 302
adding_headers = {}
if not failure:
adding_headers['Location'] = landing_url
responses.add(responses.POST, url, status=status, adding_headers=adding_headers)
responses.add(responses.GET, landing_url)
def mock_api_callback(self, url, data):
""" Paginate the data, one item per page. """
......@@ -260,8 +246,68 @@ class MarketingSiteDataLoaderTests(DataLoaderTestMixin, TestCase):
return request_callback
def mock_login_response(self, failure=False):
url = self.api_url + 'user'
landing_url = '{base}users/{username}'.format(base=self.api_url,
username=self.partner.marketing_site_api_username)
status = 500 if failure else 302
adding_headers = {}
if not failure:
adding_headers['Location'] = landing_url
responses.add(responses.POST, url, status=status, adding_headers=adding_headers)
responses.add(responses.GET, landing_url)
def mock_api_failure(self):
url = self.api_url + 'node.json'
responses.add(responses.GET, url, status=500)
@responses.activate
def test_ingest_with_api_failure(self):
self.mock_login_response()
self.mock_api_failure()
with self.assertRaises(Exception):
self.loader.ingest()
@responses.activate
def test_ingest_exception_handling(self):
""" Verify the data loader properly handles exceptions during processing of the data from the API. """
self.mock_login_response()
api_data = self.mock_api()
with mock.patch.object(self.loader, 'clean_strings', side_effect=Exception):
with mock.patch(LOGGER_PATH) as mock_logger:
self.loader.ingest()
self.assertEqual(mock_logger.exception.call_count, len(api_data))
calls = [mock.call('Failed to load %s.', datum['url']) for datum in api_data]
mock_logger.exception.assert_has_calls(calls)
@responses.activate
def test_api_client_login_failure(self):
self.mock_login_response(failure=True)
with self.assertRaises(Exception):
self.loader.api_client # pylint: disable=pointless-statement
def test_constructor_without_credentials(self):
""" Verify the constructor raises an exception if the Partner has no marketing site credentials set. """
self.partner.marketing_site_api_username = None
with self.assertRaises(Exception):
self.loader_class(self.partner, self.api_url) # pylint: disable=not-callable
class XSeriesMarketingSiteDataLoaderTests(AbstractMarketingSiteDataLoaderTestMixin, TestCase):
loader_class = XSeriesMarketingSiteDataLoader
LOGIN_COOKIE = ('session_id', 'abc123')
def create_mock_programs(self, programs):
for program in programs:
marketing_slug = program['url'].split('/')[-1]
ProgramFactory(marketing_slug=marketing_slug, partner=self.partner)
def mock_api(self):
bodies = mock_data.MARKETING_SITE_API_XSERIES_BODIES
self.create_mock_programs(bodies)
url = self.api_url + 'node.json'
responses.add_callback(
......@@ -273,63 +319,49 @@ class MarketingSiteDataLoaderTests(DataLoaderTestMixin, TestCase):
return bodies
def mock_api_failure(self):
url = self.api_url + 'node.json'
responses.add(responses.GET, url, status=500)
def assert_program_loaded(self, data):
marketing_slug = data['url'].split('/')[-1]
program = Program.objects.get(marketing_slug=marketing_slug)
program = Program.objects.get(marketing_slug=marketing_slug, partner=self.partner)
overview = self.loader.clean_html(data['body']['value'])
overview = overview.lstrip('### XSeries Program Overview').strip()
self.assertEqual(program.overview, overview)
self.assertEqual(program.title, data['title'])
self.assertEqual(program.subtitle, data.get('field_xseries_subtitle_short'))
self.assertEqual(program.category, 'XSeries')
self.assertEqual(program.partner, self.partner)
card_image_url = data.get('field_card_image', {}).get('url')
self.assertEqual(program.card_image_url, card_image_url)
def test_constructor_without_credentials(self):
""" Verify the constructor raises an exception if the Partner has no marketing site credentials set. """
self.partner.marketing_site_api_username = None
with self.assertRaises(Exception):
self.loader_class(self.partner, self.api_url)
@responses.activate
def test_api_client_login_failure(self):
self.mock_login_response(failure=True)
with self.assertRaises(Exception):
self.loader.api_client # pylint: disable=pointless-statement
video_url = data.get('field_product_video', {}).get('url')
if video_url:
video = Video.objects.get(src=video_url)
self.assertEqual(program.video, video)
@responses.activate
def test_ingest(self):
self.mock_login_response()
api_data = self.mock_api()
self.assertEqual(Program.objects.count(), 0)
self.loader.ingest()
for datum in api_data:
self.assert_program_loaded(datum)
@responses.activate
def test_ingest_with_api_failure(self):
def test_ingest_with_missing_programs(self):
""" Verify ingestion properly logs issues when programs exist on the marketing site,
but not the Programs API. """
self.mock_login_response()
self.mock_api_failure()
api_data = self.mock_api()
with self.assertRaises(Exception):
self.loader.ingest()
Program.objects.all().delete()
self.assertEqual(Program.objects.count(), 0)
@responses.activate
def test_ingest_exception_handling(self):
""" Verify the data loader properly handles exceptions during processing of the data from the API. """
self.mock_login_response()
api_data = self.mock_api()
with mock.patch(LOGGER_PATH) as mock_logger:
self.loader.ingest()
self.assertEqual(Program.objects.count(), 0)
with mock.patch.object(self.loader, 'clean_strings', side_effect=Exception):
with mock.patch(LOGGER_PATH) as mock_logger:
self.loader.ingest()
self.assertEqual(mock_logger.exception.call_count, len(api_data))
calls = [mock.call('Failed to load %s.', datum['url']) for datum in api_data]
mock_logger.exception.assert_has_calls(calls)
calls = [mock.call('Program [%s] exists on the marketing site, but not in the Programs Service!',
datum['url'].split('/')[-1]) for datum in api_data]
mock_logger.error.assert_has_calls(calls)
......@@ -8,7 +8,7 @@ from course_discovery.apps.course_metadata.data_loaders.api import (
CoursesApiDataLoader, OrganizationsApiDataLoader, EcommerceApiDataLoader, ProgramsApiDataLoader,
)
from course_discovery.apps.course_metadata.data_loaders.marketing_site import (
DrupalApiDataLoader, MarketingSiteDataLoader,
DrupalApiDataLoader, XSeriesMarketingSiteDataLoader,
)
logger = logging.getLogger(__name__)
......@@ -83,7 +83,7 @@ class Command(BaseCommand):
(partner.ecommerce_api_url, EcommerceApiDataLoader,),
(partner.programs_api_url, ProgramsApiDataLoader,),
(partner.marketing_site_api_url, DrupalApiDataLoader,),
(partner.marketing_site_url_root, MarketingSiteDataLoader,),
(partner.marketing_site_url_root, XSeriesMarketingSiteDataLoader,),
)
for api_url, loader_class in data_loaders:
......
......@@ -11,7 +11,7 @@ from course_discovery.apps.course_metadata.data_loaders.api import (
CoursesApiDataLoader, OrganizationsApiDataLoader, EcommerceApiDataLoader, ProgramsApiDataLoader,
)
from course_discovery.apps.course_metadata.data_loaders.marketing_site import (
DrupalApiDataLoader, MarketingSiteDataLoader,
DrupalApiDataLoader, XSeriesMarketingSiteDataLoader,
)
from course_discovery.apps.course_metadata.models import Course, CourseRun, Organization, Program
from course_discovery.apps.course_metadata.tests import mock_data
......@@ -165,6 +165,6 @@ class RefreshCourseMetadataCommandTests(TestCase):
call_command('refresh_course_metadata')
loader_classes = (OrganizationsApiDataLoader, CoursesApiDataLoader, EcommerceApiDataLoader,
ProgramsApiDataLoader, DrupalApiDataLoader, MarketingSiteDataLoader)
ProgramsApiDataLoader, DrupalApiDataLoader, XSeriesMarketingSiteDataLoader)
expected_calls = [mock.call('%s failed!', loader_class.__name__) for loader_class in loader_classes]
mock_logger.exception.assert_has_calls(expected_calls)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment