Commit fe5f29da by Peter Fogg

Ingest data from Drupal.

ECOM-3983
parent 39214f4d
""" Data loaders. """ """ Data loaders. """
import abc import abc
import logging import logging
from urllib.parse import urljoin
from dateutil.parser import parse from dateutil.parser import parse
from django.conf import settings
from edx_rest_api_client.client import EdxRestApiClient from edx_rest_api_client.client import EdxRestApiClient
import html2text
from opaque_keys.edx.keys import CourseKey from opaque_keys.edx.keys import CourseKey
from course_discovery.apps.course_metadata.models import ( from course_discovery.apps.course_metadata.models import (
Organization, Image, Course, CourseRun, CourseOrganization, Video Course, CourseOrganization, CourseRun, Image, LanguageTag, LevelType, Organization, Subject, Video
) )
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -24,7 +27,7 @@ class AbstractDataLoader(metaclass=abc.ABCMeta): ...@@ -24,7 +27,7 @@ class AbstractDataLoader(metaclass=abc.ABCMeta):
PAGE_SIZE = 50 PAGE_SIZE = 50
def __init__(self, api_url, access_token): def __init__(self, api_url, access_token=None):
""" """
Arguments: Arguments:
api_url (str): URL of the API from which data is loaded api_url (str): URL of the API from which data is loaded
...@@ -68,6 +71,21 @@ class AbstractDataLoader(metaclass=abc.ABCMeta): ...@@ -68,6 +71,21 @@ class AbstractDataLoader(metaclass=abc.ABCMeta):
return None return None
@classmethod
def convert_course_run_key(cls, course_run_key_str):
"""
Given a serialized course run key, return the corresponding
serialized course key.
Args:
course_run_key_str (str): The serialized course run key.
Returns:
str
"""
course_run_key = CourseKey.from_string(course_run_key_str)
return '{org}+{course}'.format(org=course_run_key.org, course=course_run_key.course)
class OrganizationsApiDataLoader(AbstractDataLoader): class OrganizationsApiDataLoader(AbstractDataLoader):
""" Loads organizations from the Organizations API. """ """ Loads organizations from the Organizations API. """
...@@ -141,9 +159,10 @@ class CoursesApiDataLoader(AbstractDataLoader): ...@@ -141,9 +159,10 @@ class CoursesApiDataLoader(AbstractDataLoader):
def update_course(self, body): def update_course(self, body):
# NOTE (CCB): Use the data from the CourseKey since the Course API exposes display names for org and number, # NOTE (CCB): Use the data from the CourseKey since the Course API exposes display names for org and number,
# which may not be unique for an organization. # which may not be unique for an organization.
course_run_key = CourseKey.from_string(body['id']) course_run_key_str = body['id']
course_run_key = CourseKey.from_string(course_run_key_str)
organization, __ = Organization.objects.get_or_create(key=course_run_key.org) organization, __ = Organization.objects.get_or_create(key=course_run_key.org)
course_key = '{org}+{course}'.format(org=organization.key, course=course_run_key.course) course_key = self.convert_course_run_key(course_run_key_str)
defaults = { defaults = {
'title': body['name'] 'title': body['name']
} }
...@@ -202,3 +221,86 @@ class CoursesApiDataLoader(AbstractDataLoader): ...@@ -202,3 +221,86 @@ class CoursesApiDataLoader(AbstractDataLoader):
video, __ = Video.objects.get_or_create(src=video_url) video, __ = Video.objects.get_or_create(src=video_url)
return video return video
class DrupalApiDataLoader(AbstractDataLoader):
"""Loads course runs from the Drupal API."""
def ingest(self):
client = EdxRestApiClient(self.api_url)
logger.info('Refreshing Courses and CourseRuns from %s...', self.api_url)
response = client.courses.get()
data = response['items']
logger.info('Retrieved %d course runs...', len(data))
for body in data:
cleaned_body = self.clean_strings(body)
course = self.update_course(cleaned_body)
self.update_course_run(course, cleaned_body)
logger.info('Retrieved %d course runs from %s.', len(data), self.api_url)
def update_course(self, body):
"""Create or update a course from Drupal data given by `body`."""
course_key = self.convert_course_run_key(body['course_id'])
try:
course = Course.objects.get(key=course_key)
except Course.DoesNotExist:
logger.warning('Course not find course [%s]', course_key)
return None
course.full_description = self.clean_html(body['description'])
course.short_description = self.clean_html(body['subtitle'])
course.marketing_url = urljoin(settings.MARKETING_URL_ROOT, body['course_about_uri'])
level_type, __ = LevelType.objects.get_or_create(name=body['level']['title'])
course.level_type = level_type
self.set_subjects(course, body)
course.save()
return course
def set_subjects(self, course, body):
"""Update `course` with subjects from `body`."""
course.subjects.clear()
subjects = (s['title'] for s in body['subjects'])
for subject_name in subjects:
# Normalize subject names with title case
subject, __ = Subject.objects.get_or_create(name=subject_name.title())
course.subjects.add(subject)
def update_course_run(self, course, body):
"""
Create or update a run of `course` from Drupal data given by `body`.
"""
course_run_key = body['course_id']
try:
course_run = CourseRun.objects.get(key=course_run_key)
except CourseRun.DoesNotExist:
logger.warning('Could not find course run [%s]', course_run_key)
return None
course_run.language = self.get_language_tag(body)
course_run.course = course
course_run.save()
return course_run
def get_language_tag(self, body):
"""Get a language tag from Drupal data given by `body`."""
iso_code = body['current_language']
if iso_code is None:
return None
try:
return LanguageTag.objects.get(code=iso_code)
except LanguageTag.DoesNotExist:
logger.warning('Could not find language with ISO code [%s].', iso_code)
return None
def clean_html(self, content):
"""Cleans HTML from a string and returns a Markdown version."""
stripped = content.replace(' ', '')
html_converter = html2text.HTML2Text()
html_converter.wrap_links = False
html_converter.body_width = None
return html_converter.handle(stripped).strip()
...@@ -4,7 +4,11 @@ from django.conf import settings ...@@ -4,7 +4,11 @@ from django.conf import settings
from django.core.management import BaseCommand from django.core.management import BaseCommand
from edx_rest_api_client.client import EdxRestApiClient from edx_rest_api_client.client import EdxRestApiClient
from course_discovery.apps.course_metadata.data_loaders import OrganizationsApiDataLoader, CoursesApiDataLoader from course_discovery.apps.course_metadata.data_loaders import (
CoursesApiDataLoader,
DrupalApiDataLoader,
OrganizationsApiDataLoader,
)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -39,3 +43,4 @@ class Command(BaseCommand): ...@@ -39,3 +43,4 @@ class Command(BaseCommand):
OrganizationsApiDataLoader(settings.ORGANIZATIONS_API_URL, access_token).ingest() OrganizationsApiDataLoader(settings.ORGANIZATIONS_API_URL, access_token).ingest()
CoursesApiDataLoader(settings.COURSES_API_URL, access_token).ingest() CoursesApiDataLoader(settings.COURSES_API_URL, access_token).ingest()
DrupalApiDataLoader(settings.MARKETING_API_URL).ingest()
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('course_metadata', '0001_initial'),
]
operations = [
migrations.AddField(
model_name='course',
name='marketing_url',
field=models.URLField(null=True, max_length=255, blank=True),
),
migrations.AddField(
model_name='historicalcourse',
name='marketing_url',
field=models.URLField(null=True, max_length=255, blank=True),
),
]
...@@ -127,6 +127,7 @@ class Course(TimeStampedModel): ...@@ -127,6 +127,7 @@ class Course(TimeStampedModel):
expected_learning_items = SortedManyToManyField(ExpectedLearningItem, blank=True) expected_learning_items = SortedManyToManyField(ExpectedLearningItem, blank=True)
image = models.ForeignKey(Image, default=None, null=True, blank=True) image = models.ForeignKey(Image, default=None, null=True, blank=True)
video = models.ForeignKey(Video, default=None, null=True, blank=True) video = models.ForeignKey(Video, default=None, null=True, blank=True)
marketing_url = models.URLField(max_length=255, null=True, blank=True)
history = HistoricalRecords() history = HistoricalRecords()
......
""" Tests for data loaders. """ """ Tests for data loaders. """
import datetime import datetime
import json import json
from urllib.parse import parse_qs, urlparse from urllib.parse import parse_qs, urlparse, urljoin
import ddt import ddt
import responses import responses
from django.conf import settings from django.conf import settings
from django.test import TestCase, override_settings from django.test import TestCase, override_settings
from opaque_keys.edx.keys import CourseKey
from course_discovery.apps.course_metadata.data_loaders import OrganizationsApiDataLoader, CoursesApiDataLoader, \ from course_discovery.apps.course_metadata.data_loaders import (
AbstractDataLoader OrganizationsApiDataLoader, CoursesApiDataLoader, AbstractDataLoader, DrupalApiDataLoader
from course_discovery.apps.course_metadata.models import Organization, Image, Course, CourseRun )
from course_discovery.apps.course_metadata.models import (
Course, CourseRun, Image, LanguageTag, Organization, Subject
)
ACCESS_TOKEN = 'secret' ACCESS_TOKEN = 'secret'
COURSES_API_URL = 'https://lms.example.com/api/courses/v1' COURSES_API_URL = 'https://lms.example.com/api/courses/v1'
ORGANIZATIONS_API_URL = 'https://lms.example.com/api/organizations/v0' ORGANIZATIONS_API_URL = 'https://lms.example.com/api/organizations/v0'
MARKETING_API_URL = 'https://example.com/api/catalog/v2/'
JSON = 'application/json' JSON = 'application/json'
...@@ -53,10 +58,11 @@ class DataLoaderTestMixin(object): ...@@ -53,10 +58,11 @@ class DataLoaderTestMixin(object):
super(DataLoaderTestMixin, self).setUp() super(DataLoaderTestMixin, self).setUp()
self.loader = self.loader_class(self.api_url, ACCESS_TOKEN) # pylint: disable=not-callable self.loader = self.loader_class(self.api_url, ACCESS_TOKEN) # pylint: disable=not-callable
def assert_api_called(self, expected_num_calls): def assert_api_called(self, expected_num_calls, check_auth=True):
""" Asserts the API was called with the correct number of calls, and the appropriate Authorization header. """ """ Asserts the API was called with the correct number of calls, and the appropriate Authorization header. """
self.assertEqual(len(responses.calls), expected_num_calls) self.assertEqual(len(responses.calls), expected_num_calls)
self.assertEqual(responses.calls[0].request.headers['Authorization'], 'Bearer {}'.format(ACCESS_TOKEN)) if check_auth:
self.assertEqual(responses.calls[0].request.headers['Authorization'], 'Bearer {}'.format(ACCESS_TOKEN))
def test_init(self): def test_init(self):
""" Verify the constructor sets the appropriate attributes. """ """ Verify the constructor sets the appropriate attributes. """
...@@ -287,7 +293,7 @@ class CoursesApiDataLoaderTests(DataLoaderTestMixin, TestCase): ...@@ -287,7 +293,7 @@ class CoursesApiDataLoaderTests(DataLoaderTestMixin, TestCase):
expected_num_course_runs = len(data) expected_num_course_runs = len(data)
self.assert_api_called(expected_num_course_runs) self.assert_api_called(expected_num_course_runs)
# Verify the Organizations were created correctly # Verify the CourseRuns were created correctly
self.assertEqual(CourseRun.objects.count(), expected_num_course_runs) self.assertEqual(CourseRun.objects.count(), expected_num_course_runs)
for datum in data: for datum in data:
...@@ -350,3 +356,190 @@ class CoursesApiDataLoaderTests(DataLoaderTestMixin, TestCase): ...@@ -350,3 +356,190 @@ class CoursesApiDataLoaderTests(DataLoaderTestMixin, TestCase):
self.assertEqual(actual.src, expected_video_src) self.assertEqual(actual.src, expected_video_src)
else: else:
self.assertIsNone(actual) self.assertIsNone(actual)
@override_settings(MARKETING_API_URL=MARKETING_API_URL)
@ddt.ddt
class DrupalApiDataLoaderTests(DataLoaderTestMixin, TestCase):
EXISTING_COURSE_AND_RUN_DATA = ({
'course_run_key': 'course-v1:SC+BreadX+3T2015',
'course_key': 'SC+BreadX',
'title': 'Bread Baking 101',
'current_language': 'en-us',
}, {
'course_run_key': 'course-v1:TX+T201+3T2015',
'course_key': 'TX+T201',
'title': 'Testing 201',
'current_language': ''
})
# A course which exists, but has no associated runs
EXISTING_COURSE = {
'course_key': 'PartialX+P102',
'title': 'A partial course',
}
api_url = MARKETING_API_URL
loader_class = DrupalApiDataLoader
def setUp(self):
super(DrupalApiDataLoaderTests, self).setUp()
for course_dict in self.EXISTING_COURSE_AND_RUN_DATA:
course = Course.objects.create(key=course_dict['course_key'], title=course_dict['title'])
CourseRun.objects.create(
key=course_dict['course_run_key'],
language=self.loader.get_language_tag(course_dict),
course=course
)
Course.objects.create(key=self.EXISTING_COURSE['course_key'], title=self.EXISTING_COURSE['title'])
def mock_api(self):
"""Mock out the Drupal API. Returns a list of mocked-out course runs."""
body = {
'items': [{
'title': self.EXISTING_COURSE_AND_RUN_DATA[0]['title'],
'level': {
'title': 'Introductory',
},
'course_about_uri': '/course/bread-baking-101',
'course_id': self.EXISTING_COURSE_AND_RUN_DATA[0]['course_run_key'],
'subjects': [{
'title': 'Bread baking',
}],
'current_language': self.EXISTING_COURSE_AND_RUN_DATA[0]['current_language'],
'subtitle': 'Learn about Bread',
'description': '<p><b>Bread</b> is a <a href="/wiki/Staple_food" title="Staple food">staple food</a>.',
}, {
'title': self.EXISTING_COURSE_AND_RUN_DATA[1]['title'],
'level': {
'title': 'Intermediate',
},
'course_about_uri': '/course/testing-201',
'course_id': self.EXISTING_COURSE_AND_RUN_DATA[1]['course_run_key'],
'subjects': [{
'title': 'testing',
}],
'current_language': self.EXISTING_COURSE_AND_RUN_DATA[1]['current_language'],
'subtitle': 'Testing 201',
'description': "how to test better",
}, { # Create a course which exists in LMS/Otto, but without course runs
'title': self.EXISTING_COURSE['title'],
'level': {
'title': 'Advanced',
},
'course_about_uri': '/course/partial-101',
'course_id': 'course-v1:{course_key}+run'.format(course_key=self.EXISTING_COURSE['course_key']),
'subjects': [{
'title': 'partially fake',
}],
'current_language': 'en-us',
'subtitle': 'Nope',
'description': 'what is fake?',
}, { # Create a fake course run which doesn't exist in LMS/Otto
'title': 'A partial course',
'level': {
'title': 'Advanced',
},
'course_about_uri': '/course/partial-101',
'course_id': 'course-v1:fakeX+fake+reallyfake',
'subjects': [{
'title': 'seriously fake',
}],
'current_language': 'en-us',
'subtitle': 'Nope',
'description': 'what is real?',
}]
}
responses.add(
responses.GET,
settings.MARKETING_API_URL + 'courses/',
body=json.dumps(body),
status=200,
content_type='application/json'
)
return body['items']
def assert_course_run_loaded(self, body):
"""
Verify that the course run corresponding to `body` has been saved
correctly.
"""
course_run_key_str = body['course_id']
course_run_key = CourseKey.from_string(course_run_key_str)
course_key = '{org}+{course}'.format(org=course_run_key.org, course=course_run_key.course)
course = Course.objects.get(key=course_key)
course_run = CourseRun.objects.get(key=course_run_key_str)
self.assertEqual(course_run.course, course)
self.assert_course_loaded(course, body)
if course_run.language:
self.assertEqual(course_run.language.code, body['current_language'])
else:
self.assertEqual(body['current_language'], '')
def assert_course_loaded(self, course, body):
"""Verify that the course has been loaded correctly."""
self.assertEqual(course.title, body['title'])
self.assertEqual(course.full_description, self.loader.clean_html(body['description']))
self.assertEqual(course.short_description, self.loader.clean_html(body['subtitle']))
self.assertEqual(course.marketing_url, urljoin(settings.MARKETING_URL_ROOT, body['course_about_uri']))
self.assertEqual(course.level_type.name, body['level']['title'])
self.assert_subjects_loaded(course, body)
def assert_subjects_loaded(self, course, body):
"""Verify that subjects have been loaded correctly."""
course_subjects = course.subjects.all()
api_subjects = body['subjects']
self.assertEqual(len(course_subjects), len(api_subjects))
for api_subject in api_subjects:
loaded_subject = Subject.objects.get(name=api_subject['title'].title())
self.assertIn(loaded_subject, course_subjects)
@responses.activate
def test_ingest(self):
"""Verify the data loader ingests data from Drupal."""
data = self.mock_api()
# The faked course should not be loaded from Drupal
loaded_data = data[:-2]
self.loader.ingest()
# Drupal does not paginate its response or check authorization
self.assert_api_called(1, check_auth=False)
# Assert that the fake course was not created
self.assertEqual(CourseRun.objects.count(), len(loaded_data))
for datum in loaded_data:
self.assert_course_run_loaded(datum)
Course.objects.get(key=self.EXISTING_COURSE['course_key'], title=self.EXISTING_COURSE['title'])
@ddt.data(
('', ''),
('<h1>foo</h1>', '# foo'),
('<a href="http://example.com">link</a>', '[link](http://example.com)'),
('<strong>foo</strong>', '**foo**'),
('<em>foo</em>', '_foo_'),
('\nfoo\n', 'foo'),
('<span>foo</span>', 'foo'),
('<div>foo</div>', 'foo'),
)
@ddt.unpack
def test_clean_html(self, to_clean, expected):
self.assertEqual(self.loader.clean_html(to_clean), expected)
@ddt.data(
({'current_language': ''}, None),
({'current_language': 'not-real'}, None),
({'current_language': 'en-us'}, LanguageTag(code='en-us', name='English - United States')),
({'current_language': None}, None),
)
@ddt.unpack
def test_get_language_tag(self, body, expected):
self.assertEqual(self.loader.get_language_tag(body), expected)
...@@ -312,6 +312,8 @@ HAYSTACK_SIGNAL_PROCESSOR = 'haystack.signals.RealtimeSignalProcessor' ...@@ -312,6 +312,8 @@ HAYSTACK_SIGNAL_PROCESSOR = 'haystack.signals.RealtimeSignalProcessor'
COURSES_API_URL = 'http://127.0.0.1:8000/api/courses/v1/' COURSES_API_URL = 'http://127.0.0.1:8000/api/courses/v1/'
ECOMMERCE_API_URL = 'http://127.0.0.1:8002/api/v2/' ECOMMERCE_API_URL = 'http://127.0.0.1:8002/api/v2/'
ORGANIZATIONS_API_URL = 'http://127.0.0.1:8000/api/organizations/v0/' ORGANIZATIONS_API_URL = 'http://127.0.0.1:8000/api/organizations/v0/'
MARKETING_API_URL = 'http://example.org/api/catalog/v2/'
MARKETING_URL_ROOT = 'http://example.org/'
EDX_DRF_EXTENSIONS = { EDX_DRF_EXTENSIONS = {
'OAUTH2_USER_INFO_URL': 'http://localhost:8000/oauth2/user_info', 'OAUTH2_USER_INFO_URL': 'http://localhost:8000/oauth2/user_info',
......
...@@ -15,6 +15,7 @@ edx-drf-extensions==0.2.0 ...@@ -15,6 +15,7 @@ edx-drf-extensions==0.2.0
edx-opaque-keys==0.3.0 edx-opaque-keys==0.3.0
edx-rest-api-client==1.5.0 edx-rest-api-client==1.5.0
elasticsearch>=1.0.0,<2.0.0 elasticsearch>=1.0.0,<2.0.0
html2text==2016.4.2
pycountry==1.20 pycountry==1.20
python-dateutil==2.5.2 python-dateutil==2.5.2
pytz==2015.7 pytz==2015.7
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment