Commit fe5f29da by Peter Fogg

Ingest data from Drupal.

ECOM-3983
parent 39214f4d
""" Data loaders. """
import abc
import logging
from urllib.parse import urljoin
from dateutil.parser import parse
from django.conf import settings
from edx_rest_api_client.client import EdxRestApiClient
import html2text
from opaque_keys.edx.keys import CourseKey
from course_discovery.apps.course_metadata.models import (
Organization, Image, Course, CourseRun, CourseOrganization, Video
Course, CourseOrganization, CourseRun, Image, LanguageTag, LevelType, Organization, Subject, Video
)
logger = logging.getLogger(__name__)
......@@ -24,7 +27,7 @@ class AbstractDataLoader(metaclass=abc.ABCMeta):
PAGE_SIZE = 50
def __init__(self, api_url, access_token):
def __init__(self, api_url, access_token=None):
"""
Arguments:
api_url (str): URL of the API from which data is loaded
......@@ -68,6 +71,21 @@ class AbstractDataLoader(metaclass=abc.ABCMeta):
return None
@classmethod
def convert_course_run_key(cls, course_run_key_str):
"""
Given a serialized course run key, return the corresponding
serialized course key.
Args:
course_run_key_str (str): The serialized course run key.
Returns:
str
"""
course_run_key = CourseKey.from_string(course_run_key_str)
return '{org}+{course}'.format(org=course_run_key.org, course=course_run_key.course)
class OrganizationsApiDataLoader(AbstractDataLoader):
""" Loads organizations from the Organizations API. """
......@@ -141,9 +159,10 @@ class CoursesApiDataLoader(AbstractDataLoader):
def update_course(self, body):
# NOTE (CCB): Use the data from the CourseKey since the Course API exposes display names for org and number,
# which may not be unique for an organization.
course_run_key = CourseKey.from_string(body['id'])
course_run_key_str = body['id']
course_run_key = CourseKey.from_string(course_run_key_str)
organization, __ = Organization.objects.get_or_create(key=course_run_key.org)
course_key = '{org}+{course}'.format(org=organization.key, course=course_run_key.course)
course_key = self.convert_course_run_key(course_run_key_str)
defaults = {
'title': body['name']
}
......@@ -202,3 +221,86 @@ class CoursesApiDataLoader(AbstractDataLoader):
video, __ = Video.objects.get_or_create(src=video_url)
return video
class DrupalApiDataLoader(AbstractDataLoader):
"""Loads course runs from the Drupal API."""
def ingest(self):
client = EdxRestApiClient(self.api_url)
logger.info('Refreshing Courses and CourseRuns from %s...', self.api_url)
response = client.courses.get()
data = response['items']
logger.info('Retrieved %d course runs...', len(data))
for body in data:
cleaned_body = self.clean_strings(body)
course = self.update_course(cleaned_body)
self.update_course_run(course, cleaned_body)
logger.info('Retrieved %d course runs from %s.', len(data), self.api_url)
def update_course(self, body):
"""Create or update a course from Drupal data given by `body`."""
course_key = self.convert_course_run_key(body['course_id'])
try:
course = Course.objects.get(key=course_key)
except Course.DoesNotExist:
logger.warning('Course not find course [%s]', course_key)
return None
course.full_description = self.clean_html(body['description'])
course.short_description = self.clean_html(body['subtitle'])
course.marketing_url = urljoin(settings.MARKETING_URL_ROOT, body['course_about_uri'])
level_type, __ = LevelType.objects.get_or_create(name=body['level']['title'])
course.level_type = level_type
self.set_subjects(course, body)
course.save()
return course
def set_subjects(self, course, body):
"""Update `course` with subjects from `body`."""
course.subjects.clear()
subjects = (s['title'] for s in body['subjects'])
for subject_name in subjects:
# Normalize subject names with title case
subject, __ = Subject.objects.get_or_create(name=subject_name.title())
course.subjects.add(subject)
def update_course_run(self, course, body):
"""
Create or update a run of `course` from Drupal data given by `body`.
"""
course_run_key = body['course_id']
try:
course_run = CourseRun.objects.get(key=course_run_key)
except CourseRun.DoesNotExist:
logger.warning('Could not find course run [%s]', course_run_key)
return None
course_run.language = self.get_language_tag(body)
course_run.course = course
course_run.save()
return course_run
def get_language_tag(self, body):
"""Get a language tag from Drupal data given by `body`."""
iso_code = body['current_language']
if iso_code is None:
return None
try:
return LanguageTag.objects.get(code=iso_code)
except LanguageTag.DoesNotExist:
logger.warning('Could not find language with ISO code [%s].', iso_code)
return None
def clean_html(self, content):
"""Cleans HTML from a string and returns a Markdown version."""
stripped = content.replace(' ', '')
html_converter = html2text.HTML2Text()
html_converter.wrap_links = False
html_converter.body_width = None
return html_converter.handle(stripped).strip()
......@@ -4,7 +4,11 @@ from django.conf import settings
from django.core.management import BaseCommand
from edx_rest_api_client.client import EdxRestApiClient
from course_discovery.apps.course_metadata.data_loaders import OrganizationsApiDataLoader, CoursesApiDataLoader
from course_discovery.apps.course_metadata.data_loaders import (
CoursesApiDataLoader,
DrupalApiDataLoader,
OrganizationsApiDataLoader,
)
logger = logging.getLogger(__name__)
......@@ -39,3 +43,4 @@ class Command(BaseCommand):
OrganizationsApiDataLoader(settings.ORGANIZATIONS_API_URL, access_token).ingest()
CoursesApiDataLoader(settings.COURSES_API_URL, access_token).ingest()
DrupalApiDataLoader(settings.MARKETING_API_URL).ingest()
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('course_metadata', '0001_initial'),
]
operations = [
migrations.AddField(
model_name='course',
name='marketing_url',
field=models.URLField(null=True, max_length=255, blank=True),
),
migrations.AddField(
model_name='historicalcourse',
name='marketing_url',
field=models.URLField(null=True, max_length=255, blank=True),
),
]
......@@ -127,6 +127,7 @@ class Course(TimeStampedModel):
expected_learning_items = SortedManyToManyField(ExpectedLearningItem, blank=True)
image = models.ForeignKey(Image, default=None, null=True, blank=True)
video = models.ForeignKey(Video, default=None, null=True, blank=True)
marketing_url = models.URLField(max_length=255, null=True, blank=True)
history = HistoricalRecords()
......
""" Tests for data loaders. """
import datetime
import json
from urllib.parse import parse_qs, urlparse
from urllib.parse import parse_qs, urlparse, urljoin
import ddt
import responses
from django.conf import settings
from django.test import TestCase, override_settings
from opaque_keys.edx.keys import CourseKey
from course_discovery.apps.course_metadata.data_loaders import OrganizationsApiDataLoader, CoursesApiDataLoader, \
AbstractDataLoader
from course_discovery.apps.course_metadata.models import Organization, Image, Course, CourseRun
from course_discovery.apps.course_metadata.data_loaders import (
OrganizationsApiDataLoader, CoursesApiDataLoader, AbstractDataLoader, DrupalApiDataLoader
)
from course_discovery.apps.course_metadata.models import (
Course, CourseRun, Image, LanguageTag, Organization, Subject
)
ACCESS_TOKEN = 'secret'
COURSES_API_URL = 'https://lms.example.com/api/courses/v1'
ORGANIZATIONS_API_URL = 'https://lms.example.com/api/organizations/v0'
MARKETING_API_URL = 'https://example.com/api/catalog/v2/'
JSON = 'application/json'
......@@ -53,10 +58,11 @@ class DataLoaderTestMixin(object):
super(DataLoaderTestMixin, self).setUp()
self.loader = self.loader_class(self.api_url, ACCESS_TOKEN) # pylint: disable=not-callable
def assert_api_called(self, expected_num_calls):
def assert_api_called(self, expected_num_calls, check_auth=True):
""" Asserts the API was called with the correct number of calls, and the appropriate Authorization header. """
self.assertEqual(len(responses.calls), expected_num_calls)
self.assertEqual(responses.calls[0].request.headers['Authorization'], 'Bearer {}'.format(ACCESS_TOKEN))
if check_auth:
self.assertEqual(responses.calls[0].request.headers['Authorization'], 'Bearer {}'.format(ACCESS_TOKEN))
def test_init(self):
""" Verify the constructor sets the appropriate attributes. """
......@@ -287,7 +293,7 @@ class CoursesApiDataLoaderTests(DataLoaderTestMixin, TestCase):
expected_num_course_runs = len(data)
self.assert_api_called(expected_num_course_runs)
# Verify the Organizations were created correctly
# Verify the CourseRuns were created correctly
self.assertEqual(CourseRun.objects.count(), expected_num_course_runs)
for datum in data:
......@@ -350,3 +356,190 @@ class CoursesApiDataLoaderTests(DataLoaderTestMixin, TestCase):
self.assertEqual(actual.src, expected_video_src)
else:
self.assertIsNone(actual)
@override_settings(MARKETING_API_URL=MARKETING_API_URL)
@ddt.ddt
class DrupalApiDataLoaderTests(DataLoaderTestMixin, TestCase):
EXISTING_COURSE_AND_RUN_DATA = ({
'course_run_key': 'course-v1:SC+BreadX+3T2015',
'course_key': 'SC+BreadX',
'title': 'Bread Baking 101',
'current_language': 'en-us',
}, {
'course_run_key': 'course-v1:TX+T201+3T2015',
'course_key': 'TX+T201',
'title': 'Testing 201',
'current_language': ''
})
# A course which exists, but has no associated runs
EXISTING_COURSE = {
'course_key': 'PartialX+P102',
'title': 'A partial course',
}
api_url = MARKETING_API_URL
loader_class = DrupalApiDataLoader
def setUp(self):
super(DrupalApiDataLoaderTests, self).setUp()
for course_dict in self.EXISTING_COURSE_AND_RUN_DATA:
course = Course.objects.create(key=course_dict['course_key'], title=course_dict['title'])
CourseRun.objects.create(
key=course_dict['course_run_key'],
language=self.loader.get_language_tag(course_dict),
course=course
)
Course.objects.create(key=self.EXISTING_COURSE['course_key'], title=self.EXISTING_COURSE['title'])
def mock_api(self):
"""Mock out the Drupal API. Returns a list of mocked-out course runs."""
body = {
'items': [{
'title': self.EXISTING_COURSE_AND_RUN_DATA[0]['title'],
'level': {
'title': 'Introductory',
},
'course_about_uri': '/course/bread-baking-101',
'course_id': self.EXISTING_COURSE_AND_RUN_DATA[0]['course_run_key'],
'subjects': [{
'title': 'Bread baking',
}],
'current_language': self.EXISTING_COURSE_AND_RUN_DATA[0]['current_language'],
'subtitle': 'Learn about Bread',
'description': '<p><b>Bread</b> is a <a href="/wiki/Staple_food" title="Staple food">staple food</a>.',
}, {
'title': self.EXISTING_COURSE_AND_RUN_DATA[1]['title'],
'level': {
'title': 'Intermediate',
},
'course_about_uri': '/course/testing-201',
'course_id': self.EXISTING_COURSE_AND_RUN_DATA[1]['course_run_key'],
'subjects': [{
'title': 'testing',
}],
'current_language': self.EXISTING_COURSE_AND_RUN_DATA[1]['current_language'],
'subtitle': 'Testing 201',
'description': "how to test better",
}, { # Create a course which exists in LMS/Otto, but without course runs
'title': self.EXISTING_COURSE['title'],
'level': {
'title': 'Advanced',
},
'course_about_uri': '/course/partial-101',
'course_id': 'course-v1:{course_key}+run'.format(course_key=self.EXISTING_COURSE['course_key']),
'subjects': [{
'title': 'partially fake',
}],
'current_language': 'en-us',
'subtitle': 'Nope',
'description': 'what is fake?',
}, { # Create a fake course run which doesn't exist in LMS/Otto
'title': 'A partial course',
'level': {
'title': 'Advanced',
},
'course_about_uri': '/course/partial-101',
'course_id': 'course-v1:fakeX+fake+reallyfake',
'subjects': [{
'title': 'seriously fake',
}],
'current_language': 'en-us',
'subtitle': 'Nope',
'description': 'what is real?',
}]
}
responses.add(
responses.GET,
settings.MARKETING_API_URL + 'courses/',
body=json.dumps(body),
status=200,
content_type='application/json'
)
return body['items']
def assert_course_run_loaded(self, body):
"""
Verify that the course run corresponding to `body` has been saved
correctly.
"""
course_run_key_str = body['course_id']
course_run_key = CourseKey.from_string(course_run_key_str)
course_key = '{org}+{course}'.format(org=course_run_key.org, course=course_run_key.course)
course = Course.objects.get(key=course_key)
course_run = CourseRun.objects.get(key=course_run_key_str)
self.assertEqual(course_run.course, course)
self.assert_course_loaded(course, body)
if course_run.language:
self.assertEqual(course_run.language.code, body['current_language'])
else:
self.assertEqual(body['current_language'], '')
def assert_course_loaded(self, course, body):
"""Verify that the course has been loaded correctly."""
self.assertEqual(course.title, body['title'])
self.assertEqual(course.full_description, self.loader.clean_html(body['description']))
self.assertEqual(course.short_description, self.loader.clean_html(body['subtitle']))
self.assertEqual(course.marketing_url, urljoin(settings.MARKETING_URL_ROOT, body['course_about_uri']))
self.assertEqual(course.level_type.name, body['level']['title'])
self.assert_subjects_loaded(course, body)
def assert_subjects_loaded(self, course, body):
"""Verify that subjects have been loaded correctly."""
course_subjects = course.subjects.all()
api_subjects = body['subjects']
self.assertEqual(len(course_subjects), len(api_subjects))
for api_subject in api_subjects:
loaded_subject = Subject.objects.get(name=api_subject['title'].title())
self.assertIn(loaded_subject, course_subjects)
@responses.activate
def test_ingest(self):
"""Verify the data loader ingests data from Drupal."""
data = self.mock_api()
# The faked course should not be loaded from Drupal
loaded_data = data[:-2]
self.loader.ingest()
# Drupal does not paginate its response or check authorization
self.assert_api_called(1, check_auth=False)
# Assert that the fake course was not created
self.assertEqual(CourseRun.objects.count(), len(loaded_data))
for datum in loaded_data:
self.assert_course_run_loaded(datum)
Course.objects.get(key=self.EXISTING_COURSE['course_key'], title=self.EXISTING_COURSE['title'])
@ddt.data(
('', ''),
('<h1>foo</h1>', '# foo'),
('<a href="http://example.com">link</a>', '[link](http://example.com)'),
('<strong>foo</strong>', '**foo**'),
('<em>foo</em>', '_foo_'),
('\nfoo\n', 'foo'),
('<span>foo</span>', 'foo'),
('<div>foo</div>', 'foo'),
)
@ddt.unpack
def test_clean_html(self, to_clean, expected):
self.assertEqual(self.loader.clean_html(to_clean), expected)
@ddt.data(
({'current_language': ''}, None),
({'current_language': 'not-real'}, None),
({'current_language': 'en-us'}, LanguageTag(code='en-us', name='English - United States')),
({'current_language': None}, None),
)
@ddt.unpack
def test_get_language_tag(self, body, expected):
self.assertEqual(self.loader.get_language_tag(body), expected)
......@@ -312,6 +312,8 @@ HAYSTACK_SIGNAL_PROCESSOR = 'haystack.signals.RealtimeSignalProcessor'
COURSES_API_URL = 'http://127.0.0.1:8000/api/courses/v1/'
ECOMMERCE_API_URL = 'http://127.0.0.1:8002/api/v2/'
ORGANIZATIONS_API_URL = 'http://127.0.0.1:8000/api/organizations/v0/'
MARKETING_API_URL = 'http://example.org/api/catalog/v2/'
MARKETING_URL_ROOT = 'http://example.org/'
EDX_DRF_EXTENSIONS = {
'OAUTH2_USER_INFO_URL': 'http://localhost:8000/oauth2/user_info',
......
......@@ -15,6 +15,7 @@ edx-drf-extensions==0.2.0
edx-opaque-keys==0.3.0
edx-rest-api-client==1.5.0
elasticsearch>=1.0.0,<2.0.0
html2text==2016.4.2
pycountry==1.20
python-dateutil==2.5.2
pytz==2015.7
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment