Commit fe5f29da by Peter Fogg

Ingest data from Drupal.

ECOM-3983
parent 39214f4d
""" Data loaders. """ """ Data loaders. """
import abc import abc
import logging import logging
from urllib.parse import urljoin
from dateutil.parser import parse from dateutil.parser import parse
from django.conf import settings
from edx_rest_api_client.client import EdxRestApiClient from edx_rest_api_client.client import EdxRestApiClient
import html2text
from opaque_keys.edx.keys import CourseKey from opaque_keys.edx.keys import CourseKey
from course_discovery.apps.course_metadata.models import ( from course_discovery.apps.course_metadata.models import (
Organization, Image, Course, CourseRun, CourseOrganization, Video Course, CourseOrganization, CourseRun, Image, LanguageTag, LevelType, Organization, Subject, Video
) )
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -24,7 +27,7 @@ class AbstractDataLoader(metaclass=abc.ABCMeta): ...@@ -24,7 +27,7 @@ class AbstractDataLoader(metaclass=abc.ABCMeta):
PAGE_SIZE = 50 PAGE_SIZE = 50
def __init__(self, api_url, access_token): def __init__(self, api_url, access_token=None):
""" """
Arguments: Arguments:
api_url (str): URL of the API from which data is loaded api_url (str): URL of the API from which data is loaded
...@@ -68,6 +71,21 @@ class AbstractDataLoader(metaclass=abc.ABCMeta): ...@@ -68,6 +71,21 @@ class AbstractDataLoader(metaclass=abc.ABCMeta):
return None return None
@classmethod
def convert_course_run_key(cls, course_run_key_str):
"""
Given a serialized course run key, return the corresponding
serialized course key.
Args:
course_run_key_str (str): The serialized course run key.
Returns:
str
"""
course_run_key = CourseKey.from_string(course_run_key_str)
return '{org}+{course}'.format(org=course_run_key.org, course=course_run_key.course)
class OrganizationsApiDataLoader(AbstractDataLoader): class OrganizationsApiDataLoader(AbstractDataLoader):
""" Loads organizations from the Organizations API. """ """ Loads organizations from the Organizations API. """
...@@ -141,9 +159,10 @@ class CoursesApiDataLoader(AbstractDataLoader): ...@@ -141,9 +159,10 @@ class CoursesApiDataLoader(AbstractDataLoader):
def update_course(self, body): def update_course(self, body):
# NOTE (CCB): Use the data from the CourseKey since the Course API exposes display names for org and number, # NOTE (CCB): Use the data from the CourseKey since the Course API exposes display names for org and number,
# which may not be unique for an organization. # which may not be unique for an organization.
course_run_key = CourseKey.from_string(body['id']) course_run_key_str = body['id']
course_run_key = CourseKey.from_string(course_run_key_str)
organization, __ = Organization.objects.get_or_create(key=course_run_key.org) organization, __ = Organization.objects.get_or_create(key=course_run_key.org)
course_key = '{org}+{course}'.format(org=organization.key, course=course_run_key.course) course_key = self.convert_course_run_key(course_run_key_str)
defaults = { defaults = {
'title': body['name'] 'title': body['name']
} }
...@@ -202,3 +221,86 @@ class CoursesApiDataLoader(AbstractDataLoader): ...@@ -202,3 +221,86 @@ class CoursesApiDataLoader(AbstractDataLoader):
video, __ = Video.objects.get_or_create(src=video_url) video, __ = Video.objects.get_or_create(src=video_url)
return video return video
class DrupalApiDataLoader(AbstractDataLoader):
"""Loads course runs from the Drupal API."""
def ingest(self):
client = EdxRestApiClient(self.api_url)
logger.info('Refreshing Courses and CourseRuns from %s...', self.api_url)
response = client.courses.get()
data = response['items']
logger.info('Retrieved %d course runs...', len(data))
for body in data:
cleaned_body = self.clean_strings(body)
course = self.update_course(cleaned_body)
self.update_course_run(course, cleaned_body)
logger.info('Retrieved %d course runs from %s.', len(data), self.api_url)
def update_course(self, body):
"""Create or update a course from Drupal data given by `body`."""
course_key = self.convert_course_run_key(body['course_id'])
try:
course = Course.objects.get(key=course_key)
except Course.DoesNotExist:
logger.warning('Course not find course [%s]', course_key)
return None
course.full_description = self.clean_html(body['description'])
course.short_description = self.clean_html(body['subtitle'])
course.marketing_url = urljoin(settings.MARKETING_URL_ROOT, body['course_about_uri'])
level_type, __ = LevelType.objects.get_or_create(name=body['level']['title'])
course.level_type = level_type
self.set_subjects(course, body)
course.save()
return course
def set_subjects(self, course, body):
"""Update `course` with subjects from `body`."""
course.subjects.clear()
subjects = (s['title'] for s in body['subjects'])
for subject_name in subjects:
# Normalize subject names with title case
subject, __ = Subject.objects.get_or_create(name=subject_name.title())
course.subjects.add(subject)
def update_course_run(self, course, body):
"""
Create or update a run of `course` from Drupal data given by `body`.
"""
course_run_key = body['course_id']
try:
course_run = CourseRun.objects.get(key=course_run_key)
except CourseRun.DoesNotExist:
logger.warning('Could not find course run [%s]', course_run_key)
return None
course_run.language = self.get_language_tag(body)
course_run.course = course
course_run.save()
return course_run
def get_language_tag(self, body):
"""Get a language tag from Drupal data given by `body`."""
iso_code = body['current_language']
if iso_code is None:
return None
try:
return LanguageTag.objects.get(code=iso_code)
except LanguageTag.DoesNotExist:
logger.warning('Could not find language with ISO code [%s].', iso_code)
return None
def clean_html(self, content):
"""Cleans HTML from a string and returns a Markdown version."""
stripped = content.replace(' ', '')
html_converter = html2text.HTML2Text()
html_converter.wrap_links = False
html_converter.body_width = None
return html_converter.handle(stripped).strip()
...@@ -4,7 +4,11 @@ from django.conf import settings ...@@ -4,7 +4,11 @@ from django.conf import settings
from django.core.management import BaseCommand from django.core.management import BaseCommand
from edx_rest_api_client.client import EdxRestApiClient from edx_rest_api_client.client import EdxRestApiClient
from course_discovery.apps.course_metadata.data_loaders import OrganizationsApiDataLoader, CoursesApiDataLoader from course_discovery.apps.course_metadata.data_loaders import (
CoursesApiDataLoader,
DrupalApiDataLoader,
OrganizationsApiDataLoader,
)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -39,3 +43,4 @@ class Command(BaseCommand): ...@@ -39,3 +43,4 @@ class Command(BaseCommand):
OrganizationsApiDataLoader(settings.ORGANIZATIONS_API_URL, access_token).ingest() OrganizationsApiDataLoader(settings.ORGANIZATIONS_API_URL, access_token).ingest()
CoursesApiDataLoader(settings.COURSES_API_URL, access_token).ingest() CoursesApiDataLoader(settings.COURSES_API_URL, access_token).ingest()
DrupalApiDataLoader(settings.MARKETING_API_URL).ingest()
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('course_metadata', '0001_initial'),
]
operations = [
migrations.AddField(
model_name='course',
name='marketing_url',
field=models.URLField(null=True, max_length=255, blank=True),
),
migrations.AddField(
model_name='historicalcourse',
name='marketing_url',
field=models.URLField(null=True, max_length=255, blank=True),
),
]
...@@ -127,6 +127,7 @@ class Course(TimeStampedModel): ...@@ -127,6 +127,7 @@ class Course(TimeStampedModel):
expected_learning_items = SortedManyToManyField(ExpectedLearningItem, blank=True) expected_learning_items = SortedManyToManyField(ExpectedLearningItem, blank=True)
image = models.ForeignKey(Image, default=None, null=True, blank=True) image = models.ForeignKey(Image, default=None, null=True, blank=True)
video = models.ForeignKey(Video, default=None, null=True, blank=True) video = models.ForeignKey(Video, default=None, null=True, blank=True)
marketing_url = models.URLField(max_length=255, null=True, blank=True)
history = HistoricalRecords() history = HistoricalRecords()
......
...@@ -312,6 +312,8 @@ HAYSTACK_SIGNAL_PROCESSOR = 'haystack.signals.RealtimeSignalProcessor' ...@@ -312,6 +312,8 @@ HAYSTACK_SIGNAL_PROCESSOR = 'haystack.signals.RealtimeSignalProcessor'
COURSES_API_URL = 'http://127.0.0.1:8000/api/courses/v1/' COURSES_API_URL = 'http://127.0.0.1:8000/api/courses/v1/'
ECOMMERCE_API_URL = 'http://127.0.0.1:8002/api/v2/' ECOMMERCE_API_URL = 'http://127.0.0.1:8002/api/v2/'
ORGANIZATIONS_API_URL = 'http://127.0.0.1:8000/api/organizations/v0/' ORGANIZATIONS_API_URL = 'http://127.0.0.1:8000/api/organizations/v0/'
MARKETING_API_URL = 'http://example.org/api/catalog/v2/'
MARKETING_URL_ROOT = 'http://example.org/'
EDX_DRF_EXTENSIONS = { EDX_DRF_EXTENSIONS = {
'OAUTH2_USER_INFO_URL': 'http://localhost:8000/oauth2/user_info', 'OAUTH2_USER_INFO_URL': 'http://localhost:8000/oauth2/user_info',
......
...@@ -15,6 +15,7 @@ edx-drf-extensions==0.2.0 ...@@ -15,6 +15,7 @@ edx-drf-extensions==0.2.0
edx-opaque-keys==0.3.0 edx-opaque-keys==0.3.0
edx-rest-api-client==1.5.0 edx-rest-api-client==1.5.0
elasticsearch>=1.0.0,<2.0.0 elasticsearch>=1.0.0,<2.0.0
html2text==2016.4.2
pycountry==1.20 pycountry==1.20
python-dateutil==2.5.2 python-dateutil==2.5.2
pytz==2015.7 pytz==2015.7
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment