Commit fe5f29da by Peter Fogg

Ingest data from Drupal.

ECOM-3983
parent 39214f4d
""" Data loaders. """
import abc
import logging
from urllib.parse import urljoin
from dateutil.parser import parse
from django.conf import settings
from edx_rest_api_client.client import EdxRestApiClient
import html2text
from opaque_keys.edx.keys import CourseKey
from course_discovery.apps.course_metadata.models import (
Organization, Image, Course, CourseRun, CourseOrganization, Video
Course, CourseOrganization, CourseRun, Image, LanguageTag, LevelType, Organization, Subject, Video
)
logger = logging.getLogger(__name__)
......@@ -24,7 +27,7 @@ class AbstractDataLoader(metaclass=abc.ABCMeta):
PAGE_SIZE = 50
def __init__(self, api_url, access_token):
def __init__(self, api_url, access_token=None):
"""
Arguments:
api_url (str): URL of the API from which data is loaded
......@@ -68,6 +71,21 @@ class AbstractDataLoader(metaclass=abc.ABCMeta):
return None
@classmethod
def convert_course_run_key(cls, course_run_key_str):
"""
Given a serialized course run key, return the corresponding
serialized course key.
Args:
course_run_key_str (str): The serialized course run key.
Returns:
str
"""
course_run_key = CourseKey.from_string(course_run_key_str)
return '{org}+{course}'.format(org=course_run_key.org, course=course_run_key.course)
class OrganizationsApiDataLoader(AbstractDataLoader):
""" Loads organizations from the Organizations API. """
......@@ -141,9 +159,10 @@ class CoursesApiDataLoader(AbstractDataLoader):
def update_course(self, body):
# NOTE (CCB): Use the data from the CourseKey since the Course API exposes display names for org and number,
# which may not be unique for an organization.
course_run_key = CourseKey.from_string(body['id'])
course_run_key_str = body['id']
course_run_key = CourseKey.from_string(course_run_key_str)
organization, __ = Organization.objects.get_or_create(key=course_run_key.org)
course_key = '{org}+{course}'.format(org=organization.key, course=course_run_key.course)
course_key = self.convert_course_run_key(course_run_key_str)
defaults = {
'title': body['name']
}
......@@ -202,3 +221,86 @@ class CoursesApiDataLoader(AbstractDataLoader):
video, __ = Video.objects.get_or_create(src=video_url)
return video
class DrupalApiDataLoader(AbstractDataLoader):
"""Loads course runs from the Drupal API."""
def ingest(self):
client = EdxRestApiClient(self.api_url)
logger.info('Refreshing Courses and CourseRuns from %s...', self.api_url)
response = client.courses.get()
data = response['items']
logger.info('Retrieved %d course runs...', len(data))
for body in data:
cleaned_body = self.clean_strings(body)
course = self.update_course(cleaned_body)
self.update_course_run(course, cleaned_body)
logger.info('Retrieved %d course runs from %s.', len(data), self.api_url)
def update_course(self, body):
"""Create or update a course from Drupal data given by `body`."""
course_key = self.convert_course_run_key(body['course_id'])
try:
course = Course.objects.get(key=course_key)
except Course.DoesNotExist:
logger.warning('Course not find course [%s]', course_key)
return None
course.full_description = self.clean_html(body['description'])
course.short_description = self.clean_html(body['subtitle'])
course.marketing_url = urljoin(settings.MARKETING_URL_ROOT, body['course_about_uri'])
level_type, __ = LevelType.objects.get_or_create(name=body['level']['title'])
course.level_type = level_type
self.set_subjects(course, body)
course.save()
return course
def set_subjects(self, course, body):
"""Update `course` with subjects from `body`."""
course.subjects.clear()
subjects = (s['title'] for s in body['subjects'])
for subject_name in subjects:
# Normalize subject names with title case
subject, __ = Subject.objects.get_or_create(name=subject_name.title())
course.subjects.add(subject)
def update_course_run(self, course, body):
"""
Create or update a run of `course` from Drupal data given by `body`.
"""
course_run_key = body['course_id']
try:
course_run = CourseRun.objects.get(key=course_run_key)
except CourseRun.DoesNotExist:
logger.warning('Could not find course run [%s]', course_run_key)
return None
course_run.language = self.get_language_tag(body)
course_run.course = course
course_run.save()
return course_run
def get_language_tag(self, body):
"""Get a language tag from Drupal data given by `body`."""
iso_code = body['current_language']
if iso_code is None:
return None
try:
return LanguageTag.objects.get(code=iso_code)
except LanguageTag.DoesNotExist:
logger.warning('Could not find language with ISO code [%s].', iso_code)
return None
def clean_html(self, content):
"""Cleans HTML from a string and returns a Markdown version."""
stripped = content.replace(' ', '')
html_converter = html2text.HTML2Text()
html_converter.wrap_links = False
html_converter.body_width = None
return html_converter.handle(stripped).strip()
......@@ -4,7 +4,11 @@ from django.conf import settings
from django.core.management import BaseCommand
from edx_rest_api_client.client import EdxRestApiClient
from course_discovery.apps.course_metadata.data_loaders import OrganizationsApiDataLoader, CoursesApiDataLoader
from course_discovery.apps.course_metadata.data_loaders import (
CoursesApiDataLoader,
DrupalApiDataLoader,
OrganizationsApiDataLoader,
)
logger = logging.getLogger(__name__)
......@@ -39,3 +43,4 @@ class Command(BaseCommand):
OrganizationsApiDataLoader(settings.ORGANIZATIONS_API_URL, access_token).ingest()
CoursesApiDataLoader(settings.COURSES_API_URL, access_token).ingest()
DrupalApiDataLoader(settings.MARKETING_API_URL).ingest()
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('course_metadata', '0001_initial'),
]
operations = [
migrations.AddField(
model_name='course',
name='marketing_url',
field=models.URLField(null=True, max_length=255, blank=True),
),
migrations.AddField(
model_name='historicalcourse',
name='marketing_url',
field=models.URLField(null=True, max_length=255, blank=True),
),
]
......@@ -127,6 +127,7 @@ class Course(TimeStampedModel):
expected_learning_items = SortedManyToManyField(ExpectedLearningItem, blank=True)
image = models.ForeignKey(Image, default=None, null=True, blank=True)
video = models.ForeignKey(Video, default=None, null=True, blank=True)
marketing_url = models.URLField(max_length=255, null=True, blank=True)
history = HistoricalRecords()
......
......@@ -312,6 +312,8 @@ HAYSTACK_SIGNAL_PROCESSOR = 'haystack.signals.RealtimeSignalProcessor'
COURSES_API_URL = 'http://127.0.0.1:8000/api/courses/v1/'
ECOMMERCE_API_URL = 'http://127.0.0.1:8002/api/v2/'
ORGANIZATIONS_API_URL = 'http://127.0.0.1:8000/api/organizations/v0/'
MARKETING_API_URL = 'http://example.org/api/catalog/v2/'
MARKETING_URL_ROOT = 'http://example.org/'
EDX_DRF_EXTENSIONS = {
'OAUTH2_USER_INFO_URL': 'http://localhost:8000/oauth2/user_info',
......
......@@ -15,6 +15,7 @@ edx-drf-extensions==0.2.0
edx-opaque-keys==0.3.0
edx-rest-api-client==1.5.0
elasticsearch>=1.0.0,<2.0.0
html2text==2016.4.2
pycountry==1.20
python-dateutil==2.5.2
pytz==2015.7
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment