Commit 3acb36fb by Clinton Blackburn Committed by GitHub

Storing ingested data as HTML instead of Markdown (#328)

This removes the burden of conversion from our API clients.

ECOM-5623
parent 4a4cebc9
import abc
import re
import html2text
import markdown
from dateutil.parser import parse
from django.utils.functional import cached_property
from edx_rest_api_client.client import EdxRestApiClient
......@@ -22,6 +24,7 @@ class AbstractDataLoader(metaclass=abc.ABCMeta):
PAGE_SIZE = 50
SUPPORTED_TOKEN_TYPES = ('bearer', 'jwt',)
MARKDOWN_CLEANUP_REGEX = re.compile(r'^<p>(.*)</p>$')
def __init__(self, partner, api_url, access_token=None, token_type=None, max_workers=None):
"""
......@@ -82,12 +85,19 @@ class AbstractDataLoader(metaclass=abc.ABCMeta):
@classmethod
def clean_html(cls, content):
"""Cleans HTML from a string and returns a Markdown version."""
stripped = content.replace('&nbsp;', '')
"""Cleans HTML from a string.
This method converts the HTML to a Markdown string (to remove styles, classes, and other unsupported
attributes), and converts the Markdown back to HTML.
"""
cleaned = content.replace('&nbsp;', '')
html_converter = html2text.HTML2Text()
html_converter.wrap_links = False
html_converter.body_width = None
return html_converter.handle(stripped).strip()
cleaned = html_converter.handle(cleaned).strip()
cleaned = markdown.markdown(cleaned)
cleaned = cls.MARKDOWN_CLEANUP_REGEX.sub(r'\1', cleaned)
return cleaned
@classmethod
def parse_date(cls, date_string):
......
......@@ -58,6 +58,17 @@ class AbstractDataLoaderTest(TestCase):
for instance in instances:
self.assertFalse(instance.__class__.objects.filter(pk=instance.pk).exists()) # pylint: disable=no-member
def test_clean_html(self):
""" Verify the method removes unnecessary HTML attributes. """
data = (
('', '',),
('<p>Hello!</p>', 'Hello!'),
('<em>Testing</em>', '<em>Testing</em>'),
)
for content, expected in data:
self.assertEqual(AbstractDataLoader.clean_html(content), expected)
@ddt.ddt
class OrganizationsApiDataLoaderTests(ApiClientTestMixin, DataLoaderTestMixin, TestCase):
......
......@@ -35,6 +35,7 @@ edx-rest-api-client==1.6.0
elasticsearch>=1.0.0,<2.0.0
html2text==2016.5.29
jsonfield==1.0.3
markdown==2.6.6
pillow==3.3.0
pycountry==1.20
python-dateutil==2.5.3
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment