Commit 450c9313 by Clinton Blackburn Committed by Clinton Blackburn

Removing ampersand HTML entities from ingested data

Data containing & will now have that entity converted to &.

ECOM-5997
parent 9fc6f989
......@@ -100,6 +100,11 @@ class AbstractDataLoader(metaclass=abc.ABCMeta):
cleaned = html_converter.handle(cleaned).strip()
cleaned = markdown.markdown(cleaned)
cleaned = cls.MARKDOWN_CLEANUP_REGEX.sub(r'\1', cleaned)
# html2text does not handle ampersands properly.
# See https://github.com/Alir3z4/html2text/issues/109.
cleaned = cleaned.replace('&', '&')
return cleaned
@classmethod
......
......@@ -383,7 +383,7 @@ class CourseMarketingSiteDataLoader(AbstractMarketingSiteDataLoader):
defaults = {
'key': key,
'title': data['field_course_course_title']['value'],
'title': self.clean_html(data['field_course_course_title']['value']),
'number': data['field_course_code'],
'full_description': self.get_description(data),
'video': self.get_video(data),
......@@ -454,7 +454,7 @@ class CourseMarketingSiteDataLoader(AbstractMarketingSiteDataLoader):
'key': key,
'course': course,
'uuid': uuid,
'title_override': data['field_course_course_title']['value'],
'title_override': self.clean_html(data['field_course_course_title']['value']),
'language': language,
'slug': slug,
'card_image_url': self._get_nested_url(data.get('field_course_image_promoted')),
......
......@@ -64,6 +64,7 @@ class AbstractDataLoaderTest(TestCase):
('', '',),
('<p>Hello!</p>', 'Hello!'),
('<em>Testing</em>', '<em>Testing</em>'),
('Hello&amp;world&nbsp;!', 'Hello&world!')
)
for content, expected in data:
......
......@@ -423,7 +423,7 @@ class CourseMarketingSiteDataLoaderTests(AbstractMarketingSiteDataLoaderTestMixi
course = self._get_course(data)
expected_values = {
'title': data['field_course_course_title']['value'],
'title': self.loader.clean_html(data['field_course_course_title']['value']),
'number': data['field_course_code'],
'full_description': self.loader.get_description(data),
'video': self.loader.get_video(data),
......@@ -473,7 +473,7 @@ class CourseMarketingSiteDataLoaderTests(AbstractMarketingSiteDataLoaderTestMixi
expected_values = {
'key': data['field_course_id'],
'title_override': data['field_course_course_title']['value'],
'title_override': self.loader.clean_html(data['field_course_course_title']['value']),
'language': language,
'slug': data['url'].split('/')[-1],
'card_image_url': (data.get('field_course_image_promoted') or {}).get('url'),
......
......@@ -33,7 +33,7 @@ edx-drf-extensions==1.1.1
edx-opaque-keys==0.3.1
edx-rest-api-client==1.6.0
elasticsearch>=1.0.0,<2.0.0
html2text==2016.5.29
html2text==2016.9.19
jsonfield==1.0.3
markdown==2.6.6
pillow==3.3.0
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment