Commit 450c9313 by Clinton Blackburn Committed by Clinton Blackburn

Removing ampersand HTML entities from ingested data

Data containing & will now have that entity converted to &.

ECOM-5997
parent 9fc6f989
...@@ -100,6 +100,11 @@ class AbstractDataLoader(metaclass=abc.ABCMeta): ...@@ -100,6 +100,11 @@ class AbstractDataLoader(metaclass=abc.ABCMeta):
cleaned = html_converter.handle(cleaned).strip() cleaned = html_converter.handle(cleaned).strip()
cleaned = markdown.markdown(cleaned) cleaned = markdown.markdown(cleaned)
cleaned = cls.MARKDOWN_CLEANUP_REGEX.sub(r'\1', cleaned) cleaned = cls.MARKDOWN_CLEANUP_REGEX.sub(r'\1', cleaned)
# html2text does not handle ampersands properly.
# See https://github.com/Alir3z4/html2text/issues/109.
cleaned = cleaned.replace('&', '&')
return cleaned return cleaned
@classmethod @classmethod
......
...@@ -383,7 +383,7 @@ class CourseMarketingSiteDataLoader(AbstractMarketingSiteDataLoader): ...@@ -383,7 +383,7 @@ class CourseMarketingSiteDataLoader(AbstractMarketingSiteDataLoader):
defaults = { defaults = {
'key': key, 'key': key,
'title': data['field_course_course_title']['value'], 'title': self.clean_html(data['field_course_course_title']['value']),
'number': data['field_course_code'], 'number': data['field_course_code'],
'full_description': self.get_description(data), 'full_description': self.get_description(data),
'video': self.get_video(data), 'video': self.get_video(data),
...@@ -454,7 +454,7 @@ class CourseMarketingSiteDataLoader(AbstractMarketingSiteDataLoader): ...@@ -454,7 +454,7 @@ class CourseMarketingSiteDataLoader(AbstractMarketingSiteDataLoader):
'key': key, 'key': key,
'course': course, 'course': course,
'uuid': uuid, 'uuid': uuid,
'title_override': data['field_course_course_title']['value'], 'title_override': self.clean_html(data['field_course_course_title']['value']),
'language': language, 'language': language,
'slug': slug, 'slug': slug,
'card_image_url': self._get_nested_url(data.get('field_course_image_promoted')), 'card_image_url': self._get_nested_url(data.get('field_course_image_promoted')),
......
...@@ -64,6 +64,7 @@ class AbstractDataLoaderTest(TestCase): ...@@ -64,6 +64,7 @@ class AbstractDataLoaderTest(TestCase):
('', '',), ('', '',),
('<p>Hello!</p>', 'Hello!'), ('<p>Hello!</p>', 'Hello!'),
('<em>Testing</em>', '<em>Testing</em>'), ('<em>Testing</em>', '<em>Testing</em>'),
('Hello&amp;world&nbsp;!', 'Hello&world!')
) )
for content, expected in data: for content, expected in data:
......
...@@ -423,7 +423,7 @@ class CourseMarketingSiteDataLoaderTests(AbstractMarketingSiteDataLoaderTestMixi ...@@ -423,7 +423,7 @@ class CourseMarketingSiteDataLoaderTests(AbstractMarketingSiteDataLoaderTestMixi
course = self._get_course(data) course = self._get_course(data)
expected_values = { expected_values = {
'title': data['field_course_course_title']['value'], 'title': self.loader.clean_html(data['field_course_course_title']['value']),
'number': data['field_course_code'], 'number': data['field_course_code'],
'full_description': self.loader.get_description(data), 'full_description': self.loader.get_description(data),
'video': self.loader.get_video(data), 'video': self.loader.get_video(data),
...@@ -473,7 +473,7 @@ class CourseMarketingSiteDataLoaderTests(AbstractMarketingSiteDataLoaderTestMixi ...@@ -473,7 +473,7 @@ class CourseMarketingSiteDataLoaderTests(AbstractMarketingSiteDataLoaderTestMixi
expected_values = { expected_values = {
'key': data['field_course_id'], 'key': data['field_course_id'],
'title_override': data['field_course_course_title']['value'], 'title_override': self.loader.clean_html(data['field_course_course_title']['value']),
'language': language, 'language': language,
'slug': data['url'].split('/')[-1], 'slug': data['url'].split('/')[-1],
'card_image_url': (data.get('field_course_image_promoted') or {}).get('url'), 'card_image_url': (data.get('field_course_image_promoted') or {}).get('url'),
......
...@@ -33,7 +33,7 @@ edx-drf-extensions==1.1.1 ...@@ -33,7 +33,7 @@ edx-drf-extensions==1.1.1
edx-opaque-keys==0.3.1 edx-opaque-keys==0.3.1
edx-rest-api-client==1.6.0 edx-rest-api-client==1.6.0
elasticsearch>=1.0.0,<2.0.0 elasticsearch>=1.0.0,<2.0.0
html2text==2016.5.29 html2text==2016.9.19
jsonfield==1.0.3 jsonfield==1.0.3
markdown==2.6.6 markdown==2.6.6
pillow==3.3.0 pillow==3.3.0
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment