Commit 6e1b8749 by Jason Bau

Merge pull request #1440 from edx/jbau/COE-sanitize-with-bleach

Jbau/coe sanitize with bleach
parents 9936fc0a 651ef209
import json import json
import logging import logging
from lxml.html.clean import Cleaner, autolink_html
import re import re
import bleach
from html5lib.tokenizer import HTMLTokenizer
from xmodule.progress import Progress from xmodule.progress import Progress
import capa.xqueue_interface as xqueue_interface import capa.xqueue_interface as xqueue_interface
from capa.util import * from capa.util import *
...@@ -50,24 +51,14 @@ def upload_to_s3(file_to_upload, keyname, s3_interface): ...@@ -50,24 +51,14 @@ def upload_to_s3(file_to_upload, keyname, s3_interface):
return public_url return public_url
class WhiteListCleaner(Cleaner): # Used by sanitize_html
""" ALLOWED_HTML_ATTRS = {
By default, lxml cleaner strips out all links that are not in a defined whitelist. '*': ['id', 'class', 'height', 'width', 'alt'],
We want to allow all links, and rely on the peer grading flagging mechanic to catch 'a': ['href', 'title', 'rel', 'target'],
the "bad" ones. So, don't define a whitelist at all. 'embed': ['src'],
""" 'iframe': ['src'],
def allow_embedded_url(self, el, url): 'img': ['src'],
""" }
Override the Cleaner allow_embedded_url method to remove the whitelist url requirement.
Ensure that any tags not in the whitelist are stripped beforehand.
"""
# Tell cleaner to strip any element with a tag that isn't whitelisted.
if self.whitelist_tags is not None and el.tag not in self.whitelist_tags:
return False
# Tell cleaner to allow all urls.
return True
class OpenEndedChild(object): class OpenEndedChild(object):
...@@ -228,22 +219,23 @@ class OpenEndedChild(object): ...@@ -228,22 +219,23 @@ class OpenEndedChild(object):
answer - any string answer - any string
return - a cleaned version of the string return - a cleaned version of the string
""" """
try: clean_html = bleach.clean(answer,
answer = autolink_html(answer) tags=['embed', 'iframe', 'a', 'img', 'br'],
cleaner = WhiteListCleaner( attributes=ALLOWED_HTML_ATTRS,
style=True, strip=True)
links=True, autolinked = bleach.linkify(clean_html,
add_nofollow=False, callbacks=[bleach.callbacks.target_blank],
page_structure=True, skip_pre=True,
safe_attrs_only=True, tokenizer=HTMLTokenizer)
whitelist_tags=('embed', 'iframe', 'a', 'img', 'br',) return OpenEndedChild.replace_newlines(autolinked)
)
clean_html = cleaner.clean_html(answer) @staticmethod
clean_html = re.sub(r'</p>$', '', re.sub(r'^<p>', '', clean_html)) def replace_newlines(html):
clean_html = re.sub("\n","<br/>", clean_html) """
except Exception: Replaces "\n" newlines with <br/>
clean_html = answer """
return clean_html retv = re.sub(r'</p>$', '', re.sub(r'^<p>', '', html))
return re.sub("\n","<br/>", retv)
def new_history_entry(self, answer): def new_history_entry(self, answer):
""" """
......
...@@ -1001,3 +1001,92 @@ class OpenEndedModuleXmlImageUploadTest(unittest.TestCase, DummyModulestore): ...@@ -1001,3 +1001,92 @@ class OpenEndedModuleXmlImageUploadTest(unittest.TestCase, DummyModulestore):
self.assertTrue(response['success']) self.assertTrue(response['success'])
self.assertIn(self.answer_link, response['student_response']) self.assertIn(self.answer_link, response['student_response'])
self.assertIn(self.autolink_tag, response['student_response']) self.assertIn(self.autolink_tag, response['student_response'])
class OpenEndedModuleUtilTest(unittest.TestCase):
"""
Tests for the util functions of OpenEndedModule. Currently just for the html_sanitizer and <br/> inserter
"""
script_dirty = u'<script>alert("xss!")</script>'
script_clean = u'alert("xss!")'
img_dirty = u'<img alt="cats" height="200" onclick="eval()" src="http://example.com/lolcats.jpg" width="200">'
img_clean = u'<img alt="cats" height="200" src="http://example.com/lolcats.jpg" width="200">'
embed_dirty = u'<embed height="200" id="cats" onhover="eval()" src="http://example.com/lolcats.swf" width="200"/>'
embed_clean = u'<embed height="200" id="cats" src="http://example.com/lolcats.swf" width="200">'
iframe_dirty = u'<iframe class="cats" height="200" onerror="eval()" src="http://example.com/lolcats" width="200"/>'
iframe_clean = u'<iframe class="cats" height="200" src="http://example.com/lolcats" width="200"></iframe>'
text = u'I am a \u201c\xfcber student\u201d'
text_lessthan_noencd = u'This used to be broken < by the other parser. 3>5'
text_lessthan_encode = u'This used to be broken &lt; by the other parser. 3&gt;5'
text_linebreaks = u"St\xfcdent submission:\nI like lamp."
text_brs = u"St\xfcdent submission:<br/>I like lamp."
link_text = u'I love going to www.lolcatz.com'
link_atag = u'I love going to <a href="http://www.lolcatz.com" target="_blank">www.lolcatz.com</a>'
def test_script(self):
"""
Basic test for stripping <script>
"""
self.assertEqual(OpenEndedChild.sanitize_html(self.script_dirty), self.script_clean)
def test_img(self):
"""
Basic test for passing through img, but stripping bad attr
"""
self.assertEqual(OpenEndedChild.sanitize_html(self.img_dirty), self.img_clean)
def test_embed(self):
"""
Basic test for passing through embed, but stripping bad attr
"""
self.assertEqual(OpenEndedChild.sanitize_html(self.embed_dirty), self.embed_clean)
def test_iframe(self):
"""
Basic test for passing through iframe, but stripping bad attr
"""
self.assertEqual(OpenEndedChild.sanitize_html(self.iframe_dirty), self.iframe_clean)
def test_text(self):
"""
Test for passing through text unchanged, including unicode
"""
self.assertEqual(OpenEndedChild.sanitize_html(self.text), self.text)
def test_lessthan(self):
"""
Tests that `<` in text context is handled properly
"""
self.assertEqual(OpenEndedChild.sanitize_html(self.text_lessthan_noencd), self.text_lessthan_encode)
def test_linebreaks(self):
"""
tests the replace_newlines function
"""
self.assertEqual(OpenEndedChild.replace_newlines(self.text_linebreaks), self.text_brs)
def test_linkify(self):
"""
tests the replace_newlines function
"""
self.assertEqual(OpenEndedChild.sanitize_html(self.link_text), self.link_atag)
def test_combined(self):
"""
tests a combination of inputs
"""
test_input = u"{}\n{}\n{}\n\n{}{}\n{}".format(self.link_text,
self.text,
self.script_dirty,
self.embed_dirty,
self.text_lessthan_noencd,
self.img_dirty)
test_output = u"{}<br/>{}<br/>{}<br/><br/>{}{}<br/>{}".format(self.link_atag,
self.text,
self.script_clean,
self.embed_clean,
self.text_lessthan_encode,
self.img_clean)
self.assertEqual(OpenEndedChild.sanitize_html(test_input), test_output)
...@@ -8,6 +8,8 @@ ...@@ -8,6 +8,8 @@
beautifulsoup4==4.1.3 beautifulsoup4==4.1.3
beautifulsoup==3.2.1 beautifulsoup==3.2.1
bleach==1.2.2
html5lib==0.95
boto==2.6.0 boto==2.6.0
celery==3.0.19 celery==3.0.19
dealer==0.2.3 dealer==0.2.3
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment