render.py 3.56 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105
"""
Content rendering functionality

Note that this module is designed to imitate the front end behavior as
implemented in Markdown.Sanitizer.js.
"""
import re

import markdown

# These patterns could be more flexible about things like attributes and
# whitespace, but this is imitating Markdown.Sanitizer.js, so it uses the
# patterns defined therein.
TAG_PATTERN = re.compile(r"<[^>]*>?")
SANITIZED_TAG_PATTERN = re.compile(r"<(/?)(\w+)[^>]*>")
ALLOWED_BASIC_TAG_PATTERN = re.compile(
    r"^(</?(b|blockquote|code|del|dd|dl|dt|em|h1|h2|h3|i|kbd|li|ol|p|pre|s|sup|sub|strong|strike|ul)>|<(br|hr)\s?/?>)$"
)
ALLOWED_A_PATTERN = re.compile(
    r'^(<a\shref="((https?|ftp)://|/)[-A-Za-z0-9+&@#/%?=~_|!:,.;\(\)]+"(\stitle="[^"<>]+")?\s?>|</a>)$'
)
ALLOWED_IMG_PATTERN = re.compile(
    r'^(<img\ssrc="(https?://|/)[-A-Za-z0-9+&@#/%?=~_|!:,.;\(\)]+"(\swidth="\d{1,3}")?'
    r'(\sheight="\d{1,3}")?(\salt="[^"<>]*")?(\stitle="[^"<>]*")?\s?/?>)$'
)


def _sanitize_tag(match):
    """Return the tag if it is allowed or the empty string otherwise"""
    tag = match.group(0)
    if (
            ALLOWED_BASIC_TAG_PATTERN.match(tag) or
            ALLOWED_A_PATTERN.match(tag) or
            ALLOWED_IMG_PATTERN.match(tag)
    ):
        return tag
    else:
        return ""


def _sanitize_html(source):
    """
    Return source with all non-allowed tags removed, preserving the text content
    """
    return TAG_PATTERN.sub(_sanitize_tag, source)


def _remove_unpaired_tags(source):
    """
    Return source with all unpaired tags removed, preserving the text content

    source should have already been sanitized
    """
    tag_matches = list(SANITIZED_TAG_PATTERN.finditer(source))
    if not tag_matches:
        return source
    tag_stack = []
    tag_name_stack = []
    text_stack = [source[:tag_matches[0].start()]]
    for i, match in enumerate(tag_matches):
        tag_name = match.group(2)
        following_text = (
            source[match.end():tag_matches[i + 1].start()] if i + 1 < len(tag_matches) else
            source[match.end():]
        )
        if tag_name in ["p", "img", "br", "li", "hr"]:  # tags that don't require closing
            text_stack[-1] += match.group(0) + following_text
        elif match.group(1):  # end tag
            if tag_name in tag_name_stack:  # paired with a start tag somewhere
                # pop tags until we find the matching one, keeping the non-tag text
                while True:
                    popped_tag_name = tag_name_stack.pop()
                    popped_tag = tag_stack.pop()
                    popped_text = text_stack.pop()
                    if popped_tag_name == tag_name:
                        text_stack[-1] += popped_tag + popped_text + match.group(0)
                        break
                    else:
                        text_stack[-1] += popped_text
            # else unpaired; drop the tag
            text_stack[-1] += following_text
        else:  # start tag
            tag_stack.append(match.group(0))
            tag_name_stack.append(tag_name)
            text_stack.append(following_text)
    return "".join(text_stack)


def render_body(raw_body):
    """
    Render raw_body to HTML.

    This includes the following steps:

    * Convert Markdown to HTML
    * Strip non-whitelisted HTML
    * Remove unbalanced HTML tags

    Note that this does not prevent Markdown syntax inside a MathJax block from
    being processed, which the forums JavaScript code does.
    """
    rendered = markdown.markdown(raw_body)
    rendered = _sanitize_html(rendered)
    rendered = _remove_unpaired_tags(rendered)
    return rendered