render.py

"""
Content rendering functionality

Note that this module is designed to imitate the front end behavior as
implemented in Markdown.Sanitizer.js.
"""
import re

import markdown

# These patterns could be more flexible about things like attributes and
# whitespace, but this is imitating Markdown.Sanitizer.js, so it uses the
# patterns defined therein.
TAG_PATTERN = re.compile(r"<[^>]*>?")
SANITIZED_TAG_PATTERN = re.compile(r"<(/?)(\w+)[^>]*>")
ALLOWED_BASIC_TAG_PATTERN = re.compile(
    r"^(</?(b|blockquote|code|del|dd|dl|dt|em|h1|h2|h3|i|kbd|li|ol|p|pre|s|sup|sub|strong|strike|ul)>|<(br|hr)\s?/?>)$"
)
ALLOWED_A_PATTERN = re.compile(
    r'^(<a\shref="((https?|ftp)://|/)[-A-Za-z0-9+&@#/%?=~_|!:,.;\(\)]+"(\stitle="[^"<>]+")?\s?>|</a>)$'
)
ALLOWED_IMG_PATTERN = re.compile(
    r'^(<img\ssrc="(https?://|/)[-A-Za-z0-9+&@#/%?=~_|!:,.;\(\)]+"(\swidth="\d{1,3}")?'
    r'(\sheight="\d{1,3}")?(\salt="[^"<>]*")?(\stitle="[^"<>]*")?\s?/?>)$'
)


def _sanitize_tag(match):
    """Return the tag if it is allowed or the empty string otherwise"""
    tag = match.group(0)
    if (
            ALLOWED_BASIC_TAG_PATTERN.match(tag) or
            ALLOWED_A_PATTERN.match(tag) or
            ALLOWED_IMG_PATTERN.match(tag)
    ):
        return tag
    else:
        return ""


def _sanitize_html(source):
    """
    Return source with all non-allowed tags removed, preserving the text content
    """
    return TAG_PATTERN.sub(_sanitize_tag, source)


def _remove_unpaired_tags(source):
    """
    Return source with all unpaired tags removed, preserving the text content

    source should have already been sanitized
    """
    tag_matches = list(SANITIZED_TAG_PATTERN.finditer(source))
    if not tag_matches:
        return source
    tag_stack = []
    tag_name_stack = []
    text_stack = [source[:tag_matches[0].start()]]
    for i, match in enumerate(tag_matches):
        tag_name = match.group(2)
        following_text = (
            source[match.end():tag_matches[i + 1].start()] if i + 1 < len(tag_matches) else
            source[match.end():]
        )
        if tag_name in ["p", "img", "br", "li", "hr"]:  # tags that don't require closing
            text_stack[-1] += match.group(0) + following_text
        elif match.group(1):  # end tag
            if tag_name in tag_name_stack:  # paired with a start tag somewhere
                # pop tags until we find the matching one, keeping the non-tag text
                while True:
                    popped_tag_name = tag_name_stack.pop()
                    popped_tag = tag_stack.pop()
                    popped_text = text_stack.pop()
                    if popped_tag_name == tag_name:
                        text_stack[-1] += popped_tag + popped_text + match.group(0)
                        break
                    else:
                        text_stack[-1] += popped_text
            # else unpaired; drop the tag
            text_stack[-1] += following_text
        else:  # start tag
            tag_stack.append(match.group(0))
            tag_name_stack.append(tag_name)
            text_stack.append(following_text)
    return "".join(text_stack)


def render_body(raw_body):
    """
    Render raw_body to HTML.

    This includes the following steps:

    * Convert Markdown to HTML
    * Strip non-whitelisted HTML
    * Remove unbalanced HTML tags

    Note that this does not prevent Markdown syntax inside a MathJax block from
    being processed, which the forums JavaScript code does.
    """
    rendered = markdown.markdown(raw_body)
    rendered = _sanitize_html(rendered)
    rendered = _remove_unpaired_tags(rendered)
    return rendered