html_to_text.py 705 Bytes
Newer Older
1
"""Provides a function to convert html to plaintext."""
2
import logging
3 4
from subprocess import Popen, PIPE

5 6
log = logging.getLogger(__name__)

7

8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27
def html_to_text(html_message):
    """
    Converts an html message to plaintext.
    Currently uses lynx in a subprocess; should be refactored to
    use something more pythonic.
    """
    process = Popen(
        ['lynx', '-stdin', '-display_charset=UTF-8', '-assume_charset=UTF-8', '-dump'],
        stdin=PIPE,
        stdout=PIPE
    )
    # use lynx to get plaintext
    (plaintext, err_from_stderr) = process.communicate(
        input=html_message.encode('utf-8')
    )

    if err_from_stderr:
        log.info(err_from_stderr)

    return plaintext