handle non-BMP unicode chars in _trunc()

c8cdcd6c · jsa · 5d425578 · c8cdcd6c · c8cdcd6c
Commit c8cdcd6c authored Dec 20, 2013 by jsa
Hide whitespace changes
Inline Side-by-side

Showing with 23 additions and 9 deletions

notifier/digest.py
+23 -6

notifier/tests/test_digest.py
+0 -3

No files found.
--- a/notifier/digest.py
+++ b/notifier/digest.py
@@ -4,6 +4,7 @@ General formatting and rendering helpers for digest notifications.
 import datetime
 import logging
+import struct
 from django.conf import settings
 from django.template.loader import get_template
@@ -34,17 +35,33 @@ def _trunc(s, length):
    Truncate the string `s` to no more than `length`, using ellipsis and
    without chopping words.
-    >>> _trunc("one two three", 13)
+    This function works on both str and unicode objects.  If a str
-    'one two three'
+    is passed, it may return a unicode. If a unicode is passed, it will
-    >>> _trunc("one two three", 12)
+    always return a unicode.
-    'one two...'
+    >>> _trunc(u"one two three", 13)
+    u'one two three'
+    >>> _trunc(u"one two three", 12)
+    u'one two...'
    """
+    # Some Python2.7 builds do not support non-BMP unicode characters.
+    # To function properly on such systems, we convert to code points
+    # inside this function before counting / slicing characters, and
+    # decode again prior to concatenating the output value.
    s = s.strip()
-    if len(s) <= length:
+    u = s.encode('utf-32-le')
+    pts = struct.unpack('<{}L'.format(len(u) / 4), u)
+    if len(pts) <= length:
        # nothing to do
        return s
    # truncate, taking an extra -3 off the orig string for the ellipsis itself
-    return s[:length - 3].rsplit(' ', 1)[0].strip() + '...'
+    # see above comment about non-BMP support for why this is done in such
+    # elaborate fashion.
+    uchr = lambda x: '\U{0:08x}'.format(x).decode('unicode-escape')
+    return ''.join(uchr(p) for p in pts[:length - 3]).rsplit(' ', 1)[0].strip() + '...'
 def _make_text_list(values):

--- a/notifier/tests/test_digest.py
+++ b/notifier/tests/test_digest.py
@@ -23,7 +23,6 @@ class DigestItemTestCase(TestCase):
    def test_CJK(self):
        self._test_unicode_data(u"ｲんﾉ丂 ｱo丂ｲ co刀ｲﾑﾉ刀丂 cﾌズ", u"ｲんﾉ丂 ｱo丂ｲ...")
-    @skip("Non-BMP characters are not handled correctly")
    def test_non_BMP(self):
        self._test_unicode_data(u"𝕋𝕙𝕚𝕤 𝕡𝕠𝕤𝕥 𝕔𝕠𝕟𝕥𝕒𝕚𝕟𝕤 𝕔𝕙𝕒𝕣𝕒𝕔𝕥𝕖𝕣𝕤 𝕠𝕦𝕥𝕤𝕚𝕕𝕖 𝕥𝕙𝕖 𝔹𝕄ℙ", u"𝕋𝕙𝕚𝕤 𝕡𝕠𝕤𝕥...")
@@ -48,7 +47,6 @@ class DigestThreadTestCase(TestCase):
    def test_CJK(self):
        self._test_unicode_data(u"ｲんﾉ丂 ｱo丂ｲ co刀ｲﾑﾉ刀丂 cﾌズ", u"ｲんﾉ丂 ｱo丂ｲ...")
-    @skip("Non-BMP characters are not handled correctly")
    def test_non_BMP(self):
        self._test_unicode_data(u"𝕋𝕙𝕚𝕤 𝕡𝕠𝕤𝕥 𝕔𝕠𝕟𝕥𝕒𝕚𝕟𝕤 𝕔𝕙𝕒𝕣𝕒𝕔𝕥𝕖𝕣𝕤 𝕠𝕦𝕥𝕤𝕚𝕕𝕖 𝕥𝕙𝕖 𝔹𝕄ℙ", u"𝕋𝕙𝕚𝕤 𝕡𝕠𝕤𝕥...")
@@ -91,7 +89,6 @@ class RenderDigestTestCase(TestCase):
    def test_CJK(self):
        self._test_unicode_data(u"ｲんﾉ丂 ｱo丂ｲ co刀ｲﾑﾉ刀丂 cﾌズ", u"ｲんﾉ丂 ｱo丂ｲ...")
-    @skip("Non-BMP characters are not handled correctly")
    def test_non_BMP(self):
        self._test_unicode_data(u"𝕋𝕙𝕚𝕤 𝕡𝕠𝕤𝕥 𝕔𝕠𝕟𝕥𝕒𝕚𝕟𝕤 𝕔𝕙𝕒𝕣𝕒𝕔𝕥𝕖𝕣𝕤 𝕠𝕦𝕥𝕤𝕚𝕕𝕖 𝕥𝕙𝕖 𝔹𝕄ℙ", u"𝕋𝕙𝕚𝕤 𝕡𝕠𝕤𝕥...")