Commit c8cdcd6c by jsa

handle non-BMP unicode chars in _trunc()

parent 5d425578
...@@ -4,6 +4,7 @@ General formatting and rendering helpers for digest notifications. ...@@ -4,6 +4,7 @@ General formatting and rendering helpers for digest notifications.
import datetime import datetime
import logging import logging
import struct
from django.conf import settings from django.conf import settings
from django.template.loader import get_template from django.template.loader import get_template
...@@ -34,17 +35,33 @@ def _trunc(s, length): ...@@ -34,17 +35,33 @@ def _trunc(s, length):
Truncate the string `s` to no more than `length`, using ellipsis and Truncate the string `s` to no more than `length`, using ellipsis and
without chopping words. without chopping words.
>>> _trunc("one two three", 13) This function works on both str and unicode objects. If a str
'one two three' is passed, it may return a unicode. If a unicode is passed, it will
>>> _trunc("one two three", 12) always return a unicode.
'one two...'
>>> _trunc(u"one two three", 13)
u'one two three'
>>> _trunc(u"one two three", 12)
u'one two...'
""" """
# Some Python2.7 builds do not support non-BMP unicode characters.
# To function properly on such systems, we convert to code points
# inside this function before counting / slicing characters, and
# decode again prior to concatenating the output value.
s = s.strip() s = s.strip()
if len(s) <= length: u = s.encode('utf-32-le')
pts = struct.unpack('<{}L'.format(len(u) / 4), u)
if len(pts) <= length:
# nothing to do # nothing to do
return s return s
# truncate, taking an extra -3 off the orig string for the ellipsis itself # truncate, taking an extra -3 off the orig string for the ellipsis itself
return s[:length - 3].rsplit(' ', 1)[0].strip() + '...' # see above comment about non-BMP support for why this is done in such
# elaborate fashion.
uchr = lambda x: '\U{0:08x}'.format(x).decode('unicode-escape')
return ''.join(uchr(p) for p in pts[:length - 3]).rsplit(' ', 1)[0].strip() + '...'
def _make_text_list(values): def _make_text_list(values):
......
...@@ -23,7 +23,6 @@ class DigestItemTestCase(TestCase): ...@@ -23,7 +23,6 @@ class DigestItemTestCase(TestCase):
def test_CJK(self): def test_CJK(self):
self._test_unicode_data(u"イんノ丂 アo丂イ co刀イムノ刀丂 cフズ", u"イんノ丂 アo丂イ...") self._test_unicode_data(u"イんノ丂 アo丂イ co刀イムノ刀丂 cフズ", u"イんノ丂 アo丂イ...")
@skip("Non-BMP characters are not handled correctly")
def test_non_BMP(self): def test_non_BMP(self):
self._test_unicode_data(u"𝕋𝕙𝕚𝕤 𝕡𝕠𝕤𝕥 𝕔𝕠𝕟𝕥𝕒𝕚𝕟𝕤 𝕔𝕙𝕒𝕣𝕒𝕔𝕥𝕖𝕣𝕤 𝕠𝕦𝕥𝕤𝕚𝕕𝕖 𝕥𝕙𝕖 𝔹𝕄ℙ", u"𝕋𝕙𝕚𝕤 𝕡𝕠𝕤𝕥...") self._test_unicode_data(u"𝕋𝕙𝕚𝕤 𝕡𝕠𝕤𝕥 𝕔𝕠𝕟𝕥𝕒𝕚𝕟𝕤 𝕔𝕙𝕒𝕣𝕒𝕔𝕥𝕖𝕣𝕤 𝕠𝕦𝕥𝕤𝕚𝕕𝕖 𝕥𝕙𝕖 𝔹𝕄ℙ", u"𝕋𝕙𝕚𝕤 𝕡𝕠𝕤𝕥...")
...@@ -48,7 +47,6 @@ class DigestThreadTestCase(TestCase): ...@@ -48,7 +47,6 @@ class DigestThreadTestCase(TestCase):
def test_CJK(self): def test_CJK(self):
self._test_unicode_data(u"イんノ丂 アo丂イ co刀イムノ刀丂 cフズ", u"イんノ丂 アo丂イ...") self._test_unicode_data(u"イんノ丂 アo丂イ co刀イムノ刀丂 cフズ", u"イんノ丂 アo丂イ...")
@skip("Non-BMP characters are not handled correctly")
def test_non_BMP(self): def test_non_BMP(self):
self._test_unicode_data(u"𝕋𝕙𝕚𝕤 𝕡𝕠𝕤𝕥 𝕔𝕠𝕟𝕥𝕒𝕚𝕟𝕤 𝕔𝕙𝕒𝕣𝕒𝕔𝕥𝕖𝕣𝕤 𝕠𝕦𝕥𝕤𝕚𝕕𝕖 𝕥𝕙𝕖 𝔹𝕄ℙ", u"𝕋𝕙𝕚𝕤 𝕡𝕠𝕤𝕥...") self._test_unicode_data(u"𝕋𝕙𝕚𝕤 𝕡𝕠𝕤𝕥 𝕔𝕠𝕟𝕥𝕒𝕚𝕟𝕤 𝕔𝕙𝕒𝕣𝕒𝕔𝕥𝕖𝕣𝕤 𝕠𝕦𝕥𝕤𝕚𝕕𝕖 𝕥𝕙𝕖 𝔹𝕄ℙ", u"𝕋𝕙𝕚𝕤 𝕡𝕠𝕤𝕥...")
...@@ -91,7 +89,6 @@ class RenderDigestTestCase(TestCase): ...@@ -91,7 +89,6 @@ class RenderDigestTestCase(TestCase):
def test_CJK(self): def test_CJK(self):
self._test_unicode_data(u"イんノ丂 アo丂イ co刀イムノ刀丂 cフズ", u"イんノ丂 アo丂イ...") self._test_unicode_data(u"イんノ丂 アo丂イ co刀イムノ刀丂 cフズ", u"イんノ丂 アo丂イ...")
@skip("Non-BMP characters are not handled correctly")
def test_non_BMP(self): def test_non_BMP(self):
self._test_unicode_data(u"𝕋𝕙𝕚𝕤 𝕡𝕠𝕤𝕥 𝕔𝕠𝕟𝕥𝕒𝕚𝕟𝕤 𝕔𝕙𝕒𝕣𝕒𝕔𝕥𝕖𝕣𝕤 𝕠𝕦𝕥𝕤𝕚𝕕𝕖 𝕥𝕙𝕖 𝔹𝕄ℙ", u"𝕋𝕙𝕚𝕤 𝕡𝕠𝕤𝕥...") self._test_unicode_data(u"𝕋𝕙𝕚𝕤 𝕡𝕠𝕤𝕥 𝕔𝕠𝕟𝕥𝕒𝕚𝕟𝕤 𝕔𝕙𝕒𝕣𝕒𝕔𝕥𝕖𝕣𝕤 𝕠𝕦𝕥𝕤𝕚𝕕𝕖 𝕥𝕙𝕖 𝔹𝕄ℙ", u"𝕋𝕙𝕚𝕤 𝕡𝕠𝕤𝕥...")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment