Sender side seems to work.

8f2ca4ae · Gabriel · 040f262e · 8f2ca4ae
Commit 8f2ca4ae authored Jan 28, 2012 by Gabriel
Hide whitespace changes
Inline Side-by-side

Showing with 63 additions and 16 deletions

rfc6266.py
+63 -16

No files found.
--- a/rfc6266.py
+++ b/rfc6266.py
@@ -5,8 +5,8 @@
 from lepl import *
 from collections import namedtuple
-from urllib import unquote
+from urllib import quote, unquote
-from string import hexdigits
+from string import hexdigits, ascii_letters, digits
 import re
 __all__ = ('ContentDisposition', )
@@ -41,6 +41,9 @@ class ContentDisposition(object):
    def from_header(cls, hdrval):
        # Require hdrval to be ascii bytes (0-127),
        # or characters in the ascii range
+        # XXX We might allow non-ascii here (see the definition of qdtext),
+        # but parsing it would still be ambiguous. OTOH, we might allow it
+        # just so that the non-ambiguous filename* value does get parsed.
        hdrval = hdrval.encode('ascii')
        rv, = content_disposition_value.parse(hdrval)
        return rv
@@ -75,20 +78,39 @@ def CaseInsensitiveLiteral(lit):
    return Regexp('(?i)' + re.escape(lit))
-# To debug, wrap in this block:
+# RFC 2616
-#with TraceVariables():
 separator_chars = "()<>@,;:\\\"/[]?={} \t"
 ctl_chars = ''.join(chr(i) for i in xrange(32)) + chr(127)
 nontoken_chars = separator_chars + ctl_chars
+# RFC 5987
+attr_chars_nonalnum = '!#$&+-.^_`|~'
+attr_chars = ascii_letters + digits + attr_chars_nonalnum
+# RFC 5987 gives this alternative construction of the token character class
+token_chars = attr_chars + "*'%"
+# To debug, wrap in this block:
+#with TraceVariables():
 # Definitions from https://tools.ietf.org/html/rfc2616#section-2.2
-token = AnyBut(nontoken_chars)[1:, ...]
+# token was redefined from attr_chars to avoid using AnyBut,
+# which might include non-ascii octets.
+token = Any(token_chars)[1:, ...]
 # RFC 2616 says some linear whitespace (LWS) is in fact allowed in text
 # and qdtext; however it also mentions folding that whitespace into
 # a single SP (which isn't in CTL).
 # Assume the caller already that folding when parsing headers.
+# XXX qdtext also allows non-ascii, which might be
+# parsed as ISO-8859-1 (but is ambiguous). We should probably reject it.
+# Everything else in this grammar (including RFC 5987 ext values)
+# is ascii-safe.
+# Because of this, this is the only character class to use AnyBut,
+# and all the others are defined with Any.
 qdtext = AnyBut('"' + ctl_chars)
 char = Any(''.join(chr(i) for i in xrange(128)))  # ascii range: 0-127
@@ -102,9 +124,11 @@ value = token | quoted_string
 # for future evolutions.
 charset = (CaseInsensitiveLiteral('UTF-8')
           | CaseInsensitiveLiteral('ISO-8859-1'))
 # XXX See RFC 5646 for the correct definition
 language = token
-attr_char = AnyBut(nontoken_chars + "*'%")
+attr_char = Any(attr_chars)
 hexdig = Any(hexdigits)
 pct_encoded = '%' + hexdig + hexdig >> unquote
 value_chars = (pct_encoded | attr_char)[...]
@@ -132,22 +156,36 @@ def is_token_char(ch):
    return 31 < asciicode < 127 and ch not in separator_chars
+def usesonlycharsfrom(candidate, chars):
+    # Found that shortcut in urllib.quote
+    return not candidate.rstrip(chars)
 def is_token(candidate):
    return all(is_token_char(ch) for ch in candidate)
-def header_for_filename(filename, filename_ascii=None):
+def header_for_filename(filename, compat='ignore', filename_ascii=None):
+    # Compat methods (fallback for receivers that can't handle filename*):
+    # - ignore (give only filename*);
+    # - strip accents using unicode's decomposing normalisations,
+    # which can be done from unicode data (stdlib), and keep only ascii;
+    # - use the ascii transliteration tables from Unidecode (PyPI);
+    # - use iso-8859-1.
+    # Ignore is the safest, and can be used to trigger a fallback
+    # to the document location.
+    # While this method exists, it could also sanitize the filename
+    # by rejecting slashes or other weirdness that might upset a receiver.
+    if compat != 'ignore':
+        raise NotImplementedError
    if is_token(filename):
        return 'attachment; filename=%s' % filename
-    try:
+    return "attachment; filename*=utf-8''%s" % quote(
-        asc = filename.encode('ascii')
+        filename.encode('utf-8'), safe=attr_chars_nonalnum)
-        iso = filename.encode('iso-8859-1')
-    except UnicodeEncodeError:
-        return 'attachment; filename=%s; filename*=%s' % (fn1, fn2)
-    else:
-        # The filename is ascii already
-        pass
 def test_cdfh():
@@ -160,4 +198,13 @@ def test_cdfh():
        'attachment; filename="EURO rates"; filename*=utf-8\'\'%e2%82%ac%20rates')
    assert cd.filename == u'€ rates'
+    def roundtrip(filename):
+        return ContentDisposition.from_header(
+            header_for_filename(filename)).filename
+    def assert_roundtrip(filename):
+        assert roundtrip(filename) == filename
+    assert_roundtrip(u'aéioou"qfsdf!')