Python3 compatibility with 2to3, testing with tox.

2to3 is enabled by switching to Distribute from distutils. LEPL now works on character strings; I'd prefer byte strings, but this seems to work.

Python3 compatibility with 2to3, testing with tox.
2to3 is enabled by switching to Distribute from distutils. LEPL now works on character strings; I'd prefer byte strings, but this seems to work.
9112d87e · Gabriel · 94ddf76a · 9112d87e · 9112d87e · 9112d87e
Commit 9112d87e authored Jan 30, 2012 by Gabriel
Hide whitespace changes
Inline Side-by-side

Showing with 67 additions and 16 deletions

README
+7 -1

rfc6266.py
+48 -14

setup.py
+2 -1

tox.ini
+10 -0

No files found.
--- a/README
+++ b/README
@@ -29,12 +29,18 @@ Read RFC 6266 section 4.3 for more details.
 # Compatibility
-Currently tested under Python 2.7.
+Currently tested under Python 2.7, Python 2.6, and Python 3.2.
 # Testing
+To test in the current Python implementation:
    nosetests --all-modules --detailed-errors
+To test with tox:
+    tox
 # References
 * RFC 6266 <https://tools.ietf.org/html/rfc6266>

--- a/rfc6266.py
+++ b/rfc6266.py
@@ -14,12 +14,13 @@ build_header handles the sender side.
 from lepl import *
 from collections import namedtuple
 from urllib import quote, unquote
+from urlparse import urlsplit
 from string import hexdigits, ascii_letters, digits
 import posixpath
 import os.path
 import re
-import urlparse
+import sys
 __all__ = (
    'ContentDisposition',
@@ -30,9 +31,24 @@ __all__ = (
 )
+PY3K = sys.version_info >= (3,)
 LangTagged = namedtuple('LangTagged', 'string langtag')
+if PY3K:
+    percent_encode = quote
+    percent_decode = unquote
+else:
+    def percent_encode(string, **kwargs):
+        encoding = kwargs.pop('encoding')
+        return quote(string.encode(encoding), **kwargs)
+    def percent_decode(string, **kwargs):
+        encoding = kwargs.pop('encoding')
+        return unquote(string, **kwargs).decode(encoding)
 class ContentDisposition(object):
    """
    Records various indications and hints about content disposition.
@@ -86,8 +102,9 @@ class ContentDisposition(object):
    @property
    def location_path(self):
        if self.location:
-            return unquote(
+            return percent_decode(
-                urlparse.urlsplit(self.location, scheme='http').path)
+                urlsplit(self.location, scheme='http').path,
+                encoding='utf-8')
    def filename_sanitized(self, extension, default_filename='file'):
        """Returns a filename that is safer to use on the filesystem.
@@ -135,6 +152,14 @@ class ContentDisposition(object):
            self.disposition, self.assocs, self.location)
+def ensure_charset(text, encoding):
+    if isinstance(text, bytes):
+        return text.decode(encoding)
+    else:
+        assert fits_inside_codec(text, encoding)
+        return text
 def parse_headers(content_disposition, location=None):
    """Build a ContentDisposition from header values.
    """
@@ -146,7 +171,7 @@ def parse_headers(content_disposition, location=None):
    if False:
        # Require content_disposition to be ascii bytes (0-127),
        # or characters in the ascii range
-        content_disposition = content_disposition.encode('ascii')
+        content_disposition = ensure_charset(content_disposition, 'ascii')
    else:
        # We allow non-ascii here (it will only be parsed inside of
        # qdtext, and rejected by the grammar if it appears in
@@ -155,14 +180,12 @@ def parse_headers(content_disposition, location=None):
        # won't get dismissed because of an unrelated ambiguity
        # in the filename parameter. But it does mean we occasionally
        # give less-than-certain values for some legacy senders.
-        content_disposition = content_disposition.encode('iso-8859-1')
+        content_disposition = ensure_charset(content_disposition, 'iso-8859-1')
    # Check the caller already did LWS-folding (normally done
    # when separating header names and values; RFC 2616 section 2.2
    # says it should be done before interpretation at any rate).
-    # Since this is ascii the definition of space is known; I don't know
+    # Hopefully space still means what it should in iso-8859-1.
-    # what Python's definition of space chars will be if we allow
-    # iso-8859-1.
    # This check is a bit stronger that LWS folding, it will
    # remove CR and LF even if they aren't part of a CRLF.
    # However http doesn't allow isolated CR and LF in headers outside
@@ -197,7 +220,9 @@ def parse_ext_value(val):
    else:
        charset, coded = val
        langtag = None
-    decoded = coded.decode(charset)
+    if not PY3K and isinstance(coded, unicode):
+        coded = coded.encode('ascii')
+    decoded = percent_decode(coded, encoding=charset)
    return LangTagged(decoded, langtag)
@@ -253,7 +278,7 @@ char = Any(''.join(chr(i) for i in xrange(128)))  # ascii range: 0-127
 quoted_pair = Drop('\\') + char
 quoted_string = (Drop('"') & (quoted_pair | qdtext)[:, ...] & Drop('"')
-                ) > parse_iso
+                ) #> parse_iso
 value = token | quoted_string
@@ -267,7 +292,7 @@ language = token
 attr_char = Any(attr_chars)
 hexdig = Any(hexdigits)
-pct_encoded = '%' + hexdig + hexdig >> unquote
+pct_encoded = '%' + hexdig + hexdig
 value_chars = (pct_encoded | attr_char)[...]
 ext_value = (
    charset & Drop("'") & Optional(language) & Drop("'")
@@ -312,6 +337,15 @@ def is_ascii(text):
    return all(ord(ch) < 128 for ch in text)
+def fits_inside_codec(text, codec):
+    try:
+        text.encode(codec)
+    except UnicodeEncodeError:
+        return False
+    else:
+        return True
 def is_lws_safe(text):
    return ' '.join(text.split()) == text
@@ -371,8 +405,8 @@ def build_header(
    # alnum are already considered always-safe, but the rest isn't.
    # Python encodes ~ when it shouldn't, for example.
-    rv += "; filename*=utf-8''%s" % (quote(
+    rv += "; filename*=utf-8''%s" % (percent_encode(
-        filename.encode('utf-8'), safe=attr_chars_nonalnum), )
+        filename, safe=attr_chars_nonalnum, encoding='utf-8'), )
    # This will only encode filename_compat, if it used non-ascii iso-8859-1.
    return rv.encode('iso-8859-1')

--- a/setup.py
+++ b/setup.py
-from distutils.core import setup
+from setuptools import setup
 setup(
    name='rfc6266',
    version='0.0.1',  # symver
    py_modules=['rfc6266'],
    install_requires=['LEPL'],
+    use_2to3=True,
 )
--- a/tox.ini
+++ b/tox.ini
+[tox]
+envlist=py27,py26,py32
+[testenv]
+deps=nose
+# changedir is a hack to prevent nose from finding the non-2to3 source
+# now nose will use import, which has the converted modules in its path
+changedir=.tox
+commands=nosetests --detailed-errors rfc6266