changeset 8202:6c381371d106

py3: fix non-ASCII URLs - decode unicode correctly before passing them to controllers as unicode strings This is needed for supporting localized repo path names in the path of URLs. Some references: https://www.python.org/dev/peps/pep-0333/#unicode-issues https://bugs.python.org/issue16679 http://lucumr.pocoo.org/2010/5/25/wsgi-on-python-3/ https://bugs.launchpad.net/pecan/+bug/1451842 https://github.com/tipabu/eventlet/commit/a5a7751b013fe99b6d30acbca79e819770e9ae5d
author Mads Kiilerich <mads@kiilerich.com>
date Mon, 23 Dec 2019 00:56:45 +0100
parents 620c13a373c5
children c146a2ab50a8
files kallithea/config/routing.py kallithea/lib/base.py kallithea/lib/middleware/permanent_repo_url.py
diffstat 3 files changed, 30 insertions(+), 5 deletions(-) [+]
line wrap: on
line diff
--- a/kallithea/config/routing.py	Thu Dec 26 15:17:51 2019 +0100
+++ b/kallithea/config/routing.py	Mon Dec 23 00:56:45 2019 +0100
@@ -19,14 +19,34 @@
 refer to the routes manual at http://routes.groovie.org/docs/
 """
 
-from routes import Mapper
+import routes
 from tg import request
 
+from kallithea.lib.utils2 import safe_str
+
 
 # prefix for non repository related links needs to be prefixed with `/`
 ADMIN_PREFIX = '/_admin'
 
 
+class Mapper(routes.Mapper):
+    """
+    Subclassed Mapper with routematch patched to decode "unicode" str url to
+    *real* unicode str before applying matches and invoking controller methods.
+    """
+
+    def routematch(self, url=None, environ=None):
+        """
+        routematch that also decode url from "fake bytes" to real unicode
+        string before matching and invoking controllers.
+        """
+        # Process url like get_path_info does ... but PATH_INFO has already
+        # been retrieved from environ and is passed, so - let's just use that
+        # instead.
+        url = safe_str(url.encode('latin1'))
+        return super().routematch(url=url, environ=environ)
+
+
 def make_map(config):
     """Create, configure and return the routes Mapper"""
     rmap = Mapper(directory=config['paths']['controllers'],
--- a/kallithea/lib/base.py	Thu Dec 26 15:17:51 2019 +0100
+++ b/kallithea/lib/base.py	Mon Dec 23 00:56:45 2019 +0100
@@ -97,12 +97,17 @@
 
 
 def get_path_info(environ):
-    """Return unicode PATH_INFO from environ ... using tg.original_request if available.
+    """Return PATH_INFO from environ ... using tg.original_request if available.
+
+    In Python 3 WSGI, PATH_INFO is a unicode str, but kind of contains encoded
+    bytes. The code points are guaranteed to only use the lower 8 bit bits, and
+    encoding the string with the 1:1 encoding latin1 will give the
+    corresponding byte string ... which then can be decoded to proper unicode.
     """
     org_req = environ.get('tg.original_request')
     if org_req is not None:
         environ = org_req.environ
-    return safe_str(environ['PATH_INFO'])
+    return safe_str(environ['PATH_INFO'].encode('latin1'))
 
 
 def log_in_user(user, remember, is_external_auth, ip_addr):
--- a/kallithea/lib/middleware/permanent_repo_url.py	Thu Dec 26 15:17:51 2019 +0100
+++ b/kallithea/lib/middleware/permanent_repo_url.py	Mon Dec 23 00:56:45 2019 +0100
@@ -33,9 +33,9 @@
     def __call__(self, environ, start_response):
         # Extract path_info as get_path_info does, but do it explicitly because
         # we also have to do the reverse operation when patching it back in
-        path_info = safe_str(environ['PATH_INFO'])
+        path_info = safe_str(environ['PATH_INFO'].encode('latin1'))
         if path_info.startswith('/'): # it must
             path_info = '/' + fix_repo_id_name(path_info[1:])
-            environ['PATH_INFO'] = safe_bytes(path_info)
+            environ['PATH_INFO'] = safe_bytes(path_info).decode('latin1')
 
         return self.application(environ, start_response)