changeset 8040:0f69b5c35b2b

lib: introduce string conversion functions for ASCII without further encoding concerns Avoid the trial-and-error and vagueness of the "safe" functions. This should replace some use of safe_unicode and safe_str. It will mostly be a noop in py2 but will be crucial in py3.
author Mads Kiilerich <mads@kiilerich.com>
date Fri, 27 Dec 2019 23:30:56 +0100
parents 4e565c5d7b7d
children 5f3101d54c32
files kallithea/lib/utils2.py kallithea/lib/vcs/utils/__init__.py
diffstat 2 files changed, 57 insertions(+), 1 deletions(-) [+]
line wrap: on
line diff
--- a/kallithea/lib/utils2.py	Sun Dec 15 20:00:38 2019 +0100
+++ b/kallithea/lib/utils2.py	Fri Dec 27 23:30:56 2019 +0100
@@ -43,7 +43,7 @@
 from webhelpers2.text import collapse, remove_formatting, strip_tags
 
 from kallithea.lib.compat import json
-from kallithea.lib.vcs.utils import safe_bytes, safe_str, safe_unicode  # re-export
+from kallithea.lib.vcs.utils import ascii_bytes, ascii_str, safe_bytes, safe_str, safe_unicode  # re-export
 from kallithea.lib.vcs.utils.lazy import LazyProperty
 
 
--- a/kallithea/lib/vcs/utils/__init__.py	Sun Dec 15 20:00:38 2019 +0100
+++ b/kallithea/lib/vcs/utils/__init__.py	Fri Dec 27 23:30:56 2019 +0100
@@ -1,3 +1,5 @@
+# -*- coding: utf-8 -*-
+
 """
 This module provides some useful tools for ``vcs`` like annotate/diff html
 output. It also includes some internal helpers.
@@ -121,6 +123,60 @@
 safe_str = safe_bytes  # safe_str is deprecated - it will be redefined when changing to py3
 
 
+def ascii_bytes(s):
+    """
+    Simple conversion from unicode/str to bytes, *assuming* all codepoints are
+    7-bit and it thus is pure ASCII.
+    Will fail badly with UnicodeError on invalid input.
+    This should be used where enocding and "safe" ambiguity should be avoided.
+    Where strings already have been encoded in other ways but still are unicode
+    string - for example to hex, base64, json, urlencoding, or are known to be
+    identifiers.
+
+    >>> ascii_bytes('a')
+    'a'
+    >>> ascii_bytes(u'a')
+    'a'
+    >>> ascii_bytes('å')
+    Traceback (most recent call last):
+    UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 0: ordinal not in range(128)
+    >>> ascii_bytes(u'å')
+    Traceback (most recent call last):
+    UnicodeEncodeError: 'ascii' codec can't encode characters in position 0-1: ordinal not in range(128)
+    """
+    assert isinstance(s, (unicode, str)), repr(s)
+    return s.encode('ascii')
+
+
+def ascii_str(s):
+    r"""
+    Simple conversion from bytes to str, *assuming* all codepoints are
+    7-bit and it thus is pure ASCII.
+    Will fail badly with UnicodeError on invalid input.
+    This should be used where enocding and "safe" ambiguity should be avoided.
+    Where strings are encoded but also in other ways are known to be ASCII, and
+    where a unicode string is wanted without caring about encoding. For example
+    to hex, base64, urlencoding, or are known to be identifiers.
+
+    >>> ascii_str('a')
+    'a'
+    >>> ascii_str(u'a')
+    Traceback (most recent call last):
+    AssertionError: u'a'
+    >>> ascii_str('å')
+    Traceback (most recent call last):
+    UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 0: ordinal not in range(128)
+    >>> ascii_str(u'å')
+    Traceback (most recent call last):
+    AssertionError: u'\xc3\xa5'
+    """
+    assert isinstance(s, bytes), repr(s)
+    # Note: we use "encode", even though we really *should* use "decode". But
+    # we are in py2 and don't want py2, and encode is doing what we need for the
+    # ascii subset.
+    return s.encode('ascii')
+
+
 # Regex taken from http://www.regular-expressions.info/email.html
 email_re = re.compile(
     r"""[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@"""