Mercurial > kallithea
changeset 8040:0f69b5c35b2b
lib: introduce string conversion functions for ASCII without further encoding concerns
Avoid the trial-and-error and vagueness of the "safe" functions.
This should replace some use of safe_unicode and safe_str. It will mostly be a
noop in py2 but will be crucial in py3.
author | Mads Kiilerich <mads@kiilerich.com> |
---|---|
date | Fri, 27 Dec 2019 23:30:56 +0100 |
parents | 4e565c5d7b7d |
children | 5f3101d54c32 |
files | kallithea/lib/utils2.py kallithea/lib/vcs/utils/__init__.py |
diffstat | 2 files changed, 57 insertions(+), 1 deletions(-) [+] |
line wrap: on
line diff
--- a/kallithea/lib/utils2.py Sun Dec 15 20:00:38 2019 +0100 +++ b/kallithea/lib/utils2.py Fri Dec 27 23:30:56 2019 +0100 @@ -43,7 +43,7 @@ from webhelpers2.text import collapse, remove_formatting, strip_tags from kallithea.lib.compat import json -from kallithea.lib.vcs.utils import safe_bytes, safe_str, safe_unicode # re-export +from kallithea.lib.vcs.utils import ascii_bytes, ascii_str, safe_bytes, safe_str, safe_unicode # re-export from kallithea.lib.vcs.utils.lazy import LazyProperty
--- a/kallithea/lib/vcs/utils/__init__.py Sun Dec 15 20:00:38 2019 +0100 +++ b/kallithea/lib/vcs/utils/__init__.py Fri Dec 27 23:30:56 2019 +0100 @@ -1,3 +1,5 @@ +# -*- coding: utf-8 -*- + """ This module provides some useful tools for ``vcs`` like annotate/diff html output. It also includes some internal helpers. @@ -121,6 +123,60 @@ safe_str = safe_bytes # safe_str is deprecated - it will be redefined when changing to py3 +def ascii_bytes(s): + """ + Simple conversion from unicode/str to bytes, *assuming* all codepoints are + 7-bit and it thus is pure ASCII. + Will fail badly with UnicodeError on invalid input. + This should be used where enocding and "safe" ambiguity should be avoided. + Where strings already have been encoded in other ways but still are unicode + string - for example to hex, base64, json, urlencoding, or are known to be + identifiers. + + >>> ascii_bytes('a') + 'a' + >>> ascii_bytes(u'a') + 'a' + >>> ascii_bytes('å') + Traceback (most recent call last): + UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 0: ordinal not in range(128) + >>> ascii_bytes(u'å') + Traceback (most recent call last): + UnicodeEncodeError: 'ascii' codec can't encode characters in position 0-1: ordinal not in range(128) + """ + assert isinstance(s, (unicode, str)), repr(s) + return s.encode('ascii') + + +def ascii_str(s): + r""" + Simple conversion from bytes to str, *assuming* all codepoints are + 7-bit and it thus is pure ASCII. + Will fail badly with UnicodeError on invalid input. + This should be used where enocding and "safe" ambiguity should be avoided. + Where strings are encoded but also in other ways are known to be ASCII, and + where a unicode string is wanted without caring about encoding. For example + to hex, base64, urlencoding, or are known to be identifiers. + + >>> ascii_str('a') + 'a' + >>> ascii_str(u'a') + Traceback (most recent call last): + AssertionError: u'a' + >>> ascii_str('å') + Traceback (most recent call last): + UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 0: ordinal not in range(128) + >>> ascii_str(u'å') + Traceback (most recent call last): + AssertionError: u'\xc3\xa5' + """ + assert isinstance(s, bytes), repr(s) + # Note: we use "encode", even though we really *should* use "decode". But + # we are in py2 and don't want py2, and encode is doing what we need for the + # ascii subset. + return s.encode('ascii') + + # Regex taken from http://www.regular-expressions.info/email.html email_re = re.compile( r"""[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@"""