changeset 8769:d35d14b05b82

diff: handle some escaped characters in Git diffs There are some odd characters (like \r and \n) that the Kallithea UI doesn't allow in filenames in repos. Kallithea (through the routes module) will fail to generate URLs when browsing Files. That is a known limitation with minimal real-world impact, non-trivial to work around or fix. There are very few relevant use cases for tracking files with odd filenames. \t is valid but is hard to render in a meaningful way in the UI. And ASCII characters like \ and " are not usable on Windows and should just be avoided. Kallithea would parse Git diffs with odd characers incorrectly or fail, even before hitting the known limitation. With this change, Kallithea will parse diffs with odd filenames correctly (and then hit the limitation). Git will quote odd filenames and escape the odd characters when emitting diffs. (Mercurial does by design not allow \r and \n , and Mercurial will thus never have to quote file names in diffs.) Quotes are already handled (and ignored). With this change, Kallithea will handle \ unescaping of \\ and \", the usual letters like \r and \n and \t, and octal numbers like \033 (for ESC) . Filenames with \ and " will work perfectly (when not on Windows). Filenames with \t and ESC will work fine, but without helpful display in the UI. Filenames with \r and \n will still make the UI fail when trying to generate URLs. Thanks to stypr of Flatt Security for raising this.
author Mads Kiilerich <mads@kiilerich.com>
date Sat, 14 Nov 2020 15:41:39 +0100
parents fd61f678577f
children 0a84ef075575
files kallithea/lib/diffs.py kallithea/tests/models/test_diff_parsers.py
diffstat 2 files changed, 24 insertions(+), 6 deletions(-) [+]
line wrap: on
line diff
--- a/kallithea/lib/diffs.py	Sat Nov 14 15:20:40 2020 +0100
+++ b/kallithea/lib/diffs.py	Sat Nov 14 15:41:39 2020 +0100
@@ -549,6 +549,19 @@
 _header_next_check = re.compile(br'''(?!@)(?!literal )(?!delta )''')
 
 
+_git_bs_escape_re = re.compile(r'\\(?:([^0-9])|([0-9]{3}))')
+
+
+_git_bs_escape_dict = {'\\': '\\', '"': '"', 'r': '\r', 'n': '\n', 't': '\t'}
+
+
+def _git_bs_unescape_m(m):
+    c = m.group(1)
+    if c is not None:
+        return _git_bs_escape_dict.get(c) or ('\\' + c)
+    return chr(int(m.group(2), 8))
+
+
 def _get_header(vcs, diff_chunk):
     """
     Parses a Git diff for a single file (header and chunks) and returns a tuple with:
@@ -569,6 +582,11 @@
     if match is None:
         raise Exception('diff not recognized as valid %s diff: %r' % (vcs, safe_str(bytes(diff_chunk[:1000]))))
     meta_info = {k: None if v is None else safe_str(v) for k, v in match.groupdict().items()}
+    if vcs == 'git':
+        for k in ['a_path', 'b_path', 'a_file', 'b_file']:
+            v = meta_info.get(k)
+            if v:
+                meta_info[k] = _git_bs_escape_re.sub(_git_bs_unescape_m, v)
     rest = diff_chunk[match.end():]
     if rest:
         if _header_next_check.match(rest):
--- a/kallithea/tests/models/test_diff_parsers.py	Sat Nov 14 15:20:40 2020 +0100
+++ b/kallithea/tests/models/test_diff_parsers.py	Sat Nov 14 15:41:39 2020 +0100
@@ -269,7 +269,7 @@
           'ops': {RENAMED_FILENODE: 'file renamed from oh no to oh yes'}}),
     ],
     'git_diff_quoting.diff': [
-        (r'\"foo\"',  # TODO: quotes should not be escaped
+        ('"foo"',
          'added',
          {'added': 1,
           'binary': False,
@@ -281,19 +281,19 @@
           'binary': False,
           'deleted': 0,
           'ops': {1: 'new file 100644'}}),
-        ("'foo'" r'\"foo\"',  # TODO: quotes should not be escaped
+        ("'foo'" '"foo"',
          'added',
          {'added': 1,
           'binary': False,
           'deleted': 0,
           'ops': {1: 'new file 100644'}}),
-        (r'a\r\nb',  # TODO: escaped
+        ('a\r\nb',  # Note: will be parsed correctly, but other parts of Kallithea can't handle it
          'added',
          {'added': 1,
           'binary': False,
           'deleted': 0,
           'ops': {1: 'new file 100644'}}),
-        (r'foo\rfoo',  # TODO: escaped
+        ('foo\rfoo',  # Note: will be parsed correctly, but other parts of Kallithea can't handle it
          'added',
         {'added': 0,
          'binary': True,
@@ -311,13 +311,13 @@
           'binary': False,
           'deleted': 0,
           'ops': {1: 'new file 100644'}}),
-        (r'esc\033foo',  # TODO: escaped
+        ('esc\033foo',  # Note: will be parsed and handled correctly, but without good UI
          'added',
          {'added': 0,
           'binary': True,
           'deleted': 0,
           'ops': {1: 'new file 100644'}}),
-        (r'tab\tfoo',  # TODO: escaped
+        ('tab\tfoo',  # Note: will be parsed and handled correctly, but without good UI
          'added',
          {'added': 0,
           'binary': True,