changeset 8062:9203621cae03

vcs: always return bytes from node.content We will rather have the unicode conversions explicit. Note: Py3 bytes doesn't have .startswith - replace that with a regexp.
author Mads Kiilerich <mads@kiilerich.com>
date Sat, 28 Dec 2019 01:08:48 +0100
parents 7c43e15fb8bc
children 9bc709aa0614
files kallithea/controllers/admin/gists.py kallithea/controllers/compare.py kallithea/controllers/feed.py kallithea/controllers/files.py kallithea/controllers/pullrequests.py kallithea/controllers/summary.py kallithea/lib/annotate.py kallithea/lib/diffs.py kallithea/lib/helpers.py kallithea/lib/indexers/daemon.py kallithea/lib/vcs/backends/git/inmemory.py kallithea/lib/vcs/backends/hg/inmemory.py kallithea/lib/vcs/nodes.py kallithea/lib/vcs/utils/annotate.py kallithea/templates/admin/gists/edit.html kallithea/templates/files/files_edit.html kallithea/tests/vcs/test_git.py kallithea/tests/vcs/test_hg.py
diffstat 18 files changed, 43 insertions(+), 51 deletions(-) [+]
line wrap: on
line diff
--- a/kallithea/controllers/admin/gists.py	Fri Dec 27 00:26:14 2019 +0100
+++ b/kallithea/controllers/admin/gists.py	Sat Dec 28 01:08:48 2019 +0100
@@ -182,7 +182,7 @@
             log.error(traceback.format_exc())
             raise HTTPNotFound()
         if format == 'raw':
-            content = '\n\n'.join([f.content for f in c.files if (f_path is None or safe_unicode(f.path) == f_path)])
+            content = '\n\n'.join([safe_unicode(f.content) for f in c.files if (f_path is None or safe_unicode(f.path) == f_path)])
             response.content_type = 'text/plain'
             return content
         return render('admin/gists/show.html')
--- a/kallithea/controllers/compare.py	Fri Dec 27 00:26:14 2019 +0100
+++ b/kallithea/controllers/compare.py	Sat Dec 28 01:08:48 2019 +0100
@@ -272,7 +272,7 @@
                                       ignore_whitespace=ignore_whitespace,
                                       context=line_context)
 
-        diff_processor = diffs.DiffProcessor(raw_diff or '', diff_limit=diff_limit)
+        diff_processor = diffs.DiffProcessor(raw_diff, diff_limit=diff_limit)
         c.limited_diff = diff_processor.limited_diff
         c.file_diff_data = []
         c.lines_added = 0
--- a/kallithea/controllers/feed.py	Fri Dec 27 00:26:14 2019 +0100
+++ b/kallithea/controllers/feed.py	Sat Dec 28 01:08:48 2019 +0100
@@ -94,7 +94,7 @@
         desc_msg.extend(changes)
         if str2bool(CONFIG.get('rss_include_diff', False)):
             desc_msg.append('\n\n')
-            desc_msg.append(raw_diff)
+            desc_msg.append(safe_unicode(raw_diff))
         desc_msg.append('</pre>')
         return [safe_unicode(chunk) for chunk in desc_msg]
 
--- a/kallithea/controllers/files.py	Fri Dec 27 00:26:14 2019 +0100
+++ b/kallithea/controllers/files.py	Sat Dec 28 01:08:48 2019 +0100
@@ -46,7 +46,7 @@
 from kallithea.lib.base import BaseRepoController, jsonify, render
 from kallithea.lib.exceptions import NonRelativePathError
 from kallithea.lib.utils import action_logger
-from kallithea.lib.utils2 import convert_line_endings, detect_mode, safe_int, safe_str, str2bool
+from kallithea.lib.utils2 import convert_line_endings, detect_mode, safe_int, safe_str, safe_unicode, str2bool
 from kallithea.lib.vcs.backends.base import EmptyChangeset
 from kallithea.lib.vcs.conf import settings
 from kallithea.lib.vcs.exceptions import (
@@ -365,8 +365,7 @@
         c.f_path = f_path
 
         if r_post:
-
-            old_content = c.file.content
+            old_content = safe_unicode(c.file.content)
             sl = old_content.splitlines(1)
             first_line = sl[0] if sl else ''
             # modes:  0 - Unix, 1 - Mac, 2 - DOS
--- a/kallithea/controllers/pullrequests.py	Fri Dec 27 00:26:14 2019 +0100
+++ b/kallithea/controllers/pullrequests.py	Sat Dec 28 01:08:48 2019 +0100
@@ -591,7 +591,7 @@
                                       ignore_whitespace=ignore_whitespace, context=line_context)
         except ChangesetDoesNotExistError:
             raw_diff = _("The diff can't be shown - the PR revisions could not be found.")
-        diff_processor = diffs.DiffProcessor(raw_diff or '', diff_limit=diff_limit)
+        diff_processor = diffs.DiffProcessor(raw_diff, diff_limit=diff_limit)
         c.limited_diff = diff_processor.limited_diff
         c.file_diff_data = []
         c.lines_added = 0
--- a/kallithea/controllers/summary.py	Fri Dec 27 00:26:14 2019 +0100
+++ b/kallithea/controllers/summary.py	Sat Dec 28 01:08:48 2019 +0100
@@ -46,7 +46,7 @@
 from kallithea.lib.compat import json
 from kallithea.lib.markup_renderer import MarkupRenderer
 from kallithea.lib.page import Page
-from kallithea.lib.utils2 import safe_int
+from kallithea.lib.utils2 import safe_int, safe_unicode
 from kallithea.lib.vcs.backends.base import EmptyChangeset
 from kallithea.lib.vcs.exceptions import ChangesetError, EmptyRepositoryError, NodeDoesNotExistError
 from kallithea.lib.vcs.nodes import FileNode
@@ -84,7 +84,7 @@
                         readme_file = f
                         log.debug('Found README file `%s` rendering...',
                                   readme_file)
-                        readme_data = renderer.render(readme.content,
+                        readme_data = renderer.render(safe_unicode(readme.content),
                                                       filename=f)
                         break
                     except NodeDoesNotExistError:
--- a/kallithea/lib/annotate.py	Fri Dec 27 00:26:14 2019 +0100
+++ b/kallithea/lib/annotate.py	Sat Dec 28 01:08:48 2019 +0100
@@ -30,6 +30,7 @@
 
 from kallithea.lib.vcs.exceptions import VCSError
 from kallithea.lib.vcs.nodes import FileNode
+from kallithea.lib.vcs.utils import safe_unicode
 
 
 def annotate_highlight(filenode, annotate_from_changeset_func=None,
@@ -53,7 +54,7 @@
         headers=headers,
         annotate_from_changeset_func=annotate_from_changeset_func, **options)
     lexer = get_custom_lexer(filenode.extension) or filenode.lexer
-    highlighted = highlight(filenode.content, lexer, formatter)
+    highlighted = highlight(safe_unicode(filenode.content), lexer, formatter)
     return highlighted
 
 
--- a/kallithea/lib/diffs.py	Fri Dec 27 00:26:14 2019 +0100
+++ b/kallithea/lib/diffs.py	Sat Dec 28 01:08:48 2019 +0100
@@ -289,8 +289,8 @@
             based on that parameter cut off will be triggered, set to None
             to show full diff
         """
-        if not isinstance(diff, basestring):
-            raise Exception('Diff must be a basestring got %s instead' % type(diff))
+        if not isinstance(diff, bytes):
+            raise Exception('Diff must be bytes - got %s' % type(diff))
 
         self._diff = diff
         self.adds = 0
@@ -516,6 +516,9 @@
 """, re.VERBOSE | re.MULTILINE)
 
 
+_header_next_check = re.compile(br'''(?!@)(?!literal )(?!delta )''')
+
+
 def _get_header(vcs, diff_chunk):
     """
     Parses a Git diff for a single file (header and chunks) and returns a tuple with:
@@ -537,7 +540,7 @@
         raise Exception('diff not recognized as valid %s diff' % vcs)
     meta_info = match.groupdict()
     rest = diff_chunk[match.end():]
-    if rest and not rest.startswith('@') and not rest.startswith('literal ') and not rest.startswith('delta '):
+    if rest and _header_next_check.match(rest):
         raise Exception('cannot parse %s diff header: %r followed by %r' % (vcs, diff_chunk[:match.end()], rest[:1000]))
     diff_lines = (_escaper(m.group(0)) for m in re.finditer(r'.*\n|.+$', rest)) # don't split on \r as str.splitlines do
     return meta_info, diff_lines
--- a/kallithea/lib/helpers.py	Fri Dec 27 00:26:14 2019 +0100
+++ b/kallithea/lib/helpers.py	Sat Dec 28 01:08:48 2019 +0100
@@ -330,7 +330,7 @@
     """
     lexer = get_custom_lexer(filenode.extension) or filenode.lexer
     return literal(markup_whitespace(
-        code_highlight(filenode.content, lexer, CodeHtmlFormatter(**kwargs))))
+        code_highlight(safe_unicode(filenode.content), lexer, CodeHtmlFormatter(**kwargs))))
 
 
 def pygmentize_annotation(repo_name, filenode, **kwargs):
--- a/kallithea/lib/indexers/daemon.py	Fri Dec 27 00:26:14 2019 +0100
+++ b/kallithea/lib/indexers/daemon.py	Sat Dec 28 01:08:48 2019 +0100
@@ -182,12 +182,13 @@
 
         indexed = indexed_w_content = 0
         if self.is_indexable_node(node):
-            u_content = node.content
-            if not isinstance(u_content, unicode):
+            bytes_content = node.content
+            if b'\0' in bytes_content:
                 log.warning('    >> %s - no text content', path)
                 u_content = u''
             else:
                 log.debug('    >> %s', path)
+                u_content = safe_unicode(bytes_content)
                 indexed_w_content += 1
 
         else:
--- a/kallithea/lib/vcs/backends/git/inmemory.py	Fri Dec 27 00:26:14 2019 +0100
+++ b/kallithea/lib/vcs/backends/git/inmemory.py	Sat Dec 28 01:08:48 2019 +0100
@@ -68,11 +68,7 @@
             # for dirnames (in reverse order) [this only applies for nodes from added]
             new_trees = []
 
-            if not node.is_binary:
-                content = node.content.encode(ENCODING)
-            else:
-                content = node.content
-            blob = objects.Blob.from_string(content)
+            blob = objects.Blob.from_string(node.content)
 
             node_path = safe_bytes(node.name)
             if dirnames:
--- a/kallithea/lib/vcs/backends/hg/inmemory.py	Fri Dec 27 00:26:14 2019 +0100
+++ b/kallithea/lib/vcs/backends/hg/inmemory.py	Sat Dec 28 01:08:48 2019 +0100
@@ -52,8 +52,7 @@
             for node in self.added:
                 if node.path == path:
                     return memfilectx(_repo, memctx, path=node.path,
-                        data=(node.content.encode('utf-8')
-                              if not node.is_binary else node.content),
+                        data=node.content,
                         islink=False,
                         isexec=node.is_executable,
                         copysource=False)
@@ -62,8 +61,7 @@
             for node in self.changed:
                 if node.path == path:
                     return memfilectx(_repo, memctx, path=node.path,
-                        data=(node.content.encode('utf-8')
-                              if not node.is_binary else node.content),
+                        data=node.content,
                         islink=False,
                         isexec=node.is_executable,
                         copysource=False)
--- a/kallithea/lib/vcs/nodes.py	Fri Dec 27 00:26:14 2019 +0100
+++ b/kallithea/lib/vcs/nodes.py	Sat Dec 28 01:08:48 2019 +0100
@@ -16,7 +16,7 @@
 
 from kallithea.lib.vcs.backends.base import EmptyChangeset
 from kallithea.lib.vcs.exceptions import NodeError, RemovedFileNodeError
-from kallithea.lib.vcs.utils import safe_str, safe_unicode
+from kallithea.lib.vcs.utils import safe_bytes, safe_str, safe_unicode
 from kallithea.lib.vcs.utils.lazy import LazyProperty
 
 
@@ -263,6 +263,10 @@
             raise NodeError("Cannot use both content and changeset")
         super(FileNode, self).__init__(path, kind=NodeKind.FILE)
         self.changeset = changeset
+        if not isinstance(content, bytes) and content is not None:
+            # File content is one thing that inherently must be bytes ... but
+            # VCS module tries to be "user friendly" and support unicode ...
+            content = safe_bytes(content)
         self._content = content
         self._mode = mode or 0o100644
 
@@ -278,25 +282,17 @@
             mode = self._mode
         return mode
 
-    def _get_content(self):
+    @property
+    def content(self):
+        """
+        Returns lazily byte content of the FileNode.
+        """
         if self.changeset:
             content = self.changeset.get_file_content(self.path)
         else:
             content = self._content
         return content
 
-    @property
-    def content(self):
-        """
-        Returns lazily content of the FileNode. If possible, would try to
-        decode content from UTF-8.
-        """
-        content = self._get_content()
-
-        if bool(content and '\0' in content):
-            return content
-        return safe_unicode(content)
-
     @LazyProperty
     def size(self):
         if self.changeset:
@@ -366,7 +362,7 @@
         """
         from pygments import lexers
         try:
-            lexer = lexers.guess_lexer_for_filename(self.name, self.content, stripnl=False)
+            lexer = lexers.guess_lexer_for_filename(self.name, safe_unicode(self.content), stripnl=False)
         except lexers.ClassNotFound:
             lexer = lexers.TextLexer(stripnl=False)
         # returns first alias
@@ -414,8 +410,7 @@
         """
         Returns True if file has binary content.
         """
-        _bin = '\0' in self._get_content()
-        return _bin
+        return b'\0' in self.content
 
     def is_browser_compatible_image(self):
         return self.mimetype in [
--- a/kallithea/lib/vcs/utils/annotate.py	Fri Dec 27 00:26:14 2019 +0100
+++ b/kallithea/lib/vcs/utils/annotate.py	Sat Dec 28 01:08:48 2019 +0100
@@ -3,6 +3,7 @@
 
 from kallithea.lib.vcs.exceptions import VCSError
 from kallithea.lib.vcs.nodes import FileNode
+from kallithea.lib.vcs.utils import safe_unicode
 
 
 def annotate_highlight(filenode, annotate_from_changeset_func=None,
@@ -24,9 +25,7 @@
     formatter = AnnotateHtmlFormatter(filenode=filenode, order=order,
         headers=headers,
         annotate_from_changeset_func=annotate_from_changeset_func, **options)
-    lexer = filenode.lexer
-    highlighted = highlight(filenode.content, lexer, formatter)
-    return highlighted
+    return highlight(safe_unicode(filenode.content), filenode.lexer, formatter)
 
 
 class AnnotateHtmlFormatter(HtmlFormatter):
--- a/kallithea/templates/admin/gists/edit.html	Fri Dec 27 00:26:14 2019 +0100
+++ b/kallithea/templates/admin/gists/edit.html	Sat Dec 28 01:08:48 2019 +0100
@@ -73,7 +73,7 @@
                     </div>
                     <div class="panel-body no-padding">
                         <div id="editor_container">
-                            <textarea id="editor_${h.FID('f',file.path)}" name="contents" style="display:none">${file.content}</textarea>
+                            <textarea id="editor_${h.FID('f',file.path)}" name="contents" style="display:none">${safe_unicode(file.content)}</textarea>
                         </div>
                     </div>
                 </div>
--- a/kallithea/templates/files/files_edit.html	Fri Dec 27 00:26:14 2019 +0100
+++ b/kallithea/templates/files/files_edit.html	Sat Dec 28 01:08:48 2019 +0100
@@ -59,7 +59,7 @@
                     </span>
               </div>
               <div class="panel-body no-padding">
-                <textarea id="editor" name="content" style="display:none">${h.escape(c.file.content)|n}</textarea>
+                <textarea id="editor" name="content" style="display:none">${h.escape(safe_unicode(c.file.content))|n}</textarea>
               </div>
             </div>
             <div>
--- a/kallithea/tests/vcs/test_git.py	Fri Dec 27 00:26:14 2019 +0100
+++ b/kallithea/tests/vcs/test_git.py	Sat Dec 28 01:08:48 2019 +0100
@@ -596,11 +596,11 @@
         for cs in self.repo:
             assert isinstance(cs.author, unicode)
 
-    def test_repo_files_content_is_unicode(self):
+    def test_repo_files_content_is_bytes(self):
         changeset = self.repo.get_changeset()
         for node in changeset.get_node('/'):
             if node.is_file():
-                assert isinstance(node.content, unicode)
+                assert isinstance(node.content, bytes)
 
     def test_wrong_path(self):
         # There is 'setup.py' in the root dir but not there:
--- a/kallithea/tests/vcs/test_hg.py	Fri Dec 27 00:26:14 2019 +0100
+++ b/kallithea/tests/vcs/test_hg.py	Sat Dec 28 01:08:48 2019 +0100
@@ -544,11 +544,11 @@
         for cm in self.repo:
             assert isinstance(cm.author, unicode)
 
-    def test_repo_files_content_is_unicode(self):
+    def test_repo_files_content_is_bytes(self):
         test_changeset = self.repo.get_changeset(100)
         for node in test_changeset.get_node('/'):
             if node.is_file():
-                assert isinstance(node.content, unicode)
+                assert isinstance(node.content, bytes)
 
     def test_wrong_path(self):
         # There is 'setup.py' in the root dir but not there: