changeset 6770:5cc6a3308a8f stable

repo-scan: rewrite get_filesystem_repos to use os.walk instead of stupid recursion I think this is more readable. It is also faster. Perhaps because the more readable implementation makes it easier to optimize.
author Mads Kiilerich <madski@unity3d.com>
date Tue, 18 Oct 2016 23:29:23 +0200
parents 1013af35fa60
children afda98017955
files kallithea/lib/paster_commands/repo_scan.py kallithea/lib/utils.py kallithea/model/scm.py
diffstat 3 files changed, 40 insertions(+), 28 deletions(-) [+]
line wrap: on
line diff
--- a/kallithea/lib/paster_commands/repo_scan.py	Fri Jun 10 01:19:58 2016 +0200
+++ b/kallithea/lib/paster_commands/repo_scan.py	Tue Oct 18 23:29:23 2016 +0200
@@ -61,7 +61,9 @@
                                         remove_obsolete=rm_obsolete)
         added = ', '.join(added) or '-'
         removed = ', '.join(removed) or '-'
-        print 'Scan completed added: %s removed: %s' % (added, removed)
+        print 'Scan completed.'
+        print 'Added: %s' % added
+        print 'Removed: %s' % removed
 
     def update_parser(self):
         self.parser.add_option(
--- a/kallithea/lib/utils.py	Fri Jun 10 01:19:58 2016 +0200
+++ b/kallithea/lib/utils.py	Tue Oct 18 23:29:23 2016 +0200
@@ -206,7 +206,7 @@
         sa.commit()
 
 
-def get_filesystem_repos(path, recursive=False, skip_removed_repos=True):
+def get_filesystem_repos(path):
     """
     Scans given path for repos and return (name,(type,path)) tuple
 
@@ -216,40 +216,50 @@
 
     # remove ending slash for better results
     path = safe_str(path.rstrip(os.sep))
-    log.debug('now scanning in %s location recursive:%s...', path, recursive)
+    log.debug('now scanning in %s', path)
+
+    def isdir(*n):
+        return os.path.isdir(os.path.join(*n))
 
-    def _get_repos(p):
-        if not os.access(p, os.R_OK) or not os.access(p, os.X_OK):
-            log.warning('ignoring repo path without access: %s', p)
-            return
-        if not os.access(p, os.W_OK):
-            log.warning('repo path without write access: %s', p)
-        for dirpath in os.listdir(p):
-            if os.path.isfile(os.path.join(p, dirpath)):
-                continue
-            cur_path = os.path.join(p, dirpath)
-
+    for root, dirs, _files in os.walk(path):
+        recurse_dirs = []
+        for subdir in dirs:
             # skip removed repos
-            if skip_removed_repos and REMOVED_REPO_PAT.match(dirpath):
+            if REMOVED_REPO_PAT.match(subdir):
                 continue
 
             #skip .<something> dirs TODO: rly? then we should prevent creating them ...
-            if dirpath.startswith('.'):
+            if subdir.startswith('.'):
                 continue
 
-            try:
-                scm_info = get_scm(cur_path)
-                yield scm_info[1].split(path, 1)[-1].lstrip(os.sep), scm_info
-            except VCSError:
-                if not recursive:
+            cur_path = os.path.join(root, subdir)
+            if (isdir(cur_path, '.hg') or
+                isdir(cur_path, '.git') or
+                isdir(cur_path, '.svn') or
+                isdir(cur_path, 'objects') and (isdir(cur_path, 'refs') or
+                                                os.path.isfile(os.path.join(cur_path, 'packed-refs')))):
+
+                if not os.access(cur_path, os.R_OK) or not os.access(cur_path, os.X_OK):
+                    log.warning('ignoring repo path without access: %s', cur_path)
                     continue
-                #check if this dir containts other repos for recursive scan
-                rec_path = os.path.join(p, dirpath)
-                if not os.path.islink(rec_path) and os.path.isdir(rec_path):
-                    for inner_scm in _get_repos(rec_path):
-                        yield inner_scm
+
+                if not os.access(cur_path, os.W_OK):
+                    log.warning('repo path without write access: %s', cur_path)
 
-    return _get_repos(path)
+                try:
+                    scm_info = get_scm(cur_path)
+                    assert cur_path.startswith(path)
+                    repo_path = cur_path[len(path) + 1:]
+                    yield repo_path, scm_info
+                    continue # no recursion
+                except VCSError:
+                    # We should perhaps ignore such broken repos, but especially
+                    # the bare git detection is unreliable so we dive into it
+                    pass
+
+            recurse_dirs.append(subdir)
+
+        dirs[:] = recurse_dirs
 
 
 def is_valid_repo(repo_name, base_path, scm=None):
--- a/kallithea/model/scm.py	Fri Jun 10 01:19:58 2016 +0200
+++ b/kallithea/model/scm.py	Tue Oct 18 23:29:23 2016 +0200
@@ -276,7 +276,7 @@
         baseui = make_ui('db')
         repos = {}
 
-        for name, path in get_filesystem_repos(repos_path, recursive=True):
+        for name, path in get_filesystem_repos(repos_path):
             # name need to be decomposed and put back together using the /
             # since this is internal storage separator for kallithea
             name = Repository.normalize_repo_name(name)