changeset 6051:186bf5fee0a1

repo-scan: rewrite get_filesystem_repos to use os.walk instead of stupid recursion I think this is more readable. It is also faster. Perhaps because the more readable implementation makes it easier to optimize.
author Mads Kiilerich <madski@unity3d.com>
date Thu, 28 Jul 2016 16:28:34 +0200
parents 17f9f921a538
children 039a3f88518a
files kallithea/lib/paster_commands/repo_scan.py kallithea/lib/utils.py kallithea/model/scm.py
diffstat 3 files changed, 39 insertions(+), 28 deletions(-) [+]
line wrap: on
line diff
--- a/kallithea/lib/paster_commands/repo_scan.py	Thu Jul 28 16:28:34 2016 +0200
+++ b/kallithea/lib/paster_commands/repo_scan.py	Thu Jul 28 16:28:34 2016 +0200
@@ -58,7 +58,9 @@
                                         remove_obsolete=rm_obsolete)
         added = ', '.join(added) or '-'
         removed = ', '.join(removed) or '-'
-        print 'Scan completed added: %s removed: %s' % (added, removed)
+        print 'Scan completed.'
+        print 'Added: %s' % added
+        print 'Removed: %s' % removed
 
     def update_parser(self):
         self.parser.add_option(
--- a/kallithea/lib/utils.py	Thu Jul 28 16:28:34 2016 +0200
+++ b/kallithea/lib/utils.py	Thu Jul 28 16:28:34 2016 +0200
@@ -204,7 +204,7 @@
         sa.commit()
 
 
-def get_filesystem_repos(path, recursive=False, skip_removed_repos=True):
+def get_filesystem_repos(path):
     """
     Scans given path for repos and return (name,(type,path)) tuple
 
@@ -214,40 +214,49 @@
 
     # remove ending slash for better results
     path = safe_str(path.rstrip(os.sep))
-    log.debug('now scanning in %s location recursive:%s...', path, recursive)
+    log.debug('now scanning in %s', path)
+
+    def isdir(*n):
+        return os.path.isdir(os.path.join(*n))
 
-    def _get_repos(p):
-        if not os.access(p, os.R_OK) or not os.access(p, os.X_OK):
-            log.warning('ignoring repo path without access: %s', p)
-            return
-        if not os.access(p, os.W_OK):
-            log.warning('repo path without write access: %s', p)
-        for dirpath in os.listdir(p):
-            if os.path.isfile(os.path.join(p, dirpath)):
-                continue
-            cur_path = os.path.join(p, dirpath)
-
+    for root, dirs, _files in os.walk(path):
+        recurse_dirs = []
+        for subdir in dirs:
             # skip removed repos
-            if skip_removed_repos and REMOVED_REPO_PAT.match(dirpath):
+            if REMOVED_REPO_PAT.match(subdir):
                 continue
 
             #skip .<something> dirs TODO: rly? then we should prevent creating them ...
-            if dirpath.startswith('.'):
+            if subdir.startswith('.'):
                 continue
 
-            try:
-                scm_info = get_scm(cur_path)
-                yield scm_info[1].split(path, 1)[-1].lstrip(os.sep), scm_info
-            except VCSError:
-                if not recursive:
+            cur_path = os.path.join(root, subdir)
+            if (isdir(cur_path, '.hg') or
+                isdir(cur_path, '.git') or
+                isdir(cur_path, '.svn') or
+                isdir(cur_path, 'objects') and (isdir(cur_path, 'refs') or isfile(cur_path, 'packed-refs'))):
+
+                if not os.access(cur_path, os.R_OK) or not os.access(cur_path, os.X_OK):
+                    log.warning('ignoring repo path without access: %s', cur_path)
                     continue
-                #check if this dir contains other repos for recursive scan
-                rec_path = os.path.join(p, dirpath)
-                if not os.path.islink(rec_path) and os.path.isdir(rec_path):
-                    for inner_scm in _get_repos(rec_path):
-                        yield inner_scm
+
+                if not os.access(cur_path, os.W_OK):
+                    log.warning('repo path without write access: %s', cur_path)
 
-    return _get_repos(path)
+                try:
+                    scm_info = get_scm(cur_path)
+                    assert cur_path.startswith(path)
+                    repo_path = cur_path[len(path) + 1:]
+                    yield repo_path, scm_info
+                    continue # no recursion
+                except VCSError:
+                    # We should perhaps ignore such broken repos, but especially
+                    # the bare git detection is unreliable so we dive into it
+                    pass
+
+            recurse_dirs.append(subdir)
+
+        dirs[:] = recurse_dirs
 
 
 def is_valid_repo(repo_name, base_path, scm=None):
--- a/kallithea/model/scm.py	Thu Jul 28 16:28:34 2016 +0200
+++ b/kallithea/model/scm.py	Thu Jul 28 16:28:34 2016 +0200
@@ -191,7 +191,7 @@
         baseui = make_ui('db')
         repos = {}
 
-        for name, path in get_filesystem_repos(repos_path, recursive=True):
+        for name, path in get_filesystem_repos(repos_path):
             # name need to be decomposed and put back together using the /
             # since this is internal storage separator for kallithea
             name = Repository.normalize_repo_name(name)