diff rhodecode/lib/indexers/daemon.py @ 560:3072935bdeed

rewrote whoosh indexing to run internal repository.walk() instead of filesystem. Disabled default hg update hook (not needed since whoosh is not dependent on file system files to index)
author Marcin Kuzminski <marcin@python-works.com>
date Sat, 09 Oct 2010 00:22:19 +0200
parents 29ec9ddbe258
children 5f3b967d9d10
line wrap: on
line diff
--- a/rhodecode/lib/indexers/daemon.py	Thu Oct 07 22:01:51 2010 +0200
+++ b/rhodecode/lib/indexers/daemon.py	Sat Oct 09 00:22:19 2010 +0200
@@ -39,6 +39,9 @@
 from shutil import rmtree
 from rhodecode.lib.indexers import INDEX_EXTENSIONS, IDX_LOCATION, SCHEMA, IDX_NAME
 
+from time import mktime
+from vcs.backends import hg
+
 import logging
 
 log = logging.getLogger('whooshIndexer')
@@ -62,7 +65,9 @@
     return HgModel.repo_scan('/', root_location, None, True)
 
 class WhooshIndexingDaemon(object):
-    """Deamon for atomic jobs"""
+    """
+    Deamon for atomic jobs
+    """
 
     def __init__(self, indexname='HG_INDEX', repo_location=None):
         self.indexname = indexname
@@ -73,55 +78,49 @@
             log.info('Cannot run incremental index since it does not'
                      ' yet exist running full build')
             self.initial = True
-    
+        
     def get_paths(self, root_dir):
-        """recursive walk in root dir and return a set of all path in that dir
-        excluding files in .hg dir"""
+        """
+        recursive walk in root dir and return a set of all path in that dir
+        based on repository walk function
+        """
+        repo = hg.MercurialRepository(root_dir)
         index_paths_ = set()
-        for path, dirs, files in os.walk(root_dir):
-            if path.find('.hg') == -1:
+        for topnode, dirs, files in repo.walk('/', 'tip'):
+            for f in files:
+                index_paths_.add(jn(root_dir, f.path))
+            for dir in dirs:
                 for f in files:
-                    index_paths_.add(jn(path, f))
-    
-        return index_paths_
-    
+                    index_paths_.add(jn(root_dir, f.path))
+            
+        return index_paths_        
+
+
     def add_doc(self, writer, path, repo):
         """Adding doc to writer"""
-        
-        ext = unicode(path.split('/')[-1].split('.')[-1].lower())
-        #we just index the content of choosen files
-        if ext in INDEX_EXTENSIONS:
+        n_path = path[len(repo.path) + 1:]
+        node = repo.get_changeset().get_node(n_path)
+
+        #we just index the content of chosen files
+        if node.extension in INDEX_EXTENSIONS:
             log.debug('    >> %s [WITH CONTENT]' % path)
-            fobj = open(path, 'rb')
-            content = fobj.read()
-            fobj.close()
-            u_content = safe_unicode(content)
+            u_content = node.content
         else:
             log.debug('    >> %s' % path)
             #just index file name without it's content
             u_content = u''
         
-        
-        
-        try:
-            os.stat(path)
-            writer.add_document(owner=unicode(repo.contact),
-                            repository=safe_unicode(repo.name),
-                            path=safe_unicode(path),
-                            content=u_content,
-                            modtime=os.path.getmtime(path),
-                            extension=ext)             
-        except OSError, e:
-            import errno
-            if e.errno == errno.ENOENT:
-                log.debug('path %s does not exist or is a broken symlink' % path)
-            else:
-                raise e                 
+        writer.add_document(owner=unicode(repo.contact),
+                        repository=safe_unicode(repo.name),
+                        path=safe_unicode(path),
+                        content=u_content,
+                        modtime=mktime(node.last_changeset.date.timetuple()),
+                        extension=node.extension)             
 
     
     def build_index(self):
         if os.path.exists(IDX_LOCATION):
-            log.debug('removing previos index')
+            log.debug('removing previous index')
             rmtree(IDX_LOCATION)
             
         if not os.path.exists(IDX_LOCATION):