Mercurial > kallithea
diff rhodecode/lib/indexers/daemon.py @ 560:3072935bdeed
rewrote whoosh indexing to run internal repository.walk() instead of filesystem.
Disabled default hg update hook (not needed since whoosh is not dependent on file system files to index)
author | Marcin Kuzminski <marcin@python-works.com> |
---|---|
date | Sat, 09 Oct 2010 00:22:19 +0200 |
parents | 29ec9ddbe258 |
children | 5f3b967d9d10 |
line wrap: on
line diff
--- a/rhodecode/lib/indexers/daemon.py Thu Oct 07 22:01:51 2010 +0200 +++ b/rhodecode/lib/indexers/daemon.py Sat Oct 09 00:22:19 2010 +0200 @@ -39,6 +39,9 @@ from shutil import rmtree from rhodecode.lib.indexers import INDEX_EXTENSIONS, IDX_LOCATION, SCHEMA, IDX_NAME +from time import mktime +from vcs.backends import hg + import logging log = logging.getLogger('whooshIndexer') @@ -62,7 +65,9 @@ return HgModel.repo_scan('/', root_location, None, True) class WhooshIndexingDaemon(object): - """Deamon for atomic jobs""" + """ + Deamon for atomic jobs + """ def __init__(self, indexname='HG_INDEX', repo_location=None): self.indexname = indexname @@ -73,55 +78,49 @@ log.info('Cannot run incremental index since it does not' ' yet exist running full build') self.initial = True - + def get_paths(self, root_dir): - """recursive walk in root dir and return a set of all path in that dir - excluding files in .hg dir""" + """ + recursive walk in root dir and return a set of all path in that dir + based on repository walk function + """ + repo = hg.MercurialRepository(root_dir) index_paths_ = set() - for path, dirs, files in os.walk(root_dir): - if path.find('.hg') == -1: + for topnode, dirs, files in repo.walk('/', 'tip'): + for f in files: + index_paths_.add(jn(root_dir, f.path)) + for dir in dirs: for f in files: - index_paths_.add(jn(path, f)) - - return index_paths_ - + index_paths_.add(jn(root_dir, f.path)) + + return index_paths_ + + def add_doc(self, writer, path, repo): """Adding doc to writer""" - - ext = unicode(path.split('/')[-1].split('.')[-1].lower()) - #we just index the content of choosen files - if ext in INDEX_EXTENSIONS: + n_path = path[len(repo.path) + 1:] + node = repo.get_changeset().get_node(n_path) + + #we just index the content of chosen files + if node.extension in INDEX_EXTENSIONS: log.debug(' >> %s [WITH CONTENT]' % path) - fobj = open(path, 'rb') - content = fobj.read() - fobj.close() - u_content = safe_unicode(content) + u_content = node.content else: log.debug(' >> %s' % path) #just index file name without it's content u_content = u'' - - - try: - os.stat(path) - writer.add_document(owner=unicode(repo.contact), - repository=safe_unicode(repo.name), - path=safe_unicode(path), - content=u_content, - modtime=os.path.getmtime(path), - extension=ext) - except OSError, e: - import errno - if e.errno == errno.ENOENT: - log.debug('path %s does not exist or is a broken symlink' % path) - else: - raise e + writer.add_document(owner=unicode(repo.contact), + repository=safe_unicode(repo.name), + path=safe_unicode(path), + content=u_content, + modtime=mktime(node.last_changeset.date.timetuple()), + extension=node.extension) def build_index(self): if os.path.exists(IDX_LOCATION): - log.debug('removing previos index') + log.debug('removing previous index') rmtree(IDX_LOCATION) if not os.path.exists(IDX_LOCATION):