Mercurial > kallithea
changeset 2640:5f21a9dcb09d beta
create an index for commit messages and the ability to search them and see results
author | Indra Talip <indra.talip@gmail.com> |
---|---|
date | Fri, 20 Jul 2012 12:50:56 +0200 |
parents | f597cfb492f9 |
children | cfcd981d6679 |
files | rhodecode/controllers/search.py rhodecode/lib/indexers/__init__.py rhodecode/lib/indexers/daemon.py rhodecode/templates/search/search.html rhodecode/templates/search/search_commit.html rhodecode/tests/functional/test_search.py |
diffstat | 6 files changed, 346 insertions(+), 116 deletions(-) [+] |
line wrap: on
line diff
--- a/rhodecode/controllers/search.py Wed Jul 18 22:07:46 2012 +0200 +++ b/rhodecode/controllers/search.py Fri Jul 20 12:50:56 2012 +0200 @@ -30,7 +30,7 @@ from rhodecode.lib.auth import LoginRequired from rhodecode.lib.base import BaseController, render -from rhodecode.lib.indexers import SCHEMA, IDX_NAME, WhooshResultWrapper +from rhodecode.lib.indexers import CHGSETS_SCHEMA, SCHEMA, CHGSET_IDX_NAME, IDX_NAME, WhooshResultWrapper from webhelpers.paginate import Page from webhelpers.util import update_params @@ -54,25 +54,41 @@ c.formated_results = [] c.runtime = '' c.cur_query = request.GET.get('q', None) - c.cur_type = request.GET.get('type', 'source') + c.cur_type = request.GET.get('type', 'content') c.cur_search = search_type = {'content': 'content', - 'commit': 'content', + 'commit': 'message', 'path': 'path', 'repository': 'repository'}\ .get(c.cur_type, 'content') + index_name = { + 'content': IDX_NAME, + 'commit': CHGSET_IDX_NAME, + 'path': IDX_NAME}\ + .get(c.cur_type, IDX_NAME) + + schema_defn = { + 'content': SCHEMA, + 'commit': CHGSETS_SCHEMA, + 'path': SCHEMA}\ + .get(c.cur_type, SCHEMA) + + log.debug('IDX: %s' % index_name) + log.debug('SCHEMA: %s' % schema_defn) + if c.cur_query: cur_query = c.cur_query.lower() + log.debug(cur_query) if c.cur_query: p = int(request.params.get('page', 1)) highlight_items = set() try: idx = open_dir(config['app_conf']['index_dir'], - indexname=IDX_NAME) + indexname=index_name) searcher = idx.searcher() - qp = QueryParser(search_type, schema=SCHEMA) + qp = QueryParser(search_type, schema=schema_defn) if c.repo_name: cur_query = u'repository:%s %s' % (c.repo_name, cur_query) try: @@ -84,13 +100,13 @@ highlight_items.add(query.text) else: for i in query.all_terms(): - if i[0] == 'content': + if i[0] in ['content', 'message']: highlight_items.add(i[1]) matcher = query.matcher(searcher) - log.debug(query) - log.debug(highlight_items) + log.debug('query: %s' % query) + log.debug('hl terms: %s' % highlight_items) results = searcher.search(query) res_ln = len(results) c.runtime = '%s results (%.3f seconds)' % ( @@ -99,7 +115,7 @@ def url_generator(**kw): return update_params("?q=%s&type=%s" \ - % (c.cur_query, c.cur_search), **kw) + % (c.cur_query, c.cur_type), **kw) repo_location = RepoModel().repos_path c.formated_results = Page( WhooshResultWrapper(search_type, searcher, matcher,
--- a/rhodecode/lib/indexers/__init__.py Wed Jul 18 22:07:46 2012 +0200 +++ b/rhodecode/lib/indexers/__init__.py Fri Jul 20 12:50:56 2012 +0200 @@ -35,7 +35,7 @@ from shutil import rmtree from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter -from whoosh.fields import TEXT, ID, STORED, Schema, FieldType +from whoosh.fields import TEXT, ID, STORED, NUMERIC, BOOLEAN, Schema, FieldType from whoosh.index import create_in, open_dir from whoosh.formats import Characters from whoosh.highlight import highlight, HtmlFormatter, ContextFragmenter @@ -51,10 +51,11 @@ from rhodecode.lib.utils import BasePasterCommand, Command, add_cache,\ load_rcextensions +log = logging.getLogger(__name__) + # CUSTOM ANALYZER wordsplit + lowercase filter ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter() - #INDEX SCHEMA DEFINITION SCHEMA = Schema( fileid=ID(unique=True), @@ -71,6 +72,22 @@ FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n') FRAGMENTER = ContextFragmenter(200) +CHGSETS_SCHEMA = Schema( + path=ID(unique=True, stored=True), + revision=NUMERIC(unique=True, stored=True), + last=BOOLEAN(), + owner=TEXT(), + repository=ID(unique=True, stored=True), + author=TEXT(stored=True), + message=FieldType(format=Characters(), analyzer=ANALYZER, + scorable=True, stored=True), + parents=TEXT(), + added=TEXT(), + removed=TEXT(), + changed=TEXT(), +) + +CHGSET_IDX_NAME = 'CHGSET_INDEX' class MakeIndex(BasePasterCommand): @@ -191,14 +208,20 @@ def get_full_content(self, docid): res = self.searcher.stored_fields(docid[0]) + log.debug('result: %s' % res) full_repo_path = jn(self.repo_location, res['repository']) f_path = res['path'].split(full_repo_path)[-1] f_path = f_path.lstrip(os.sep) + res.update({'f_path': f_path}) - content_short = self.get_short_content(res, docid[1]) - res.update({'content_short': content_short, - 'content_short_hl': self.highlight(content_short), - 'f_path': f_path}) + if self.search_type == 'content': + content_short = self.get_short_content(res, docid[1]) + res.update({'content_short': content_short, + 'content_short_hl': self.highlight(content_short)}) + elif self.search_type == 'message': + res.update({'message_hl': self.highlight(res['message'])}) + + log.debug('result: %s' % res) return res @@ -216,19 +239,20 @@ :param size: """ memory = [(0, 0)] - for span in self.matcher.spans(): - start = span.startchar or 0 - end = span.endchar or 0 - start_offseted = max(0, start - self.fragment_size) - end_offseted = end + self.fragment_size + if self.matcher.supports('positions'): + for span in self.matcher.spans(): + start = span.startchar or 0 + end = span.endchar or 0 + start_offseted = max(0, start - self.fragment_size) + end_offseted = end + self.fragment_size - if start_offseted < memory[-1][1]: - start_offseted = memory[-1][1] - memory.append((start_offseted, end_offseted,)) - yield (start_offseted, end_offseted,) + if start_offseted < memory[-1][1]: + start_offseted = memory[-1][1] + memory.append((start_offseted, end_offseted,)) + yield (start_offseted, end_offseted,) def highlight(self, content, top=5): - if self.search_type != 'content': + if self.search_type not in ['content', 'message']: return '' hl = highlight( text=content,
--- a/rhodecode/lib/indexers/daemon.py Wed Jul 18 22:07:46 2012 +0200 +++ b/rhodecode/lib/indexers/daemon.py Fri Jul 20 12:50:56 2012 +0200 @@ -41,12 +41,14 @@ from rhodecode.config.conf import INDEX_EXTENSIONS from rhodecode.model.scm import ScmModel from rhodecode.lib.utils2 import safe_unicode -from rhodecode.lib.indexers import SCHEMA, IDX_NAME +from rhodecode.lib.indexers import SCHEMA, IDX_NAME, CHGSETS_SCHEMA, CHGSET_IDX_NAME from rhodecode.lib.vcs.exceptions import ChangesetError, RepositoryError, \ NodeDoesNotExistError -from whoosh.index import create_in, open_dir +from whoosh.index import create_in, open_dir, exists_in +from whoosh.query import * +from whoosh.qparser import QueryParser log = logging.getLogger('whoosh_indexer') @@ -89,12 +91,19 @@ self.filtered_repo_update_paths[repo_name] = repo self.repo_paths = self.filtered_repo_update_paths - self.initial = False + self.initial = True if not os.path.isdir(self.index_location): os.makedirs(self.index_location) log.info('Cannot run incremental index since it does not' ' yet exist running full build') - self.initial = True + elif not exists_in(self.index_location, IDX_NAME): + log.info('Running full index build as the file content' + ' index does not exist') + elif not exists_in(self.index_location, CHGSET_IDX_NAME): + log.info('Running full index build as the changeset' + ' index does not exist') + else: + self.initial = False def get_paths(self, repo): """ @@ -158,35 +167,86 @@ ) return indexed, indexed_w_content - def build_index(self): - if os.path.exists(self.index_location): - log.debug('removing previous index') - rmtree(self.index_location) + def index_changesets(self, writer, repo_name, repo, start_rev=0): + """ + Add all changeset in the vcs repo starting at start_rev + to the index writer + """ + + log.debug('indexing changesets in %s[%d:]' % (repo_name, start_rev)) - if not os.path.exists(self.index_location): - os.mkdir(self.index_location) + indexed=0 + for cs in repo[start_rev:]: + writer.add_document( + path=unicode(cs.raw_id), + owner=unicode(repo.contact), + repository=safe_unicode(repo_name), + author=cs.author, + message=cs.message, + revision=cs.revision, + last=cs.last, + added=u' '.join([node.path for node in cs.added]).lower(), + removed=u' '.join([node.path for node in cs.removed]).lower(), + changed=u' '.join([node.path for node in cs.changed]).lower(), + parents=u' '.join([cs.raw_id for cs in cs.parents]), + ) + indexed += 1 - idx = create_in(self.index_location, SCHEMA, indexname=IDX_NAME) - writer = idx.writer() - log.debug('BUILDING INDEX FOR EXTENSIONS %s ' - 'AND REPOS %s' % (INDEX_EXTENSIONS, self.repo_paths.keys())) + log.debug('indexed %d changesets for repo %s' % (indexed, repo_name)) + + def index_files(self, file_idx_writer, repo_name, repo): + i_cnt = iwc_cnt = 0 + log.debug('building index for [%s]' % repo.path) + for idx_path in self.get_paths(repo): + i, iwc = self.add_doc(file_idx_writer, idx_path, repo, repo_name) + i_cnt += i + iwc_cnt += iwc + + log.debug('added %s files %s with content for repo %s' % (i_cnt + iwc_cnt, iwc_cnt, repo.path)) + + def update_changeset_index(self): + idx = open_dir(self.index_location, indexname=CHGSET_IDX_NAME) - for repo_name, repo in self.repo_paths.items(): - log.debug('building index @ %s' % repo.path) - i_cnt = iwc_cnt = 0 - for idx_path in self.get_paths(repo): - i, iwc = self.add_doc(writer, idx_path, repo, repo_name) - i_cnt += i - iwc_cnt += iwc - log.debug('added %s files %s with content for repo %s' % ( - i_cnt + iwc_cnt, iwc_cnt, repo.path) - ) + with idx.searcher() as searcher: + writer = idx.writer() + writer_is_dirty = False + try: + for repo_name, repo in self.repo_paths.items(): + # skip indexing if there aren't any revs in the repo + revs = repo.revisions + if len(revs) < 1: + continue + + qp = QueryParser('repository', schema=CHGSETS_SCHEMA) + q = qp.parse(u"last:t AND %s" % repo_name) + + results = searcher.search(q, sortedby='revision') + + last_rev = 0 + if len(results) > 0: + last_rev = results[0]['revision'] - log.debug('>> COMMITING CHANGES <<') - writer.commit(merge=True) - log.debug('>>> FINISHED BUILDING INDEX <<<') + # there are new changesets to index or a new repo to index + if last_rev == 0 or len(revs) > last_rev + 1: + # delete the docs in the index for the previous last changeset(s) + for hit in results: + q = qp.parse(u"last:t AND %s AND path:%s" % + (repo_name, hit['path'])) + writer.delete_by_query(q) - def update_index(self): + # index from the previous last changeset + all new ones + self.index_changesets(writer, repo_name, repo, last_rev) + writer_is_dirty = True + + finally: + if writer_is_dirty: + log.debug('>> COMMITING CHANGES TO CHANGESET INDEX<<') + writer.commit(merge=True) + log.debug('>> COMMITTED CHANGES TO CHANGESET INDEX<<') + else: + writer.cancel + + def update_file_index(self): log.debug((u'STARTING INCREMENTAL INDEXING UPDATE FOR EXTENSIONS %s ' 'AND REPOS %s') % (INDEX_EXTENSIONS, self.repo_paths.keys())) @@ -196,72 +256,117 @@ # The set of all paths we need to re-index to_index = set() - reader = idx.reader() writer = idx.writer() + writer_is_dirty = False + try: + with idx.reader() as reader: + + # Loop over the stored fields in the index + for fields in reader.all_stored_fields(): + indexed_path = fields['path'] + indexed_repo_path = fields['repository'] + indexed_paths.add(indexed_path) + + if not indexed_repo_path in self.filtered_repo_update_paths: + continue + + repo = self.repo_paths[indexed_repo_path] + + try: + node = self.get_node(repo, indexed_path) + # Check if this file was changed since it was indexed + indexed_time = fields['modtime'] + mtime = self.get_node_mtime(node) + if mtime > indexed_time: + # The file has changed, delete it and add it to the list of + # files to reindex + log.debug('adding to reindex list %s mtime: %s vs %s' % ( + indexed_path, mtime, indexed_time) + ) + writer.delete_by_term('fileid', indexed_path) + writer_is_dirty = True + + to_index.add(indexed_path) + except (ChangesetError, NodeDoesNotExistError): + # This file was deleted since it was indexed + log.debug('removing from index %s' % indexed_path) + writer.delete_by_term('path', indexed_path) + writer_is_dirty = True - # Loop over the stored fields in the index - for fields in reader.all_stored_fields(): - indexed_path = fields['path'] - indexed_repo_path = fields['repository'] - indexed_paths.add(indexed_path) + # Loop over the files in the filesystem + # Assume we have a function that gathers the filenames of the + # documents to be indexed + ri_cnt_total = 0 # indexed + riwc_cnt_total = 0 # indexed with content + for repo_name, repo in self.repo_paths.items(): + # skip indexing if there aren't any revisions + if len(repo) < 1: + continue + ri_cnt = 0 # indexed + riwc_cnt = 0 # indexed with content + for path in self.get_paths(repo): + path = safe_unicode(path) + if path in to_index or path not in indexed_paths: - if not indexed_repo_path in self.filtered_repo_update_paths: + # This is either a file that's changed, or a new file + # that wasn't indexed before. So index it! + i, iwc = self.add_doc(writer, path, repo, repo_name) + writer_is_dirty = True + log.debug('re indexing %s' % path) + ri_cnt += i + ri_cnt_total += 1 + riwc_cnt += iwc + riwc_cnt_total += iwc + log.debug('added %s files %s with content for repo %s' % ( + ri_cnt + riwc_cnt, riwc_cnt, repo.path) + ) + log.debug('indexed %s files in total and %s with content' % ( + ri_cnt_total, riwc_cnt_total) + ) + finally: + if writer_is_dirty: + log.debug('>> COMMITING CHANGES <<') + writer.commit(merge=True) + log.debug('>>> FINISHED REBUILDING INDEX <<<') + else: + writer.cancel() + + def build_indexes(self): + if os.path.exists(self.index_location): + log.debug('removing previous index') + rmtree(self.index_location) + + if not os.path.exists(self.index_location): + os.mkdir(self.index_location) + + chgset_idx = create_in(self.index_location, CHGSETS_SCHEMA, indexname=CHGSET_IDX_NAME) + chgset_idx_writer = chgset_idx.writer() + + file_idx = create_in(self.index_location, SCHEMA, indexname=IDX_NAME) + file_idx_writer = file_idx.writer() + log.debug('BUILDING INDEX FOR EXTENSIONS %s ' + 'AND REPOS %s' % (INDEX_EXTENSIONS, self.repo_paths.keys())) + + for repo_name, repo in self.repo_paths.items(): + # skip indexing if there aren't any revisions + if len(repo) < 1: continue - repo = self.repo_paths[indexed_repo_path] - - try: - node = self.get_node(repo, indexed_path) - # Check if this file was changed since it was indexed - indexed_time = fields['modtime'] - mtime = self.get_node_mtime(node) - if mtime > indexed_time: - # The file has changed, delete it and add it to the list of - # files to reindex - log.debug('adding to reindex list %s mtime: %s vs %s' % ( - indexed_path, mtime, indexed_time) - ) - writer.delete_by_term('fileid', indexed_path) - - to_index.add(indexed_path) - except (ChangesetError, NodeDoesNotExistError): - # This file was deleted since it was indexed - log.debug('removing from index %s' % indexed_path) - writer.delete_by_term('path', indexed_path) + self.index_files(file_idx_writer, repo_name, repo) + self.index_changesets(chgset_idx_writer, repo_name, repo) - # Loop over the files in the filesystem - # Assume we have a function that gathers the filenames of the - # documents to be indexed - ri_cnt_total = 0 # indexed - riwc_cnt_total = 0 # indexed with content - for repo_name, repo in self.repo_paths.items(): - ri_cnt = 0 # indexed - riwc_cnt = 0 # indexed with content - for path in self.get_paths(repo): - path = safe_unicode(path) - if path in to_index or path not in indexed_paths: + log.debug('>> COMMITING CHANGES <<') + file_idx_writer.commit(merge=True) + chgset_idx_writer.commit(merge=True) + log.debug('>>> FINISHED BUILDING INDEX <<<') - # This is either a file that's changed, or a new file - # that wasn't indexed before. So index it! - i, iwc = self.add_doc(writer, path, repo, repo_name) - log.debug('re indexing %s' % path) - ri_cnt += i - ri_cnt_total += 1 - riwc_cnt += iwc - riwc_cnt_total += iwc - log.debug('added %s files %s with content for repo %s' % ( - ri_cnt + riwc_cnt, riwc_cnt, repo.path) - ) - log.debug('indexed %s files in total and %s with content' % ( - ri_cnt_total, riwc_cnt_total) - ) - log.debug('>> COMMITING CHANGES <<') - writer.commit(merge=True) - log.debug('>>> FINISHED REBUILDING INDEX <<<') + def update_indexes(self): + self.update_file_index() + self.update_changeset_index() def run(self, full_index=False): """Run daemon""" if full_index or self.initial: - self.build_index() + self.build_indexes() else: - self.update_index() + self.update_indexes()
--- a/rhodecode/templates/search/search.html Wed Jul 18 22:07:46 2012 +0200 +++ b/rhodecode/templates/search/search.html Fri Jul 20 12:50:56 2012 +0200 @@ -61,7 +61,7 @@ </div> <div class="select"> ${h.select('type',c.cur_type,[('content',_('File contents')), - ##('commit',_('Commit messages')), + ('commit',_('Commit messages')), ('path',_('File names')), ##('repository',_('Repository names')), ])} @@ -72,13 +72,13 @@ </div> ${h.end_form()} <div class="search"> - %if c.cur_search == 'content': + %if c.cur_type == 'content': <%include file='search_content.html'/> - %elif c.cur_search == 'path': + %elif c.cur_type == 'path': <%include file='search_path.html'/> - %elif c.cur_search == 'commit': + %elif c.cur_type == 'commit': <%include file='search_commit.html'/> - %elif c.cur_search == 'repository': + %elif c.cur_type == 'repository': <%include file='search_repository.html'/> %endif </div>
--- a/rhodecode/templates/search/search_commit.html Wed Jul 18 22:07:46 2012 +0200 +++ b/rhodecode/templates/search/search_commit.html Fri Jul 20 12:50:56 2012 +0200 @@ -0,0 +1,44 @@ +##commit highligthing + +%for cnt,sr in enumerate(c.formated_results): + %if h.HasRepoPermissionAny('repository.write','repository.read','repository.admin')(sr['repository'],'search results check'): + <div class="table"> + <div id="body${cnt}" class="codeblock"> + <div class="code-header"> + <div class="search-path">${h.link_to(h.literal('%s » %s' % (sr['repository'],sr['f_path'])), + h.url('changeset_home',repo_name=sr['repository'],revision=sr['path']))} + </div> + </div> + <div class="left"> + <div class="author"> + <div class="gravatar"> + <img alt="gravatar" src="${h.gravatar_url(h.email(sr['author']),20)}"/> + </div> + <span>${h.person(sr['author'])}</span><br/> + <span><a href="mailto:${h.email_or_none(sr['author'])}">${h.email_or_none(sr['author'])}</a></span><br/> + </div> + %if sr['message_hl']: + <div class="search-code-body"> + <pre>${h.literal(sr['message_hl'])}</pre> + </div> + %else: + <div class="message">${h.urlify_commit(sr['message'], sr['repository'])}</div> + %endif + </div> + </div> + </div> + %else: + %if cnt == 0: + <div class="table"> + <div id="body${cnt}" class="codeblock"> + <div class="error">${_('Permission denied')}</div> + </div> + </div> + %endif + %endif +%endfor +%if c.cur_query and c.formated_results: +<div class="pagination-wh pagination-left"> + ${c.formated_results.pager('$link_previous ~2~ $link_next')} +</div> +%endif
--- a/rhodecode/tests/functional/test_search.py Wed Jul 18 22:07:46 2012 +0200 +++ b/rhodecode/tests/functional/test_search.py Fri Jul 20 12:50:56 2012 +0200 @@ -27,7 +27,7 @@ self.log_user() response = self.app.get(url(controller='search', action='index'), {'q': 'def repo'}) - response.mustcontain('39 results') + response.mustcontain('10 results') def test_repo_search(self): self.log_user() @@ -35,3 +35,44 @@ {'q': 'repository:%s def test' % HG_REPO}) response.mustcontain('4 results') + + def test_search_last(self): + self.log_user() + response = self.app.get(url(controller='search', action='index'), + {'q': 'last:t', 'type': 'commit'}) + + response.mustcontain('1 results') + + def test_search_commit_message(self): + self.log_user() + response = self.app.get(url(controller='search', action='index'), + {'q': 'bother to ask where to fetch repo during tests', + 'type': 'commit'}) + + response.mustcontain('1 results') + response.mustcontain('a00c1b6f5d7a6ae678fd553a8b81d92367f7ecf1') + + def test_search_commit_changed_file(self): + self.log_user() + response = self.app.get(url(controller='search', action='index'), + {'q': 'changed:tests/utils.py', + 'type': 'commit'}) + + response.mustcontain('a00c1b6f5d7a6ae678fd553a8b81d92367f7ecf1') + + def test_search_commit_added_file(self): + self.log_user() + response = self.app.get(url(controller='search', action='index'), + {'q': 'added:README.rst', + 'type': 'commit'}) + + response.mustcontain('1 results') + response.mustcontain('3803844fdbd3b711175fc3da9bdacfcd6d29a6fb') + + def test_search_author(self): + self.log_user() + response = self.app.get(url(controller='search', action='index'), + {'q': 'author:marcin@python-blog.com revision:0', + 'type': 'commit'}) + + response.mustcontain('1 results')