changeset 6476:8b7c0ef62427

search: make "repository:" condition work case-insensitively as expected Before this revision, "repository:" condition at searching for "Commit messages" never shows revisions in a repository, of which name uses upper case letter. Using ID for "repository" of CHGSETS_SCHEMA preserves case of repository name at indexing. On the other hand, search condition itself is forcibly lowered before parsing. - files in repository "FOO" is indexed as "FOO" in "repository" field - "repository:FOO" condition is treated as "repository:foo: Then, indexing search itself is executed case-sensitively. Therefore, "repository:FOO" condition never show revisions in repository "FOO". But just making "repository" of CHGSETS_SCHEMA case-insensitive isn't reasonable enough, because it breaks assumptions below, if there is case-insensitive name collision between repositories, even though Kallithea itself can manage such repositories at same time. - combination of "raw_id" (= revision hash ID) and "repository" is unique between all known revisions under Kallithea CHGSETS_SCHEMA assumes this. This unique-ness is required by Whoosh library to determine whether index table should be updated or not for that repository. - searching in a repository shows only revisions in that repository Before this revision, this filtering is achieve by "repository:" condition with case-preserved repository name from requested URL. To make "repository:" search condition work case-insensitively as expected (without any violation of assumptions above), this revision does: - make "repository" of CHGSETS_SCHEMA case-insensitive by "analyzer=ICASEIDANALYZER" - introduce "repository_rawname" into SCHEMA and CHGSETS_SCHEMA, to ensure assumptions described above, by preserving case of repository name "repository_rawname" of SCHEMA uses not ID but TEXT, because the former disable "positions" feature, which is required for highlight-ing file content (see previous revision for detail). This revision requires full re-building index tables, because indexing schemas are changed.
author FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
date Mon, 23 Jan 2017 02:17:38 +0900
parents caef0be39948
children 168cc92c1b53
files kallithea/controllers/search.py kallithea/lib/indexers/__init__.py kallithea/lib/indexers/daemon.py kallithea/tests/functional/test_search_indexing.py
diffstat 4 files changed, 22 insertions(+), 5 deletions(-) [+]
line wrap: on
line diff
--- a/kallithea/controllers/search.py	Mon Jan 23 02:17:38 2017 +0900
+++ b/kallithea/controllers/search.py	Mon Jan 23 02:17:38 2017 +0900
@@ -94,7 +94,9 @@
 
                 qp = QueryParser(search_type, schema=schema_defn)
                 if c.repo_name:
-                    cur_query = u'repository:%s %s' % (c.repo_name, cur_query)
+                    # use "repository_rawname:" instead of "repository:"
+                    # for case-sensitive matching
+                    cur_query = u'repository_rawname:%s %s' % (c.repo_name, cur_query)
                 try:
                     query = qp.parse(unicode(cur_query))
                     # extract words for highlight
--- a/kallithea/lib/indexers/__init__.py	Mon Jan 23 02:17:38 2017 +0900
+++ b/kallithea/lib/indexers/__init__.py	Mon Jan 23 02:17:38 2017 +0900
@@ -53,10 +53,20 @@
 #
 ICASEIDANALYZER = IDTokenizer() | LowercaseFilter()
 
+# CUSTOM ANALYZER raw-string
+#
+# This is useful to:
+# - avoid tokenization
+# - avoid removing "stop words" from text
+#
+IDANALYZER = IDTokenizer()
+
 #INDEX SCHEMA DEFINITION
 SCHEMA = Schema(
     fileid=ID(unique=True),
     owner=TEXT(),
+    # this field preserves case of repository name for exact matching
+    repository_rawname=TEXT(analyzer=IDANALYZER),
     repository=TEXT(stored=True, analyzer=ICASEIDANALYZER),
     path=TEXT(stored=True),
     content=FieldType(format=Characters(), analyzer=ANALYZER,
@@ -74,7 +84,10 @@
     date=NUMERIC(stored=True),
     last=BOOLEAN(),
     owner=TEXT(),
-    repository=ID(unique=True, stored=True),
+    # this field preserves case of repository name for exact matching
+    # and unique-ness in index table
+    repository_rawname=ID(unique=True),
+    repository=ID(stored=True, analyzer=ICASEIDANALYZER),
     author=TEXT(stored=True),
     message=FieldType(format=Characters(), analyzer=ANALYZER,
                       scorable=True, stored=True),
--- a/kallithea/lib/indexers/daemon.py	Mon Jan 23 02:17:38 2017 +0900
+++ b/kallithea/lib/indexers/daemon.py	Mon Jan 23 02:17:38 2017 +0900
@@ -203,6 +203,7 @@
         writer.add_document(
             fileid=p,
             owner=unicode(repo.contact),
+            repository_rawname=repo.name_unicode,
             repository=safe_unicode(repo_name),
             path=p,
             content=u_content,
@@ -241,6 +242,7 @@
                 raw_id=unicode(cs.raw_id),
                 owner=unicode(repo.contact),
                 date=cs._timestamp,
+                repository_rawname=repo.name_unicode,
                 repository=safe_unicode(repo_name),
                 author=cs.author,
                 message=cs.message,
--- a/kallithea/tests/functional/test_search_indexing.py	Mon Jan 23 02:17:38 2017 +0900
+++ b/kallithea/tests/functional/test_search_indexing.py	Mon Jan 23 02:17:38 2017 +0900
@@ -126,9 +126,9 @@
         response.mustcontain('>%d results' % hit)
 
     @parametrize('searchtype,query,hit', [
-        ('content', 'this_should_be_unique_content', 2),
+        ('content', 'this_should_be_unique_content', 1),
         ('commit', 'this_should_be_unique_commit_log', 1),
-        ('path', 'this_should_be_unique_filename.txt', 2),
+        ('path', 'this_should_be_unique_filename.txt', 1),
     ])
     def test_repository_case_sensitivity(self, searchtype, query, hit):
         self.log_user()
@@ -142,7 +142,7 @@
         response = self.app.get(url(controller='search', action='index'),
                                 {'q': q, 'type': searchtype})
 
-        response.mustcontain('>%d results' % hit)
+        response.mustcontain('>%d results' % (hit * 2))
 
         # (2) on the other hand, searching under the specific
         # repository should return results only for that repository,